diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
index 05fff4db1b48bd6501aa6436238e96397e2de8f6..6b0f835f7361fb54b9826bdec7e1819333f989df 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
@@ -5,125 +5,106 @@
 
 #include "device_math.h"
 
-extern "C"{
-
-  // NOTE: API for tensorGroupConvolution
-  // API for Running Tensor Convolution with CUTLASS
-  void* tensorConvCutlass(void* input, void* filter,
-			  int vertical_pad, int horizontal_pad,
-			  int vertical_stride, int horizontal_stride,
-			  int conv_mode, int conv_groups);
-
-  void* tensorHalfConvCutlass(void* input, void* filter,
-			      int vertical_pad, int horizontal_pad,
-			      int vertical_stride, int horizontal_stride,
-			      int conv_mode, int conv_groups);
-
-
-  // Perforated Tensor Conv with 'perforation_rate' parameter
-  void* tensorConvPerf(void* input, void* filter,
-		       int vertical_pad, int horizontal_pad,
-		       int vertical_stride, int horizontal_stride,
-		       int conv_mode, int conv_groups, int row, int col);
-
-  void* tensorConvolutionKernelSamp(void* input, void* filter_ptr,
-				    int vertical_pad, int horizontal_pad,
-				    int vertical_stride, int horizontal_stride,
-				    int conv_mode, int conv_groups,
-				    int skip_every);
-
-  void* tensorConvPerfCuda(void* input, void* filter,
-			   int vertical_pad, int horizontal_pad,
-			   int vertical_stride, int horizontal_stride,
-			   int conv_mode, int conv_groups,
-			   int row, int col, int start);
-
-  void* tensorConvPerfSim(void* input_ptr, void* filter_ptr,
-			  int vertical_pad, int horizontal_pad,
-			  int vertical_stride, int horizontal_stride,
-			  int conv_mode, int conv_groups, int row, int col);
-
-  
-  void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr,
-			       int vertical_pad, int horizontal_pad,
-			       int vertical_stride, int horizontal_stride,
-			       int conv_mode, int conv_groups,
-			       int row, int col, int start);
-  
-  void sampleFilter(Tensor* filter, int skip_rate, int skip_offset);
-
-  void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
-			  int vertical_pad, int horizontal_pad,
-			  int vertical_stride, int horizontal_stride,
-			  int conv_mode, int conv_groups,
-			  int skip_rate, int skip_offset);
-
-  void* tensorConvSampSim2(void* input_ptr, void* filter_ptr,
-			  int vertical_pad, int horizontal_pad,
-			  int vertical_stride, int horizontal_stride,
-			  int conv_mode, int conv_groups,
-			  int skip_rate, int skip_offset, float interpolation_rate);
-  
-
-  void *autotuner_tensorFft(void *input, bool inverse);
-
-
-  void *autotuner_tensorReduce(void *input, size_t axis, MathOp func);
-
-
-  void *autotuner_tensorProjectiveT(void *input, void *transformation);
-  
-
-  void *autotuner_tensorMap1(MathOp func, void *input);
-
-  void *autotuner_tensorMap2(MathOp func, void *input1, void *input2);
-	
-
-  void *autotuner_tensorMap3(MathOp func, void *input1, void *input2, void *input3);
-
-  void* tensorConvInputHalf(void* input_ptr, void* filter_ptr,
-              int vertical_pad, int horizontal_pad, int vertical_stride,
-              int horizontal_stride, int conv_mode, int conv_groups,
-              int skip_every, int skip_offset);
-
-  void* tensorConvApproxHalf(void* input_ptr, void* filter_ptr,
-			     int vertical_pad, int horizontal_pad,
-			     int vertical_stride, int horizontal_stride,
-			     int conv_mode, int conv_groups,
-			     int row, int col,
-			     int skip_every, int skip_offset);
-
-  void* tensorConvApprox(void* input_ptr, void* filter_ptr,
-			 int vertical_pad, int horizontal_pad,
-			 int vertical_stride, int horizontal_stride,
-			 int conv_mode, int conv_groups,
-			 int row, int col,
-			 int skip_every, int skip_offset);
-
-  void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
-			     int vertical_pad, int horizontal_pad,
-			     int vertical_stride, int horizontal_stride,
-			     int conv_mode, int conv_groups,
-			     int row, int col,
-			     int skip_every, int skip_offset);
-
-
-  void* PROMISE_Conv(void* input, float i_min, float i_max,
-		     void* filter, float w_min, float w_max,
-		     void* bias, float b_min, float b_max,
-		     int conv_pad_h, int conv_pad_w,
-		     int conv_stride_h, int conv_stride_w,
-		     int pool_id, int pool_size, int pool_stride,
-		     int activation_id, // Relu, Tanh, ClipRelu
-		     float out_min, float out_max, int swing);
-
-
-  void* PROMISE_FC(void* input, float i_min, float i_max,
-		   void* weights, float w_min, float w_max,
-		   void* bias, float b_min, float b_max,
-		   int activation_id,
-		   float out_min, float out_max, int swing);
-  
+extern "C" {
+
+// NOTE: API for tensorGroupConvolution
+// API for Running Tensor Convolution with CUTLASS
+void *tensorConvCutlass(void *input, void *filter, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups);
+
+void *tensorHalfConvCutlass(void *input, void *filter, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups);
+
+// Perforated Tensor Conv with 'perforation_rate' parameter
+void *tensorConvPerf(void *input, void *filter, int vertical_pad,
+                     int horizontal_pad, int vertical_stride,
+                     int horizontal_stride, int conv_mode, int conv_groups,
+                     int row, int col);
+
+void *tensorConvolutionKernelSamp(void *input, void *filter_ptr,
+                                  int vertical_pad, int horizontal_pad,
+                                  int vertical_stride, int horizontal_stride,
+                                  int conv_mode, int conv_groups,
+                                  int skip_every);
+
+void *tensorConvPerfCuda(void *input, void *filter, int vertical_pad,
+                         int horizontal_pad, int vertical_stride,
+                         int horizontal_stride, int conv_mode, int conv_groups,
+                         int row, int col, int start);
+
+void *tensorConvPerfSim(void *input_ptr, void *filter_ptr, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups,
+                        int row, int col);
+
+void *tensorConvPerfCudaHalf(void *input_ptr, void *filter_ptr,
+                             int vertical_pad, int horizontal_pad,
+                             int vertical_stride, int horizontal_stride,
+                             int conv_mode, int conv_groups, int row, int col,
+                             int start);
+
+void sampleFilter(Tensor *filter, int skip_rate, int skip_offset);
+
+void *tensorConvSampSim(void *input_ptr, void *filter_ptr, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups,
+                        int skip_rate, int skip_offset);
+
+void *tensorConvSampSim2(void *input_ptr, void *filter_ptr, int vertical_pad,
+                         int horizontal_pad, int vertical_stride,
+                         int horizontal_stride, int conv_mode, int conv_groups,
+                         int skip_rate, int skip_offset,
+                         float interpolation_rate);
+
+void *autotuner_tensorFft(void *input, bool inverse);
+
+void *autotuner_tensorReduce(void *input, size_t axis, MathOp func);
+
+void *autotuner_tensorProjectiveT(void *input, void *transformation);
+
+void *autotuner_tensorMap1(MathOp func, void *input);
+
+void *autotuner_tensorMap2(MathOp func, void *input1, void *input2);
+
+void *autotuner_tensorMap3(MathOp func, void *input1, void *input2,
+                           void *input3);
+
+void *tensorConvInputHalf(void *input_ptr, void *filter_ptr, int vertical_pad,
+                          int horizontal_pad, int vertical_stride,
+                          int horizontal_stride, int conv_mode, int conv_groups,
+                          int skip_every, int skip_offset);
+
+void *tensorConvApproxHalf(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int conv_groups, int row, int col, int skip_every,
+                           int skip_offset);
+
+void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
+                       int horizontal_pad, int vertical_stride,
+                       int horizontal_stride, int conv_mode, int conv_groups,
+                       int row, int col, int skip_every, int skip_offset);
+
+void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups, int row, int col, int skip_every,
+                            int skip_offset);
+
+void *PROMISE_Conv(void *input, float i_min, float i_max, void *filter,
+                   float w_min, float w_max, void *bias, float b_min,
+                   float b_max, int conv_pad_h, int conv_pad_w,
+                   int conv_stride_h, int conv_stride_w, int pool_id,
+                   int pool_size, int pool_stride,
+                   int activation_id, // Relu, Tanh, ClipRelu
+                   float out_min, float out_max, int swing);
+
+void *PROMISE_FC(void *input, float i_min, float i_max, void *weights,
+                 float w_min, float w_max, void *bias, float b_min, float b_max,
+                 int activation_id, float out_min, float out_max, int swing);
 }
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_knob_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_knob_utils.h
index eb6516faab23dfbe8b2c0b14e5bf16f52ee9cd4e..4611ae2218f2b838ff6cbae90824b5c8f07349ec 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_knob_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_knob_utils.h
@@ -1,96 +1,68 @@
-
-
 #ifndef APPROX_KNOBS_UTILS
 #define APPROX_KNOBS_UTILS
 
-
-#include <sstream>
 #include <fstream>
 #include <map>
+#include <sstream>
 #include <vector>
 
+class PerfParams {
 
-class PerfParams{
-
- public:
+public:
   int row;
   int col;
   int skip_offset;
 
   PerfParams();
-  
+
   PerfParams(int row1, int col1, int skip_offset1);
-  
 };
 
-
-
-
-class PerfParamSet{
+class PerfParamSet {
 
 private:
-  
   std::map<int, PerfParams> perf_knob_map;
-  
-public:
 
+public:
   PerfParamSet();
-  
-  PerfParams getPerfParams(int knob_id);  
 
+  PerfParams getPerfParams(int knob_id);
 };
- 
-
-
 
+class SampParams {
 
-
-class SampParams{
-
- public:  
+public:
   int skip_rate;
   int skip_offset;
   float interpolation_id;
-  
+
   SampParams();
-  
+
   SampParams(int skip_rate1, int skip_offset1, float interpolation_id1);
-  		
 };
 
+class SampParamSet {
 
-
-class SampParamSet{
-
- private:
-
+private:
   std::map<int, SampParams> samp_knob_map;
- 
- public:
 
+public:
   SampParamSet();
 
   SampParams getSampParams(int knob_id);
-   
 };
 
-
-
-  
-
 class RedSampParams {
 
- public:
+public:
   float skip_ratio;
   bool is_half;
 
   RedSampParams();
 
   RedSampParams(float skip_ratio1, bool is_half1);
-  		
 };
 
 RedSampParams getRedSampParams(int knob_id);
 
-
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
index 22a6b5ca951793d26003f0c5ff4dc1e7d4c39f95..98d6d63eadc44b171b54bd09a9096d072c4be10d 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
@@ -1,213 +1,218 @@
 #include "tensor_utils.h"
 
-
-//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
-__global__ void convToGemmApproxHalf(__half * const __restrict__ output,
-				     const __half * const __restrict input, const int N, const int C,
-				     const int H, const int W, const int KH,
-				     const int KW, const int V_pad,
-				     const int H_pad, const int H_out,
-				     const int W_out, const int V_stride,
-				     const int H_stride, const int reduced_filter_elem,
-				     const int skip_every) {
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
-	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account
-	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	  else
-	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
-	}
+// produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem
+// cols
+__global__ void convToGemmApproxHalf(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;             // output width index (col number)
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if (filter_elem_num % skip_every !=
+            skip_every - 1) { // are we including this filter element?
+          const int output_col =
+              filter_elem_num -
+              (filter_elem_num / skip_every); // calculate output column, taking
+                                              // skipping into account
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((n * reduced_filter_elem + output_col) * H_out + h) *
+                       W_out +
+                   w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((n * reduced_filter_elem + output_col) * H_out + h) *
+                       W_out +
+                   w] = 0;
+        }
       }
     }
   }
 }
 
-
-//This skips every xth row
-//H_eff is the number of rows calculated exactly
-__global__
-void convToGemmPerfRow(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-  const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
+// This skips every xth row
+// H_eff is the number of rows calculated exactly
+__global__ void
+convToGemmPerfRow(float *const __restrict__ output,
+                  const float *const __restrict input, const int N, const int C,
+                  const int H, const int W, const int KH, const int KW,
+                  const int V_pad, const int H_pad, const int H_out,
+                  const int W_out, const int V_stride, const int H_stride,
+                  const int x, const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image number
+  const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_eff * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out; // output width index (col number)
   int past_start = (h % (x - 1) >= (x - 1 - start));
-  const int inH = (h / (x - 1) * x + h % (x-1) +
-		   past_start) * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w] = 0;
-
+  const int inH = (h / (x - 1) * x + h % (x - 1) + past_start) * V_stride -
+                  V_pad;                // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out +
+                 w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out +
+                 w] = 0;
       }
     }
   }
-
 }
 
-
-//For use in tensorConvPerfCuda
-//Interpolates every xth row starting from x - 1 - start
-//N is total number of elements in final output array
-__global__
-void approxInterpolateRow(int N, int old_h, int n, int c, int h, int w,
-			  float *old_data, float *new_data, int x, int start){
+// For use in tensorConvPerfCuda
+// Interpolates every xth row starting from x - 1 - start
+// N is total number of elements in final output array
+__global__ void approxInterpolateRow(int N, int old_h, int n, int c, int h,
+                                     int w, float *old_data, float *new_data,
+                                     int x, int start) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
 
-  for(int i = index; i < N; i += stride){
+  for (int i = index; i < N; i += stride) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
     int n = i / (c * h * w);
     int past_start = ((row % x) >= (x - 1 - start));
 
-    if(row == h-1)
+    if (row == h - 1)
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) +
+                   col];
     else if (row == 0)
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-    else if(row % x == x - 1 - start){
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    else if (row % x == x - 1 - start) {
       int past_startO = ((row - 1) % x) > (x - 1 - start);
-      int oldIdx1 = n * (c * old_h * w) + ch * (old_h * w) +
-	((x-1) * ((row - 1) / x) + (row-1) % x - past_startO) * (w) + col;
+      int oldIdx1 =
+          n * (c * old_h * w) + ch * (old_h * w) +
+          ((x - 1) * ((row - 1) / x) + (row - 1) % x - past_startO) * (w) + col;
 
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	(old_data[oldIdx1] + old_data[oldIdx1 + 1 * w]) / 2;
-    }
-    else
+          (old_data[oldIdx1] + old_data[oldIdx1 + 1 * w]) / 2;
+    } else
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) +
-		 ((x-1) * (row / x) + row % x - past_start )  * (w) + col];
-
-
+          old_data[n * (c * old_h * w) + ch * (old_h * w) +
+                   ((x - 1) * (row / x) + row % x - past_start) * (w) + col];
   }
-
 }
 
-
-//This skips every xth row
-//W_eff is the number of cols calculated exactly
-__global__
-void convToGemmPerfCol(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int W_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-  const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-  const int w = tx % W_eff; //output width index (col number)
+// This skips every xth row
+// W_eff is the number of cols calculated exactly
+__global__ void
+convToGemmPerfCol(float *const __restrict__ output,
+                  const float *const __restrict input, const int N, const int C,
+                  const int H, const int W, const int KH, const int KW,
+                  const int V_pad, const int H_pad, const int H_out,
+                  const int W_out, const int V_stride, const int H_stride,
+                  const int x, const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_eff) / W_eff; // output height index (row
+                                              // number)
+  const int w = tx % W_eff; // output width index (col number)
   int past_start = (w % (x - 1)) >= (x - 1 - start);
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = (w / (x - 1) * x + w % (x-1) +
-		   past_start) * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0;
-
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = (w / (x - 1) * x + w % (x - 1) + past_start) * H_stride -
+                  H_pad; // input width index (col number)
+  if (n < N) {           // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = 0;
       }
     }
   }
-
 }
 
-
-//For use in tensorConvPerfCuda
-//Interpolates every xth col starting from x - 1 - start
-//N is total number of elements in final output array
-__global__
-void approxInterpolateCol(int N, int old_w, int n, int c, int h, int w,
-			  float *old_data, float *new_data, int x, int start){
+// For use in tensorConvPerfCuda
+// Interpolates every xth col starting from x - 1 - start
+// N is total number of elements in final output array
+__global__ void approxInterpolateCol(int N, int old_w, int n, int c, int h,
+                                     int w, float *old_data, float *new_data,
+                                     int x, int start) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
 
-  for(int i = index; i < N; i += stride){
+  for (int i = index; i < N; i += stride) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
     int n = i / (c * h * w);
     int past_start = ((col % x) >= (x - 1 - start));
 
-    if(col == w-1)
+    if (col == w - 1)
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) +
+                   old_w - 1];
     else if (col == 0)
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    else if(col % x == x - 1 - start){
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    else if (col % x == x - 1 - start) {
       int past_startO = ((col - 1) % x) > (x - 1 - start);
       int oldIdx1 = n * (c * h * old_w) + ch * (h * old_w) + row * old_w +
-	((x-1) * ((col - 1) / x) + (col-1) % x - past_startO);
+                    ((x - 1) * ((col - 1) / x) + (col - 1) % x - past_startO);
 
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	(old_data[oldIdx1] + old_data[oldIdx1 + 1]) / 2;
-    }
-    else
+          (old_data[oldIdx1] + old_data[oldIdx1 + 1]) / 2;
+    } else
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w +
-		 ((x-1) * (col / x) + col % x - past_start)];
-
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w +
+                   ((x - 1) * (col / x) + col % x - past_start)];
   }
-
 }
 
-
-
-//start has to be less than row or less than col
-//row and col have to be >= 0
-//row = col = 1 means no perforation
-void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
-			 int vertical_pad, int horizontal_pad, int vertical_stride,
-			 int horizontal_stride, int conv_mode, int conv_groups,
-			 int row, int col, int start){
+// start has to be less than row or less than col
+// row and col have to be >= 0
+// row = col = 1 means no perforation
+void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad,
+                         int horizontal_pad, int vertical_stride,
+                         int horizontal_stride, int conv_mode, int conv_groups,
+                         int row, int col, int start) {
 
   INFO("*** TensorConvolution (output perforation) \n");
   profileEvent("Conv");
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
-  
-  Tensor* output;
+
+  Tensor *output;
   // TODO: Support other cases;
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
@@ -216,389 +221,377 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
   convertToFP32(input);
   convertToFP32(filter);
   profileEvent("H2F_end");
-  
+
   int n, c, h, w; // output dimensions
   n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
+  c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
   int h_eff = h - h / row;
-  if(h % row > row - 1 - start)
+  if (h % row > row - 1 - start)
     h_eff = h_eff - 1;
 
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   int w_eff = w - w / col;
-  if(w % col > col - 1 - start)
+  if (w % col > col - 1 - start)
     w_eff = w_eff - 1;
 
-
   Tensor *new_output;
-  if(row > 1){
-    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+  if (row > 1) {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1], input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3], KH, KW, vertical_pad,
-					       horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride, row, start, h_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
 
+    convToGemmPerfRow<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, start, h_eff);
 
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h_eff * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h_eff * w, num_filter_elem * h_eff * w,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h_eff * w, c * h_eff * w,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+        &alpha, convData, h_eff * w, num_filter_elem * h_eff * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h_eff * w, c * h_eff * w, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data, (float *) new_output->gpu_data,
-					    row, start);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateRow<<<numBlocks, 128>>>(
+        n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, row, start);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  }
-  else if(col > 1){
-    
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+  } else if (col > 1) {
+
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1], input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3], KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride, col, start, w_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
 
+    convToGemmPerfCol<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, start, w_eff);
 
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w_eff, c * h * w_eff,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+        &alpha, convData, h * w_eff, num_filter_elem * h * w_eff,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w_eff, c * h * w_eff, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data, (float *)new_output->gpu_data,
-					    col, start);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateCol<<<numBlocks, 128>>>(
+        n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, col, start);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  }
-  else{
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w);
+  } else {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    convToGemmApprox<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					      input->dims.dim_sizes[1], input->dims.dim_sizes[2],
-					      input->dims.dim_sizes[3], KH, KW,
-					      vertical_pad, horizontal_pad, h, w,
-					      vertical_stride, horizontal_stride,
-					      num_filter_elem, c * h * w);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmApprox<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        num_filter_elem, c * h * w);
     checkCudaErrors(cudaDeviceSynchronize());
-    //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+    // Do the matrix multiplication. Want to multiply convData by
+    // filter->gpu_data[f * chan * KH * KW]
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w, num_filter_elem * h * w,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w, c * h * w,
-					      n));
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, num_filter_elem,
+        &alpha, convData, h * w, num_filter_elem * h * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w, c * h * w, n));
 
     new_output = output;
     cudaFree(convData);
   }
 
-
   profileEvent("Conv_end"); //, true);
- 
-  
+
   return new_output;
 }
 
-__global__
-void convToGemmPerfRowHalf(__half * const __restrict__ output,
-			   const __half * const __restrict input, const int N, const int C,
-			   const int H, const int W, const int KH, const int KW, const int V_pad,
-			   const int H_pad, const int H_out, const int W_out, const int V_stride,
-			   const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-  const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
+__global__ void convToGemmPerfRowHalf(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image number
+  const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_eff * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out; // output width index (col number)
   int past_start = (h % (x - 1) >= (x - 1 - start));
-  const int inH = (h / (x - 1) * x + h % (x-1) +
-		   past_start) * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = 0;
-
+  const int inH = (h / (x - 1) * x + h % (x - 1) + past_start) * V_stride -
+                  V_pad;                // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = 0;
       }
     }
   }
-
 }
 
-
-//For use in tensorConvPerfCuda
-//Interpolates every xth row starting from x - 1 - start
-//N is total number of elements in final output array
-__global__
-void approxInterpolateRowHalf(int N, int old_h, int b, int c, int h, int w,
-			      __half *old_data, __half *new_data, int x, int start){
+// For use in tensorConvPerfCuda
+// Interpolates every xth row starting from x - 1 - start
+// N is total number of elements in final output array
+__global__ void approxInterpolateRowHalf(int N, int old_h, int b, int c, int h,
+                                         int w, __half *old_data,
+                                         __half *new_data, int x, int start) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
 
-  for(int i = index; i < N; i += stride){
+  for (int i = index; i < N; i += stride) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
     int n = i / (c * h * w);
     int past_start = ((row % x) >= (x - 1 - start));
 
-    if(row == h-1)
+    if (row == h - 1)
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
+          old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) +
+                   col];
     else if (row == 0)
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
-    else if(row % x == x - 1 - start){
+          old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
+    else if (row % x == x - 1 - start) {
       int past_startO = ((row - 1) % x) > (x - 1 - start);
-      int oldIdx1 = ch * (b * old_h * w) + n * (old_h * w) +
-	((x-1) * ((row - 1) / x) + (row-1) % x - past_startO) * (w) + col;
+      int oldIdx1 =
+          ch * (b * old_h * w) + n * (old_h * w) +
+          ((x - 1) * ((row - 1) / x) + (row - 1) % x - past_startO) * (w) + col;
 
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	__hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1 * w]), 2);
-    }
-    else
+          __hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1 * w]), 2);
+    } else
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[ch * (b * old_h * w) + n * (old_h * w) +
-		 ((x-1) * (row / x) + row % x - past_start )  * (w) + col];
-
-
+          old_data[ch * (b * old_h * w) + n * (old_h * w) +
+                   ((x - 1) * (row / x) + row % x - past_start) * (w) + col];
   }
-
 }
 
-
-//This skips every xth row
-//W_eff is the number of cols calculated exactly
-__global__
-void convToGemmPerfColHalf(__half * const __restrict__ output,
-			   const __half * const __restrict input, const int N, const int C,
-			   const int H, const int W, const int KH, const int KW, const int V_pad,
-			   const int H_pad, const int H_out, const int W_out, const int V_stride,
-			   const int H_stride, const int x, const int start, const int W_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-  const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-  const int w = tx % W_eff; //output width index (col number)
+// This skips every xth row
+// W_eff is the number of cols calculated exactly
+__global__ void convToGemmPerfColHalf(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_eff) / W_eff; // output height index (row
+                                              // number)
+  const int w = tx % W_eff; // output width index (col number)
   int past_start = (w % (x - 1)) >= (x - 1 - start);
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = (w / (x - 1) * x + w % (x-1) +
-		   past_start) * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] = 0;
-
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = (w / (x - 1) * x + w % (x - 1) + past_start) * H_stride -
+                  H_pad; // input width index (col number)
+  if (n < N) {           // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] = 0;
       }
     }
   }
-
 }
 
-
-//For use in tensorConvPerfCuda
-//Interpolates every xth col starting from x - 1 - start
-//N is total number of elements in final output array
-__global__
-void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
-			      __half *old_data, __half *new_data, int x, int start){
-
+// For use in tensorConvPerfCuda
+// Interpolates every xth col starting from x - 1 - start
+// N is total number of elements in final output array
+__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h,
+                                         int w, __half *old_data,
+                                         __half *new_data, int x, int start) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
 
-  for(int i = index; i < N; i += stride){
+  for (int i = index; i < N; i += stride) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
     int n = i / (c * h * w);
     int past_start = ((col % x) >= (x - 1 - start));
 
-    if(col == w-1)
+    if (col == w - 1)
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
+          old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) +
+                   old_w - 1];
     else if (col == 0)
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
-    else if(col % x == x - 1 - start){
+          old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
+    else if (col % x == x - 1 - start) {
       int past_startO = ((col - 1) % x) > (x - 1 - start);
       int oldIdx1 = ch * (b * h * old_w) + n * (h * old_w) + row * old_w +
-	((x-1) * ((col - 1) / x) + (col-1) % x - past_startO);
+                    ((x - 1) * ((col - 1) / x) + (col - 1) % x - past_startO);
 
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	__hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1]), 2);
-    }
-    else
+          __hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1]), 2);
+    } else
       new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w +
-		 ((x-1) * (col / x) + col % x - past_start)];
-
-  } 
+          old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w +
+                   ((x - 1) * (col / x) + col % x - past_start)];
+  }
 }
 
-__global__
-void switchMatrix(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){
+__global__ void switchMatrix(int N, int n, int c, int h, int w,
+                             __half *old_data, __half *new_data) {
 
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if(i < N){
+  if (i < N) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
     int n_new = i / (c * h * w);
 
-    new_data[((n_new * c + ch) * h + row ) * w + col] =
-      old_data[((ch * n + n_new) * h + row ) * w + col];
+    new_data[((n_new * c + ch) * h + row) * w + col] =
+        old_data[((ch * n + n_new) * h + row) * w + col];
   }
-
 }
-						
-
-//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
-__global__ void convToGemmApproxHalfN(__half * const __restrict__ output,
-				     const __half * const __restrict input, const int N, const int C,
-				     const int H, const int W, const int KH, const int KW, const int V_pad,
-				     const int H_pad, const int H_out, const int W_out, const int V_stride,
-				     const int H_stride, const int reduced_filter_elem,
-				     const int skip_every) {
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-	const int output_col = filter_elem_num; //calculate output column, taking skipping into account
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((output_col * N + n) * H_out + h) * W_out + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
 
+// produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem
+// cols
+__global__ void convToGemmApproxHalfN(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;             // output width index (col number)
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j;              // index of this filter element
+        const int output_col = filter_elem_num; // calculate output column,
+                                                // taking skipping into account
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[((output_col * N + n) * H_out + h) * W_out + w] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
       }
     }
   }
 }
 
-//start has to be less than row or less than col
-//row and col have to be >= 0
-//row = col = 1 means no perforation
-void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr,
-			     int vertical_pad, int horizontal_pad, int vertical_stride,
-			     int horizontal_stride, int conv_mode, int conv_groups,
-			     int row, int col, int start){
+// start has to be less than row or less than col
+// row and col have to be >= 0
+// row = col = 1 means no perforation
+void *tensorConvPerfCudaHalf(void *input_ptr, void *filter_ptr,
+                             int vertical_pad, int horizontal_pad,
+                             int vertical_stride, int horizontal_stride,
+                             int conv_mode, int conv_groups, int row, int col,
+                             int start) {
 
   INFO("*** TensorConvolution half perforation \n");
   profileEvent("#Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
@@ -611,48 +604,48 @@ void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr,
   convertToFP16(filter);
   profileEvent("F2H_end");
 
-  Tensor* output_half;
+  Tensor *output_half;
   int n, c, h, w; // output dimensions
   n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
+  c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
   int h_eff = h - h / row;
-  if(h % row > row - 1 - start)
+  if (h % row > row - 1 - start)
     h_eff = h_eff - 1;
 
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   int w_eff = w - w / col;
-  if(w % col > col - 1 - start)
+  if (w % col > col - 1 - start)
     w_eff = w_eff - 1;
 
-
   Tensor *new_output;
-  if(row > 1){
-    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, CUDNN_TENSOR_NCHW,
-					  n, c, h_eff, w);
+  if (row > 1) {
+    output_half = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                           CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output_half, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    __half * convData;
+    __half *convData;
     int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 256;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-
-    convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
-						   horizontal_pad, h, w,
-						   vertical_stride, horizontal_stride, row, start, h_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
 
+    convToGemmPerfRowHalf<<<gridSize, blockSize>>>(
+        convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, start, h_eff);
 
     checkCudaErrors(cudaDeviceSynchronize());
 
@@ -661,56 +654,51 @@ void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr,
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
 
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-				 n * h_eff * w, c, num_filter_elem,
-				 alpha_half,
-				 convData, CUDA_R_16F, n * h_eff * w,
-				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-				 beta_half,
-				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
-				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h_eff * w, c,
+        num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h_eff * w,
+        (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half,
+        (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
+        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 
-    
-    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                          CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 255) / 256;
-    approxInterpolateRowHalf<<<numBlocks,256>>>(n * c * h * w, h_eff, n, c, h, w,
-						(__half *)output_half->gpu_half_data,
-						(__half *)new_output->gpu_half_data,
-						row, start);
+    // interpolate
+    int numBlocks = (n * c * h * w + 255) / 256;
+    approxInterpolateRowHalf<<<numBlocks, 256>>>(
+        n * c * h * w, h_eff, n, c, h, w, (__half *)output_half->gpu_half_data,
+        (__half *)new_output->gpu_half_data, row, start);
     cudaDeviceSynchronize();
 
     freeTensor(output_half);
     cudaFree(convData);
-  }
-  else if(col > 1){
-    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-					  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+  } else if (col > 1) {
+    output_half = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                           CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output_half, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    __half * convData;
+    __half *convData;
     int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 256;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
-						   horizontal_pad, h, w,
-						   vertical_stride, horizontal_stride, col, start, w_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
 
+    convToGemmPerfColHalf<<<gridSize, blockSize>>>(
+        convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, start, w_eff);
 
     checkCudaErrors(cudaDeviceSynchronize());
 
@@ -719,94 +707,84 @@ void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr,
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
 
-    
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-				 n * h * w_eff, c, num_filter_elem,
-				 alpha_half,
-				 convData, CUDA_R_16F, n * h * w_eff,
-				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-				 beta_half,
-				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
-				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w_eff, c,
+        num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w_eff,
+        (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half,
+        (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
+        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 
-    
-    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                          CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 255) / 256;
-    approxInterpolateColHalf<<<numBlocks,256>>>(n * c * h * w, w_eff, n, c, h, w,
-						(__half *)output_half->gpu_half_data,
-						(__half *)new_output->gpu_half_data,
-						col, start);
-    
+    // interpolate
+    int numBlocks = (n * c * h * w + 255) / 256;
+    approxInterpolateColHalf<<<numBlocks, 256>>>(
+        n * c * h * w, w_eff, n, c, h, w, (__half *)output_half->gpu_half_data,
+        (__half *)new_output->gpu_half_data, col, start);
+
     cudaDeviceSynchronize();
 
     freeTensor(output_half);
     cudaFree(convData);
 
-  }
-  else{
-    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-					  CUDNN_TENSOR_NCHW, c, n, h, w);
+  } else {
+    output_half = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                           CUDNN_TENSOR_NCHW, c, n, h, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output_half, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    __half * convData;
+    __half *convData;
     int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 256;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    convToGemmApproxHalfN<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3], KH, KW,
-						   vertical_pad, horizontal_pad, h, w,
-						   vertical_stride, horizontal_stride,
-						   num_filter_elem, c * h * w);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmApproxHalfN<<<gridSize, blockSize>>>(
+        convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        num_filter_elem, c * h * w);
     checkCudaErrors(cudaDeviceSynchronize());
-    //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+    // Do the matrix multiplication. Want to multiply convData by
+    // filter->gpu_data[f * chan * KH * KW]
     const __half alf = approx_float_to_half(1.0);
     const __half bet = approx_float_to_half(0.0);
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
 
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-				 n * h * w, c, num_filter_elem,
-				 alpha_half,
-				 convData, CUDA_R_16F, n * h * w,
-				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-				 beta_half,
-				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w,
-				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
-
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem,
+        alpha_half, convData, CUDA_R_16F, n * h * w,
+        (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half,
+        (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w, CUDA_R_16F,
+        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 
     // profileEvent("gemm_end", true);
-    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-					  CUDNN_TENSOR_NCHW, n, c, h, w);
+    new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                          CUDNN_TENSOR_NCHW, n, c, h, w);
     changeTensorPlacement(new_output, DEVICE);
 
-    
-    int numBlocks = (n * c * h * w  + 255) / 256;
-    switchMatrix<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-				    (__half *)output_half->gpu_half_data,
-				    (__half *)new_output->gpu_half_data);
+    int numBlocks = (n * c * h * w + 255) / 256;
+    switchMatrix<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                     (__half *)output_half->gpu_half_data,
+                                     (__half *)new_output->gpu_half_data);
 
     checkCudaErrors(cudaDeviceSynchronize());
-    
+
     cudaFree(convData);
     freeTensor(output_half);
   }
 
-  //profileEvent("Conv_end", true);
+  // profileEvent("Conv_end", true);
 
   profileEvent("H2F_start");
   convertToFP32_offline(new_output);
@@ -817,113 +795,117 @@ void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr,
   return new_output;
 }
 
-
-//produces COL MAJOR matrix with reduced_filter_elem rows and NF cols
-__global__ void createReducedFiltersHalf(__half * output,
-					 const __half * const __restrict input, const int NF,
-					 const int num_filter_elem, const int reduced_filter_elem,
-					 const int skip_every, const int skip_offset) {
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int fIdx = tx / num_filter_elem; //filter index
-  const int offset = tx % num_filter_elem; //offset within filter
-  if(fIdx < NF) { //is thread id within bounds?
-    if(offset % skip_every != skip_every-1-skip_offset) { //are we including this filter element?
-      const int output_row = offset - ((offset + skip_every)/skip_every); //correct for skip_every = 2 
-      output[fIdx*reduced_filter_elem + output_row] =
-	__hmul((skip_every / (skip_every - 1)), input[tx]);
+// produces COL MAJOR matrix with reduced_filter_elem rows and NF cols
+__global__ void
+createReducedFiltersHalf(__half *output, const __half *const __restrict input,
+                         const int NF, const int num_filter_elem,
+                         const int reduced_filter_elem, const int skip_every,
+                         const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / num_filter_elem;                // filter index
+  const int offset = tx % num_filter_elem;              // offset within filter
+  if (fIdx < NF) { // is thread id within bounds?
+    if (offset % skip_every !=
+        skip_every - 1 - skip_offset) { // are we including this filter element?
+      const int output_row =
+          offset -
+          ((offset + skip_every) / skip_every); // correct for skip_every = 2
+      output[fIdx * reduced_filter_elem + output_row] =
+          __hmul((skip_every / (skip_every - 1)), input[tx]);
     }
   }
 }
 
-
-//COL Major matrix with N*H*W columns and reduced_filter_elem rows
-//skip_every = 1 means no perforation
-__global__ void convToGemmHalfInput(__half * const __restrict__ output,
-				    const __half * const __restrict input,
-				    const int N, const int C,
-				    const int H, const int W,
-				    const int KH, const int KW, const int V_pad,
-				    const int H_pad, const int H_out,
-				    const int W_out, const int V_stride,
-				    const int H_stride, const int reduced_filter_elem,
-				    const int skip_every, const int skip_offset) {
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-	
-	if(filter_elem_num % skip_every != skip_every-1-skip_offset) {
-	  int output_col = filter_elem_num -
-	    ((filter_elem_num + skip_every)/skip_every);
-	  if(skip_every == 1)
-	    output_col = filter_elem_num;
-	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	    output[((output_col*N + n) * H_out + h) * W_out + w] =
-	      input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	  else
-	    output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-	}
+// COL Major matrix with N*H*W columns and reduced_filter_elem rows
+// skip_every = 1 means no perforation
+__global__ void
+convToGemmHalfInput(__half *const __restrict__ output,
+                    const __half *const __restrict input, const int N,
+                    const int C, const int H, const int W, const int KH,
+                    const int KW, const int V_pad, const int H_pad,
+                    const int H_out, const int W_out, const int V_stride,
+                    const int H_stride, const int reduced_filter_elem,
+                    const int skip_every, const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;             // output width index (col number)
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+
+        if (filter_elem_num % skip_every != skip_every - 1 - skip_offset) {
+          int output_col =
+              filter_elem_num - ((filter_elem_num + skip_every) / skip_every);
+          if (skip_every == 1)
+            output_col = filter_elem_num;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((output_col * N + n) * H_out + h) * W_out + w] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
+        }
       }
     }
   }
 }
 
-
-//COL Major matrix with N*H*W columns and reduced_filter_elem rows
-//Can only be used when skipping every other element in input sampling
-__global__ void convToGemmHalfInput2(__half * const __restrict__ output,
-				     const __half * const __restrict input,
-				     const int N, const int C,
-				     const int H, const int W,
-				     const int KH, const int KW, const int V_pad,
-				     const int H_pad, const int H_out,
-				     const int W_out, const int V_stride,
-				     const int H_stride, const int reduced_filter_elem,
-				     const int skip_every, const int skip_offset) {
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
+// COL Major matrix with N*H*W columns and reduced_filter_elem rows
+// Can only be used when skipping every other element in input sampling
+__global__ void
+convToGemmHalfInput2(__half *const __restrict__ output,
+                     const __half *const __restrict input, const int N,
+                     const int C, const int H, const int W, const int KH,
+                     const int KW, const int V_pad, const int H_pad,
+                     const int H_out, const int W_out, const int V_stride,
+                     const int H_stride, const int reduced_filter_elem,
+                     const int skip_every, const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;             // output width index (col number)
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
     const int filter_elem_num = c * KH * KW;
-    for(int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l+=2) {
+    for (int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l += 2) {
       int i = l / KW;
       int j = l % KW;
 
       const int new_idx = filter_elem_num + i * KW + j;
-      const int output_col = new_idx - ((new_idx + skip_every)/2); //new output column
-      if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	output[((output_col*N + n) * H_out + h) * W_out + w] =
-	  input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+      const int output_col =
+          new_idx - ((new_idx + skip_every) / 2); // new output column
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+        output[((output_col * N + n) * H_out + h) * W_out + w] =
+            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
       else
-	output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-
+        output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
     }
   }
 }
 
-//Baseline: skip_offset = skip_every = 1
-void* tensorConvInputHalf(void* input_ptr, void* filter_ptr,
-			  int vertical_pad, int horizontal_pad, int vertical_stride,
-			  int horizontal_stride, int conv_mode, int conv_groups,
-			  int skip_every, int skip_offset){
+// Baseline: skip_offset = skip_every = 1
+void *tensorConvInputHalf(void *input_ptr, void *filter_ptr, int vertical_pad,
+                          int horizontal_pad, int vertical_stride,
+                          int horizontal_stride, int conv_mode, int conv_groups,
+                          int skip_every, int skip_offset) {
 
   INFO("*** TensorHConvolution input sampling \n");
   profileEvent("#Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
@@ -936,107 +918,97 @@ void* tensorConvInputHalf(void* input_ptr, void* filter_ptr,
   convertToFP16(filter);
   profileEvent("F2H_end");
 
-  Tensor* output;
-  Tensor* new_output;
+  Tensor *output;
+  Tensor *new_output;
   // TODO: Support other cases;
   int n, c, h, w; // output dimensions
   n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
+  c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
   h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-  output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-				   CUDNN_TENSOR_NCHW, n, c, h, w);
-  new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-				       CUDNN_TENSOR_NCHW, n, c, h, w); 
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
+  output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                    CUDNN_TENSOR_NCHW, n, c, h, w);
+  new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                        CUDNN_TENSOR_NCHW, n, c, h, w);
 
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(output, DEVICE);
   changeTensorPlacement(new_output, DEVICE);
   // NOTE: Necessary to insert the above call for every output tensor
 
-  //total number of filter elem
+  // total number of filter elem
   const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-  //reduced number after skipping
+  // reduced number after skipping
   int reduced_filter_elem;
-  if(skip_offset != skip_every){
-    reduced_filter_elem = num_filter_elem - (num_filter_elem/skip_every);
-    if(num_filter_elem % skip_every > skip_every - 1 - skip_offset)
+  if (skip_offset != skip_every) {
+    reduced_filter_elem = num_filter_elem - (num_filter_elem / skip_every);
+    if (num_filter_elem % skip_every > skip_every - 1 - skip_offset)
       reduced_filter_elem = reduced_filter_elem - 1;
-  }
-  else
+  } else
     reduced_filter_elem = num_filter_elem;
-  
-  __half * convData;
+
+  __half *convData;
   int convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w;
   checkCudaErrors(cudaMalloc(&convData, convDataSize));
-  __half * reducedFilter;
-  checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
+  __half *reducedFilter;
+  checkCudaErrors(
+      cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
   const int filtBlockSize = 128;
-  const int filtGridSize = (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize;
-  if(skip_offset != skip_every)
-    createReducedFiltersHalf<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-							      (__half *)filter->gpu_half_data,
-							      c, num_filter_elem, reduced_filter_elem,
-							      skip_every, skip_offset);
+  const int filtGridSize =
+      (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize;
+  if (skip_offset != skip_every)
+    createReducedFiltersHalf<<<filtGridSize, filtBlockSize>>>(
+        reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+        reduced_filter_elem, skip_every, skip_offset);
   checkCudaErrors(cudaDeviceSynchronize());
 
   const int blockSize = 256;
-  const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-  if(skip_every == 2){
-    convToGemmHalfInput2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-						  input->dims.dim_sizes[1],
-						  input->dims.dim_sizes[2],
-						  input->dims.dim_sizes[3],
-						  KH, KW, vertical_pad, horizontal_pad,
-						  h, w, vertical_stride, horizontal_stride,
-						  reduced_filter_elem, skip_every,
-						  skip_offset);
-  }
-  else{
-    convToGemmHalfInput<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-						 input->dims.dim_sizes[1],
-						 input->dims.dim_sizes[2],
-						 input->dims.dim_sizes[3],
-						 KH, KW, vertical_pad, horizontal_pad,
-						 h, w, vertical_stride, horizontal_stride,
-						 reduced_filter_elem, skip_every,
-						 skip_offset);
+  const int gridSize =
+      (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+  if (skip_every == 2) {
+    convToGemmHalfInput2<<<gridSize, blockSize>>>(
+        convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        reduced_filter_elem, skip_every, skip_offset);
+  } else {
+    convToGemmHalfInput<<<gridSize, blockSize>>>(
+        convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        reduced_filter_elem, skip_every, skip_offset);
   }
- 
+
   checkCudaErrors(cudaDeviceSynchronize());
-  //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+  // Do the matrix multiplication. Want to multiply convData by
+  // filter->gpu_data[f * chan * KH * KW]
   const __half alf = approx_float_to_half(1.0);
   const __half bet = approx_float_to_half(0.0);
   const __half *alpha_half = &alf;
   const __half *beta_half = &bet;
 
-  if(skip_offset != skip_every)
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-				 n * h * w, c, reduced_filter_elem,
-				 alpha_half,
-				 convData, CUDA_R_16F, n * h * w,
-				 reducedFilter, CUDA_R_16F, reduced_filter_elem,
-				 beta_half,
-				 (__half*) output->gpu_half_data, CUDA_R_16F, n * h * w,
-				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+  if (skip_offset != skip_every)
+    checkCudaErrors(
+        cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c,
+                     reduced_filter_elem, alpha_half, convData, CUDA_R_16F,
+                     n * h * w, reducedFilter, CUDA_R_16F, reduced_filter_elem,
+                     beta_half, (__half *)output->gpu_half_data, CUDA_R_16F,
+                     n * h * w, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   else
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-				 n * h * w, c, reduced_filter_elem,
-				 alpha_half,
-				 convData, CUDA_R_16F, n * h * w,
-				 (__half*) filter->gpu_half_data, CUDA_R_16F,
-				 reduced_filter_elem,
-				 beta_half,
-				 (__half*) output->gpu_half_data, CUDA_R_16F, n * h * w,
-				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
-
-  int numBlocks = (n * c * h * w  + 255) / 256;
-  switchMatrix<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-				  (__half *)output->gpu_half_data,
-				  (__half *)new_output->gpu_half_data);
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c,
+        reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w,
+        (__half *)filter->gpu_half_data, CUDA_R_16F, reduced_filter_elem,
+        beta_half, (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w,
+        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+  int numBlocks = (n * c * h * w + 255) / 256;
+  switchMatrix<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                   (__half *)output->gpu_half_data,
+                                   (__half *)new_output->gpu_half_data);
 
   checkCudaErrors(cudaDeviceSynchronize());
 
@@ -1054,131 +1026,133 @@ void* tensorConvInputHalf(void* input_ptr, void* filter_ptr,
   profileEvent("#Conv_end", true);
 
   return new_output;
-
 }
 
-//COL Major matrix with N*H*W columns and reduced_filter_elem rows
-//skip_every = 1 means no perforation
-__global__ void convToGemmFullInput(float * const __restrict__ output,
-				    const float * const __restrict input,
-				    const int N, const int C,
-				    const int H, const int W,
-				    const int KH, const int KW, const int V_pad,
-				    const int H_pad, const int H_out,
-				    const int W_out, const int V_stride,
-				    const int H_stride, const int reduced_filter_elem,
-				    const int skip_every, const int skip_offset) {
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-
-	if(filter_elem_num % skip_every != skip_every-1-skip_offset) {
-	    int output_col = filter_elem_num -
-	      ((filter_elem_num + skip_every)/skip_every);
-	    if(skip_every == 1)
-	      output_col = filter_elem_num;
-	    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	      output[((output_col*N + n) * H_out + h) * W_out + w] =
-		input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	    else
-	      output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-	}
+// COL Major matrix with N*H*W columns and reduced_filter_elem rows
+// skip_every = 1 means no perforation
+__global__ void
+convToGemmFullInput(float *const __restrict__ output,
+                    const float *const __restrict input, const int N,
+                    const int C, const int H, const int W, const int KH,
+                    const int KW, const int V_pad, const int H_pad,
+                    const int H_out, const int W_out, const int V_stride,
+                    const int H_stride, const int reduced_filter_elem,
+                    const int skip_every, const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;             // output width index (col number)
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+
+        if (filter_elem_num % skip_every != skip_every - 1 - skip_offset) {
+          int output_col =
+              filter_elem_num - ((filter_elem_num + skip_every) / skip_every);
+          if (skip_every == 1)
+            output_col = filter_elem_num;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((output_col * N + n) * H_out + h) * W_out + w] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
+        }
       }
     }
   }
 }
 
-
-//COL Major matrix with N*H*W columns and reduced_filter_elem rows
-//Can only be used when skipping every other element in input sampling
-__global__ void convToGemmFullInput2(float * const __restrict__ output,
-				     const float * const __restrict input,
-				     const int N, const int C,
-				     const int H, const int W,
-				     const int KH, const int KW, const int V_pad,
-				     const int H_pad, const int H_out,
-				     const int W_out, const int V_stride,
-				     const int H_stride, const int reduced_filter_elem,
-				     const int skip_every, const int skip_offset) {
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
+// COL Major matrix with N*H*W columns and reduced_filter_elem rows
+// Can only be used when skipping every other element in input sampling
+__global__ void
+convToGemmFullInput2(float *const __restrict__ output,
+                     const float *const __restrict input, const int N,
+                     const int C, const int H, const int W, const int KH,
+                     const int KW, const int V_pad, const int H_pad,
+                     const int H_out, const int W_out, const int V_stride,
+                     const int H_stride, const int reduced_filter_elem,
+                     const int skip_every, const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;             // output width index (col number)
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
     const int filter_elem_num = c * KH * KW;
-    for(int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l+=2) {
+    for (int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l += 2) {
       int i = l / KW;
       int j = l % KW;
 
       const int new_idx = filter_elem_num + i * KW + j;
-      const int output_col = new_idx - ((new_idx + skip_every)/2); //new output column
-      if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	output[((output_col*N + n) * H_out + h) * W_out + w] =
-	  input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+      const int output_col =
+          new_idx - ((new_idx + skip_every) / 2); // new output column
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+        output[((output_col * N + n) * H_out + h) * W_out + w] =
+            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
       else
-	output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-
+        output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
     }
   }
 }
 
-
-//produces COL MAJOR matrix with reduced_filter_elem rows and NF cols
-__global__ void createReducedFiltersFull(float * output,
-					 const float * const __restrict input, const int NF,
-					 const int num_filter_elem, const int reduced_filter_elem,
-					 const int skip_every, const int skip_offset) {
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int fIdx = tx / num_filter_elem; //filter index
-  const int offset = tx % num_filter_elem; //offset within filter
-  if(fIdx < NF) { //is thread id within bounds?
-    if(offset % skip_every != skip_every-1-skip_offset) { //are we including this filter element?
-      const int output_row = offset - ((offset + skip_every)/skip_every); //correct for skip_every = 2
-            output[fIdx*reduced_filter_elem + output_row] =
-	      (skip_every / (skip_every - 1)) * input[tx];
+// produces COL MAJOR matrix with reduced_filter_elem rows and NF cols
+__global__ void
+createReducedFiltersFull(float *output, const float *const __restrict input,
+                         const int NF, const int num_filter_elem,
+                         const int reduced_filter_elem, const int skip_every,
+                         const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / num_filter_elem;                // filter index
+  const int offset = tx % num_filter_elem;              // offset within filter
+  if (fIdx < NF) { // is thread id within bounds?
+    if (offset % skip_every !=
+        skip_every - 1 - skip_offset) { // are we including this filter element?
+      const int output_row =
+          offset -
+          ((offset + skip_every) / skip_every); // correct for skip_every = 2
+      output[fIdx * reduced_filter_elem + output_row] =
+          (skip_every / (skip_every - 1)) * input[tx];
     }
   }
 }
 
-__global__
-void switchMatrixFull(int N, int n, int c, int h, int w,
-		      float *old_data, float *new_data){
+__global__ void switchMatrixFull(int N, int n, int c, int h, int w,
+                                 float *old_data, float *new_data) {
 
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if(i < N){
+  if (i < N) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
     int n_new = i / (c * h * w);
 
-    new_data[((n_new * c + ch) * h + row ) * w + col] =
-      old_data[((ch * n + n_new) * h + row ) * w + col];
+    new_data[((n_new * c + ch) * h + row) * w + col] =
+        old_data[((ch * n + n_new) * h + row) * w + col];
   }
-
 }
 
-void* tensorConvApprox(void* input_ptr, void* filter_ptr,
-		       int vertical_pad, int horizontal_pad, int vertical_stride,
-		       int horizontal_stride, int conv_mode, int conv_groups,
-		       int row, int col, int skip_every, int offset){
+void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
+                       int horizontal_pad, int vertical_stride,
+                       int horizontal_stride, int conv_mode, int conv_groups,
+                       int row, int col, int skip_every, int offset) {
 
   INFO("*** TensorConvolution approximation \n");
   profileEvent("Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
@@ -1186,248 +1160,229 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
-  //profileEvent("H2F_start");
+  // profileEvent("H2F_start");
   convertToFP32(input);
   convertToFP32(filter);
-  //profileEvent("H2F_end");
+  // profileEvent("H2F_end");
 
   int n, c, h, w; // output dimensions
   n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
+  c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
   int h_eff = h - h / row;
-  if(h % row > row - 1 - offset)
+  if (h % row > row - 1 - offset)
     h_eff = h_eff - 1;
 
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   int w_eff = w - w / col;
-  if(w % col > col - 1 - offset)
+  if (w % col > col - 1 - offset)
     w_eff = w_eff - 1;
 
-
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-				       CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                                CUDNN_TENSOR_NCHW, n, c, h, w);
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(new_output, DEVICE);
 
-  if(row > 1){
-    Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				      CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+  if (row > 1) {
+    Tensor *output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1], input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3], KH, KW, vertical_pad,
-					       horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       row, offset, h_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
 
+    convToGemmPerfRow<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, offset, h_eff);
 
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h_eff * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h_eff * w, num_filter_elem * h_eff * w,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h_eff * w, c * h_eff * w,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					  CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+        &alpha, convData, h_eff * w, num_filter_elem * h_eff * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h_eff * w, c * h_eff * w, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data, (float *) new_output->gpu_data,
-					    row, offset);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateRow<<<numBlocks, 128>>>(
+        n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, row, offset);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  }
-  else if(col > 1){
-    
-    Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+  } else if (col > 1) {
+
+    Tensor *output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1], input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3], KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       col, offset, w_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
 
+    convToGemmPerfCol<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, offset, w_eff);
 
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w_eff, c * h * w_eff,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					  CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+        &alpha, convData, h * w_eff, num_filter_elem * h * w_eff,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w_eff, c * h * w_eff, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data, (float *)new_output->gpu_data,
-					    col, offset);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateCol<<<numBlocks, 128>>>(
+        n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, col, offset);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  }
-  else{
-    Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-					     CUDNN_TENSOR_NCHW, n, c, h, w);
+  } else {
+    Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                              CUDNN_TENSOR_NCHW, n, c, h, w);
 
-    //total number of filter elem
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-    //reduced number after skipping
+    // reduced number after skipping
     int reduced_filter_elem;
-    if(offset != skip_every){
-      reduced_filter_elem = num_filter_elem - (num_filter_elem/skip_every);
-      if(num_filter_elem % skip_every > skip_every - 1 - offset)
-	reduced_filter_elem = reduced_filter_elem - 1;
-    }
-    else
+    if (offset != skip_every) {
+      reduced_filter_elem = num_filter_elem - (num_filter_elem / skip_every);
+      if (num_filter_elem % skip_every > skip_every - 1 - offset)
+        reduced_filter_elem = reduced_filter_elem - 1;
+    } else
       reduced_filter_elem = num_filter_elem;
 
- 
-    float * convData;
+    float *convData;
     int convDataSize = sizeof(float) * n * reduced_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    float * reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
+    float *reducedFilter;
+    checkCudaErrors(
+        cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
     const int filtBlockSize = 128;
-    const int filtGridSize = (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    if(offset != skip_every)
-      createReducedFiltersFull<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-								(float *)filter->gpu_data,
-								c, num_filter_elem, reduced_filter_elem,
-								skip_every, offset);
+    const int filtGridSize =
+        (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    if (offset != skip_every)
+      createReducedFiltersFull<<<filtGridSize, filtBlockSize>>>(
+          reducedFilter, (float *)filter->gpu_data, c, num_filter_elem,
+          reduced_filter_elem, skip_every, offset);
     checkCudaErrors(cudaDeviceSynchronize());
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    if(skip_every == 2){
-      convToGemmFullInput2<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-						    input->dims.dim_sizes[1],
-						    input->dims.dim_sizes[2],
-						    input->dims.dim_sizes[3],
-						    KH, KW, vertical_pad, horizontal_pad,
-						    h, w, vertical_stride, horizontal_stride,
-						    reduced_filter_elem, skip_every,
-						    offset);
-    }
-    else{
-      convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-						   input->dims.dim_sizes[1],
-						   input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3],
-						   KH, KW, vertical_pad, horizontal_pad,
-						   h, w, vertical_stride, horizontal_stride,
-						   reduced_filter_elem, skip_every,
-						   offset);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    if (skip_every == 2) {
+      convToGemmFullInput2<<<gridSize, blockSize>>>(
+          convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
+    } else {
+      convToGemmFullInput<<<gridSize, blockSize>>>(
+          convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
     }
 
     checkCudaErrors(cudaDeviceSynchronize());
-    //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+    // Do the matrix multiplication. Want to multiply convData by
+    // filter->gpu_data[f * chan * KH * KW]
     const float alpha = 1.0;
     const float beta = 0.0;
 
-    if(offset != skip_every)
-      checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-				   n * h * w, c, reduced_filter_elem,
-				   &alpha,
-				   convData, CUDA_R_32F, n * h * w,
-				   reducedFilter, CUDA_R_32F, reduced_filter_elem,
-				   &beta,
-				   (float *) output->gpu_data, CUDA_R_32F, n * h * w,
-				   CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+    if (offset != skip_every)
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c,
+          reduced_filter_elem, &alpha, convData, CUDA_R_32F, n * h * w,
+          reducedFilter, CUDA_R_32F, reduced_filter_elem, &beta,
+          (float *)output->gpu_data, CUDA_R_32F, n * h * w, CUDA_R_32F,
+          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     else
-      checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-				   n * h * w, c, reduced_filter_elem,
-				   &alpha,
-				   convData, CUDA_R_32F, n * h * w,
-				   (float *) filter->gpu_data, CUDA_R_32F,
-				   reduced_filter_elem,
-				   &beta,
-				   (float *) output->gpu_data, CUDA_R_32F, n * h * w,
-				   CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
-    int numBlocks = (n * c * h * w  + 255) / 256;
-    switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-				    (float *)output->gpu_data,
-				    (float *)new_output->gpu_data);
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c,
+          reduced_filter_elem, &alpha, convData, CUDA_R_32F, n * h * w,
+          (float *)filter->gpu_data, CUDA_R_32F, reduced_filter_elem, &beta,
+          (float *)output->gpu_data, CUDA_R_32F, n * h * w, CUDA_R_32F,
+          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    int numBlocks = (n * c * h * w + 255) / 256;
+    switchMatrixFull<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                         (float *)output->gpu_data,
+                                         (float *)new_output->gpu_data);
 
     checkCudaErrors(cudaDeviceSynchronize());
-    
+
     cudaFree(convData);
     cudaFree(reducedFilter);
     freeTensor(output);
   }
 
   profileEvent("Conv_end");
-  
+
   return new_output;
-  
 }
 
-void* tensorConvApproxHalf(void* input_ptr, void* filter_ptr,
-			   int vertical_pad, int horizontal_pad, int vertical_stride,
-			   int horizontal_stride, int conv_mode, int conv_groups,
-			   int row, int col, int skip_every, int offset){
+void *tensorConvApproxHalf(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int conv_groups, int row, int col, int skip_every,
+                           int offset) {
 
   INFO("*** TensorConvolution half approximation \n");
   profileEvent("#Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
@@ -1442,50 +1397,49 @@ void* tensorConvApproxHalf(void* input_ptr, void* filter_ptr,
 
   int n, c, h, w; // output dimensions
   n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
+  c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
   int h_eff = h - h / row;
-  if(h % row > row - 1 - offset)
+  if (h % row > row - 1 - offset)
     h_eff = h_eff - 1;
 
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   int w_eff = w - w / col;
-  if(w % col > col - 1 - offset)
+  if (w % col > col - 1 - offset)
     w_eff = w_eff - 1;
 
-
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-				       CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                                CUDNN_TENSOR_NCHW, n, c, h, w);
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(new_output, DEVICE);
 
-  if(row > 1){
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-						  CUDNN_TENSOR_NCHW,
-						  n, c, h_eff, w);
+  if (row > 1) {
+    Tensor *output_half = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output_half, DEVICE);
 
-    //total number of filter elem
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    __half * convData;
+    __half *convData;
     int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 256;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-
-    convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
-						   horizontal_pad, h, w, vertical_stride,
-						   horizontal_stride, row, offset, h_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
 
+    convToGemmPerfRowHalf<<<gridSize, blockSize>>>(
+        convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, offset, h_eff);
 
     checkCudaErrors(cudaDeviceSynchronize());
 
@@ -1494,49 +1448,45 @@ void* tensorConvApproxHalf(void* input_ptr, void* filter_ptr,
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
 
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-				 n * h_eff * w, c, num_filter_elem,
-				 alpha_half,
-				 convData, CUDA_R_16F, n * h_eff * w,
-				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-				 beta_half,
-				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
-				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
-    //interpolate
-    int numBlocks = (n * c * h * w  + 255) / 256;
-    approxInterpolateRowHalf<<<numBlocks,256>>>(n * c * h * w, h_eff, n, c, h, w,
-						(__half *)output_half->gpu_half_data,
-						(__half *)new_output->gpu_half_data,
-						row, offset);
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h_eff * w, c,
+        num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h_eff * w,
+        (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half,
+        (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
+        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    // interpolate
+    int numBlocks = (n * c * h * w + 255) / 256;
+    approxInterpolateRowHalf<<<numBlocks, 256>>>(
+        n * c * h * w, h_eff, n, c, h, w, (__half *)output_half->gpu_half_data,
+        (__half *)new_output->gpu_half_data, row, offset);
     cudaDeviceSynchronize();
 
     freeTensor(output_half);
     cudaFree(convData);
-  }
-  else if(col > 1){
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-					  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+  } else if (col > 1) {
+    Tensor *output_half = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output_half, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    __half * convData;
+    __half *convData;
     int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 256;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
-						   horizontal_pad, h, w, vertical_stride,
-						   horizontal_stride, col, offset, w_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
 
+    convToGemmPerfColHalf<<<gridSize, blockSize>>>(
+        convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, offset, w_eff);
 
     checkCudaErrors(cudaDeviceSynchronize());
 
@@ -1545,121 +1495,104 @@ void* tensorConvApproxHalf(void* input_ptr, void* filter_ptr,
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
 
-    
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-				 n * h * w_eff, c, num_filter_elem,
-				 alpha_half,
-				 convData, CUDA_R_16F, n * h * w_eff,
-				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-				 beta_half,
-				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
-				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
-    //interpolate
-    int numBlocks = (n * c * h * w  + 255) / 256;
-    approxInterpolateColHalf<<<numBlocks,256>>>(n * c * h * w, w_eff, n, c, h, w,
-						(__half *)output_half->gpu_half_data,
-						(__half *)new_output->gpu_half_data,
-						col, offset);
-    
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w_eff, c,
+        num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w_eff,
+        (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half,
+        (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
+        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    // interpolate
+    int numBlocks = (n * c * h * w + 255) / 256;
+    approxInterpolateColHalf<<<numBlocks, 256>>>(
+        n * c * h * w, w_eff, n, c, h, w, (__half *)output_half->gpu_half_data,
+        (__half *)new_output->gpu_half_data, col, offset);
+
     cudaDeviceSynchronize();
 
     freeTensor(output_half);
     cudaFree(convData);
 
-  }
-  else{
-    Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-				   CUDNN_TENSOR_NCHW, n, c, h, w);
-    
-    //total number of filter elem
+  } else {
+    Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                              CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    // total number of filter elem
     const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-    //reduced number after skipping
+    // reduced number after skipping
     int reduced_filter_elem;
-    if(offset != skip_every){
-      reduced_filter_elem = num_filter_elem - (num_filter_elem/skip_every);
-      if(num_filter_elem % skip_every > skip_every - 1 - offset)
-	reduced_filter_elem = reduced_filter_elem - 1;
-    }
-    else
+    if (offset != skip_every) {
+      reduced_filter_elem = num_filter_elem - (num_filter_elem / skip_every);
+      if (num_filter_elem % skip_every > skip_every - 1 - offset)
+        reduced_filter_elem = reduced_filter_elem - 1;
+    } else
       reduced_filter_elem = num_filter_elem;
-    
-    __half * convData;
+
+    __half *convData;
     int convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    __half * reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
+    __half *reducedFilter;
+    checkCudaErrors(
+        cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
     const int filtBlockSize = 128;
-    const int filtGridSize = (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    if(offset != skip_every)
-      createReducedFiltersHalf<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-								(__half *)filter->gpu_half_data,
-								c, num_filter_elem, reduced_filter_elem,
-								skip_every, offset);
+    const int filtGridSize =
+        (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    if (offset != skip_every)
+      createReducedFiltersHalf<<<filtGridSize, filtBlockSize>>>(
+          reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+          reduced_filter_elem, skip_every, offset);
     checkCudaErrors(cudaDeviceSynchronize());
-    
+
     const int blockSize = 256;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    if(skip_every == 2){
-      convToGemmHalfInput2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-						    input->dims.dim_sizes[1],
-						    input->dims.dim_sizes[2],
-						    input->dims.dim_sizes[3],
-						    KH, KW, vertical_pad, horizontal_pad,
-						    h, w, vertical_stride, horizontal_stride,
-						    reduced_filter_elem, skip_every,
-						    offset);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    if (skip_every == 2) {
+      convToGemmHalfInput2<<<gridSize, blockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
+    } else {
+      convToGemmHalfInput<<<gridSize, blockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
     }
-    else{
-      convToGemmHalfInput<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1],
-						   input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3],
-						   KH, KW, vertical_pad, horizontal_pad,
-						   h, w, vertical_stride, horizontal_stride,
-						   reduced_filter_elem, skip_every,
-						   offset);
-    }
-    
+
     checkCudaErrors(cudaDeviceSynchronize());
-    //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+    // Do the matrix multiplication. Want to multiply convData by
+    // filter->gpu_data[f * chan * KH * KW]
     const __half alf = approx_float_to_half(1.0);
     const __half bet = approx_float_to_half(0.0);
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
-    
-    if(offset != skip_every)
-      checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-				   n * h * w, c, reduced_filter_elem,
-				   alpha_half,
-				   convData, CUDA_R_16F, n * h * w,
-				   reducedFilter, CUDA_R_16F, reduced_filter_elem,
-				   beta_half,
-				   (__half*) output->gpu_half_data, CUDA_R_16F, n * h * w,
-				   CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+    if (offset != skip_every)
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c,
+          reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w,
+          reducedFilter, CUDA_R_16F, reduced_filter_elem, beta_half,
+          (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w, CUDA_R_16F,
+          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     else
-      checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-				   n * h * w, c, reduced_filter_elem,
-				   alpha_half,
-				   convData, CUDA_R_16F, n * h * w,
-				   (__half*) filter->gpu_half_data, CUDA_R_16F,
-				   reduced_filter_elem,
-				   beta_half,
-				   (__half*) output->gpu_half_data, CUDA_R_16F, n * h * w,
-				   CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-    
-    
-    int numBlocks = (n * c * h * w  + 255) / 256;
-    switchMatrix<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-				    (__half *)output->gpu_half_data,
-				    (__half *)new_output->gpu_half_data);
-    
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c,
+          reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w,
+          (__half *)filter->gpu_half_data, CUDA_R_16F, reduced_filter_elem,
+          beta_half, (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w,
+          CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    int numBlocks = (n * c * h * w + 255) / 256;
+    switchMatrix<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                     (__half *)output->gpu_half_data,
+                                     (__half *)new_output->gpu_half_data);
+
     checkCudaErrors(cudaDeviceSynchronize());
-    
+
     cudaFree(convData);
     cudaFree(reducedFilter);
     freeTensor(output);
-      
   }
 
   profileEvent("H2F_start");
@@ -1667,6 +1600,6 @@ void* tensorConvApproxHalf(void* input_ptr, void* filter_ptr,
   profileEvent("H2F_end");
 
   profileEvent("#Conv_end");
-  
+
   return new_output;
 }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_utils.h
index 984ee7c25679e23b735320a419fc844a35055ea0..7118de5e20c7b565867b4a6282d72349b442584f 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_utils.h
@@ -1,32 +1,29 @@
 
 
-extern "C"{
-
-
-__global__ void convToGemmApprox(float * const __restrict__ output,
-		      const float * const __restrict input, const int N, const int C,
-		      const int H, const int W,
-		      const int KH, const int KW, const int V_pad,
-		      const int H_pad, const int H_out,
-		      const int W_out, const int V_stride,
-		      const int H_stride, const int reduced_filter_elem,
-		      const int skip_every);
-
-
-void* tensorConvApprox(void* input_ptr, void* filter_ptr,
-		       int vertical_pad, int horizontal_pad, int vertical_stride,
-		       int horizontal_stride, int conv_mode, int conv_groups,
-		       int row, int col, int skip_every, int offset);
-
-
-void* tensorConvApproxHalf(void* input_ptr, void* filter_ptr,
-			   int vertical_pad, int horizontal_pad, int vertical_stride,
-			   int horizontal_stride, int conv_mode, int conv_groups,
-			   int row, int col, int skip_every, int offset);
-
-void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
-			   int vertical_pad, int horizontal_pad, int vertical_stride,
-			   int horizontal_stride, int conv_mode, int conv_groups,
-			   int row, int col, int skip_every, int offset);
-
+extern "C" {
+
+__global__ void
+convToGemmApprox(float *const __restrict__ output,
+                 const float *const __restrict input, const int N, const int C,
+                 const int H, const int W, const int KH, const int KW,
+                 const int V_pad, const int H_pad, const int H_out,
+                 const int W_out, const int V_stride, const int H_stride,
+                 const int reduced_filter_elem, const int skip_every);
+
+void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
+                       int horizontal_pad, int vertical_stride,
+                       int horizontal_stride, int conv_mode, int conv_groups,
+                       int row, int col, int skip_every, int offset);
+
+void *tensorConvApproxHalf(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int conv_groups, int row, int col, int skip_every,
+                           int offset);
+
+void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups, int row, int col, int skip_every,
+                            int offset);
 }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
index d89fc2f9ac9168ba09cd55ee03b389eca56973be..2545f07b48ddabfa6793f1d9eb01911542f4198e 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
@@ -128,8 +128,8 @@ void *handleTensorReduceApproximationTuples(
       RC->reset_profiler();
       if (is_half) {
         RC->addToCurrentIterationComputeTime("tensorReduceHalf", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy(
-            "tensorReduceHalf", pinfo.second);
+        RC->addToCurrentIterationComputeEnergy("tensorReduceHalf",
+                                               pinfo.second);
       } else {
         RC->addToCurrentIterationComputeTime("tensorReduce", pinfo.first);
         RC->addToCurrentIterationComputeEnergy("tensorReduce", pinfo.second);
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
index 0b91030b717d257664ef2cb1bf8e06bd2bcc9508..138ddd0887b57ce583b8f5cfeaba19ad7d20eb4e 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
@@ -3,51 +3,48 @@
 #ifndef APPROXHPVM_RUNTIME_UTILS
 #define APPROXHPVM_RUNTIME_UTILS
 
-
-#include "tensor_runtime.h"
 #include "configuration.h"
 #include "hpvm-rt-controller.h"
+#include "tensor_runtime.h"
 
 #include "approx_knob_utils.h"
 
 // Utilities header for ApproxHPVM runtime API (wrapper runtime API)
 
-void* handleTensorAddApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input, void* bias) {
+void *handleTensorAddApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *bias) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorAdd(input, bias);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfAdd(input, bias);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorAdd(input, bias);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfAdd(input, bias);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -60,44 +57,42 @@ void* handleTensorAddApproximationTuples(
   return NULL;
 }
 
-void* handleTensorMulApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* lhs, void* rhs) {
+void *handleTensorMulApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *lhs, void *rhs) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorGemmGPU(lhs, rhs);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfGemmGPU(lhs, rhs);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorGemmGPU(lhs, rhs);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfGemmGPU(lhs, rhs);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
-      }
+    }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
     abort();
@@ -108,102 +103,90 @@ void* handleTensorMulApproximationTuples(
   return NULL;
 }
 
-void* handleTensorConvApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input, void* filter, 
-  int conv_pad_h, int conv_pad_w,
-  int conv_stride_h, int conv_stride_w) {
+void *handleTensorConvApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *filter, int conv_pad_h, int conv_pad_w,
+    int conv_stride_h, int conv_stride_w) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvApprox(input, filter,
-                                 conv_pad_h, conv_pad_w,
-                                 conv_stride_h, conv_stride_w,
-                                 1, 1,
-                                 1, 1, 1, 1);
-	
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApprox(input, filter, conv_pad_h, conv_pad_w,
+                               conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1);
 
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvApproxHalf2(input, filter,
-                                     conv_pad_h, conv_pad_w,
-                                     conv_stride_h, conv_stride_w,
-                                     1, 1,
-                                     1, 1, 1, 1);
-	
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out =
+          tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w,
+                                conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1);
 
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::PERFORATION :
-      case GPUNodeConfiguration::APPROX::PERFORATION_HP :
-        {
-          PerfParams params = perfParamSet->getPerfParams(param);
-          //PerfParams params = PerfParamSet().getPerfParams(param);
-          INFO("perforation param = %i\n", param);
-          INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
-                params.row, params.col, params.skip_offset);
-          void* t_out;
-          RC->resume_profiler();
-          t_out = tensorConvApproxHalf2(input, filter,
-                                       conv_pad_h, conv_pad_w,
-                                       conv_stride_h, conv_stride_w,
-                                       1, 1,
-                                       params.row, params.col, 1, params.skip_offset);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf",
+                                             pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::PERFORATION:
+    case GPUNodeConfiguration::APPROX::PERFORATION_HP: {
+      PerfParams params = perfParamSet->getPerfParams(param);
+      // PerfParams params = PerfParamSet().getPerfParams(param);
+      INFO("perforation param = %i\n", param);
+      INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
+           params.row, params.col, params.skip_offset);
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApproxHalf2(
+          input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w,
+          1, 1, params.row, params.col, 1, params.skip_offset);
 
-          RC->pause_profiler();
-          std::pair<double, double> pinfo = RC->get_time_energy();
-          RC->reset_profiler();
-          RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)", pinfo.first);
-          RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)", pinfo.second);
-          return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::INPUT_SAMPLING :
-      case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP :
-        {
-          SampParams params = sampParamSet->getSampParams(param);
-          //SampParams params = SampParamSet().getSampParams(param);
-          INFO("sampling param = %i\n", param);
-          INFO("params.skip_rate = %i, params.skip_offset = %i\n",
-                params.skip_rate, params.skip_offset);
-          void* t_out;
-          RC->resume_profiler();
-          t_out = tensorConvApproxHalf2(input, filter,
-                                       conv_pad_h, conv_pad_w,
-                                       conv_stride_h, conv_stride_w,
-                                       1, 1,
-                                       1, 1,
-                                       params.skip_rate, params.skip_offset);
-          RC->pause_profiler();
-          std::pair<double, double> pinfo = RC->get_time_energy();
-          RC->reset_profiler();
-          RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)", pinfo.first);
-          RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)", pinfo.second);
-          return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)",
+                                             pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::INPUT_SAMPLING:
+    case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP: {
+      SampParams params = sampParamSet->getSampParams(param);
+      // SampParams params = SampParamSet().getSampParams(param);
+      INFO("sampling param = %i\n", param);
+      INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate,
+           params.skip_offset);
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w,
+                                    conv_stride_h, conv_stride_w, 1, 1, 1, 1,
+                                    params.skip_rate, params.skip_offset);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -216,103 +199,99 @@ void* handleTensorConvApproximationTuples(
   return NULL;
 }
 
-void* handleTensorGroupConvApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input, void* filter,
-  int vertical_pad, int horizontal_pad,
-  int vertical_stride, int horizontal_stride,
-  int conv_mode, int conv_groups) {
+void *handleTensorGroupConvApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *filter, int vertical_pad, int horizontal_pad,
+    int vertical_stride, int horizontal_stride, int conv_mode,
+    int conv_groups) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvCutlass(input, filter,
-                                  vertical_pad, horizontal_pad,
-                                  vertical_stride, horizontal_stride,
-                                  conv_mode, conv_groups);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfConvCutlass(input, filter,
-                                      vertical_pad, horizontal_pad,
-                                      vertical_stride, horizontal_stride,
-                                      conv_mode, conv_groups);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvCutlass(input, filter, vertical_pad, horizontal_pad,
+                                vertical_stride, horizontal_stride, conv_mode,
+                                conv_groups);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfConvCutlass(input, filter, vertical_pad, horizontal_pad,
+                                    vertical_stride, horizontal_stride,
+                                    conv_mode, conv_groups);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorBatchNormApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input_ptr, void* gamma_ptr, void* beta_ptr,
-  void* mean_ptr, void* variance_ptr, double epsilon) {
+void *handleTensorBatchNormApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr,
+    void *variance_ptr, double epsilon) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr,
-                               mean_ptr, variance_ptr, epsilon);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr,
-                                   mean_ptr, variance_ptr, epsilon);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-    // TODO additional approx methods implemented here
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr,
+                              variance_ptr, epsilon);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr,
+                                  variance_ptr, epsilon);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
+      // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
@@ -324,215 +303,202 @@ void* handleTensorBatchNormApproximationTuples(
   return NULL;
 }
 
-void* handleTensorReluApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input) {
+void *handleTensorReluApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorRelu(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfRelu(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorRelu(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfRelu(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorClippedReluApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input, float min, float max) {
+void *handleTensorClippedReluApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, float min, float max) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorRelu2(input, min, max);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfRelu2(input, min, max);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorRelu2(input, min, max);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfRelu2(input, min, max);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorTanhApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input) {
+void *handleTensorTanhApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorTanh(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfTanh(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorTanh(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfTanh(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorPoolingApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input_ptr, int poolFunction,
-  int window_height, int window_width,
-  int vertical_pad, int horizontal_pad,
-  int vertical_stride, int horizontal_stride) {
+void *handleTensorPoolingApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr, int poolFunction, int window_height, int window_width,
+    int vertical_pad, int horizontal_pad, int vertical_stride,
+    int horizontal_stride) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorPooling(input_ptr,
-                             poolFunction,
-                             window_height, window_width,
-                             vertical_pad, horizontal_pad,
-                             vertical_stride, horizontal_stride);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfPooling(input_ptr,
-                                 poolFunction,
-                                 window_height, window_width,
-                                 vertical_pad, horizontal_pad,
-                                 vertical_stride, horizontal_stride);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorPooling(input_ptr, poolFunction, window_height,
+                            window_width, vertical_pad, horizontal_pad,
+                            vertical_stride, horizontal_stride);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfPooling(input_ptr, poolFunction, window_height,
+                                window_width, vertical_pad, horizontal_pad,
+                                vertical_stride, horizontal_stride);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorSoftmaxApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input_ptr) {
-  //TODO: if approximation choices are added for softmax operation,
+void *handleTensorSoftmaxApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr) {
+  // TODO: if approximation choices are added for softmax operation,
   // implement this like the other handle* functions
-  void* t_out;
+  void *t_out;
   RC->resume_profiler();
   t_out = tensorSoftmax(input_ptr);
   RC->pause_profiler();
@@ -543,5 +509,4 @@ void* handleTensorSoftmaxApproximationTuples(
   return t_out;
 }
 
-
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
index 2624ea43c12426edc3535471df5dafc0360b9a81..2067609c5a476291a27763b80a558da099e62e60 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
@@ -140,8 +140,8 @@ struct Configuration {
 // Comparison operator definition, in increasing accuracy loss
 // (for std sort, used in pareto optimal computation)
 struct ConfigurationLessThan {
-  bool operator()(
-      const struct Configuration &a, const struct Configuration &b) const;
+  bool operator()(const struct Configuration &a,
+                  const struct Configuration &b) const;
 };
 
 // Comparison operator definition, in increasing accuracy loss
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h
index 20a85478b18774c5add650bb273251a0722a35a4..7724a49edf2465ee5e3d9ed5568ef2d87f943030 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h
@@ -3,48 +3,49 @@
 #ifndef RUNTIME_DEBUG
 #define RUNTIME_DEBUG
 
-#define LOG_DEBUG 1   // Sets the debug logging to true
+#define LOG_DEBUG 1 // Sets the debug logging to true
 #define LOG_INFO 1  // Sets the info logging to true
 #define ASSERT_FLAG // Sets assertions to true (opposite of NDEBUG macro)
 
 #include "tensor.h"
-#include <sstream>
 #include <iostream>
-
 #include <sstream>
-#include <iostream>
+
 #include <cstdarg>
+#include <iostream>
+#include <sstream>
 
-#include <cudnn.h>
 #include <cublas_v2.h>
+#include <cudnn.h>
 #include <cufft.h>
 
-#define FatalError(s) do {                                             \
-    std::stringstream _where, _message;                                \
-    _where << __FILE__ << ':' << __LINE__;                             \
-    _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__;  \
-    std::cerr << _message.str() << "\nAborting...\n";                  \
-    cudaDeviceReset();                                                 \
-    exit(1);                                                           \
-} while(0)
-
-
-#define checkCUDNN(status) do {                                        \
-    std::stringstream _error;                                          \
-    if (status != CUDNN_STATUS_SUCCESS) {                              \
-      _error << "CUDNN failure: " << cudnnGetErrorString(status);      \
-      FatalError(_error.str());                                        \
-    }                                                                  \
-} while(0)
-
-
-#define checkCudaErrors(status) do {                                   \
-    std::stringstream _error;                                          \
-    if (status != 0) {                                                 \
-      _error << "Cuda failure: " << status;                            \
-      FatalError(_error.str());                                        \
-    }                                                                  \
-} while(0)
+#define FatalError(s)                                                          \
+  do {                                                                         \
+    std::stringstream _where, _message;                                        \
+    _where << __FILE__ << ':' << __LINE__;                                     \
+    _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__;          \
+    std::cerr << _message.str() << "\nAborting...\n";                          \
+    cudaDeviceReset();                                                         \
+    exit(1);                                                                   \
+  } while (0)
+
+#define checkCUDNN(status)                                                     \
+  do {                                                                         \
+    std::stringstream _error;                                                  \
+    if (status != CUDNN_STATUS_SUCCESS) {                                      \
+      _error << "CUDNN failure: " << cudnnGetErrorString(status);              \
+      FatalError(_error.str());                                                \
+    }                                                                          \
+  } while (0)
+
+#define checkCudaErrors(status)                                                \
+  do {                                                                         \
+    std::stringstream _error;                                                  \
+    if (status != 0) {                                                         \
+      _error << "Cuda failure: " << status;                                    \
+      FatalError(_error.str());                                                \
+    }                                                                          \
+  } while (0)
 
 void _checkCUBLAS(cublasStatus_t error, const char *file, int line);
 
@@ -58,33 +59,32 @@ void _checkCUDA(cudaError_t err, const char *file, int line);
 
 #define checkCUDA(err) _checkCUDA(err, __FILE__, __LINE__)
 
-void INFO(const char* format, ...);
-
-void DEBUG(const char* format, ...);
-
-void ERROR(const char* format, ...);
+void INFO(const char *format, ...);
 
+void DEBUG(const char *format, ...);
 
+void ERROR(const char *format, ...);
 
 #ifdef ASSERT_FLAG
-#define CUSTOM_ASSERT(x) do {                                 \
-  if (!(x)) {                                                 \
-    std::stringstream _message;                               \
-    _message << "Assertion failed at "                        \
-             << __FILE__ << ':' << __LINE__                   \
-             << " inside function " << __FUNCTION__ << "\n"   \
-             << "Condition: " << #x << "\n";                  \
-    std::cerr << _message.str();                              \
-    abort();                                                  \
-  }                                                           \
-} while (0)
+#define CUSTOM_ASSERT(x)                                                       \
+  do {                                                                         \
+    if (!(x)) {                                                                \
+      std::stringstream _message;                                              \
+      _message << "Assertion failed at " << __FILE__ << ':' << __LINE__        \
+               << " inside function " << __FUNCTION__ << "\n"                  \
+               << "Condition: " << #x << "\n";                                 \
+      std::cerr << _message.str();                                             \
+      abort();                                                                 \
+    }                                                                          \
+  } while (0)
 #else
-#define CUSTOM_ASSERT(x) do { } while (0)
+#define CUSTOM_ASSERT(x)                                                       \
+  do {                                                                         \
+  } while (0)
 #endif
 
-void fillOnes(struct Tensor* tensor);
-
-void printTensorDescInfo(struct Tensor* tensor);
+void fillOnes(struct Tensor *tensor);
 
+void printTensorDescInfo(struct Tensor *tensor);
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h
index 6429ed1a6e695ff4ed5ad927b93d2f74ac82ae63..a3d51141acd9e45d3231689a39f43e97fbeb0a9f 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h
@@ -2,91 +2,78 @@
 #ifndef ERROR_HEADER
 #define ERROR_HEADER
 
-
 #include "debug.h"
 
+extern "C" {
 
-extern "C"{
-  
-void readSkipTensors(int* skip_tensor_ids, int op_count);
+void readSkipTensors(int *skip_tensor_ids, int op_count);
 
-void readOpenTunerFlags(const char* file_name);
+void readOpenTunerFlags(const char *file_name);
 
-void readQuantRanges(char* file_name);
+void readQuantRanges(char *file_name);
 
-Norm_t* calculateNorms(Tensor* x, Tensor* x_orig);
+Norm_t *calculateNorms(Tensor *x, Tensor *x_orig);
 
-Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig);
+Norm_t *calculateNorms2(Tensor *x, Tensor *x_orig);
 
-__global__ void normComputeKernel(float* A, float * B,
-				  double* l1_A, double* l2_A,
-				  double* l1_diff, double* l2_diff,
-				  unsigned int n);
+__global__ void normComputeKernel(float *A, float *B, double *l1_A,
+                                  double *l2_A, double *l1_diff,
+                                  double *l2_diff, unsigned int n);
 
 __inline__ __device__ double warpReduceSum(double val);
 
 __inline__ __device__ double blockReduceSum(double val);
 
+__global__ void deviceReduceBlockAtomicKernel(float *A, float *B, int N,
+                                              double *A_l1, double *A_l2,
+                                              double *diff_l1, double *diff_l2);
 
-__global__ void deviceReduceBlockAtomicKernel(float* A, float* B, int N,
-					      double* A_l1, double* A_l2,
-					      double* diff_l1, double* diff_l2);
-
-void deviceReduce(float* A, float* B, int N,
-		  double* A_l1, double* A_l2,
-		  double* diff_l1, double* diff_l2);
+void deviceReduce(float *A, float *B, int N, double *A_l1, double *A_l2,
+                  double *diff_l1, double *diff_l2);
 
 // Compute Norms on the GPU
-Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig);
+Norm_t *calculateNormsTreeReduction(Tensor *x, Tensor *x_orig);
 
 // Compute Norms on the GPU
-Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig);
+Norm_t *calculateNormsGPU(Tensor *x, Tensor *x_orig);
 
-__global__ void vecConstMul(float* A, float mul_factor, int n);
+__global__ void vecConstMul(float *A, float mul_factor, int n);
 
-__global__ void vecRound(float* A, int n);
+__global__ void vecRound(float *A, int n);
 
-__global__ void vecConstDiv(float* A, float div_factor, int n);
+__global__ void vecConstDiv(float *A, float div_factor, int n);
 
-__global__ void vecMul(float* A, float* B, int n);
+__global__ void vecMul(float *A, float *B, int n);
 
 /****  ERROR injecion routines  ******/
-void initRandValues(Tensor* bias, int error_scale);
+void initRandValues(Tensor *bias, int error_scale);
 
-void initRandValues2(Tensor* bias, int error_scale);
+void initRandValues2(Tensor *bias, int error_scale);
 
-void* addBitError(void* x_ptr, int error_scale);
+void *addBitError(void *x_ptr, int error_scale);
 
-void randomCeilAndFloor(float* x, size_t num_elems);
+void randomCeilAndFloor(float *x, size_t num_elems);
 
 // Routine for Adding RoundOff Errors
-void* addRoundError(void* x_ptr, int error_scale);
-
+void *addRoundError(void *x_ptr, int error_scale);
 
 // Routine for Adding Gaussian Error
-void* addGaussianError(void* x_ptr, int error_scale);
-
-void initPromiseRandValues(Tensor* bias, int error_scale);
+void *addGaussianError(void *x_ptr, int error_scale);
 
+void initPromiseRandValues(Tensor *bias, int error_scale);
 
 // NOTE: Assumption is that x_ptr is FP32 tensor - doesn't work with FP16
 // Routine for Adding PROMISE bitline swing error
-void* addPromiseError(void* x_ptr, int error_scale);
-
-__global__ void quantizeAndClip(float* A, int n,
-				float mul_factor,
-				float min, float max);
+void *addPromiseError(void *x_ptr, int error_scale);
 
-__global__ void quantizeElem(float* A, int n,
-			     float mul_factor,
-			     float min);
+__global__ void quantizeAndClip(float *A, int n, float mul_factor, float min,
+                                float max);
 
-void* quantizeTensorPromise(void* input_ptr,
-			    float min, float max);
+__global__ void quantizeElem(float *A, int n, float mul_factor, float min);
 
-void* tensorAddError(void* x_ptr, int error_scale);
+void *quantizeTensorPromise(void *input_ptr, float min, float max);
 
+void *tensorAddError(void *x_ptr, int error_scale);
 }
 
-
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h
index 4c2fbe806d1758118f6d55c079f9c75de42599d8..0b80e043e327e5dec5169dcd0bde092313a1bdc9 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h
@@ -24,101 +24,90 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// This code modified from the public domain code here: 
+// This code modified from the public domain code here:
 // https://gist.github.com/rygorous/2156668
 // The URL above includes more robust conversion routines
-// that handle Inf and NaN correctly. 
-// 
+// that handle Inf and NaN correctly.
+//
 // It is recommended to use the more robust versions in production code.
 
-
 #ifndef FP16_CONV_HEADER
 #define FP16_CONV_HEADER
 
-
-
 typedef unsigned uint;
 
-union FP32
-{
-    uint u;
-    float f;
-    struct
-    {
-        uint Mantissa : 23;
-        uint Exponent : 8;
-        uint Sign : 1;
-    };
+union FP32 {
+  uint u;
+  float f;
+  struct {
+    uint Mantissa : 23;
+    uint Exponent : 8;
+    uint Sign : 1;
+  };
 };
 
-union FP16
-{
-    unsigned short u;
-    struct
-    {
-        uint Mantissa : 10;
-        uint Exponent : 5;
-        uint Sign : 1;
-    };
+union FP16 {
+  unsigned short u;
+  struct {
+    uint Mantissa : 10;
+    uint Exponent : 5;
+    uint Sign : 1;
+  };
 };
 
 // Approximate solution. This is faster but converts some sNaNs to
 // infinity and doesn't round correctly. Handle with care.
 // Approximate solution. This is faster but converts some sNaNs to
 // infinity and doesn't round correctly. Handle with care.
-static half approx_float_to_half(float fl)
-{
-    FP32 f32infty = { 255 << 23 };
-    FP32 f16max = { (127 + 16) << 23 };
-    FP32 magic = { 15 << 23 };
-    FP32 expinf = { (255 ^ 31) << 23 };
-    uint sign_mask = 0x80000000u;
-    FP16 o = { 0 };
-
-    FP32 f = *((FP32*)&fl);
-
-    uint sign = f.u & sign_mask;
-    f.u ^= sign;
-
-    if (!(f.f < f32infty.u)) // Inf or NaN
-        o.u = f.u ^ expinf.u;
-    else
-    {
-        if (f.f > f16max.f) f.f = f16max.f;
-        f.f *= magic.f;
-    }
-
-    o.u = f.u >> 13; // Take the mantissa bits
-    o.u |= sign >> 16;
-    return *((half*)&o);
+static half approx_float_to_half(float fl) {
+  FP32 f32infty = {255 << 23};
+  FP32 f16max = {(127 + 16) << 23};
+  FP32 magic = {15 << 23};
+  FP32 expinf = {(255 ^ 31) << 23};
+  uint sign_mask = 0x80000000u;
+  FP16 o = {0};
+
+  FP32 f = *((FP32 *)&fl);
+
+  uint sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  if (!(f.f < f32infty.u)) // Inf or NaN
+    o.u = f.u ^ expinf.u;
+  else {
+    if (f.f > f16max.f)
+      f.f = f16max.f;
+    f.f *= magic.f;
+  }
+
+  o.u = f.u >> 13; // Take the mantissa bits
+  o.u |= sign >> 16;
+  return *((half *)&o);
 }
 
 // from half->float code - just for verification.
-static float half_to_float(half hf)
-{
-    FP16 h = *((FP16*)&hf);
-
-    static const FP32 magic = { 113 << 23 };
-    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
-    FP32 o;
-
-    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
-    uint exp = shifted_exp & o.u;   // just the exponent
-    o.u += (127 - 15) << 23;        // exponent adjust
-
-    // handle exponent special cases
-    if (exp == shifted_exp) // Inf/NaN?
-        o.u += (128 - 16) << 23;    // extra exp adjust
-    else if (exp == 0) // Zero/Denormal?
-    {
-        o.u += 1 << 23;             // extra exp adjust
-        o.f -= magic.f;             // renormalize
-    }
-
-    o.u |= (h.u & 0x8000) << 16;    // sign bit
-    return o.f;
+static float half_to_float(half hf) {
+  FP16 h = *((FP16 *)&hf);
+
+  static const FP32 magic = {113 << 23};
+  static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
+  FP32 o;
+
+  o.u = (h.u & 0x7fff) << 13;   // exponent/mantissa bits
+  uint exp = shifted_exp & o.u; // just the exponent
+  o.u += (127 - 15) << 23;      // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp)    // Inf/NaN?
+    o.u += (128 - 16) << 23; // extra exp adjust
+  else if (exp == 0)         // Zero/Denormal?
+  {
+    o.u += 1 << 23; // extra exp adjust
+    o.f -= magic.f; // renormalize
+  }
+
+  o.u |= (h.u & 0x8000) << 16; // sign bit
+  return o.f;
 }
 
-
-
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_emu.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_emu.h
index 64aee8231b54d52710192fc7d598d6ed162f1338..8056b2b9071b8793b3d5e85d520c87dd1631035c 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_emu.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_emu.h
@@ -52,16 +52,16 @@
 #if !defined(_FP16_EMU_H_)
 #define _FP16_EMU_H_
 
-#include <driver_types.h>
 #include <cuda_fp16.h>
+#include <driver_types.h>
 
 // Necessary to ensure visibility of CUDART_VERSION macro
 #include <cuda_runtime_api.h>
 
 // Definition of '__half_raw' was not provided before CUDA 9.0.
-// '__half_raw' is our type where the unsigned 16-bit integer 
+// '__half_raw' is our type where the unsigned 16-bit integer
 // data member 'x' can be accessed in both CUDA 9.0 and 8.0.
-#if CUDART_VERSION < 9000 
+#if CUDART_VERSION < 9000
 typedef __half __half_raw;
 #endif
 
@@ -69,206 +69,174 @@ typedef __half __half_raw;
 typedef __half half1;
 
 #define HLF_EPSILON 4.887581E-04
-#define HLF_MIN     6.103516E-05
-#define HLF_MAX     6.550400E+04
+#define HLF_MIN 6.103516E-05
+#define HLF_MAX 6.550400E+04
 
 half1 cpu_float2half_rn(float f);
 
 float cpu_half2float(half1 h);
 
-static __inline__ __device__ __host__ half1 habs(half1 h)
-{
-    __half_raw hr = reinterpret_cast<__half_raw&>(h);
-    hr.x &= 0x7fffU;
-    return reinterpret_cast<half1&>(hr);
+static __inline__ __device__ __host__ half1 habs(half1 h) {
+  __half_raw hr = reinterpret_cast<__half_raw &>(h);
+  hr.x &= 0x7fffU;
+  return reinterpret_cast<half1 &>(hr);
 }
 
-static __inline__ __device__ __host__ half1 hneg(half1 h)
-{
-    __half_raw hr = reinterpret_cast<__half_raw&>(h);
-    hr.x ^= 0x8000U;
-    return reinterpret_cast<half1&>(hr);
+static __inline__ __device__ __host__ half1 hneg(half1 h) {
+  __half_raw hr = reinterpret_cast<__half_raw &>(h);
+  hr.x ^= 0x8000U;
+  return reinterpret_cast<half1 &>(hr);
 }
 
-static __inline__ __device__ __host__ int ishnan(half1 h)
-{
-    // When input is NaN, exponent is all ones and mantissa is non-zero.
-    __half_raw hr = reinterpret_cast<__half_raw&>(h);
-    return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) != 0;
+static __inline__ __device__ __host__ int ishnan(half1 h) {
+  // When input is NaN, exponent is all ones and mantissa is non-zero.
+  __half_raw hr = reinterpret_cast<__half_raw &>(h);
+  return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) != 0;
 }
 
-static __inline__ __device__ __host__ int ishinf(half1 h)
-{
-    // When input is +/- inf, exponent is all ones and mantissa is zero.
-    __half_raw hr = reinterpret_cast<__half_raw&>(h);
-    return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) == 0;
+static __inline__ __device__ __host__ int ishinf(half1 h) {
+  // When input is +/- inf, exponent is all ones and mantissa is zero.
+  __half_raw hr = reinterpret_cast<__half_raw &>(h);
+  return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) == 0;
 }
 
-static __inline__ __device__ __host__ int ishequ(half1 x, half1 y)
-{
-    __half_raw xr = reinterpret_cast<__half_raw&>(x);
-    __half_raw yr = reinterpret_cast<__half_raw&>(y);
-    return ishnan(x) == 0 && ishnan(y) == 0 && xr.x == yr.x;
+static __inline__ __device__ __host__ int ishequ(half1 x, half1 y) {
+  __half_raw xr = reinterpret_cast<__half_raw &>(x);
+  __half_raw yr = reinterpret_cast<__half_raw &>(y);
+  return ishnan(x) == 0 && ishnan(y) == 0 && xr.x == yr.x;
 }
 
 // Returns 0.0000 in FP16 binary form
-static __inline__ __device__ __host__ half1 hzero()
-{
-    __half_raw hr;
-    hr.x = 0x0000U;
-    return reinterpret_cast<half1&>(hr);
+static __inline__ __device__ __host__ half1 hzero() {
+  __half_raw hr;
+  hr.x = 0x0000U;
+  return reinterpret_cast<half1 &>(hr);
 }
 
 // Returns 1.0000 in FP16 binary form
-static __inline__ __device__ __host__ half1 hone()
-{
-    __half_raw hr;
-    hr.x = 0x3c00U;
-    return reinterpret_cast<half1&>(hr);
+static __inline__ __device__ __host__ half1 hone() {
+  __half_raw hr;
+  hr.x = 0x3c00U;
+  return reinterpret_cast<half1 &>(hr);
 }
 
 // Returns quiet NaN, the most significant fraction bit #9 is set
-static __inline__ __device__ __host__ half1 hnan()
-{
-    __half_raw hr;
-    hr.x = 0x7e00U;
-    return reinterpret_cast<half1&>(hr);
+static __inline__ __device__ __host__ half1 hnan() {
+  __half_raw hr;
+  hr.x = 0x7e00U;
+  return reinterpret_cast<half1 &>(hr);
 }
 
 // Largest positive FP16 value, corresponds to 6.5504e+04
-static __inline__ __device__ __host__ half1 hmax()
-{
-    // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff)
-    __half_raw hr;
-    hr.x = 0x7bffU;
-    return reinterpret_cast<half1&>(hr);
+static __inline__ __device__ __host__ half1 hmax() {
+  // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff)
+  __half_raw hr;
+  hr.x = 0x7bffU;
+  return reinterpret_cast<half1 &>(hr);
 }
 
 // Smallest positive (normalized) FP16 value, corresponds to 6.1035e-05
-static __inline__ __device__ __host__ half1 hmin()
-{
-    // Exponent is 0x01 (5 bits), mantissa is all zeros (10 bits)
-    __half_raw hr;
-    hr.x = 0x0400U;
-    return reinterpret_cast<half1&>(hr);
+static __inline__ __device__ __host__ half1 hmin() {
+  // Exponent is 0x01 (5 bits), mantissa is all zeros (10 bits)
+  __half_raw hr;
+  hr.x = 0x0400U;
+  return reinterpret_cast<half1 &>(hr);
 }
 
-
-
-
-
-
-
-
-
-
-
-#define STATIC_ASSERT(cond) do { typedef char compile_time_assert[(cond) ? 1 : -1]; } while (0)
+#define STATIC_ASSERT(cond)                                                    \
+  do {                                                                         \
+    typedef char compile_time_assert[(cond) ? 1 : -1];                         \
+  } while (0)
 
 // Host functions for converting between FP32 and FP16 formats
 // Paulius Micikevicius (pauliusm@nvidia.com)
 
-half1 cpu_float2half_rn(float f)
-{
-    unsigned x = *((int*)(void*)(&f));
-    unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
-    unsigned sign, exponent, mantissa;
-
-    __half_raw hr;
-
-    // Get rid of +NaN/-NaN case first.
-    if (u > 0x7f800000) {
-        hr.x = 0x7fffU;
-        return reinterpret_cast<half1&>(hr);
-    }
-  
-    sign = ((x >> 16) & 0x8000);
-  
-    // Get rid of +Inf/-Inf, +0/-0.
-    if (u > 0x477fefff) {
-        hr.x = sign | 0x7c00U;
-        return reinterpret_cast<half1&>(hr);
-    }
-    if (u < 0x33000001) {
-        hr.x = sign | 0x0000U;
-        return reinterpret_cast<half1&>(hr);
+half1 cpu_float2half_rn(float f) {
+  unsigned x = *((int *)(void *)(&f));
+  unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+  unsigned sign, exponent, mantissa;
+
+  __half_raw hr;
+
+  // Get rid of +NaN/-NaN case first.
+  if (u > 0x7f800000) {
+    hr.x = 0x7fffU;
+    return reinterpret_cast<half1 &>(hr);
+  }
+
+  sign = ((x >> 16) & 0x8000);
+
+  // Get rid of +Inf/-Inf, +0/-0.
+  if (u > 0x477fefff) {
+    hr.x = sign | 0x7c00U;
+    return reinterpret_cast<half1 &>(hr);
+  }
+  if (u < 0x33000001) {
+    hr.x = sign | 0x0000U;
+    return reinterpret_cast<half1 &>(hr);
+  }
+
+  exponent = ((u >> 23) & 0xff);
+  mantissa = (u & 0x7fffff);
+
+  if (exponent > 0x70) {
+    shift = 13;
+    exponent -= 0x70;
+  } else {
+    shift = 0x7e - exponent;
+    exponent = 0;
+    mantissa |= 0x800000;
+  }
+  lsb = (1 << shift);
+  lsb_s1 = (lsb >> 1);
+  lsb_m1 = (lsb - 1);
+
+  // Round to nearest even.
+  remainder = (mantissa & lsb_m1);
+  mantissa >>= shift;
+  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+    ++mantissa;
+    if (!(mantissa & 0x3ff)) {
+      ++exponent;
+      mantissa = 0;
     }
+  }
 
-    exponent = ((u >> 23) & 0xff);
-    mantissa = (u & 0x7fffff);
+  hr.x = (sign | (exponent << 10) | mantissa);
 
-    if (exponent > 0x70) {
-        shift = 13;
-        exponent -= 0x70;
-    } else {
-        shift = 0x7e - exponent;
-        exponent = 0;
-        mantissa |= 0x800000;
-    }
-    lsb = (1 << shift);
-    lsb_s1 = (lsb >> 1);
-    lsb_m1 = (lsb - 1);
-  
-    // Round to nearest even.
-    remainder = (mantissa & lsb_m1);
-    mantissa >>= shift;
-    if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
-        ++mantissa;
-        if (!(mantissa & 0x3ff)) {
-            ++exponent;
-            mantissa = 0;
-        }
-    }  
-
-    hr.x = (sign | (exponent << 10) | mantissa);  
-
-    return reinterpret_cast<half1&>(hr);
+  return reinterpret_cast<half1 &>(hr);
 }
 
-
-float cpu_half2float(half1 h)
-{
-    STATIC_ASSERT(sizeof(int) == sizeof(float));
-
-    __half_raw hr = reinterpret_cast<__half_raw&>(h);
-
-    unsigned sign     = ((hr.x >> 15) & 1);
-    unsigned exponent = ((hr.x >> 10) & 0x1f);
-    unsigned mantissa = ((hr.x & 0x3ff) << 13);
-
-    if (exponent == 0x1f) {  /* NaN or Inf */
-        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
-        exponent = 0xff;
-    } else if (!exponent) {  /* Denorm or Zero */
-        if (mantissa) {
-            unsigned int msb;
-            exponent = 0x71;
-            do {
-                msb = (mantissa & 0x400000);
-                mantissa <<= 1;  /* normalize */
-                --exponent;
-            } while (!msb);
-            mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
-        }
-    } else {
-        exponent += 0x70;
+float cpu_half2float(half1 h) {
+  STATIC_ASSERT(sizeof(int) == sizeof(float));
+
+  __half_raw hr = reinterpret_cast<__half_raw &>(h);
+
+  unsigned sign = ((hr.x >> 15) & 1);
+  unsigned exponent = ((hr.x >> 10) & 0x1f);
+  unsigned mantissa = ((hr.x & 0x3ff) << 13);
+
+  if (exponent == 0x1f) { /* NaN or Inf */
+    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+    exponent = 0xff;
+  } else if (!exponent) { /* Denorm or Zero */
+    if (mantissa) {
+      unsigned int msb;
+      exponent = 0x71;
+      do {
+        msb = (mantissa & 0x400000);
+        mantissa <<= 1; /* normalize */
+        --exponent;
+      } while (!msb);
+      mantissa &= 0x7fffff; /* 1.mantissa is implicit */
     }
+  } else {
+    exponent += 0x70;
+  }
 
-    int temp = ((sign << 31) | (exponent << 23) | mantissa);
+  int temp = ((sign << 31) | (exponent << 23) | mantissa);
 
-    return reinterpret_cast<float&>(temp);
+  return reinterpret_cast<float &>(temp);
 }
 
-
-
-
-
-
-
-#endif  // _FP16_EMU_H_
-
-
-
-
-
-
+#endif // _FP16_EMU_H_
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_gemm.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_gemm.h
index 7251d9dca2ed60a8030451bb3b6a858840d9b1c4..057b7f8b869f2a40d5f76345302dfb8df5235f1f 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_gemm.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_gemm.h
@@ -6,67 +6,45 @@
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
 
-
 inline cudaError_t checkCuda(cudaError_t result);
 inline cublasStatus_t checkCublas(cublasStatus_t result);
 
-
 template <typename T>
-inline void printArray(const T * const __restrict__ array,
+inline void printArray(const T *const __restrict__ array,
                        const unsigned elements);
 
 // initialization
 template <typename T>
-__global__ void initKernel(T * const __restrict__ array,
+__global__ void initKernel(T *const __restrict__ array,
                            const unsigned elements);
 
-
 template <typename T>
-void init(T * const __restrict__ array,
-          const unsigned elements);
-
+void init(T *const __restrict__ array, const unsigned elements);
 
 // float to half
-__global__ void f2hKernel(const float * const __restrict__ input,
+__global__ void f2hKernel(const float *const __restrict__ input,
                           const unsigned elements,
-                          half * const __restrict__ output);
-
-
-void f2h(const float * const __restrict__ input,
-         const unsigned elements,
-         half * const __restrict__ output);
+                          half *const __restrict__ output);
 
+void f2h(const float *const __restrict__ input, const unsigned elements,
+         half *const __restrict__ output);
 
 // half to float
-__global__ void h2fKernel(const half * const __restrict__ input,
+__global__ void h2fKernel(const half *const __restrict__ input,
                           const unsigned elements,
-                          float * const __restrict__ output);
-
-
-void h2f(const half * const __restrict__ input,
-         const unsigned elements,
-         float * const __restrict__ output);
-
-
-
-void sgemm(const float * const __restrict__ a,
-           const unsigned num_rows_a,
-           const unsigned num_cols_a,
-           const float * const __restrict__ b,
-           const unsigned num_rows_b,
-           const unsigned num_cols_b,
-           float * const __restrict__ c);
-
-
+                          float *const __restrict__ output);
 
-void hgemm(const float * const __restrict__ af,
-           const unsigned num_rows_a,
-           const unsigned num_cols_a,
-           const float * const __restrict__ bf,
-           const unsigned num_rows_b,
-           const unsigned num_cols_b,
-           float * const __restrict__ cf);
+void h2f(const half *const __restrict__ input, const unsigned elements,
+         float *const __restrict__ output);
 
+void sgemm(const float *const __restrict__ a, const unsigned num_rows_a,
+           const unsigned num_cols_a, const float *const __restrict__ b,
+           const unsigned num_rows_b, const unsigned num_cols_b,
+           float *const __restrict__ c);
 
+void hgemm(const float *const __restrict__ af, const unsigned num_rows_a,
+           const unsigned num_cols_a, const float *const __restrict__ bf,
+           const unsigned num_rows_b, const unsigned num_cols_b,
+           float *const __restrict__ cf);
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h
index c6c804fa00f1ae5eb324d6928d8f3c43b1231d14..54d919b3346047285bb0b89c2c8d97f625738183 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h
@@ -50,8 +50,8 @@ template <typename T, size_t N> using RepNTuple = RepNType<T, std::tuple, N>;
 
 namespace {
 template <typename TIterable, typename T, size_t... Is>
-constexpr RepNTuple<T, sizeof...(Is)>
-as_tuple(TIterable arr, std::index_sequence<Is...>) {
+constexpr RepNTuple<T, sizeof...(Is)> as_tuple(TIterable arr,
+                                               std::index_sequence<Is...>) {
   return std::make_tuple(arr[Is]...);
 }
 
@@ -76,7 +76,7 @@ __device__ auto call_on_tuple(Function f, Tuple t) {
   return call(f, t, std::make_index_sequence<size>{});
 }
 
-// Expands Array of type T and size N into parameters of Function 
+// Expands Array of type T and size N into parameters of Function
 template <typename Ret, typename T, size_t N>
 __device__ Ret call_on_c_array(NAToBF<Ret, T, N> f, const T arr[N]) {
   return call_on_tuple(f, as_tuple<const T *, T, N>(arr));
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
index e3a9b38af984eb220c0ce8aa14b9ffd65138c788..c91c5b9cc314df9536f2d5efc61c29032cca2c1a 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
@@ -2,35 +2,31 @@
 #ifndef GLOBAL_DATA_HEADER
 #define GLOBAL_DATA_HEADER
 
-
-#include <stdio.h>
-#include <stdarg.h>
 #include <cstdio>
 #include <cstdlib>
+#include <stdarg.h>
+#include <stdio.h>
 #include <unordered_set>
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 
+#include <cublas_api.h>
 #include <cublas_v2.h>
 #include <cudnn.h>
-#include <cublas_api.h>
 #include <string>
 #include <unordered_map>
 
-#include "tensor.h"
 #include "approx_knob_utils.h"
-
+#include "tensor.h"
 
 #define ERROR_INJECTION_ENABLED 0
 #define PROMISE_MODE 1
 
-
 #ifdef NO_INJECTION
 #undef ERROR_INJECTION_ENABLED
 #endif
 
-
 //#define ERROR_INJECTION_ENABLED 1
 /* Data declarations */
 extern cudnnHandle_t cudnnHandle;
@@ -40,19 +36,17 @@ extern bool runtime_initialized;
 // NOTE: Layers Mode is True or Approxhpvm wrappper runtime mode
 extern bool approxhpvm_runtime_mode;
 
-
 extern int op_counter;
 extern int total_ops;
 
-
 // NOTE: Both vectors asssume a linear CFG
 // FIXME: Each operation should have an ID passed to the runtime
 extern std::vector<int> op_accuracies;
-extern std::vector<Range*> quant_ranges;
+extern std::vector<Range *> quant_ranges;
 
-extern std::unordered_set<void*> tensors_ptr, host_ptr, obj_ptr;
+extern std::unordered_set<void *> tensors_ptr, host_ptr, obj_ptr;
 
-extern std::unordered_map<void*, int> tracked_tensors;
+extern std::unordered_map<void *, int> tracked_tensors;
 
 // Autotuning data
 extern std::unordered_map<int, int> skip_tensors;
@@ -61,8 +55,7 @@ extern std::unordered_map<int, int> skip_tensors;
 extern std::unordered_map<std::string, int> func_counters;
 extern std::string profile_data;
 
-extern PerfParamSet* perfParamSet;  
-extern SampParamSet* sampParamSet;
-
+extern PerfParamSet *perfParamSet;
+extern SampParamSet *sampParamSet;
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h
index 89e74f3bf8887be12b59bb1e40e3032760cba3de..7b907b16da9eb8eeac9fea5cc1d778da1274fe29 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h
@@ -3,34 +3,27 @@
 #ifndef HALF_API_HEADER
 #define HALF_API_HEADER
 
+extern "C" {
 
-extern "C"{
+void *tensorHalfGemm(void *lhs_ptr, void *rhs_ptr);
+void *tensorHalfGemmGPU(void *lhs_ptr, void *rhs_ptr);
 
-void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr);
-void* tensorHalfGemmGPU(void* lhs_ptr, void* rhs_ptr);
+void *tensorHalfConvolution(void *input_ptr, void *filter_ptr, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups);
 
+void *tensorHalfBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                          void *mean_ptr, void *variance_ptr, double epsilon);
 
-void* tensorHalfConvolution(void* input_ptr, void* filter_ptr,
-			    int vertical_pad, int horizontal_pad,
-			    int vertical_stride, int horizontal_stride,
-			    int conv_mode, int conv_groups);
-
-void* tensorHalfBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-           		  void* mean_ptr, void* variance_ptr, double epsilon);
-
-
-void* tensorHalfPooling(void* input_ptr,
-			int poolFunction,
-			int window_height, int window_width,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride);
- 
-void* tensorHalfRelu2(void* input_ptr, float min, float max);
-void* tensorHalfRelu(void* input_ptr);
-void* tensorHalfTanh(void* input_ptr);
-void* tensorHalfAdd(void* x_ptr, void* bias_ptr);
-
+void *tensorHalfPooling(void *input_ptr, int poolFunction, int window_height,
+                        int window_width, int vertical_pad, int horizontal_pad,
+                        int vertical_stride, int horizontal_stride);
 
+void *tensorHalfRelu2(void *input_ptr, float min, float max);
+void *tensorHalfRelu(void *input_ptr);
+void *tensorHalfTanh(void *input_ptr);
+void *tensorHalfAdd(void *x_ptr, void *bias_ptr);
 }
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
index f7d1018c9a2c7f25df642c8593b9612ae92dfa98..0a207edc51c6bf029d6a5100c8617e8d3e811b31 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
@@ -1,13 +1,13 @@
 #ifndef LLVM_HPVM_RT_CONTROLLER_H
 #define LLVM_HPVM_RT_CONTROLLER_H
 
+#include <cstdlib>
+#include <cstring>
 #include <fstream>
 #include <iostream>
 #include <map>
-#include <vector>
-#include <cstring>
-#include <cstdlib>
 #include <random>
+#include <vector>
 
 #include "configuration.h"
 
@@ -24,38 +24,38 @@
  * Check if a file exists
  * Return true if the file exists, false else
  */
-bool fileExists(const std::string& file);
+bool fileExists(const std::string &file);
 
 class FrequencyIndexList {
-  private:
-    std::vector<int> idx_list;
-    unsigned rep_factor;
+private:
+  std::vector<int> idx_list;
+  unsigned rep_factor;
 
-    unsigned count;
-    unsigned idx;
+  unsigned count;
+  unsigned idx;
 
-  public:
-    FrequencyIndexList(std::vector<int>, unsigned);
-    unsigned getNextIndex();
+public:
+  FrequencyIndexList(std::vector<int>, unsigned);
+  unsigned getNextIndex();
 };
 
 class ProfileInfo {
-  private:
+private:
   // Members
-  double time_total;       // Total execution time of application
-  double time_compute;     // Compute
-  double time_control;     // Control
-  double time_config;      // Apply configuration
+  double time_total;   // Total execution time of application
+  double time_compute; // Compute
+  double time_control; // Control
+  double time_config;  // Apply configuration
 
-  double energy_total;     // Total energy consumed by applcation
-  double energy_compute;   // Compute
-  double energy_control;   // Control
-  double energy_config;    // Apply configuration
+  double energy_total;   // Total energy consumed by applcation
+  double energy_compute; // Compute
+  double energy_control; // Control
+  double energy_config;  // Apply configuration
 
   // Execution time of one loop iteration
-  double time_compute_current_iteration;   // Compute
-  double time_control_current_iteration;   // Control
-  double time_config_current_iteration;    // Apply configuration
+  double time_compute_current_iteration; // Compute
+  double time_control_current_iteration; // Control
+  double time_config_current_iteration;  // Apply configuration
 
   // Energy comsumed by one loop iteration
   double energy_compute_current_iteration; // Compute
@@ -70,23 +70,24 @@ class ProfileInfo {
   // - per operation (inner vector)
   //                 (tensor operation for GPU, or whole layer for PROMISE)
   // is stored
-  std::vector< std::vector< std::pair< std::string, double > > > tensor_time_info;
-  std::vector< std::vector< std::pair< std::string, double > > > tensor_energy_info;
+  std::vector<std::vector<std::pair<std::string, double>>> tensor_time_info;
+  std::vector<std::vector<std::pair<std::string, double>>> tensor_energy_info;
 
-  // Vectors, where total compute time and energy information per iteration are stored
-  std::vector< double > compute_time_info;
-  std::vector< double > compute_energy_info;
+  // Vectors, where total compute time and energy information per iteration are
+  // stored
+  std::vector<double> compute_time_info;
+  std::vector<double> compute_energy_info;
 
   // Vectors, where control time and energy information per iteration are stored
-  std::vector< double > control_time_info;
-  std::vector< double > control_energy_info;
+  std::vector<double> control_time_info;
+  std::vector<double> control_energy_info;
 
   // Vectors, where control time and energy information per iteration are stored
-  std::vector< double > config_time_info;
-  std::vector< double > config_energy_info;
+  std::vector<double> config_time_info;
+  std::vector<double> config_energy_info;
 
   // Vector, where frequency information at the end of each iteration is stored
-  std::vector< unsigned long > frequency_info;
+  std::vector<unsigned long> frequency_info;
 
   bool in_iteration;
 
@@ -100,7 +101,7 @@ class ProfileInfo {
 
   void start_iteration();
 
-  public:
+public:
   void end_iteration();
 
   void addToCurrentIterationComputeTime(const char *s, double t);
@@ -132,21 +133,19 @@ class ProfileInfo {
   void printToFile();
 
   ProfileInfo();
-
 };
 
 class Slowdowns {
-  private:
+private:
   std::vector<float> slowdowns;
   unsigned idx;
 
-  public:
-    Slowdowns();
+public:
+  Slowdowns();
 
   unsigned getSlowdownsNumber();
 
   float getNextSlowdown();
-
 };
 
 class RuntimeController;
@@ -154,10 +153,10 @@ class RuntimeController;
 extern RuntimeController *RC;
 
 class RuntimeController {
-  private:
+private:
   // Members
   // Map from node names to quantization ranges
-  std::map<std::string, std::vector<float> > QuantizationMap;
+  std::map<std::string, std::vector<float>> QuantizationMap;
 
   // Configurations.
   // Configurations initially read - all generated from autotuner
@@ -197,7 +196,7 @@ class RuntimeController {
   // update the frequency of the Jetson board
   FrequencyIndexList *FIL;
 
-  //Functions
+  // Functions
 
   // Private functions of profiler
   void start_profiler();
@@ -210,7 +209,7 @@ class RuntimeController {
   void computeParetoConfigurationPoints();
   void compute3DParetoConfigurationPoints();
 
-  public:
+public:
   // For testing purposes only - do not use widely
   std::vector<struct Configuration *> &getSpeedupConfigurations();
   // For testing purposes only - do not use widely
@@ -281,25 +280,17 @@ class RuntimeController {
   std::pair<double, double> get_time_energy() const;
 
   // Exposing functionality of promise simulator
-  std::pair<double, double> fc_profile(const unsigned num_rows_a,
-                            const unsigned num_cols_a,
-                            const unsigned num_rows_b,
-                            const unsigned num_cols_b,
-                            const unsigned voltage_swing,
-                            const unsigned patch_factor);
-
-  std::pair<double, double> conv_profile(const unsigned n,
-                            const unsigned c,
-                            const unsigned h,
-                            const unsigned w,
-                            const unsigned c_out,
-                            const unsigned c_in,
-                            const unsigned k_h,
-                            const unsigned k_w,
-                            const unsigned s_h,
-                            const unsigned s_w,
-                            const unsigned voltage_swing,
-                            const unsigned patch_factor);
+  std::pair<double, double>
+  fc_profile(const unsigned num_rows_a, const unsigned num_cols_a,
+             const unsigned num_rows_b, const unsigned num_cols_b,
+             const unsigned voltage_swing, const unsigned patch_factor);
+
+  std::pair<double, double>
+  conv_profile(const unsigned n, const unsigned c, const unsigned h,
+               const unsigned w, const unsigned c_out, const unsigned c_in,
+               const unsigned k_h, const unsigned k_w, const unsigned s_h,
+               const unsigned s_w, const unsigned voltage_swing,
+               const unsigned patch_factor);
 
   // Constructor and descructor
   RuntimeController();
@@ -310,7 +301,6 @@ class RuntimeController {
   void printQuantizationMap();
   void printConfigurations(std::vector<struct Configuration> &);
   void printConfigurations(std::vector<struct Configuration *> &);
-
 };
 #define NODE_NAME_BUFFER_SIZE 10
 #define AL_THRESHOLD 0.01
@@ -318,9 +308,10 @@ class RuntimeController {
 
 //*** Methods to compute accuracy of a tensor by the runtime controller   ***//
 
-uint32_t* hpvm_rt_readLabelsBatch_cached(const char* labels_file, int start, int end);
+uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start,
+                                         int end);
 
 //*** Copied from dnn_sources/include/utils.h                             ***//
-float hpvm_rt_computeAccuracy3(uint32_t* labels, void* result_ptr);
+float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr);
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h
index f4b2b17f4b4b9af16cded2838fa0db31287ccbe4..da7337008d8d39d65a45ab906155ed409b35a991 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h
@@ -3,7 +3,8 @@
 
    Do this:
       #define STB_IMAGE_IMPLEMENTATION
-   before you include this file in *one* C or C++ file to create the implementation.
+   before you include this file in *one* C or C++ file to create the
+implementation.
 
    // i.e. it should look like this:
    #include ...
@@ -13,15 +14,16 @@
    #include "stb_image.h"
 
    You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
-   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using
+malloc,realloc,free
 
 
    QUICK NOTES:
       Primarily of interest to game developers and other people who can
           avoid problematic images and only need the trivial interface
 
-      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
-      PNG 1/2/4/8/16-bit-per-channel
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as
+stock IJG lib) PNG 1/2/4/8/16-bit-per-channel
 
       TGA (not sure what subset, if a subset)
       BMP non-1bpp, non-RLE
@@ -51,21 +53,18 @@ RECENT REVISION HISTORY:
       2.23  (2019-08-11) fix clang static analysis warning
       2.22  (2019-03-04) gif fixes, fix warnings
       2.21  (2019-02-25) fix typo in comment
-      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs 
-      2.19  (2018-02-11) fix warning
-      2.18  (2018-01-30) fix warnings
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and
+platform ifdefs 2.19  (2018-02-11) fix warning 2.18  (2018-01-30) fix warnings
       2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
-      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
-      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
-      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
-      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
-      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
-      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
-                         RGB-format JPEG; remove white matting in PSD;
-                         allocate large structures on the stack;
-                         correct channel count for PNG & BMP
-      2.10  (2016-01-22) avoid warning introduced in 2.09
-      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations;
+bugfixes 2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE
+detection on GCC 2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for
+Imagenet JPGs 2.13  (2016-12-04) experimental 16-bit API, only for PNG so far;
+fixes 2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes 2.11
+(2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64 RGB-format JPEG; remove
+white matting in PSD; allocate large structures on the stack; correct channel
+count for PNG & BMP 2.10  (2016-01-22) avoid warning introduced in 2.09 2.09
+(2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
 
    See end of file for full revision history.
 
@@ -83,29 +82,29 @@ RECENT REVISION HISTORY:
     github:urraka (animated gif)           Junggon Kim (PNM comments)
     Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
                                            socks-the-fox (16-bit PNG)
-                                           Jeremy Sawicki (handle all ImageNet JPGs)
- Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+                                           Jeremy Sawicki (handle all ImageNet
+JPGs) Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
     Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
     Arseny Kapoulkine
     John-Mark Allen
     Carmelo J Fdez-Aguera
 
  Bug & warning fixes
-    Marc LeBlanc            David Woo          Guillaume George   Martins Mozeiko
-    Christpher Lloyd        Jerry Jansson      Joseph Thomson     Phil Jordan
-    Dave Moore              Roy Eltham         Hayaki Saito       Nathan Reed
+    Marc LeBlanc            David Woo          Guillaume George   Martins
+Mozeiko Christpher Lloyd        Jerry Jansson      Joseph Thomson     Phil
+Jordan Dave Moore              Roy Eltham         Hayaki Saito       Nathan Reed
     Won Chun                Luke Graham        Johan Duparc       Nick Verigakis
     the Horde3D community   Thomas Ruf         Ronny Chevalier    github:rlyeh
-    Janez Zemva             John Bartholomew   Michal Cichon      github:romigrou
+    Janez Zemva             John Bartholomew   Michal Cichon github:romigrou
     Jonathan Blow           Ken Hamada         Tero Hanninen      github:svdijk
     Laurent Gomila          Cort Stratton      Sergio Gonzalez    github:snagar
     Aruelien Pocheville     Thibault Reuille   Cass Everitt       github:Zelex
     Ryamond Barbiero        Paul Du Bois       Engin Manap        github:grim210
     Aldo Culquicondor       Philipp Wiesemann  Dale Weiler        github:sammyhw
     Oriol Ferrer Mesia      Josh Tobin         Matthew Gregan     github:phprus
-    Julian Raschke          Gregory Mullen     Baldur Karlsson    github:poppolopoppo
-    Christian Floisand      Kevin Schmidt      JR Smith           github:darealshinji
-    Blazej Dariusz Roszkowski                                     github:Michaelangel007
+    Julian Raschke          Gregory Mullen     Baldur Karlsson
+github:poppolopoppo Christian Floisand      Kevin Schmidt      JR Smith
+github:darealshinji Blazej Dariusz Roszkowski github:Michaelangel007
 */
 
 #ifndef STBI_INCLUDE_STB_IMAGE_H
@@ -124,14 +123,15 @@ RECENT REVISION HISTORY:
 //    // ... process data if not NULL ...
 //    // ... x = width, y = height, n = # 8-bit components per pixel ...
 //    // ... replace '0' with '1'..'4' to force that many components per pixel
-//    // ... but 'n' will always be the number that it would have been if you said 0
-//    stbi_image_free(data)
+//    // ... but 'n' will always be the number that it would have been if you
+//    said 0 stbi_image_free(data)
 //
 // Standard parameters:
 //    int *x                 -- outputs image width in pixels
 //    int *y                 -- outputs image height in pixels
 //    int *channels_in_file  -- outputs # of image components in image file
-//    int desired_channels   -- if non-zero, # of image components requested in result
+//    int desired_channels   -- if non-zero, # of image components requested in
+//    result
 //
 // The return value from an image loader is an 'unsigned char *' which points
 // to the pixel data, or NULL on an allocation failure or if the image is
@@ -159,8 +159,8 @@ RECENT REVISION HISTORY:
 // and *x, *y, *channels_in_file will be unchanged. The function
 // stbi_failure_reason() can be queried for an extremely brief, end-user
 // unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
-// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
-// more user-friendly ones.
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get
+// slightly more user-friendly ones.
 //
 // Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
 //
@@ -184,11 +184,12 @@ RECENT REVISION HISTORY:
 //    2. easy to maintain
 //    3. good performance
 //
-// Sometimes I let "good performance" creep up in priority over "easy to maintain",
-// and for best performance I may provide less-easy-to-use APIs that give higher
-// performance, in addition to the easy-to-use ones. Nevertheless, it's important
-// to keep in mind that from the standpoint of you, a client of this library,
-// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+// Sometimes I let "good performance" creep up in priority over "easy to
+// maintain", and for best performance I may provide less-easy-to-use APIs that
+// give higher performance, in addition to the easy-to-use ones. Nevertheless,
+// it's important to keep in mind that from the standpoint of you, a client of
+// this library, all you care about is #1 and #3, and stb libraries DO NOT
+// emphasize #3 above all.
 //
 // Some secondary priorities arise directly from the first two, some of which
 // provide more explicit reasons why performance can't be emphasized.
@@ -207,7 +208,8 @@ RECENT REVISION HISTORY:
 // overhead.
 //
 // The three functions you must define are "read" (reads some bytes of data),
-// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the
+// end).
 //
 // ===========================================================================
 //
@@ -235,10 +237,11 @@ RECENT REVISION HISTORY:
 // HDR image support   (disable by defining STBI_NO_HDR)
 //
 // stb_image supports loading HDR images in general, and currently the Radiance
-// .HDR file format specifically. You can still load any file through the existing
-// interface; if you attempt to load an HDR file, it will be automatically remapped
-// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
-// both of these constants can be reconfigured through this interface:
+// .HDR file format specifically. You can still load any file through the
+// existing interface; if you attempt to load an HDR file, it will be
+// automatically remapped to LDR, assuming gamma 2.2 and an arbitrary scale
+// factor defaulting to 1; both of these constants can be reconfigured through
+// this interface:
 //
 //     stbi_hdr_to_ldr_gamma(2.2f);
 //     stbi_hdr_to_ldr_scale(1.0f);
@@ -316,21 +319,19 @@ RECENT REVISION HISTORY:
 //     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
 //
 
-
 #ifndef STBI_NO_STDIO
 #include <stdio.h>
 #endif // STBI_NO_STDIO
 
 #define STBI_VERSION 1
 
-enum
-{
-   STBI_default = 0, // only used for desired_channels
+enum {
+  STBI_default = 0, // only used for desired_channels
 
-   STBI_grey       = 1,
-   STBI_grey_alpha = 2,
-   STBI_rgb        = 3,
-   STBI_rgb_alpha  = 4
+  STBI_grey = 1,
+  STBI_grey_alpha = 2,
+  STBI_rgb = 3,
+  STBI_rgb_alpha = 4
 };
 
 #include <stdlib.h>
@@ -358,11 +359,13 @@ extern "C" {
 // load image by filename, open file, or memory buffer
 //
 
-typedef struct
-{
-   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
-   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
-   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+typedef struct {
+  int (*read)(void *user, char *data,
+              int size); // fill 'data' with 'size' bytes.  return number of
+                         // bytes actually read
+  void (*skip)(void *user, int n); // skip the next 'n' bytes, or 'unget' the
+                                   // last -n bytes if negative
+  int (*eof)(void *user); // returns nonzero if we are at end of file/data
 } stbi_io_callbacks;
 
 ////////////////////////////////////
@@ -370,21 +373,33 @@ typedef struct
 // 8-bits-per-channel interface
 //
 
-STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x,
+                                       int *y, int *channels_in_file,
+                                       int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk,
+                                          void *user, int *x, int *y,
+                                          int *channels_in_file,
+                                          int desired_channels);
 
 #ifndef STBI_NO_STDIO
-STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
-// for stbi_load_from_file, file pointer is left pointing immediately after image
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y,
+                           int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y,
+                                     int *channels_in_file,
+                                     int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after
+// image
 #endif
 
 #ifndef STBI_NO_GIF
-STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len,
+                                           int **delays, int *x, int *y, int *z,
+                                           int *comp, int req_comp);
 #endif
 
 #ifdef STBI_WINDOWS_UTF8
-STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen,
+                                       const wchar_t *input);
 #endif
 
 ////////////////////////////////////
@@ -392,12 +407,20 @@ STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wch
 // 16-bits-per-channel interface
 //
 
-STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len,
+                                          int *x, int *y, int *channels_in_file,
+                                          int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk,
+                                             void *user, int *x, int *y,
+                                             int *channels_in_file,
+                                             int desired_channels);
 
 #ifndef STBI_NO_STDIO
-STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y,
+                              int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y,
+                                        int *channels_in_file,
+                                        int desired_channels);
 #endif
 
 ////////////////////////////////////
@@ -405,78 +428,96 @@ STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_i
 // float-per-channel interface
 //
 #ifndef STBI_NO_LINEAR
-   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
-   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x,
+                                      int *y, int *channels_in_file,
+                                      int desired_channels);
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk,
+                                         void *user, int *x, int *y,
+                                         int *channels_in_file,
+                                         int desired_channels);
 
-   #ifndef STBI_NO_STDIO
-   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
-   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
-   #endif
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y,
+                          int *channels_in_file, int desired_channels);
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y,
+                                    int *channels_in_file,
+                                    int desired_channels);
+#endif
 #endif
 
 #ifndef STBI_NO_HDR
-   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
-   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+STBIDEF void stbi_hdr_to_ldr_gamma(float gamma);
+STBIDEF void stbi_hdr_to_ldr_scale(float scale);
 #endif // STBI_NO_HDR
 
 #ifndef STBI_NO_LINEAR
-   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
-   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+STBIDEF void stbi_ldr_to_hdr_gamma(float gamma);
+STBIDEF void stbi_ldr_to_hdr_scale(float scale);
 #endif // STBI_NO_LINEAR
 
 // stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
-STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
-STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk,
+                                       void *user);
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
 #ifndef STBI_NO_STDIO
-STBIDEF int      stbi_is_hdr          (char const *filename);
-STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+STBIDEF int stbi_is_hdr(char const *filename);
+STBIDEF int stbi_is_hdr_from_file(FILE *f);
 #endif // STBI_NO_STDIO
 
-
 // get a VERY brief reason for failure
 // NOT THREADSAFE
-STBIDEF const char *stbi_failure_reason  (void);
+STBIDEF const char *stbi_failure_reason(void);
 
 // free the loaded image -- this is just free()
-STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+STBIDEF void stbi_image_free(void *retval_from_stbi_load);
 
 // get image dimensions & components without fully decoding
-STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
-STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
-STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
-STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x,
+                                  int *y, int *comp);
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user,
+                                     int *x, int *y, int *comp);
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk,
+                                          void *user);
 
 #ifndef STBI_NO_STDIO
-STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
-STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
-STBIDEF int      stbi_is_16_bit          (char const *filename);
-STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp);
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp);
+STBIDEF int stbi_is_16_bit(char const *filename);
+STBIDEF int stbi_is_16_bit_from_file(FILE *f);
 #endif
 
-
-
 // for image formats that explicitly notate that they have premultiplied alpha,
 // we just return the colors as stored in the file. set this flag to force
 // unpremultiplication. results are undefined if the unpremultiply overflow.
-STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+STBIDEF void
+stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
 
 // indicate whether we should process iphone images back to canonical format,
 // or just pass them through "as-is"
 STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
 
-// flip the image vertically, so the first pixel in the output array is the bottom left
+// flip the image vertically, so the first pixel in the output array is the
+// bottom left
 STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
 
 // ZLIB client - used by PNG, available for other purposes
 
-STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
-STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len,
+                                                int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer,
+                                                           int len,
+                                                           int initial_size,
+                                                           int *outlen,
+                                                           int parse_header);
 STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
-STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
-
-STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
-STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen,
+                                    const char *ibuffer, int ilen);
 
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len,
+                                               int *outlen);
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen,
+                                             const char *ibuffer, int ilen);
 
 #ifdef __cplusplus
 }
@@ -489,52 +530,53 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
 
 #ifdef STB_IMAGE_IMPLEMENTATION
 
-#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
-  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
-  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
-  || defined(STBI_ONLY_ZLIB)
-   #ifndef STBI_ONLY_JPEG
-   #define STBI_NO_JPEG
-   #endif
-   #ifndef STBI_ONLY_PNG
-   #define STBI_NO_PNG
-   #endif
-   #ifndef STBI_ONLY_BMP
-   #define STBI_NO_BMP
-   #endif
-   #ifndef STBI_ONLY_PSD
-   #define STBI_NO_PSD
-   #endif
-   #ifndef STBI_ONLY_TGA
-   #define STBI_NO_TGA
-   #endif
-   #ifndef STBI_ONLY_GIF
-   #define STBI_NO_GIF
-   #endif
-   #ifndef STBI_ONLY_HDR
-   #define STBI_NO_HDR
-   #endif
-   #ifndef STBI_ONLY_PIC
-   #define STBI_NO_PIC
-   #endif
-   #ifndef STBI_ONLY_PNM
-   #define STBI_NO_PNM
-   #endif
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) ||                       \
+    defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) ||                        \
+    defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) ||                        \
+    defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) ||                        \
+    defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB)
+#ifndef STBI_ONLY_JPEG
+#define STBI_NO_JPEG
+#endif
+#ifndef STBI_ONLY_PNG
+#define STBI_NO_PNG
+#endif
+#ifndef STBI_ONLY_BMP
+#define STBI_NO_BMP
+#endif
+#ifndef STBI_ONLY_PSD
+#define STBI_NO_PSD
+#endif
+#ifndef STBI_ONLY_TGA
+#define STBI_NO_TGA
+#endif
+#ifndef STBI_ONLY_GIF
+#define STBI_NO_GIF
+#endif
+#ifndef STBI_ONLY_HDR
+#define STBI_NO_HDR
+#endif
+#ifndef STBI_ONLY_PIC
+#define STBI_NO_PIC
+#endif
+#ifndef STBI_ONLY_PNM
+#define STBI_NO_PNM
+#endif
 #endif
 
-#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) &&                     \
+    !defined(STBI_NO_ZLIB)
 #define STBI_NO_ZLIB
 #endif
 
-
+#include <limits.h>
 #include <stdarg.h>
 #include <stddef.h> // ptrdiff_t on osx
 #include <stdlib.h>
 #include <string.h>
-#include <limits.h>
 
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h>  // ldexp, pow
+#include <math.h> // ldexp, pow
 #endif
 
 #ifndef STBI_NO_STDIO
@@ -552,38 +594,36 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
 #define STBI_EXTERN extern
 #endif
 
-
 #ifndef _MSC_VER
-   #ifdef __cplusplus
-   #define stbi_inline inline
-   #else
-   #define stbi_inline
-   #endif
+#ifdef __cplusplus
+#define stbi_inline inline
 #else
-   #define stbi_inline __forceinline
+#define stbi_inline
+#endif
+#else
+#define stbi_inline __forceinline
 #endif
-
 
 #ifdef _MSC_VER
 typedef unsigned short stbi__uint16;
-typedef   signed short stbi__int16;
-typedef unsigned int   stbi__uint32;
-typedef   signed int   stbi__int32;
+typedef signed short stbi__int16;
+typedef unsigned int stbi__uint32;
+typedef signed int stbi__int32;
 #else
 #include <stdint.h>
 typedef uint16_t stbi__uint16;
-typedef int16_t  stbi__int16;
+typedef int16_t stbi__int16;
 typedef uint32_t stbi__uint32;
-typedef int32_t  stbi__int32;
+typedef int32_t stbi__int32;
 #endif
 
 // should produce compiler error if size is wrong
-typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
 
 #ifdef _MSC_VER
-#define STBI_NOTUSED(v)  (void)(v)
+#define STBI_NOTUSED(v) (void)(v)
 #else
-#define STBI_NOTUSED(v)  (void)sizeof(v)
+#define STBI_NOTUSED(v) (void)sizeof(v)
 #endif
 
 #ifdef _MSC_VER
@@ -591,27 +631,30 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #endif
 
 #ifdef STBI_HAS_LROTL
-   #define stbi_lrot(x,y)  _lrotl(x,y)
+#define stbi_lrot(x, y) _lrotl(x, y)
 #else
-   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+#define stbi_lrot(x, y) (((x) << (y)) | ((x) >> (32 - (y))))
 #endif
 
-#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+#if defined(STBI_MALLOC) && defined(STBI_FREE) &&                              \
+    (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
 // ok
-#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) &&                          \
+    !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
 // ok
 #else
-#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#error                                                                         \
+    "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
 #endif
 
 #ifndef STBI_MALLOC
-#define STBI_MALLOC(sz)           malloc(sz)
-#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
-#define STBI_FREE(p)              free(p)
+#define STBI_MALLOC(sz) malloc(sz)
+#define STBI_REALLOC(p, newsz) realloc(p, newsz)
+#define STBI_FREE(p) free(p)
 #endif
 
 #ifndef STBI_REALLOC_SIZED
-#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#define STBI_REALLOC_SIZED(p, oldsz, newsz) STBI_REALLOC(p, newsz)
 #endif
 
 // x86/x64 detection
@@ -621,7 +664,8 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #define STBI__X86_TARGET
 #endif
 
-#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) &&    \
+    !defined(STBI_NO_SIMD)
 // gcc doesn't support sse2 intrinsics unless you compile with -msse2,
 // which in turn means it gets to use SSE2 everywhere. This is unfortunate,
 // but previous attempts to provide the SSE2 functions with runtime
@@ -632,8 +676,10 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #define STBI_NO_SIMD
 #endif
 
-#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
-// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) &&                       \
+    !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid
+// STBI__X64_TARGET
 //
 // 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
 // Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
@@ -643,44 +689,43 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 // See https://github.com/nothings/stb/issues/81 for more information.
 //
 // So default to no SSE2 on 32-bit MinGW. If you've read this far and added
-// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+// -mstackrealign to your build settings, feel free to #define
+// STBI_MINGW_ENABLE_SSE2.
 #define STBI_NO_SIMD
 #endif
 
-#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#if !defined(STBI_NO_SIMD) &&                                                  \
+    (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
 #define STBI_SSE2
 #include <emmintrin.h>
 
 #ifdef _MSC_VER
 
-#if _MSC_VER >= 1400  // not VC6
-#include <intrin.h> // __cpuid
-static int stbi__cpuid3(void)
-{
-   int info[4];
-   __cpuid(info,1);
-   return info[3];
+#if _MSC_VER >= 1400 // not VC6
+#include <intrin.h>  // __cpuid
+static int stbi__cpuid3(void) {
+  int info[4];
+  __cpuid(info, 1);
+  return info[3];
 }
 #else
-static int stbi__cpuid3(void)
-{
-   int res;
-   __asm {
+static int stbi__cpuid3(void) {
+  int res;
+  __asm {
       mov  eax,1
       cpuid
       mov  res,edx
-   }
-   return res;
+  }
+  return res;
 }
 #endif
 
 #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
 
 #if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
-static int stbi__sse2_available(void)
-{
-   int info3 = stbi__cpuid3();
-   return ((info3 >> 26) & 1) != 0;
+static int stbi__sse2_available(void) {
+  int info3 = stbi__cpuid3();
+  return ((info3 >> 26) & 1) != 0;
 }
 #endif
 
@@ -688,12 +733,11 @@ static int stbi__sse2_available(void)
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 
 #if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
-static int stbi__sse2_available(void)
-{
-   // If we're even attempting to compile this on GCC/Clang, that means
-   // -msse2 is on, which means the compiler is allowed to use SSE2
-   // instructions at will, and so are we.
-   return 1;
+static int stbi__sse2_available(void) {
+  // If we're even attempting to compile this on GCC/Clang, that means
+  // -msse2 is on, which means the compiler is allowed to use SSE2
+  // instructions at will, and so are we.
+  return 1;
 }
 #endif
 
@@ -721,176 +765,164 @@ static int stbi__sse2_available(void)
 
 // stbi__context structure is our basic context used by all images, so it
 // contains all the IO context, plus some basic image information
-typedef struct
-{
-   stbi__uint32 img_x, img_y;
-   int img_n, img_out_n;
+typedef struct {
+  stbi__uint32 img_x, img_y;
+  int img_n, img_out_n;
 
-   stbi_io_callbacks io;
-   void *io_user_data;
+  stbi_io_callbacks io;
+  void *io_user_data;
 
-   int read_from_callbacks;
-   int buflen;
-   stbi_uc buffer_start[128];
+  int read_from_callbacks;
+  int buflen;
+  stbi_uc buffer_start[128];
 
-   stbi_uc *img_buffer, *img_buffer_end;
-   stbi_uc *img_buffer_original, *img_buffer_original_end;
+  stbi_uc *img_buffer, *img_buffer_end;
+  stbi_uc *img_buffer_original, *img_buffer_original_end;
 } stbi__context;
 
-
 static void stbi__refill_buffer(stbi__context *s);
 
 // initialize a memory-decode context
-static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
-{
-   s->io.read = NULL;
-   s->read_from_callbacks = 0;
-   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
-   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len) {
+  s->io.read = NULL;
+  s->read_from_callbacks = 0;
+  s->img_buffer = s->img_buffer_original = (stbi_uc *)buffer;
+  s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *)buffer + len;
 }
 
 // initialize a callback-based context
-static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
-{
-   s->io = *c;
-   s->io_user_data = user;
-   s->buflen = sizeof(s->buffer_start);
-   s->read_from_callbacks = 1;
-   s->img_buffer_original = s->buffer_start;
-   stbi__refill_buffer(s);
-   s->img_buffer_original_end = s->img_buffer_end;
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c,
+                                  void *user) {
+  s->io = *c;
+  s->io_user_data = user;
+  s->buflen = sizeof(s->buffer_start);
+  s->read_from_callbacks = 1;
+  s->img_buffer_original = s->buffer_start;
+  stbi__refill_buffer(s);
+  s->img_buffer_original_end = s->img_buffer_end;
 }
 
 #ifndef STBI_NO_STDIO
 
-static int stbi__stdio_read(void *user, char *data, int size)
-{
-   return (int) fread(data,1,size,(FILE*) user);
+static int stbi__stdio_read(void *user, char *data, int size) {
+  return (int)fread(data, 1, size, (FILE *)user);
 }
 
-static void stbi__stdio_skip(void *user, int n)
-{
-   fseek((FILE*) user, n, SEEK_CUR);
+static void stbi__stdio_skip(void *user, int n) {
+  fseek((FILE *)user, n, SEEK_CUR);
 }
 
-static int stbi__stdio_eof(void *user)
-{
-   return feof((FILE*) user);
-}
+static int stbi__stdio_eof(void *user) { return feof((FILE *)user); }
 
-static stbi_io_callbacks stbi__stdio_callbacks =
-{
-   stbi__stdio_read,
-   stbi__stdio_skip,
-   stbi__stdio_eof,
+static stbi_io_callbacks stbi__stdio_callbacks = {
+    stbi__stdio_read,
+    stbi__stdio_skip,
+    stbi__stdio_eof,
 };
 
-static void stbi__start_file(stbi__context *s, FILE *f)
-{
-   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+static void stbi__start_file(stbi__context *s, FILE *f) {
+  stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *)f);
 }
 
-//static void stop_file(stbi__context *s) { }
+// static void stop_file(stbi__context *s) { }
 
 #endif // !STBI_NO_STDIO
 
-static void stbi__rewind(stbi__context *s)
-{
-   // conceptually rewind SHOULD rewind to the beginning of the stream,
-   // but we just rewind to the beginning of the initial buffer, because
-   // we only use it after doing 'test', which only ever looks at at most 92 bytes
-   s->img_buffer = s->img_buffer_original;
-   s->img_buffer_end = s->img_buffer_original_end;
+static void stbi__rewind(stbi__context *s) {
+  // conceptually rewind SHOULD rewind to the beginning of the stream,
+  // but we just rewind to the beginning of the initial buffer, because
+  // we only use it after doing 'test', which only ever looks at at most 92
+  // bytes
+  s->img_buffer = s->img_buffer_original;
+  s->img_buffer_end = s->img_buffer_original_end;
 }
 
-enum
-{
-   STBI_ORDER_RGB,
-   STBI_ORDER_BGR
-};
+enum { STBI_ORDER_RGB, STBI_ORDER_BGR };
 
-typedef struct
-{
-   int bits_per_channel;
-   int num_channels;
-   int channel_order;
+typedef struct {
+  int bits_per_channel;
+  int num_channels;
+  int channel_order;
 } stbi__result_info;
 
 #ifndef STBI_NO_JPEG
-static int      stbi__jpeg_test(stbi__context *s);
-static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+static int stbi__jpeg_test(stbi__context *s);
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp,
+                             int req_comp, stbi__result_info *ri);
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNG
-static int      stbi__png_test(stbi__context *s);
-static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
-static int      stbi__png_is16(stbi__context *s);
+static int stbi__png_test(stbi__context *s);
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri);
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int stbi__png_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_BMP
-static int      stbi__bmp_test(stbi__context *s);
-static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+static int stbi__bmp_test(stbi__context *s);
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri);
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_TGA
-static int      stbi__tga_test(stbi__context *s);
-static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+static int stbi__tga_test(stbi__context *s);
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri);
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PSD
-static int      stbi__psd_test(stbi__context *s);
-static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
-static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
-static int      stbi__psd_is16(stbi__context *s);
+static int stbi__psd_test(stbi__context *s);
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri, int bpc);
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int stbi__psd_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_HDR
-static int      stbi__hdr_test(stbi__context *s);
-static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+static int stbi__hdr_test(stbi__context *s);
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp,
+                             int req_comp, stbi__result_info *ri);
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PIC
-static int      stbi__pic_test(stbi__context *s);
-static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+static int stbi__pic_test(stbi__context *s);
+static void *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri);
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_GIF
-static int      stbi__gif_test(stbi__context *s);
-static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
-static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+static int stbi__gif_test(stbi__context *s);
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri);
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
+                                 int *z, int *comp, int req_comp);
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNM
-static int      stbi__pnm_test(stbi__context *s);
-static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int stbi__pnm_test(stbi__context *s);
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri);
+static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 // this is not threadsafe
 static const char *stbi__g_failure_reason;
 
-STBIDEF const char *stbi_failure_reason(void)
-{
-   return stbi__g_failure_reason;
-}
+STBIDEF const char *stbi_failure_reason(void) { return stbi__g_failure_reason; }
 
-static int stbi__err(const char *str)
-{
-   stbi__g_failure_reason = str;
-   return 0;
+static int stbi__err(const char *str) {
+  stbi__g_failure_reason = str;
+  return 0;
 }
 
-static void *stbi__malloc(size_t size)
-{
-    return STBI_MALLOC(size);
-}
+static void *stbi__malloc(size_t size) { return STBI_MALLOC(size); }
 
 // stb_image uses ints pervasively, including for offset calculations.
 // therefore the largest decoded image size we can support with the
@@ -904,66 +936,66 @@ static void *stbi__malloc(size_t size)
 
 // return 1 if the sum is valid, 0 on overflow.
 // negative terms are considered invalid.
-static int stbi__addsizes_valid(int a, int b)
-{
-   if (b < 0) return 0;
-   // now 0 <= b <= INT_MAX, hence also
-   // 0 <= INT_MAX - b <= INTMAX.
-   // And "a + b <= INT_MAX" (which might overflow) is the
-   // same as a <= INT_MAX - b (no overflow)
-   return a <= INT_MAX - b;
+static int stbi__addsizes_valid(int a, int b) {
+  if (b < 0)
+    return 0;
+  // now 0 <= b <= INT_MAX, hence also
+  // 0 <= INT_MAX - b <= INTMAX.
+  // And "a + b <= INT_MAX" (which might overflow) is the
+  // same as a <= INT_MAX - b (no overflow)
+  return a <= INT_MAX - b;
 }
 
 // returns 1 if the product is valid, 0 on overflow.
 // negative factors are considered invalid.
-static int stbi__mul2sizes_valid(int a, int b)
-{
-   if (a < 0 || b < 0) return 0;
-   if (b == 0) return 1; // mul-by-0 is always safe
-   // portable way to check for no overflows in a*b
-   return a <= INT_MAX/b;
+static int stbi__mul2sizes_valid(int a, int b) {
+  if (a < 0 || b < 0)
+    return 0;
+  if (b == 0)
+    return 1; // mul-by-0 is always safe
+  // portable way to check for no overflows in a*b
+  return a <= INT_MAX / b;
 }
 
 // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
-static int stbi__mad2sizes_valid(int a, int b, int add)
-{
-   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+static int stbi__mad2sizes_valid(int a, int b, int add) {
+  return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a * b, add);
 }
 
 // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
-static int stbi__mad3sizes_valid(int a, int b, int c, int add)
-{
-   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
-      stbi__addsizes_valid(a*b*c, add);
+static int stbi__mad3sizes_valid(int a, int b, int c, int add) {
+  return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) &&
+         stbi__addsizes_valid(a * b * c, add);
 }
 
-// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't
+// overflow
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
-{
-   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
-      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) {
+  return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) &&
+         stbi__mul2sizes_valid(a * b * c, d) &&
+         stbi__addsizes_valid(a * b * c * d, add);
 }
 #endif
 
 // mallocs with size overflow checking
-static void *stbi__malloc_mad2(int a, int b, int add)
-{
-   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
-   return stbi__malloc(a*b + add);
+static void *stbi__malloc_mad2(int a, int b, int add) {
+  if (!stbi__mad2sizes_valid(a, b, add))
+    return NULL;
+  return stbi__malloc(a * b + add);
 }
 
-static void *stbi__malloc_mad3(int a, int b, int c, int add)
-{
-   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
-   return stbi__malloc(a*b*c + add);
+static void *stbi__malloc_mad3(int a, int b, int c, int add) {
+  if (!stbi__mad3sizes_valid(a, b, c, add))
+    return NULL;
+  return stbi__malloc(a * b * c + add);
 }
 
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
-{
-   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
-   return stbi__malloc(a*b*c*d + add);
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add) {
+  if (!stbi__mad4sizes_valid(a, b, c, d, add))
+    return NULL;
+  return stbi__malloc(a * b * c * d + add);
 }
 #endif
 
@@ -972,395 +1004,434 @@ static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
 // stbi__errpuc - error returning pointer to unsigned char
 
 #ifdef STBI_NO_FAILURE_STRINGS
-   #define stbi__err(x,y)  0
+#define stbi__err(x, y) 0
 #elif defined(STBI_FAILURE_USERMSG)
-   #define stbi__err(x,y)  stbi__err(y)
+#define stbi__err(x, y) stbi__err(y)
 #else
-   #define stbi__err(x,y)  stbi__err(x)
+#define stbi__err(x, y) stbi__err(x)
 #endif
 
-#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
-#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpf(x, y) ((float *)(size_t)(stbi__err(x, y) ? NULL : NULL))
+#define stbi__errpuc(x, y)                                                     \
+  ((unsigned char *)(size_t)(stbi__err(x, y) ? NULL : NULL))
 
-STBIDEF void stbi_image_free(void *retval_from_stbi_load)
-{
-   STBI_FREE(retval_from_stbi_load);
+STBIDEF void stbi_image_free(void *retval_from_stbi_load) {
+  STBI_FREE(retval_from_stbi_load);
 }
 
 #ifndef STBI_NO_LINEAR
-static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+static float *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
 #endif
 
 #ifndef STBI_NO_HDR
-static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+static stbi_uc *stbi__hdr_to_ldr(float *data, int x, int y, int comp);
 #endif
 
 static int stbi__vertically_flip_on_load = 0;
 
-STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
-{
-    stbi__vertically_flip_on_load = flag_true_if_should_flip;
-}
-
-static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
-{
-   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
-   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
-   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
-   ri->num_channels = 0;
-
-   #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_PNG
-   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_BMP
-   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_GIF
-   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_PSD
-   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
-   #endif
-   #ifndef STBI_NO_PIC
-   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_PNM
-   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
-   #endif
-
-   #ifndef STBI_NO_HDR
-   if (stbi__hdr_test(s)) {
-      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
-      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
-   }
-   #endif
-
-   #ifndef STBI_NO_TGA
-   // test tga last because it's a crappy test!
-   if (stbi__tga_test(s))
-      return stbi__tga_load(s,x,y,comp,req_comp, ri);
-   #endif
-
-   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
-}
-
-static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
-{
-   int i;
-   int img_len = w * h * channels;
-   stbi_uc *reduced;
-
-   reduced = (stbi_uc *) stbi__malloc(img_len);
-   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
-
-   for (i = 0; i < img_len; ++i)
-      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
-
-   STBI_FREE(orig);
-   return reduced;
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip) {
+  stbi__vertically_flip_on_load = flag_true_if_should_flip;
 }
 
-static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
-{
-   int i;
-   int img_len = w * h * channels;
-   stbi__uint16 *enlarged;
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp,
+                             int req_comp, stbi__result_info *ri, int bpc) {
+  memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+  ri->bits_per_channel =
+      8; // default is 8 so most paths don't have to be changed
+  ri->channel_order =
+      STBI_ORDER_RGB; // all current input & output are this, but this is here
+                      // so we can add BGR order
+  ri->num_channels = 0;
 
-   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
-   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+#ifndef STBI_NO_JPEG
+  if (stbi__jpeg_test(s))
+    return stbi__jpeg_load(s, x, y, comp, req_comp, ri);
+#endif
+#ifndef STBI_NO_PNG
+  if (stbi__png_test(s))
+    return stbi__png_load(s, x, y, comp, req_comp, ri);
+#endif
+#ifndef STBI_NO_BMP
+  if (stbi__bmp_test(s))
+    return stbi__bmp_load(s, x, y, comp, req_comp, ri);
+#endif
+#ifndef STBI_NO_GIF
+  if (stbi__gif_test(s))
+    return stbi__gif_load(s, x, y, comp, req_comp, ri);
+#endif
+#ifndef STBI_NO_PSD
+  if (stbi__psd_test(s))
+    return stbi__psd_load(s, x, y, comp, req_comp, ri, bpc);
+#endif
+#ifndef STBI_NO_PIC
+  if (stbi__pic_test(s))
+    return stbi__pic_load(s, x, y, comp, req_comp, ri);
+#endif
+#ifndef STBI_NO_PNM
+  if (stbi__pnm_test(s))
+    return stbi__pnm_load(s, x, y, comp, req_comp, ri);
+#endif
 
-   for (i = 0; i < img_len; ++i)
-      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+#ifndef STBI_NO_HDR
+  if (stbi__hdr_test(s)) {
+    float *hdr = stbi__hdr_load(s, x, y, comp, req_comp, ri);
+    return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+  }
+#endif
 
-   STBI_FREE(orig);
-   return enlarged;
-}
+#ifndef STBI_NO_TGA
+  // test tga last because it's a crappy test!
+  if (stbi__tga_test(s))
+    return stbi__tga_load(s, x, y, comp, req_comp, ri);
+#endif
 
-static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
-{
-   int row;
-   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
-   stbi_uc temp[2048];
-   stbi_uc *bytes = (stbi_uc *)image;
-
-   for (row = 0; row < (h>>1); row++) {
-      stbi_uc *row0 = bytes + row*bytes_per_row;
-      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
-      // swap row0 with row1
-      size_t bytes_left = bytes_per_row;
-      while (bytes_left) {
-         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
-         memcpy(temp, row0, bytes_copy);
-         memcpy(row0, row1, bytes_copy);
-         memcpy(row1, temp, bytes_copy);
-         row0 += bytes_copy;
-         row1 += bytes_copy;
-         bytes_left -= bytes_copy;
-      }
-   }
+  return stbi__errpuc("unknown image type",
+                      "Image not of any known type, or corrupt");
+}
+
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h,
+                                      int channels) {
+  int i;
+  int img_len = w * h * channels;
+  stbi_uc *reduced;
+
+  reduced = (stbi_uc *)stbi__malloc(img_len);
+  if (reduced == NULL)
+    return stbi__errpuc("outofmem", "Out of memory");
+
+  for (i = 0; i < img_len; ++i)
+    reduced[i] =
+        (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient
+                                          // approx of 16->8 bit scaling
+
+  STBI_FREE(orig);
+  return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h,
+                                           int channels) {
+  int i;
+  int img_len = w * h * channels;
+  stbi__uint16 *enlarged;
+
+  enlarged = (stbi__uint16 *)stbi__malloc(img_len * 2);
+  if (enlarged == NULL)
+    return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory");
+
+  for (i = 0; i < img_len; ++i)
+    enlarged[i] = (stbi__uint16)(
+        (orig[i] << 8) +
+        orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+  STBI_FREE(orig);
+  return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h,
+                                int bytes_per_pixel) {
+  int row;
+  size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+  stbi_uc temp[2048];
+  stbi_uc *bytes = (stbi_uc *)image;
+
+  for (row = 0; row < (h >> 1); row++) {
+    stbi_uc *row0 = bytes + row * bytes_per_row;
+    stbi_uc *row1 = bytes + (h - row - 1) * bytes_per_row;
+    // swap row0 with row1
+    size_t bytes_left = bytes_per_row;
+    while (bytes_left) {
+      size_t bytes_copy =
+          (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+      memcpy(temp, row0, bytes_copy);
+      memcpy(row0, row1, bytes_copy);
+      memcpy(row1, temp, bytes_copy);
+      row0 += bytes_copy;
+      row1 += bytes_copy;
+      bytes_left -= bytes_copy;
+    }
+  }
 }
 
 #ifndef STBI_NO_GIF
-static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
-{
-   int slice;
-   int slice_size = w * h * bytes_per_pixel;
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z,
+                                       int bytes_per_pixel) {
+  int slice;
+  int slice_size = w * h * bytes_per_pixel;
 
-   stbi_uc *bytes = (stbi_uc *)image;
-   for (slice = 0; slice < z; ++slice) {
-      stbi__vertical_flip(bytes, w, h, bytes_per_pixel); 
-      bytes += slice_size; 
-   }
+  stbi_uc *bytes = (stbi_uc *)image;
+  for (slice = 0; slice < z; ++slice) {
+    stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+    bytes += slice_size;
+  }
 }
 #endif
 
-static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__result_info ri;
-   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x,
+                                                      int *y, int *comp,
+                                                      int req_comp) {
+  stbi__result_info ri;
+  void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
 
-   if (result == NULL)
-      return NULL;
+  if (result == NULL)
+    return NULL;
 
-   if (ri.bits_per_channel != 8) {
-      STBI_ASSERT(ri.bits_per_channel == 16);
-      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
-      ri.bits_per_channel = 8;
-   }
+  if (ri.bits_per_channel != 8) {
+    STBI_ASSERT(ri.bits_per_channel == 16);
+    result = stbi__convert_16_to_8((stbi__uint16 *)result, *x, *y,
+                                   req_comp == 0 ? *comp : req_comp);
+    ri.bits_per_channel = 8;
+  }
 
-   // @TODO: move stbi__convert_format to here
+  // @TODO: move stbi__convert_format to here
 
-   if (stbi__vertically_flip_on_load) {
-      int channels = req_comp ? req_comp : *comp;
-      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
-   }
+  if (stbi__vertically_flip_on_load) {
+    int channels = req_comp ? req_comp : *comp;
+    stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+  }
 
-   return (unsigned char *) result;
+  return (unsigned char *)result;
 }
 
-static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__result_info ri;
-   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x,
+                                                      int *y, int *comp,
+                                                      int req_comp) {
+  stbi__result_info ri;
+  void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
 
-   if (result == NULL)
-      return NULL;
+  if (result == NULL)
+    return NULL;
 
-   if (ri.bits_per_channel != 16) {
-      STBI_ASSERT(ri.bits_per_channel == 8);
-      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
-      ri.bits_per_channel = 16;
-   }
+  if (ri.bits_per_channel != 16) {
+    STBI_ASSERT(ri.bits_per_channel == 8);
+    result = stbi__convert_8_to_16((stbi_uc *)result, *x, *y,
+                                   req_comp == 0 ? *comp : req_comp);
+    ri.bits_per_channel = 16;
+  }
 
-   // @TODO: move stbi__convert_format16 to here
-   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+  // @TODO: move stbi__convert_format16 to here
+  // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to
+  // keep more precision
 
-   if (stbi__vertically_flip_on_load) {
-      int channels = req_comp ? req_comp : *comp;
-      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
-   }
+  if (stbi__vertically_flip_on_load) {
+    int channels = req_comp ? req_comp : *comp;
+    stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+  }
 
-   return (stbi__uint16 *) result;
+  return (stbi__uint16 *)result;
 }
 
 #if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
-static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
-{
-   if (stbi__vertically_flip_on_load && result != NULL) {
-      int channels = req_comp ? req_comp : *comp;
-      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
-   }
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp,
+                                    int req_comp) {
+  if (stbi__vertically_flip_on_load && result != NULL) {
+    int channels = req_comp ? req_comp : *comp;
+    stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+  }
 }
 #endif
 
 #ifndef STBI_NO_STDIO
 
 #if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
-STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
-STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(
+    unsigned int cp, unsigned long flags, const char *str, int cbmb,
+    wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(
+    unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide,
+    char *str, int cbmb, const char *defchar, int *used_default);
 #endif
 
 #if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
-STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
-{
-	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen,
+                                       const wchar_t *input) {
+  return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer,
+                             (int)bufferlen, NULL, NULL);
 }
 #endif
 
-static FILE *stbi__fopen(char const *filename, char const *mode)
-{
-   FILE *f;
+static FILE *stbi__fopen(char const *filename, char const *mode) {
+  FILE *f;
 #if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
-   wchar_t wMode[64];
-   wchar_t wFilename[1024];
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
-      return 0;
-	
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
-      return 0;
+  wchar_t wMode[64];
+  wchar_t wFilename[1024];
+  if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename,
+                               sizeof(wFilename)))
+    return 0;
+
+  if (0 ==
+      MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+    return 0;
 
 #if _MSC_VER >= 1400
-	if (0 != _wfopen_s(&f, wFilename, wMode))
-		f = 0;
+  if (0 != _wfopen_s(&f, wFilename, wMode))
+    f = 0;
 #else
-   f = _wfopen(wFilename, wMode);
+  f = _wfopen(wFilename, wMode);
 #endif
 
 #elif defined(_MSC_VER) && _MSC_VER >= 1400
-   if (0 != fopen_s(&f, filename, mode))
-      f=0;
+  if (0 != fopen_s(&f, filename, mode))
+    f = 0;
 #else
-   f = fopen(filename, mode);
+  f = fopen(filename, mode);
 #endif
-   return f;
-}
-
-
-STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
-{
-   FILE *f = stbi__fopen(filename, "rb");
-   unsigned char *result;
-   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
-   result = stbi_load_from_file(f,x,y,comp,req_comp);
-   fclose(f);
-   return result;
-}
-
-STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
-{
-   unsigned char *result;
-   stbi__context s;
-   stbi__start_file(&s,f);
-   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
-   if (result) {
-      // need to 'unget' all the characters in the IO buffer
-      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
-   }
-   return result;
-}
-
-STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__uint16 *result;
-   stbi__context s;
-   stbi__start_file(&s,f);
-   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
-   if (result) {
-      // need to 'unget' all the characters in the IO buffer
-      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
-   }
-   return result;
-}
-
-STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
-{
-   FILE *f = stbi__fopen(filename, "rb");
-   stbi__uint16 *result;
-   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
-   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
-   fclose(f);
-   return result;
-}
-
-
-#endif //!STBI_NO_STDIO
-
-STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
-}
-
-STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
-   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
-}
-
-STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
-}
-
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+  return f;
+}
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp,
+                           int req_comp) {
+  FILE *f = stbi__fopen(filename, "rb");
+  unsigned char *result;
+  if (!f)
+    return stbi__errpuc("can't fopen", "Unable to open file");
+  result = stbi_load_from_file(f, x, y, comp, req_comp);
+  fclose(f);
+  return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp,
+                                     int req_comp) {
+  unsigned char *result;
+  stbi__context s;
+  stbi__start_file(&s, f);
+  result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
+  if (result) {
+    // need to 'unget' all the characters in the IO buffer
+    fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
+  }
+  return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp,
+                                             int req_comp) {
+  stbi__uint16 *result;
+  stbi__context s;
+  stbi__start_file(&s, f);
+  result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp);
+  if (result) {
+    // need to 'unget' all the characters in the IO buffer
+    fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
+  }
+  return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp,
+                              int req_comp) {
+  FILE *f = stbi__fopen(filename, "rb");
+  stbi__uint16 *result;
+  if (!f)
+    return (stbi_us *)stbi__errpuc("can't fopen", "Unable to open file");
+  result = stbi_load_from_file_16(f, x, y, comp, req_comp);
+  fclose(f);
+  return result;
+}
+
+#endif //! STBI_NO_STDIO
+
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len,
+                                          int *x, int *y, int *channels_in_file,
+                                          int desired_channels) {
+  stbi__context s;
+  stbi__start_mem(&s, buffer, len);
+  return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file,
+                                          desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk,
+                                             void *user, int *x, int *y,
+                                             int *channels_in_file,
+                                             int desired_channels) {
+  stbi__context s;
+  stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+  return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file,
+                                          desired_channels);
+}
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x,
+                                       int *y, int *comp, int req_comp) {
+  stbi__context s;
+  stbi__start_mem(&s, buffer, len);
+  return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk,
+                                          void *user, int *x, int *y, int *comp,
+                                          int req_comp) {
+  stbi__context s;
+  stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+  return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
 }
 
 #ifndef STBI_NO_GIF
-STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
-{
-   unsigned char *result;
-   stbi__context s; 
-   stbi__start_mem(&s,buffer,len); 
-   
-   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
-   if (stbi__vertically_flip_on_load) {
-      stbi__vertical_flip_slices( result, *x, *y, *z, *comp ); 
-   }
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len,
+                                           int **delays, int *x, int *y, int *z,
+                                           int *comp, int req_comp) {
+  unsigned char *result;
+  stbi__context s;
+  stbi__start_mem(&s, buffer, len);
+
+  result =
+      (unsigned char *)stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+  if (stbi__vertically_flip_on_load) {
+    stbi__vertical_flip_slices(result, *x, *y, *z, *comp);
+  }
 
-   return result; 
+  return result;
 }
 #endif
 
 #ifndef STBI_NO_LINEAR
-static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
-{
-   unsigned char *data;
-   #ifndef STBI_NO_HDR
-   if (stbi__hdr_test(s)) {
-      stbi__result_info ri;
-      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
-      if (hdr_data)
-         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
-      return hdr_data;
-   }
-   #endif
-   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
-   if (data)
-      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
-   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
-}
-
-STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__loadf_main(&s,x,y,comp,req_comp);
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp,
+                               int req_comp) {
+  unsigned char *data;
+#ifndef STBI_NO_HDR
+  if (stbi__hdr_test(s)) {
+    stbi__result_info ri;
+    float *hdr_data = stbi__hdr_load(s, x, y, comp, req_comp, &ri);
+    if (hdr_data)
+      stbi__float_postprocess(hdr_data, x, y, comp, req_comp);
+    return hdr_data;
+  }
+#endif
+  data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+  if (data)
+    return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+  return stbi__errpf("unknown image type",
+                     "Image not of any known type, or corrupt");
 }
 
-STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__loadf_main(&s,x,y,comp,req_comp);
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x,
+                                      int *y, int *comp, int req_comp) {
+  stbi__context s;
+  stbi__start_mem(&s, buffer, len);
+  return stbi__loadf_main(&s, x, y, comp, req_comp);
 }
 
-#ifndef STBI_NO_STDIO
-STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
-{
-   float *result;
-   FILE *f = stbi__fopen(filename, "rb");
-   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
-   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
-   fclose(f);
-   return result;
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk,
+                                         void *user, int *x, int *y, int *comp,
+                                         int req_comp) {
+  stbi__context s;
+  stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+  return stbi__loadf_main(&s, x, y, comp, req_comp);
 }
 
-STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_file(&s,f);
-   return stbi__loadf_main(&s,x,y,comp,req_comp);
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp,
+                          int req_comp) {
+  float *result;
+  FILE *f = stbi__fopen(filename, "rb");
+  if (!f)
+    return stbi__errpf("can't fopen", "Unable to open file");
+  result = stbi_loadf_from_file(f, x, y, comp, req_comp);
+  fclose(f);
+  return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp,
+                                    int req_comp) {
+  stbi__context s;
+  stbi__start_file(&s, f);
+  return stbi__loadf_main(&s, x, y, comp, req_comp);
 }
 #endif // !STBI_NO_STDIO
 
@@ -1370,198 +1441,186 @@ STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_
 // defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
 // reports false!
 
-STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
-{
-   #ifndef STBI_NO_HDR
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__hdr_test(&s);
-   #else
-   STBI_NOTUSED(buffer);
-   STBI_NOTUSED(len);
-   return 0;
-   #endif
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len) {
+#ifndef STBI_NO_HDR
+  stbi__context s;
+  stbi__start_mem(&s, buffer, len);
+  return stbi__hdr_test(&s);
+#else
+  STBI_NOTUSED(buffer);
+  STBI_NOTUSED(len);
+  return 0;
+#endif
 }
 
 #ifndef STBI_NO_STDIO
-STBIDEF int      stbi_is_hdr          (char const *filename)
-{
-   FILE *f = stbi__fopen(filename, "rb");
-   int result=0;
-   if (f) {
-      result = stbi_is_hdr_from_file(f);
-      fclose(f);
-   }
-   return result;
+STBIDEF int stbi_is_hdr(char const *filename) {
+  FILE *f = stbi__fopen(filename, "rb");
+  int result = 0;
+  if (f) {
+    result = stbi_is_hdr_from_file(f);
+    fclose(f);
+  }
+  return result;
 }
 
-STBIDEF int stbi_is_hdr_from_file(FILE *f)
-{
-   #ifndef STBI_NO_HDR
-   long pos = ftell(f);
-   int res;
-   stbi__context s;
-   stbi__start_file(&s,f);
-   res = stbi__hdr_test(&s);
-   fseek(f, pos, SEEK_SET);
-   return res;
-   #else
-   STBI_NOTUSED(f);
-   return 0;
-   #endif
+STBIDEF int stbi_is_hdr_from_file(FILE *f) {
+#ifndef STBI_NO_HDR
+  long pos = ftell(f);
+  int res;
+  stbi__context s;
+  stbi__start_file(&s, f);
+  res = stbi__hdr_test(&s);
+  fseek(f, pos, SEEK_SET);
+  return res;
+#else
+  STBI_NOTUSED(f);
+  return 0;
+#endif
 }
 #endif // !STBI_NO_STDIO
 
-STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
-{
-   #ifndef STBI_NO_HDR
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__hdr_test(&s);
-   #else
-   STBI_NOTUSED(clbk);
-   STBI_NOTUSED(user);
-   return 0;
-   #endif
+STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk,
+                                       void *user) {
+#ifndef STBI_NO_HDR
+  stbi__context s;
+  stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+  return stbi__hdr_test(&s);
+#else
+  STBI_NOTUSED(clbk);
+  STBI_NOTUSED(user);
+  return 0;
+#endif
 }
 
 #ifndef STBI_NO_LINEAR
-static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+static float stbi__l2h_gamma = 2.2f, stbi__l2h_scale = 1.0f;
 
-STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
-STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+STBIDEF void stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
 #endif
 
-static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
-
-STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
-STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+static float stbi__h2l_gamma_i = 1.0f / 2.2f, stbi__h2l_scale_i = 1.0f;
 
+STBIDEF void stbi_hdr_to_ldr_gamma(float gamma) {
+  stbi__h2l_gamma_i = 1 / gamma;
+}
+STBIDEF void stbi_hdr_to_ldr_scale(float scale) {
+  stbi__h2l_scale_i = 1 / scale;
+}
 
 //////////////////////////////////////////////////////////////////////////////
 //
 // Common code used by all image loaders
 //
 
-enum
-{
-   STBI__SCAN_load=0,
-   STBI__SCAN_type,
-   STBI__SCAN_header
-};
-
-static void stbi__refill_buffer(stbi__context *s)
-{
-   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
-   if (n == 0) {
-      // at end of file, treat same as if from memory, but need to handle case
-      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
-      s->read_from_callbacks = 0;
-      s->img_buffer = s->buffer_start;
-      s->img_buffer_end = s->buffer_start+1;
-      *s->img_buffer = 0;
-   } else {
-      s->img_buffer = s->buffer_start;
-      s->img_buffer_end = s->buffer_start + n;
-   }
-}
-
-stbi_inline static stbi_uc stbi__get8(stbi__context *s)
-{
-   if (s->img_buffer < s->img_buffer_end)
-      return *s->img_buffer++;
-   if (s->read_from_callbacks) {
-      stbi__refill_buffer(s);
-      return *s->img_buffer++;
-   }
-   return 0;
-}
-
-stbi_inline static int stbi__at_eof(stbi__context *s)
-{
-   if (s->io.read) {
-      if (!(s->io.eof)(s->io_user_data)) return 0;
-      // if feof() is true, check if buffer = end
-      // special case: we've only got the special 0 character at the end
-      if (s->read_from_callbacks == 0) return 1;
-   }
+enum { STBI__SCAN_load = 0, STBI__SCAN_type, STBI__SCAN_header };
+
+static void stbi__refill_buffer(stbi__context *s) {
+  int n = (s->io.read)(s->io_user_data, (char *)s->buffer_start, s->buflen);
+  if (n == 0) {
+    // at end of file, treat same as if from memory, but need to handle case
+    // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+    s->read_from_callbacks = 0;
+    s->img_buffer = s->buffer_start;
+    s->img_buffer_end = s->buffer_start + 1;
+    *s->img_buffer = 0;
+  } else {
+    s->img_buffer = s->buffer_start;
+    s->img_buffer_end = s->buffer_start + n;
+  }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s) {
+  if (s->img_buffer < s->img_buffer_end)
+    return *s->img_buffer++;
+  if (s->read_from_callbacks) {
+    stbi__refill_buffer(s);
+    return *s->img_buffer++;
+  }
+  return 0;
+}
+
+stbi_inline static int stbi__at_eof(stbi__context *s) {
+  if (s->io.read) {
+    if (!(s->io.eof)(s->io_user_data))
+      return 0;
+    // if feof() is true, check if buffer = end
+    // special case: we've only got the special 0 character at the end
+    if (s->read_from_callbacks == 0)
+      return 1;
+  }
 
-   return s->img_buffer >= s->img_buffer_end;
+  return s->img_buffer >= s->img_buffer_end;
 }
 
-static void stbi__skip(stbi__context *s, int n)
-{
-   if (n < 0) {
+static void stbi__skip(stbi__context *s, int n) {
+  if (n < 0) {
+    s->img_buffer = s->img_buffer_end;
+    return;
+  }
+  if (s->io.read) {
+    int blen = (int)(s->img_buffer_end - s->img_buffer);
+    if (blen < n) {
       s->img_buffer = s->img_buffer_end;
+      (s->io.skip)(s->io_user_data, n - blen);
       return;
-   }
-   if (s->io.read) {
-      int blen = (int) (s->img_buffer_end - s->img_buffer);
-      if (blen < n) {
-         s->img_buffer = s->img_buffer_end;
-         (s->io.skip)(s->io_user_data, n - blen);
-         return;
-      }
-   }
-   s->img_buffer += n;
+    }
+  }
+  s->img_buffer += n;
 }
 
-static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
-{
-   if (s->io.read) {
-      int blen = (int) (s->img_buffer_end - s->img_buffer);
-      if (blen < n) {
-         int res, count;
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n) {
+  if (s->io.read) {
+    int blen = (int)(s->img_buffer_end - s->img_buffer);
+    if (blen < n) {
+      int res, count;
 
-         memcpy(buffer, s->img_buffer, blen);
+      memcpy(buffer, s->img_buffer, blen);
 
-         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
-         res = (count == (n-blen));
-         s->img_buffer = s->img_buffer_end;
-         return res;
-      }
-   }
+      count = (s->io.read)(s->io_user_data, (char *)buffer + blen, n - blen);
+      res = (count == (n - blen));
+      s->img_buffer = s->img_buffer_end;
+      return res;
+    }
+  }
 
-   if (s->img_buffer+n <= s->img_buffer_end) {
-      memcpy(buffer, s->img_buffer, n);
-      s->img_buffer += n;
-      return 1;
-   } else
-      return 0;
+  if (s->img_buffer + n <= s->img_buffer_end) {
+    memcpy(buffer, s->img_buffer, n);
+    s->img_buffer += n;
+    return 1;
+  } else
+    return 0;
 }
 
-static int stbi__get16be(stbi__context *s)
-{
-   int z = stbi__get8(s);
-   return (z << 8) + stbi__get8(s);
+static int stbi__get16be(stbi__context *s) {
+  int z = stbi__get8(s);
+  return (z << 8) + stbi__get8(s);
 }
 
-static stbi__uint32 stbi__get32be(stbi__context *s)
-{
-   stbi__uint32 z = stbi__get16be(s);
-   return (z << 16) + stbi__get16be(s);
+static stbi__uint32 stbi__get32be(stbi__context *s) {
+  stbi__uint32 z = stbi__get16be(s);
+  return (z << 16) + stbi__get16be(s);
 }
 
 #if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
 // nothing
 #else
-static int stbi__get16le(stbi__context *s)
-{
-   int z = stbi__get8(s);
-   return z + (stbi__get8(s) << 8);
+static int stbi__get16le(stbi__context *s) {
+  int z = stbi__get8(s);
+  return z + (stbi__get8(s) << 8);
 }
 #endif
 
 #ifndef STBI_NO_BMP
-static stbi__uint32 stbi__get32le(stbi__context *s)
-{
-   stbi__uint32 z = stbi__get16le(s);
-   return z + (stbi__get16le(s) << 16);
+static stbi__uint32 stbi__get32le(stbi__context *s) {
+  stbi__uint32 z = stbi__get16le(s);
+  return z + (stbi__get16le(s) << 16);
 }
 #endif
 
-#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
-
+#define STBI__BYTECAST(x)                                                      \
+  ((stbi_uc)((x)&255)) // truncate int to byte without warnings
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -1574,156 +1633,259 @@ static stbi__uint32 stbi__get32le(stbi__context *s)
 //  assume data buffer is malloced, so malloc a new one and free that one
 //  only failure mode is malloc failing
 
-static stbi_uc stbi__compute_y(int r, int g, int b)
-{
-   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
-}
-
-static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
-{
-   int i,j;
-   unsigned char *good;
-
-   if (req_comp == img_n) return data;
-   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
-
-   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
-   if (good == NULL) {
-      STBI_FREE(data);
-      return stbi__errpuc("outofmem", "Out of memory");
-   }
-
-   for (j=0; j < (int) y; ++j) {
-      unsigned char *src  = data + j * x * img_n   ;
-      unsigned char *dest = good + j * x * req_comp;
-
-      #define STBI__COMBO(a,b)  ((a)*8+(b))
-      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
-      // convert source image with img_n components to one with req_comp components;
-      // avoid switch per pixel, so use switch per scanline and massive macros
-      switch (STBI__COMBO(img_n, req_comp)) {
-         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
-         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
-         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
-         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
-         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
-         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
-         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
-         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
-         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
-         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
-         default: STBI_ASSERT(0);
+static stbi_uc stbi__compute_y(int r, int g, int b) {
+  return (stbi_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+}
+
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n,
+                                           int req_comp, unsigned int x,
+                                           unsigned int y) {
+  int i, j;
+  unsigned char *good;
+
+  if (req_comp == img_n)
+    return data;
+  STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+  good = (unsigned char *)stbi__malloc_mad3(req_comp, x, y, 0);
+  if (good == NULL) {
+    STBI_FREE(data);
+    return stbi__errpuc("outofmem", "Out of memory");
+  }
+
+  for (j = 0; j < (int)y; ++j) {
+    unsigned char *src = data + j * x * img_n;
+    unsigned char *dest = good + j * x * req_comp;
+
+#define STBI__COMBO(a, b) ((a)*8 + (b))
+#define STBI__CASE(a, b)                                                       \
+  case STBI__COMBO(a, b):                                                      \
+    for (i = x - 1; i >= 0; --i, src += a, dest += b)
+    // convert source image with img_n components to one with req_comp
+    // components; avoid switch per pixel, so use switch per scanline and
+    // massive macros
+    switch (STBI__COMBO(img_n, req_comp)) {
+      STBI__CASE(1, 2) {
+        dest[0] = src[0];
+        dest[1] = 255;
       }
-      #undef STBI__CASE
-   }
-
-   STBI_FREE(data);
-   return good;
-}
-
-static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
-{
-   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
-}
-
-static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
-{
-   int i,j;
-   stbi__uint16 *good;
-
-   if (req_comp == img_n) return data;
-   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
-
-   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
-   if (good == NULL) {
-      STBI_FREE(data);
-      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
-   }
-
-   for (j=0; j < (int) y; ++j) {
-      stbi__uint16 *src  = data + j * x * img_n   ;
-      stbi__uint16 *dest = good + j * x * req_comp;
-
-      #define STBI__COMBO(a,b)  ((a)*8+(b))
-      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
-      // convert source image with img_n components to one with req_comp components;
-      // avoid switch per pixel, so use switch per scanline and massive macros
-      switch (STBI__COMBO(img_n, req_comp)) {
-         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
-         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
-         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
-         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
-         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
-         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
-         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
-         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
-         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
-         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
-         default: STBI_ASSERT(0);
+      break;
+      STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
+      break;
+      STBI__CASE(1, 4) {
+        dest[0] = dest[1] = dest[2] = src[0];
+        dest[3] = 255;
+      }
+      break;
+      STBI__CASE(2, 1) { dest[0] = src[0]; }
+      break;
+      STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
+      break;
+      STBI__CASE(2, 4) {
+        dest[0] = dest[1] = dest[2] = src[0];
+        dest[3] = src[1];
+      }
+      break;
+      STBI__CASE(3, 4) {
+        dest[0] = src[0];
+        dest[1] = src[1];
+        dest[2] = src[2];
+        dest[3] = 255;
+      }
+      break;
+      STBI__CASE(3, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); }
+      break;
+      STBI__CASE(3, 2) {
+        dest[0] = stbi__compute_y(src[0], src[1], src[2]);
+        dest[1] = 255;
+      }
+      break;
+      STBI__CASE(4, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); }
+      break;
+      STBI__CASE(4, 2) {
+        dest[0] = stbi__compute_y(src[0], src[1], src[2]);
+        dest[1] = src[3];
       }
-      #undef STBI__CASE
-   }
+      break;
+      STBI__CASE(4, 3) {
+        dest[0] = src[0];
+        dest[1] = src[1];
+        dest[2] = src[2];
+      }
+      break;
+    default:
+      STBI_ASSERT(0);
+    }
+#undef STBI__CASE
+  }
+
+  STBI_FREE(data);
+  return good;
+}
+
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b) {
+  return (stbi__uint16)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+}
+
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n,
+                                            int req_comp, unsigned int x,
+                                            unsigned int y) {
+  int i, j;
+  stbi__uint16 *good;
+
+  if (req_comp == img_n)
+    return data;
+  STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+  good = (stbi__uint16 *)stbi__malloc(req_comp * x * y * 2);
+  if (good == NULL) {
+    STBI_FREE(data);
+    return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory");
+  }
+
+  for (j = 0; j < (int)y; ++j) {
+    stbi__uint16 *src = data + j * x * img_n;
+    stbi__uint16 *dest = good + j * x * req_comp;
+
+#define STBI__COMBO(a, b) ((a)*8 + (b))
+#define STBI__CASE(a, b)                                                       \
+  case STBI__COMBO(a, b):                                                      \
+    for (i = x - 1; i >= 0; --i, src += a, dest += b)
+    // convert source image with img_n components to one with req_comp
+    // components; avoid switch per pixel, so use switch per scanline and
+    // massive macros
+    switch (STBI__COMBO(img_n, req_comp)) {
+      STBI__CASE(1, 2) {
+        dest[0] = src[0];
+        dest[1] = 0xffff;
+      }
+      break;
+      STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
+      break;
+      STBI__CASE(1, 4) {
+        dest[0] = dest[1] = dest[2] = src[0];
+        dest[3] = 0xffff;
+      }
+      break;
+      STBI__CASE(2, 1) { dest[0] = src[0]; }
+      break;
+      STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
+      break;
+      STBI__CASE(2, 4) {
+        dest[0] = dest[1] = dest[2] = src[0];
+        dest[3] = src[1];
+      }
+      break;
+      STBI__CASE(3, 4) {
+        dest[0] = src[0];
+        dest[1] = src[1];
+        dest[2] = src[2];
+        dest[3] = 0xffff;
+      }
+      break;
+      STBI__CASE(3, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); }
+      break;
+      STBI__CASE(3, 2) {
+        dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
+        dest[1] = 0xffff;
+      }
+      break;
+      STBI__CASE(4, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); }
+      break;
+      STBI__CASE(4, 2) {
+        dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
+        dest[1] = src[3];
+      }
+      break;
+      STBI__CASE(4, 3) {
+        dest[0] = src[0];
+        dest[1] = src[1];
+        dest[2] = src[2];
+      }
+      break;
+    default:
+      STBI_ASSERT(0);
+    }
+#undef STBI__CASE
+  }
 
-   STBI_FREE(data);
-   return good;
+  STBI_FREE(data);
+  return good;
 }
 
 #ifndef STBI_NO_LINEAR
-static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
-{
-   int i,k,n;
-   float *output;
-   if (!data) return NULL;
-   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
-   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
-   // compute number of non-alpha components
-   if (comp & 1) n = comp; else n = comp-1;
-   for (i=0; i < x*y; ++i) {
-      for (k=0; k < n; ++k) {
-         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
-      }
-   }
-   if (n < comp) {
-      for (i=0; i < x*y; ++i) {
-         output[i*comp + n] = data[i*comp + n]/255.0f;
-      }
-   }
-   STBI_FREE(data);
-   return output;
+static float *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp) {
+  int i, k, n;
+  float *output;
+  if (!data)
+    return NULL;
+  output = (float *)stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+  if (output == NULL) {
+    STBI_FREE(data);
+    return stbi__errpf("outofmem", "Out of memory");
+  }
+  // compute number of non-alpha components
+  if (comp & 1)
+    n = comp;
+  else
+    n = comp - 1;
+  for (i = 0; i < x * y; ++i) {
+    for (k = 0; k < n; ++k) {
+      output[i * comp + k] =
+          (float)(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) *
+                  stbi__l2h_scale);
+    }
+  }
+  if (n < comp) {
+    for (i = 0; i < x * y; ++i) {
+      output[i * comp + n] = data[i * comp + n] / 255.0f;
+    }
+  }
+  STBI_FREE(data);
+  return output;
 }
 #endif
 
 #ifndef STBI_NO_HDR
-#define stbi__float2int(x)   ((int) (x))
-static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
-{
-   int i,k,n;
-   stbi_uc *output;
-   if (!data) return NULL;
-   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
-   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
-   // compute number of non-alpha components
-   if (comp & 1) n = comp; else n = comp-1;
-   for (i=0; i < x*y; ++i) {
-      for (k=0; k < n; ++k) {
-         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
-         if (z < 0) z = 0;
-         if (z > 255) z = 255;
-         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
-      }
-      if (k < comp) {
-         float z = data[i*comp+k] * 255 + 0.5f;
-         if (z < 0) z = 0;
-         if (z > 255) z = 255;
-         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
-      }
-   }
-   STBI_FREE(data);
-   return output;
+#define stbi__float2int(x) ((int)(x))
+static stbi_uc *stbi__hdr_to_ldr(float *data, int x, int y, int comp) {
+  int i, k, n;
+  stbi_uc *output;
+  if (!data)
+    return NULL;
+  output = (stbi_uc *)stbi__malloc_mad3(x, y, comp, 0);
+  if (output == NULL) {
+    STBI_FREE(data);
+    return stbi__errpuc("outofmem", "Out of memory");
+  }
+  // compute number of non-alpha components
+  if (comp & 1)
+    n = comp;
+  else
+    n = comp - 1;
+  for (i = 0; i < x * y; ++i) {
+    for (k = 0; k < n; ++k) {
+      float z = (float)pow(data[i * comp + k] * stbi__h2l_scale_i,
+                           stbi__h2l_gamma_i) *
+                    255 +
+                0.5f;
+      if (z < 0)
+        z = 0;
+      if (z > 255)
+        z = 255;
+      output[i * comp + k] = (stbi_uc)stbi__float2int(z);
+    }
+    if (k < comp) {
+      float z = data[i * comp + k] * 255 + 0.5f;
+      if (z < 0)
+        z = 0;
+      if (z > 255)
+        z = 255;
+      output[i * comp + k] = (stbi_uc)stbi__float2int(z);
+    }
+  }
+  STBI_FREE(data);
+  return output;
 }
 #endif
 
@@ -1751,749 +1913,788 @@ static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
 #ifndef STBI_NO_JPEG
 
 // huffman decoding acceleration
-#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
-
-typedef struct
-{
-   stbi_uc  fast[1 << FAST_BITS];
-   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
-   stbi__uint16 code[256];
-   stbi_uc  values[256];
-   stbi_uc  size[257];
-   unsigned int maxcode[18];
-   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache
+
+typedef struct {
+  stbi_uc fast[1 << FAST_BITS];
+  // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+  stbi__uint16 code[256];
+  stbi_uc values[256];
+  stbi_uc size[257];
+  unsigned int maxcode[18];
+  int delta[17]; // old 'firstsymbol' - old 'firstcode'
 } stbi__huffman;
 
-typedef struct
-{
-   stbi__context *s;
-   stbi__huffman huff_dc[4];
-   stbi__huffman huff_ac[4];
-   stbi__uint16 dequant[4][64];
-   stbi__int16 fast_ac[4][1 << FAST_BITS];
-
-// sizes for components, interleaved MCUs
-   int img_h_max, img_v_max;
-   int img_mcu_x, img_mcu_y;
-   int img_mcu_w, img_mcu_h;
-
-// definition of jpeg image component
-   struct
-   {
-      int id;
-      int h,v;
-      int tq;
-      int hd,ha;
-      int dc_pred;
-
-      int x,y,w2,h2;
-      stbi_uc *data;
-      void *raw_data, *raw_coeff;
-      stbi_uc *linebuf;
-      short   *coeff;   // progressive only
-      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
-   } img_comp[4];
-
-   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
-   int            code_bits;   // number of valid bits
-   unsigned char  marker;      // marker seen while filling entropy buffer
-   int            nomore;      // flag if we saw a marker so must stop
-
-   int            progressive;
-   int            spec_start;
-   int            spec_end;
-   int            succ_high;
-   int            succ_low;
-   int            eob_run;
-   int            jfif;
-   int            app14_color_transform; // Adobe APP14 tag
-   int            rgb;
-
-   int scan_n, order[4];
-   int restart_interval, todo;
-
-// kernels
-   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
-   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
-   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+typedef struct {
+  stbi__context *s;
+  stbi__huffman huff_dc[4];
+  stbi__huffman huff_ac[4];
+  stbi__uint16 dequant[4][64];
+  stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+  // sizes for components, interleaved MCUs
+  int img_h_max, img_v_max;
+  int img_mcu_x, img_mcu_y;
+  int img_mcu_w, img_mcu_h;
+
+  // definition of jpeg image component
+  struct {
+    int id;
+    int h, v;
+    int tq;
+    int hd, ha;
+    int dc_pred;
+
+    int x, y, w2, h2;
+    stbi_uc *data;
+    void *raw_data, *raw_coeff;
+    stbi_uc *linebuf;
+    short *coeff;         // progressive only
+    int coeff_w, coeff_h; // number of 8x8 coefficient blocks
+  } img_comp[4];
+
+  stbi__uint32 code_buffer; // jpeg entropy-coded buffer
+  int code_bits;            // number of valid bits
+  unsigned char marker;     // marker seen while filling entropy buffer
+  int nomore;               // flag if we saw a marker so must stop
+
+  int progressive;
+  int spec_start;
+  int spec_end;
+  int succ_high;
+  int succ_low;
+  int eob_run;
+  int jfif;
+  int app14_color_transform; // Adobe APP14 tag
+  int rgb;
+
+  int scan_n, order[4];
+  int restart_interval, todo;
+
+  // kernels
+  void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+  void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y,
+                              const stbi_uc *pcb, const stbi_uc *pcr, int count,
+                              int step);
+  stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near,
+                                       stbi_uc *in_far, int w, int hs);
 } stbi__jpeg;
 
-static int stbi__build_huffman(stbi__huffman *h, int *count)
-{
-   int i,j,k=0;
-   unsigned int code;
-   // build size list for each symbol (from JPEG spec)
-   for (i=0; i < 16; ++i)
-      for (j=0; j < count[i]; ++j)
-         h->size[k++] = (stbi_uc) (i+1);
-   h->size[k] = 0;
-
-   // compute actual symbols (from jpeg spec)
-   code = 0;
-   k = 0;
-   for(j=1; j <= 16; ++j) {
-      // compute delta to add to code to compute symbol id
-      h->delta[j] = k - code;
-      if (h->size[k] == j) {
-         while (h->size[k] == j)
-            h->code[k++] = (stbi__uint16) (code++);
-         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
-      }
-      // compute largest code + 1 for this size, preshifted as needed later
-      h->maxcode[j] = code << (16-j);
-      code <<= 1;
-   }
-   h->maxcode[j] = 0xffffffff;
-
-   // build non-spec acceleration table; 255 is flag for not-accelerated
-   memset(h->fast, 255, 1 << FAST_BITS);
-   for (i=0; i < k; ++i) {
-      int s = h->size[i];
-      if (s <= FAST_BITS) {
-         int c = h->code[i] << (FAST_BITS-s);
-         int m = 1 << (FAST_BITS-s);
-         for (j=0; j < m; ++j) {
-            h->fast[c+j] = (stbi_uc) i;
-         }
+static int stbi__build_huffman(stbi__huffman *h, int *count) {
+  int i, j, k = 0;
+  unsigned int code;
+  // build size list for each symbol (from JPEG spec)
+  for (i = 0; i < 16; ++i)
+    for (j = 0; j < count[i]; ++j)
+      h->size[k++] = (stbi_uc)(i + 1);
+  h->size[k] = 0;
+
+  // compute actual symbols (from jpeg spec)
+  code = 0;
+  k = 0;
+  for (j = 1; j <= 16; ++j) {
+    // compute delta to add to code to compute symbol id
+    h->delta[j] = k - code;
+    if (h->size[k] == j) {
+      while (h->size[k] == j)
+        h->code[k++] = (stbi__uint16)(code++);
+      if (code - 1 >= (1u << j))
+        return stbi__err("bad code lengths", "Corrupt JPEG");
+    }
+    // compute largest code + 1 for this size, preshifted as needed later
+    h->maxcode[j] = code << (16 - j);
+    code <<= 1;
+  }
+  h->maxcode[j] = 0xffffffff;
+
+  // build non-spec acceleration table; 255 is flag for not-accelerated
+  memset(h->fast, 255, 1 << FAST_BITS);
+  for (i = 0; i < k; ++i) {
+    int s = h->size[i];
+    if (s <= FAST_BITS) {
+      int c = h->code[i] << (FAST_BITS - s);
+      int m = 1 << (FAST_BITS - s);
+      for (j = 0; j < m; ++j) {
+        h->fast[c + j] = (stbi_uc)i;
       }
-   }
-   return 1;
+    }
+  }
+  return 1;
 }
 
 // build a table that decodes both magnitude and value of small ACs in
 // one go.
-static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
-{
-   int i;
-   for (i=0; i < (1 << FAST_BITS); ++i) {
-      stbi_uc fast = h->fast[i];
-      fast_ac[i] = 0;
-      if (fast < 255) {
-         int rs = h->values[fast];
-         int run = (rs >> 4) & 15;
-         int magbits = rs & 15;
-         int len = h->size[fast];
-
-         if (magbits && len + magbits <= FAST_BITS) {
-            // magnitude code followed by receive_extend code
-            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
-            int m = 1 << (magbits - 1);
-            if (k < m) k += (~0U << magbits) + 1;
-            // if the result is small enough, we can fit it in fast_ac table
-            if (k >= -128 && k <= 127)
-               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
-         }
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h) {
+  int i;
+  for (i = 0; i < (1 << FAST_BITS); ++i) {
+    stbi_uc fast = h->fast[i];
+    fast_ac[i] = 0;
+    if (fast < 255) {
+      int rs = h->values[fast];
+      int run = (rs >> 4) & 15;
+      int magbits = rs & 15;
+      int len = h->size[fast];
+
+      if (magbits && len + magbits <= FAST_BITS) {
+        // magnitude code followed by receive_extend code
+        int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+        int m = 1 << (magbits - 1);
+        if (k < m)
+          k += (~0U << magbits) + 1;
+        // if the result is small enough, we can fit it in fast_ac table
+        if (k >= -128 && k <= 127)
+          fast_ac[i] = (stbi__int16)((k * 256) + (run * 16) + (len + magbits));
       }
-   }
-}
-
-static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
-{
-   do {
-      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
-      if (b == 0xff) {
-         int c = stbi__get8(j->s);
-         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
-         if (c != 0) {
-            j->marker = (unsigned char) c;
-            j->nomore = 1;
-            return;
-         }
+    }
+  }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j) {
+  do {
+    unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+    if (b == 0xff) {
+      int c = stbi__get8(j->s);
+      while (c == 0xff)
+        c = stbi__get8(j->s); // consume fill bytes
+      if (c != 0) {
+        j->marker = (unsigned char)c;
+        j->nomore = 1;
+        return;
       }
-      j->code_buffer |= b << (24 - j->code_bits);
-      j->code_bits += 8;
-   } while (j->code_bits <= 24);
+    }
+    j->code_buffer |= b << (24 - j->code_bits);
+    j->code_bits += 8;
+  } while (j->code_bits <= 24);
 }
 
 // (1 << n) - 1
-static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+static const stbi__uint32 stbi__bmask[17] = {
+    0,   1,    3,    7,    15,   31,    63,    127,  255,
+    511, 1023, 2047, 4095, 8191, 16383, 32767, 65535};
 
 // decode a jpeg huffman value from the bitstream
-stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
-{
-   unsigned int temp;
-   int c,k;
-
-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-
-   // look at the top FAST_BITS and determine what symbol ID it is,
-   // if the code is <= FAST_BITS
-   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
-   k = h->fast[c];
-   if (k < 255) {
-      int s = h->size[k];
-      if (s > j->code_bits)
-         return -1;
-      j->code_buffer <<= s;
-      j->code_bits -= s;
-      return h->values[k];
-   }
-
-   // naive test is to shift the code_buffer down so k bits are
-   // valid, then test against maxcode. To speed this up, we've
-   // preshifted maxcode left so that it has (16-k) 0s at the
-   // end; in other words, regardless of the number of bits, it
-   // wants to be compared against something shifted to have 16;
-   // that way we don't need to shift inside the loop.
-   temp = j->code_buffer >> 16;
-   for (k=FAST_BITS+1 ; ; ++k)
-      if (temp < h->maxcode[k])
-         break;
-   if (k == 17) {
-      // error! code not found
-      j->code_bits -= 16;
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h) {
+  unsigned int temp;
+  int c, k;
+
+  if (j->code_bits < 16)
+    stbi__grow_buffer_unsafe(j);
+
+  // look at the top FAST_BITS and determine what symbol ID it is,
+  // if the code is <= FAST_BITS
+  c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+  k = h->fast[c];
+  if (k < 255) {
+    int s = h->size[k];
+    if (s > j->code_bits)
       return -1;
-   }
-
-   if (k > j->code_bits)
-      return -1;
-
-   // convert the huffman code to the symbol id
-   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
-   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
-
-   // convert the id to a symbol
-   j->code_bits -= k;
-   j->code_buffer <<= k;
-   return h->values[c];
+    j->code_buffer <<= s;
+    j->code_bits -= s;
+    return h->values[k];
+  }
+
+  // naive test is to shift the code_buffer down so k bits are
+  // valid, then test against maxcode. To speed this up, we've
+  // preshifted maxcode left so that it has (16-k) 0s at the
+  // end; in other words, regardless of the number of bits, it
+  // wants to be compared against something shifted to have 16;
+  // that way we don't need to shift inside the loop.
+  temp = j->code_buffer >> 16;
+  for (k = FAST_BITS + 1;; ++k)
+    if (temp < h->maxcode[k])
+      break;
+  if (k == 17) {
+    // error! code not found
+    j->code_bits -= 16;
+    return -1;
+  }
+
+  if (k > j->code_bits)
+    return -1;
+
+  // convert the huffman code to the symbol id
+  c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+  STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) &
+               stbi__bmask[h->size[c]]) == h->code[c]);
+
+  // convert the id to a symbol
+  j->code_bits -= k;
+  j->code_buffer <<= k;
+  return h->values[c];
 }
 
 // bias[n] = (-1<<n) + 1
-static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+static const int stbi__jbias[16] = {0,     -1,    -3,     -7,    -15,   -31,
+                                    -63,   -127,  -255,   -511,  -1023, -2047,
+                                    -4095, -8191, -16383, -32767};
 
 // combined JPEG 'receive' and JPEG 'extend', since baseline
 // always extends everything it receives.
-stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
-{
-   unsigned int k;
-   int sgn;
-   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n) {
+  unsigned int k;
+  int sgn;
+  if (j->code_bits < n)
+    stbi__grow_buffer_unsafe(j);
 
-   sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
-   k = stbi_lrot(j->code_buffer, n);
-   STBI_ASSERT(n >= 0 && n < (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask)));
-   j->code_buffer = k & ~stbi__bmask[n];
-   k &= stbi__bmask[n];
-   j->code_bits -= n;
-   return k + (stbi__jbias[n] & ~sgn);
+  sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
+  k = stbi_lrot(j->code_buffer, n);
+  STBI_ASSERT(n >= 0 && n < (int)(sizeof(stbi__bmask) / sizeof(*stbi__bmask)));
+  j->code_buffer = k & ~stbi__bmask[n];
+  k &= stbi__bmask[n];
+  j->code_bits -= n;
+  return k + (stbi__jbias[n] & ~sgn);
 }
 
 // get some unsigned bits
-stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
-{
-   unsigned int k;
-   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
-   k = stbi_lrot(j->code_buffer, n);
-   j->code_buffer = k & ~stbi__bmask[n];
-   k &= stbi__bmask[n];
-   j->code_bits -= n;
-   return k;
-}
-
-stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
-{
-   unsigned int k;
-   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
-   k = j->code_buffer;
-   j->code_buffer <<= 1;
-   --j->code_bits;
-   return k & 0x80000000;
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n) {
+  unsigned int k;
+  if (j->code_bits < n)
+    stbi__grow_buffer_unsafe(j);
+  k = stbi_lrot(j->code_buffer, n);
+  j->code_buffer = k & ~stbi__bmask[n];
+  k &= stbi__bmask[n];
+  j->code_bits -= n;
+  return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j) {
+  unsigned int k;
+  if (j->code_bits < 1)
+    stbi__grow_buffer_unsafe(j);
+  k = j->code_buffer;
+  j->code_buffer <<= 1;
+  --j->code_bits;
+  return k & 0x80000000;
 }
 
 // given a value that's at position X in the zigzag stream,
 // where does it appear in the 8x8 matrix coded as row-major?
-static const stbi_uc stbi__jpeg_dezigzag[64+15] =
-{
-    0,  1,  8, 16,  9,  2,  3, 10,
-   17, 24, 32, 25, 18, 11,  4,  5,
-   12, 19, 26, 33, 40, 48, 41, 34,
-   27, 20, 13,  6,  7, 14, 21, 28,
-   35, 42, 49, 56, 57, 50, 43, 36,
-   29, 22, 15, 23, 30, 37, 44, 51,
-   58, 59, 52, 45, 38, 31, 39, 46,
-   53, 60, 61, 54, 47, 55, 62, 63,
-   // let corrupt input sample past end
-   63, 63, 63, 63, 63, 63, 63, 63,
-   63, 63, 63, 63, 63, 63, 63
-};
+static const stbi_uc stbi__jpeg_dezigzag[64 + 15] = {
+    0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40,
+    48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36,
+    29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61,
+    54, 47, 55, 62, 63,
+    // let corrupt input sample past end
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63};
 
 // decode one 64-entry block--
-static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
-{
-   int diff,dc,k;
-   int t;
-
-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-   t = stbi__jpeg_huff_decode(j, hdc);
-   if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG");
-
-   // 0 all the ac values now so we can do it 32-bits at a time
-   memset(data,0,64*sizeof(data[0]));
-
-   diff = t ? stbi__extend_receive(j, t) : 0;
-   dc = j->img_comp[b].dc_pred + diff;
-   j->img_comp[b].dc_pred = dc;
-   data[0] = (short) (dc * dequant[0]);
-
-   // decode AC components, see JPEG spec
-   k = 1;
-   do {
-      unsigned int zig;
-      int c,r,s;
-      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
-      r = fac[c];
-      if (r) { // fast-AC path
-         k += (r >> 4) & 15; // run
-         s = r & 15; // combined length
-         j->code_buffer <<= s;
-         j->code_bits -= s;
-         // decode into unzigzag'd location
-         zig = stbi__jpeg_dezigzag[k++];
-         data[zig] = (short) ((r >> 8) * dequant[zig]);
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64],
+                                   stbi__huffman *hdc, stbi__huffman *hac,
+                                   stbi__int16 *fac, int b,
+                                   stbi__uint16 *dequant) {
+  int diff, dc, k;
+  int t;
+
+  if (j->code_bits < 16)
+    stbi__grow_buffer_unsafe(j);
+  t = stbi__jpeg_huff_decode(j, hdc);
+  if (t < 0)
+    return stbi__err("bad huffman code", "Corrupt JPEG");
+
+  // 0 all the ac values now so we can do it 32-bits at a time
+  memset(data, 0, 64 * sizeof(data[0]));
+
+  diff = t ? stbi__extend_receive(j, t) : 0;
+  dc = j->img_comp[b].dc_pred + diff;
+  j->img_comp[b].dc_pred = dc;
+  data[0] = (short)(dc * dequant[0]);
+
+  // decode AC components, see JPEG spec
+  k = 1;
+  do {
+    unsigned int zig;
+    int c, r, s;
+    if (j->code_bits < 16)
+      stbi__grow_buffer_unsafe(j);
+    c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+    r = fac[c];
+    if (r) {              // fast-AC path
+      k += (r >> 4) & 15; // run
+      s = r & 15;         // combined length
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      // decode into unzigzag'd location
+      zig = stbi__jpeg_dezigzag[k++];
+      data[zig] = (short)((r >> 8) * dequant[zig]);
+    } else {
+      int rs = stbi__jpeg_huff_decode(j, hac);
+      if (rs < 0)
+        return stbi__err("bad huffman code", "Corrupt JPEG");
+      s = rs & 15;
+      r = rs >> 4;
+      if (s == 0) {
+        if (rs != 0xf0)
+          break; // end block
+        k += 16;
       } else {
-         int rs = stbi__jpeg_huff_decode(j, hac);
-         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
-         s = rs & 15;
-         r = rs >> 4;
-         if (s == 0) {
-            if (rs != 0xf0) break; // end block
-            k += 16;
-         } else {
-            k += r;
-            // decode into unzigzag'd location
-            zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
-         }
+        k += r;
+        // decode into unzigzag'd location
+        zig = stbi__jpeg_dezigzag[k++];
+        data[zig] = (short)(stbi__extend_receive(j, s) * dequant[zig]);
       }
-   } while (k < 64);
-   return 1;
+    }
+  } while (k < 64);
+  return 1;
 }
 
-static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
-{
-   int diff,dc;
-   int t;
-   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64],
+                                           stbi__huffman *hdc, int b) {
+  int diff, dc;
+  int t;
+  if (j->spec_end != 0)
+    return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 
-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+  if (j->code_bits < 16)
+    stbi__grow_buffer_unsafe(j);
 
-   if (j->succ_high == 0) {
-      // first scan for DC coefficient, must be first
-      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
-      t = stbi__jpeg_huff_decode(j, hdc);
-      diff = t ? stbi__extend_receive(j, t) : 0;
+  if (j->succ_high == 0) {
+    // first scan for DC coefficient, must be first
+    memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now
+    t = stbi__jpeg_huff_decode(j, hdc);
+    diff = t ? stbi__extend_receive(j, t) : 0;
 
-      dc = j->img_comp[b].dc_pred + diff;
-      j->img_comp[b].dc_pred = dc;
-      data[0] = (short) (dc << j->succ_low);
-   } else {
-      // refinement scan for DC coefficient
-      if (stbi__jpeg_get_bit(j))
-         data[0] += (short) (1 << j->succ_low);
-   }
-   return 1;
+    dc = j->img_comp[b].dc_pred + diff;
+    j->img_comp[b].dc_pred = dc;
+    data[0] = (short)(dc << j->succ_low);
+  } else {
+    // refinement scan for DC coefficient
+    if (stbi__jpeg_get_bit(j))
+      data[0] += (short)(1 << j->succ_low);
+  }
+  return 1;
 }
 
 // @OPTIMIZE: store non-zigzagged during the decode passes,
 // and only de-zigzag when dequantizing
-static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
-{
-   int k;
-   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-
-   if (j->succ_high == 0) {
-      int shift = j->succ_low;
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64],
+                                           stbi__huffman *hac,
+                                           stbi__int16 *fac) {
+  int k;
+  if (j->spec_start == 0)
+    return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+  if (j->succ_high == 0) {
+    int shift = j->succ_low;
+
+    if (j->eob_run) {
+      --j->eob_run;
+      return 1;
+    }
 
-      if (j->eob_run) {
-         --j->eob_run;
-         return 1;
-      }
-
-      k = j->spec_start;
-      do {
-         unsigned int zig;
-         int c,r,s;
-         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
-         r = fac[c];
-         if (r) { // fast-AC path
-            k += (r >> 4) & 15; // run
-            s = r & 15; // combined length
-            j->code_buffer <<= s;
-            j->code_bits -= s;
-            zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = (short) ((r >> 8) << shift);
-         } else {
-            int rs = stbi__jpeg_huff_decode(j, hac);
-            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
-            s = rs & 15;
-            r = rs >> 4;
-            if (s == 0) {
-               if (r < 15) {
-                  j->eob_run = (1 << r);
-                  if (r)
-                     j->eob_run += stbi__jpeg_get_bits(j, r);
-                  --j->eob_run;
-                  break;
-               }
-               k += 16;
-            } else {
-               k += r;
-               zig = stbi__jpeg_dezigzag[k++];
-               data[zig] = (short) (stbi__extend_receive(j,s) << shift);
-            }
-         }
-      } while (k <= j->spec_end);
-   } else {
-      // refinement scan for these AC coefficients
-
-      short bit = (short) (1 << j->succ_low);
-
-      if (j->eob_run) {
-         --j->eob_run;
-         for (k = j->spec_start; k <= j->spec_end; ++k) {
-            short *p = &data[stbi__jpeg_dezigzag[k]];
-            if (*p != 0)
-               if (stbi__jpeg_get_bit(j))
-                  if ((*p & bit)==0) {
-                     if (*p > 0)
-                        *p += bit;
-                     else
-                        *p -= bit;
-                  }
-         }
+    k = j->spec_start;
+    do {
+      unsigned int zig;
+      int c, r, s;
+      if (j->code_bits < 16)
+        stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+      r = fac[c];
+      if (r) {              // fast-AC path
+        k += (r >> 4) & 15; // run
+        s = r & 15;         // combined length
+        j->code_buffer <<= s;
+        j->code_bits -= s;
+        zig = stbi__jpeg_dezigzag[k++];
+        data[zig] = (short)((r >> 8) << shift);
       } else {
-         k = j->spec_start;
-         do {
-            int r,s;
-            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
-            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
-            s = rs & 15;
-            r = rs >> 4;
-            if (s == 0) {
-               if (r < 15) {
-                  j->eob_run = (1 << r) - 1;
-                  if (r)
-                     j->eob_run += stbi__jpeg_get_bits(j, r);
-                  r = 64; // force end of block
-               } else {
-                  // r=15 s=0 should write 16 0s, so we just do
-                  // a run of 15 0s and then write s (which is 0),
-                  // so we don't have to do anything special here
-               }
-            } else {
-               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
-               // sign bit
-               if (stbi__jpeg_get_bit(j))
-                  s = bit;
-               else
-                  s = -bit;
+        int rs = stbi__jpeg_huff_decode(j, hac);
+        if (rs < 0)
+          return stbi__err("bad huffman code", "Corrupt JPEG");
+        s = rs & 15;
+        r = rs >> 4;
+        if (s == 0) {
+          if (r < 15) {
+            j->eob_run = (1 << r);
+            if (r)
+              j->eob_run += stbi__jpeg_get_bits(j, r);
+            --j->eob_run;
+            break;
+          }
+          k += 16;
+        } else {
+          k += r;
+          zig = stbi__jpeg_dezigzag[k++];
+          data[zig] = (short)(stbi__extend_receive(j, s) << shift);
+        }
+      }
+    } while (k <= j->spec_end);
+  } else {
+    // refinement scan for these AC coefficients
+
+    short bit = (short)(1 << j->succ_low);
+
+    if (j->eob_run) {
+      --j->eob_run;
+      for (k = j->spec_start; k <= j->spec_end; ++k) {
+        short *p = &data[stbi__jpeg_dezigzag[k]];
+        if (*p != 0)
+          if (stbi__jpeg_get_bit(j))
+            if ((*p & bit) == 0) {
+              if (*p > 0)
+                *p += bit;
+              else
+                *p -= bit;
             }
+      }
+    } else {
+      k = j->spec_start;
+      do {
+        int r, s;
+        int rs = stbi__jpeg_huff_decode(
+            j, hac); // @OPTIMIZE see if we can use the fast path here,
+                     // advance-by-r is so slow, eh
+        if (rs < 0)
+          return stbi__err("bad huffman code", "Corrupt JPEG");
+        s = rs & 15;
+        r = rs >> 4;
+        if (s == 0) {
+          if (r < 15) {
+            j->eob_run = (1 << r) - 1;
+            if (r)
+              j->eob_run += stbi__jpeg_get_bits(j, r);
+            r = 64; // force end of block
+          } else {
+            // r=15 s=0 should write 16 0s, so we just do
+            // a run of 15 0s and then write s (which is 0),
+            // so we don't have to do anything special here
+          }
+        } else {
+          if (s != 1)
+            return stbi__err("bad huffman code", "Corrupt JPEG");
+          // sign bit
+          if (stbi__jpeg_get_bit(j))
+            s = bit;
+          else
+            s = -bit;
+        }
 
-            // advance by r
-            while (k <= j->spec_end) {
-               short *p = &data[stbi__jpeg_dezigzag[k++]];
-               if (*p != 0) {
-                  if (stbi__jpeg_get_bit(j))
-                     if ((*p & bit)==0) {
-                        if (*p > 0)
-                           *p += bit;
-                        else
-                           *p -= bit;
-                     }
-               } else {
-                  if (r == 0) {
-                     *p = (short) s;
-                     break;
-                  }
-                  --r;
-               }
+        // advance by r
+        while (k <= j->spec_end) {
+          short *p = &data[stbi__jpeg_dezigzag[k++]];
+          if (*p != 0) {
+            if (stbi__jpeg_get_bit(j))
+              if ((*p & bit) == 0) {
+                if (*p > 0)
+                  *p += bit;
+                else
+                  *p -= bit;
+              }
+          } else {
+            if (r == 0) {
+              *p = (short)s;
+              break;
             }
-         } while (k <= j->spec_end);
-      }
-   }
-   return 1;
+            --r;
+          }
+        }
+      } while (k <= j->spec_end);
+    }
+  }
+  return 1;
 }
 
 // take a -128..127 value and stbi__clamp it and convert to 0..255
-stbi_inline static stbi_uc stbi__clamp(int x)
-{
-   // trick to use a single test to catch both cases
-   if ((unsigned int) x > 255) {
-      if (x < 0) return 0;
-      if (x > 255) return 255;
-   }
-   return (stbi_uc) x;
+stbi_inline static stbi_uc stbi__clamp(int x) {
+  // trick to use a single test to catch both cases
+  if ((unsigned int)x > 255) {
+    if (x < 0)
+      return 0;
+    if (x > 255)
+      return 255;
+  }
+  return (stbi_uc)x;
 }
 
-#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
-#define stbi__fsh(x)  ((x) * 4096)
+#define stbi__f2f(x) ((int)(((x)*4096 + 0.5)))
+#define stbi__fsh(x) ((x)*4096)
 
 // derived from jidctint -- DCT_ISLOW
-#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
-   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
-   p2 = s2;                                    \
-   p3 = s6;                                    \
-   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
-   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
-   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
-   p2 = s0;                                    \
-   p3 = s4;                                    \
-   t0 = stbi__fsh(p2+p3);                      \
-   t1 = stbi__fsh(p2-p3);                      \
-   x0 = t0+t3;                                 \
-   x3 = t0-t3;                                 \
-   x1 = t1+t2;                                 \
-   x2 = t1-t2;                                 \
-   t0 = s7;                                    \
-   t1 = s5;                                    \
-   t2 = s3;                                    \
-   t3 = s1;                                    \
-   p3 = t0+t2;                                 \
-   p4 = t1+t3;                                 \
-   p1 = t0+t3;                                 \
-   p2 = t1+t2;                                 \
-   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
-   t0 = t0*stbi__f2f( 0.298631336f);           \
-   t1 = t1*stbi__f2f( 2.053119869f);           \
-   t2 = t2*stbi__f2f( 3.072711026f);           \
-   t3 = t3*stbi__f2f( 1.501321110f);           \
-   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
-   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
-   p3 = p3*stbi__f2f(-1.961570560f);           \
-   p4 = p4*stbi__f2f(-0.390180644f);           \
-   t3 += p1+p4;                                \
-   t2 += p2+p3;                                \
-   t1 += p2+p4;                                \
-   t0 += p1+p3;
-
-static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
-{
-   int i,val[64],*v=val;
-   stbi_uc *o;
-   short *d = data;
-
-   // columns
-   for (i=0; i < 8; ++i,++d, ++v) {
-      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
-      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
-           && d[40]==0 && d[48]==0 && d[56]==0) {
-         //    no shortcut                 0     seconds
-         //    (1|2|3|4|5|6|7)==0          0     seconds
-         //    all separate               -0.047 seconds
-         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
-         int dcterm = d[0]*4;
-         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
-      } else {
-         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
-         // constants scaled things up by 1<<12; let's bring them back
-         // down, but keep 2 extra bits of precision
-         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
-         v[ 0] = (x0+t3) >> 10;
-         v[56] = (x0-t3) >> 10;
-         v[ 8] = (x1+t2) >> 10;
-         v[48] = (x1-t2) >> 10;
-         v[16] = (x2+t1) >> 10;
-         v[40] = (x2-t1) >> 10;
-         v[24] = (x3+t0) >> 10;
-         v[32] = (x3-t0) >> 10;
-      }
-   }
-
-   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
-      // no fast case since the first 1D IDCT spread components out
-      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
-      // constants scaled things up by 1<<12, plus we had 1<<2 from first
-      // loop, plus horizontal and vertical each scale by sqrt(8) so together
-      // we've got an extra 1<<3, so 1<<17 total we need to remove.
-      // so we want to round that, which means adding 0.5 * 1<<17,
-      // aka 65536. Also, we'll end up with -128 to 127 that we want
-      // to encode as 0..255 by adding 128, so we'll add that before the shift
-      x0 += 65536 + (128<<17);
-      x1 += 65536 + (128<<17);
-      x2 += 65536 + (128<<17);
-      x3 += 65536 + (128<<17);
-      // tried computing the shifts into temps, or'ing the temps to see
-      // if any were out of range, but that was slower
-      o[0] = stbi__clamp((x0+t3) >> 17);
-      o[7] = stbi__clamp((x0-t3) >> 17);
-      o[1] = stbi__clamp((x1+t2) >> 17);
-      o[6] = stbi__clamp((x1-t2) >> 17);
-      o[2] = stbi__clamp((x2+t1) >> 17);
-      o[5] = stbi__clamp((x2-t1) >> 17);
-      o[3] = stbi__clamp((x3+t0) >> 17);
-      o[4] = stbi__clamp((x3-t0) >> 17);
-   }
+#define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7)                          \
+  int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3;                      \
+  p2 = s2;                                                                     \
+  p3 = s6;                                                                     \
+  p1 = (p2 + p3) * stbi__f2f(0.5411961f);                                      \
+  t2 = p1 + p3 * stbi__f2f(-1.847759065f);                                     \
+  t3 = p1 + p2 * stbi__f2f(0.765366865f);                                      \
+  p2 = s0;                                                                     \
+  p3 = s4;                                                                     \
+  t0 = stbi__fsh(p2 + p3);                                                     \
+  t1 = stbi__fsh(p2 - p3);                                                     \
+  x0 = t0 + t3;                                                                \
+  x3 = t0 - t3;                                                                \
+  x1 = t1 + t2;                                                                \
+  x2 = t1 - t2;                                                                \
+  t0 = s7;                                                                     \
+  t1 = s5;                                                                     \
+  t2 = s3;                                                                     \
+  t3 = s1;                                                                     \
+  p3 = t0 + t2;                                                                \
+  p4 = t1 + t3;                                                                \
+  p1 = t0 + t3;                                                                \
+  p2 = t1 + t2;                                                                \
+  p5 = (p3 + p4) * stbi__f2f(1.175875602f);                                    \
+  t0 = t0 * stbi__f2f(0.298631336f);                                           \
+  t1 = t1 * stbi__f2f(2.053119869f);                                           \
+  t2 = t2 * stbi__f2f(3.072711026f);                                           \
+  t3 = t3 * stbi__f2f(1.501321110f);                                           \
+  p1 = p5 + p1 * stbi__f2f(-0.899976223f);                                     \
+  p2 = p5 + p2 * stbi__f2f(-2.562915447f);                                     \
+  p3 = p3 * stbi__f2f(-1.961570560f);                                          \
+  p4 = p4 * stbi__f2f(-0.390180644f);                                          \
+  t3 += p1 + p4;                                                               \
+  t2 += p2 + p3;                                                               \
+  t1 += p2 + p4;                                                               \
+  t0 += p1 + p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64]) {
+  int i, val[64], *v = val;
+  stbi_uc *o;
+  short *d = data;
+
+  // columns
+  for (i = 0; i < 8; ++i, ++d, ++v) {
+    // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+    if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 &&
+        d[48] == 0 && d[56] == 0) {
+      //    no shortcut                 0     seconds
+      //    (1|2|3|4|5|6|7)==0          0     seconds
+      //    all separate               -0.047 seconds
+      //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+      int dcterm = d[0] * 4;
+      v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+    } else {
+      STBI__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56])
+      // constants scaled things up by 1<<12; let's bring them back
+      // down, but keep 2 extra bits of precision
+      x0 += 512;
+      x1 += 512;
+      x2 += 512;
+      x3 += 512;
+      v[0] = (x0 + t3) >> 10;
+      v[56] = (x0 - t3) >> 10;
+      v[8] = (x1 + t2) >> 10;
+      v[48] = (x1 - t2) >> 10;
+      v[16] = (x2 + t1) >> 10;
+      v[40] = (x2 - t1) >> 10;
+      v[24] = (x3 + t0) >> 10;
+      v[32] = (x3 - t0) >> 10;
+    }
+  }
+
+  for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) {
+    // no fast case since the first 1D IDCT spread components out
+    STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
+    // constants scaled things up by 1<<12, plus we had 1<<2 from first
+    // loop, plus horizontal and vertical each scale by sqrt(8) so together
+    // we've got an extra 1<<3, so 1<<17 total we need to remove.
+    // so we want to round that, which means adding 0.5 * 1<<17,
+    // aka 65536. Also, we'll end up with -128 to 127 that we want
+    // to encode as 0..255 by adding 128, so we'll add that before the shift
+    x0 += 65536 + (128 << 17);
+    x1 += 65536 + (128 << 17);
+    x2 += 65536 + (128 << 17);
+    x3 += 65536 + (128 << 17);
+    // tried computing the shifts into temps, or'ing the temps to see
+    // if any were out of range, but that was slower
+    o[0] = stbi__clamp((x0 + t3) >> 17);
+    o[7] = stbi__clamp((x0 - t3) >> 17);
+    o[1] = stbi__clamp((x1 + t2) >> 17);
+    o[6] = stbi__clamp((x1 - t2) >> 17);
+    o[2] = stbi__clamp((x2 + t1) >> 17);
+    o[5] = stbi__clamp((x2 - t1) >> 17);
+    o[3] = stbi__clamp((x3 + t0) >> 17);
+    o[4] = stbi__clamp((x3 - t0) >> 17);
+  }
 }
 
 #ifdef STBI_SSE2
 // sse2 integer IDCT. not the fastest possible implementation but it
 // produces bit-identical results to the generic C version so it's
 // fully "transparent".
-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
-{
-   // This is constructed to match our regular (generic) integer IDCT exactly.
-   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
-   __m128i tmp;
-
-   // dot product constant: even elems=x, odd elems=y
-   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
-
-   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
-   // out(1) = c1[even]*x + c1[odd]*y
-   #define dct_rot(out0,out1, x,y,c0,c1) \
-      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
-      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
-      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
-      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
-      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
-      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
-
-   // out = in << 12  (in 16-bit, out 32-bit)
-   #define dct_widen(out, in) \
-      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
-      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
-
-   // wide add
-   #define dct_wadd(out, a, b) \
-      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
-      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
-
-   // wide sub
-   #define dct_wsub(out, a, b) \
-      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
-      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
-
-   // butterfly a/b, add bias, then shift by "s" and pack
-   #define dct_bfly32o(out0, out1, a,b,bias,s) \
-      { \
-         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
-         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
-         dct_wadd(sum, abiased, b); \
-         dct_wsub(dif, abiased, b); \
-         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
-         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
-      }
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
+  // This is constructed to match our regular (generic) integer IDCT exactly.
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i tmp;
+
+// dot product constant: even elems=x, odd elems=y
+#define dct_const(x, y) _mm_setr_epi16((x), (y), (x), (y), (x), (y), (x), (y))
+
+// out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+// out(1) = c1[even]*x + c1[odd]*y
+#define dct_rot(out0, out1, x, y, c0, c1)                                      \
+  __m128i c0##lo = _mm_unpacklo_epi16((x), (y));                               \
+  __m128i c0##hi = _mm_unpackhi_epi16((x), (y));                               \
+  __m128i out0##_l = _mm_madd_epi16(c0##lo, c0);                               \
+  __m128i out0##_h = _mm_madd_epi16(c0##hi, c0);                               \
+  __m128i out1##_l = _mm_madd_epi16(c0##lo, c1);                               \
+  __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+// out = in << 12  (in 16-bit, out 32-bit)
+#define dct_widen(out, in)                                                     \
+  __m128i out##_l =                                                            \
+      _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4);        \
+  __m128i out##_h =                                                            \
+      _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
 
-   // 8-bit interleave step (for transposes)
-   #define dct_interleave8(a, b) \
-      tmp = a; \
-      a = _mm_unpacklo_epi8(a, b); \
-      b = _mm_unpackhi_epi8(tmp, b)
-
-   // 16-bit interleave step (for transposes)
-   #define dct_interleave16(a, b) \
-      tmp = a; \
-      a = _mm_unpacklo_epi16(a, b); \
-      b = _mm_unpackhi_epi16(tmp, b)
-
-   #define dct_pass(bias,shift) \
-      { \
-         /* even part */ \
-         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
-         __m128i sum04 = _mm_add_epi16(row0, row4); \
-         __m128i dif04 = _mm_sub_epi16(row0, row4); \
-         dct_widen(t0e, sum04); \
-         dct_widen(t1e, dif04); \
-         dct_wadd(x0, t0e, t3e); \
-         dct_wsub(x3, t0e, t3e); \
-         dct_wadd(x1, t1e, t2e); \
-         dct_wsub(x2, t1e, t2e); \
-         /* odd part */ \
-         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
-         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
-         __m128i sum17 = _mm_add_epi16(row1, row7); \
-         __m128i sum35 = _mm_add_epi16(row3, row5); \
-         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
-         dct_wadd(x4, y0o, y4o); \
-         dct_wadd(x5, y1o, y5o); \
-         dct_wadd(x6, y2o, y5o); \
-         dct_wadd(x7, y3o, y4o); \
-         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
-         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
-         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
-         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
-      }
-
-   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
-   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
-   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
-   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
-   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
-   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
-   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
-   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
-
-   // rounding biases in column/row passes, see stbi__idct_block for explanation.
-   __m128i bias_0 = _mm_set1_epi32(512);
-   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
-
-   // load
-   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
-   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
-   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
-   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
-   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
-   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
-   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
-   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
-
-   // column pass
-   dct_pass(bias_0, 10);
-
-   {
-      // 16bit 8x8 transpose pass 1
-      dct_interleave16(row0, row4);
-      dct_interleave16(row1, row5);
-      dct_interleave16(row2, row6);
-      dct_interleave16(row3, row7);
-
-      // transpose pass 2
-      dct_interleave16(row0, row2);
-      dct_interleave16(row1, row3);
-      dct_interleave16(row4, row6);
-      dct_interleave16(row5, row7);
-
-      // transpose pass 3
-      dct_interleave16(row0, row1);
-      dct_interleave16(row2, row3);
-      dct_interleave16(row4, row5);
-      dct_interleave16(row6, row7);
-   }
-
-   // row pass
-   dct_pass(bias_1, 17);
-
-   {
-      // pack
-      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
-      __m128i p1 = _mm_packus_epi16(row2, row3);
-      __m128i p2 = _mm_packus_epi16(row4, row5);
-      __m128i p3 = _mm_packus_epi16(row6, row7);
-
-      // 8bit 8x8 transpose pass 1
-      dct_interleave8(p0, p2); // a0e0a1e1...
-      dct_interleave8(p1, p3); // c0g0c1g1...
-
-      // transpose pass 2
-      dct_interleave8(p0, p1); // a0c0e0g0...
-      dct_interleave8(p2, p3); // b0d0f0h0...
-
-      // transpose pass 3
-      dct_interleave8(p0, p2); // a0b0c0d0...
-      dct_interleave8(p1, p3); // a4b4c4d4...
+// wide add
+#define dct_wadd(out, a, b)                                                    \
+  __m128i out##_l = _mm_add_epi32(a##_l, b##_l);                               \
+  __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
 
-      // store
-      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
-   }
+// wide sub
+#define dct_wsub(out, a, b)                                                    \
+  __m128i out##_l = _mm_sub_epi32(a##_l, b##_l);                               \
+  __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+// butterfly a/b, add bias, then shift by "s" and pack
+#define dct_bfly32o(out0, out1, a, b, bias, s)                                 \
+  {                                                                            \
+    __m128i abiased_l = _mm_add_epi32(a##_l, bias);                            \
+    __m128i abiased_h = _mm_add_epi32(a##_h, bias);                            \
+    dct_wadd(sum, abiased, b);                                                 \
+    dct_wsub(dif, abiased, b);                                                 \
+    out0 =                                                                     \
+        _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s));   \
+    out1 =                                                                     \
+        _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s));   \
+  }
+
+// 8-bit interleave step (for transposes)
+#define dct_interleave8(a, b)                                                  \
+  tmp = a;                                                                     \
+  a = _mm_unpacklo_epi8(a, b);                                                 \
+  b = _mm_unpackhi_epi8(tmp, b)
+
+// 16-bit interleave step (for transposes)
+#define dct_interleave16(a, b)                                                 \
+  tmp = a;                                                                     \
+  a = _mm_unpacklo_epi16(a, b);                                                \
+  b = _mm_unpackhi_epi16(tmp, b)
+
+#define dct_pass(bias, shift)                                                  \
+  {                                                                            \
+    /* even part */                                                            \
+    dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1);                             \
+    __m128i sum04 = _mm_add_epi16(row0, row4);                                 \
+    __m128i dif04 = _mm_sub_epi16(row0, row4);                                 \
+    dct_widen(t0e, sum04);                                                     \
+    dct_widen(t1e, dif04);                                                     \
+    dct_wadd(x0, t0e, t3e);                                                    \
+    dct_wsub(x3, t0e, t3e);                                                    \
+    dct_wadd(x1, t1e, t2e);                                                    \
+    dct_wsub(x2, t1e, t2e);                                                    \
+    /* odd part */                                                             \
+    dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1);                             \
+    dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1);                             \
+    __m128i sum17 = _mm_add_epi16(row1, row7);                                 \
+    __m128i sum35 = _mm_add_epi16(row3, row5);                                 \
+    dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1);                           \
+    dct_wadd(x4, y0o, y4o);                                                    \
+    dct_wadd(x5, y1o, y5o);                                                    \
+    dct_wadd(x6, y2o, y5o);                                                    \
+    dct_wadd(x7, y3o, y4o);                                                    \
+    dct_bfly32o(row0, row7, x0, x7, bias, shift);                              \
+    dct_bfly32o(row1, row6, x1, x6, bias, shift);                              \
+    dct_bfly32o(row2, row5, x2, x5, bias, shift);                              \
+    dct_bfly32o(row3, row4, x3, x4, bias, shift);                              \
+  }
+
+  __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f),
+                             stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+  __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f),
+                             stbi__f2f(0.5411961f));
+  __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f),
+                             stbi__f2f(1.175875602f));
+  __m128i rot1_1 =
+      dct_const(stbi__f2f(1.175875602f),
+                stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+  __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f),
+                             stbi__f2f(-1.961570560f));
+  __m128i rot2_1 =
+      dct_const(stbi__f2f(-1.961570560f),
+                stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f));
+  __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f),
+                             stbi__f2f(-0.390180644f));
+  __m128i rot3_1 =
+      dct_const(stbi__f2f(-0.390180644f),
+                stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f));
+
+  // rounding biases in column/row passes, see stbi__idct_block for explanation.
+  __m128i bias_0 = _mm_set1_epi32(512);
+  __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
+
+  // load
+  row0 = _mm_load_si128((const __m128i *)(data + 0 * 8));
+  row1 = _mm_load_si128((const __m128i *)(data + 1 * 8));
+  row2 = _mm_load_si128((const __m128i *)(data + 2 * 8));
+  row3 = _mm_load_si128((const __m128i *)(data + 3 * 8));
+  row4 = _mm_load_si128((const __m128i *)(data + 4 * 8));
+  row5 = _mm_load_si128((const __m128i *)(data + 5 * 8));
+  row6 = _mm_load_si128((const __m128i *)(data + 6 * 8));
+  row7 = _mm_load_si128((const __m128i *)(data + 7 * 8));
+
+  // column pass
+  dct_pass(bias_0, 10);
+
+  {
+    // 16bit 8x8 transpose pass 1
+    dct_interleave16(row0, row4);
+    dct_interleave16(row1, row5);
+    dct_interleave16(row2, row6);
+    dct_interleave16(row3, row7);
+
+    // transpose pass 2
+    dct_interleave16(row0, row2);
+    dct_interleave16(row1, row3);
+    dct_interleave16(row4, row6);
+    dct_interleave16(row5, row7);
+
+    // transpose pass 3
+    dct_interleave16(row0, row1);
+    dct_interleave16(row2, row3);
+    dct_interleave16(row4, row5);
+    dct_interleave16(row6, row7);
+  }
+
+  // row pass
+  dct_pass(bias_1, 17);
+
+  {
+    // pack
+    __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+    __m128i p1 = _mm_packus_epi16(row2, row3);
+    __m128i p2 = _mm_packus_epi16(row4, row5);
+    __m128i p3 = _mm_packus_epi16(row6, row7);
+
+    // 8bit 8x8 transpose pass 1
+    dct_interleave8(p0, p2); // a0e0a1e1...
+    dct_interleave8(p1, p3); // c0g0c1g1...
+
+    // transpose pass 2
+    dct_interleave8(p0, p1); // a0c0e0g0...
+    dct_interleave8(p2, p3); // b0d0f0h0...
+
+    // transpose pass 3
+    dct_interleave8(p0, p2); // a0b0c0d0...
+    dct_interleave8(p1, p3); // a4b4c4d4...
+
+    // store
+    _mm_storel_epi64((__m128i *)out, p0);
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p0, 0x4e));
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, p2);
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p2, 0x4e));
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, p1);
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p1, 0x4e));
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, p3);
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p3, 0x4e));
+  }
 
 #undef dct_const
 #undef dct_rot
@@ -2512,198 +2713,236 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
 
 // NEON integer IDCT. should produce bit-identical
 // results to the generic C version.
-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
-{
-   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
-
-   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
-   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
-   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
-   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
-   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
-   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
-   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
-   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
-   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
-   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
-   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
-   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
-
-#define dct_long_mul(out, inq, coeff) \
-   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
-   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
-
-#define dct_long_mac(out, acc, inq, coeff) \
-   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
-   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
-
-#define dct_widen(out, inq) \
-   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
-   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
+  int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+  int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+  int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+  int16x4_t rot0_2 = vdup_n_s16(stbi__f2f(0.765366865f));
+  int16x4_t rot1_0 = vdup_n_s16(stbi__f2f(1.175875602f));
+  int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+  int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+  int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+  int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+  int16x4_t rot3_0 = vdup_n_s16(stbi__f2f(0.298631336f));
+  int16x4_t rot3_1 = vdup_n_s16(stbi__f2f(2.053119869f));
+  int16x4_t rot3_2 = vdup_n_s16(stbi__f2f(3.072711026f));
+  int16x4_t rot3_3 = vdup_n_s16(stbi__f2f(1.501321110f));
+
+#define dct_long_mul(out, inq, coeff)                                          \
+  int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff);                     \
+  int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff)                                     \
+  int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff);            \
+  int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq)                                                    \
+  int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12);                      \
+  int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
 
 // wide add
-#define dct_wadd(out, a, b) \
-   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
-   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+#define dct_wadd(out, a, b)                                                    \
+  int32x4_t out##_l = vaddq_s32(a##_l, b##_l);                                 \
+  int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
 
 // wide sub
-#define dct_wsub(out, a, b) \
-   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
-   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+#define dct_wsub(out, a, b)                                                    \
+  int32x4_t out##_l = vsubq_s32(a##_l, b##_l);                                 \
+  int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
 
 // butterfly a/b, then shift using "shiftop" by "s" and pack
-#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
-   { \
-      dct_wadd(sum, a, b); \
-      dct_wsub(dif, a, b); \
-      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
-      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
-   }
-
-#define dct_pass(shiftop, shift) \
-   { \
-      /* even part */ \
-      int16x8_t sum26 = vaddq_s16(row2, row6); \
-      dct_long_mul(p1e, sum26, rot0_0); \
-      dct_long_mac(t2e, p1e, row6, rot0_1); \
-      dct_long_mac(t3e, p1e, row2, rot0_2); \
-      int16x8_t sum04 = vaddq_s16(row0, row4); \
-      int16x8_t dif04 = vsubq_s16(row0, row4); \
-      dct_widen(t0e, sum04); \
-      dct_widen(t1e, dif04); \
-      dct_wadd(x0, t0e, t3e); \
-      dct_wsub(x3, t0e, t3e); \
-      dct_wadd(x1, t1e, t2e); \
-      dct_wsub(x2, t1e, t2e); \
-      /* odd part */ \
-      int16x8_t sum15 = vaddq_s16(row1, row5); \
-      int16x8_t sum17 = vaddq_s16(row1, row7); \
-      int16x8_t sum35 = vaddq_s16(row3, row5); \
-      int16x8_t sum37 = vaddq_s16(row3, row7); \
-      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
-      dct_long_mul(p5o, sumodd, rot1_0); \
-      dct_long_mac(p1o, p5o, sum17, rot1_1); \
-      dct_long_mac(p2o, p5o, sum35, rot1_2); \
-      dct_long_mul(p3o, sum37, rot2_0); \
-      dct_long_mul(p4o, sum15, rot2_1); \
-      dct_wadd(sump13o, p1o, p3o); \
-      dct_wadd(sump24o, p2o, p4o); \
-      dct_wadd(sump23o, p2o, p3o); \
-      dct_wadd(sump14o, p1o, p4o); \
-      dct_long_mac(x4, sump13o, row7, rot3_0); \
-      dct_long_mac(x5, sump24o, row5, rot3_1); \
-      dct_long_mac(x6, sump23o, row3, rot3_2); \
-      dct_long_mac(x7, sump14o, row1, rot3_3); \
-      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
-      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
-      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
-      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
-   }
-
-   // load
-   row0 = vld1q_s16(data + 0*8);
-   row1 = vld1q_s16(data + 1*8);
-   row2 = vld1q_s16(data + 2*8);
-   row3 = vld1q_s16(data + 3*8);
-   row4 = vld1q_s16(data + 4*8);
-   row5 = vld1q_s16(data + 5*8);
-   row6 = vld1q_s16(data + 6*8);
-   row7 = vld1q_s16(data + 7*8);
-
-   // add DC bias
-   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
-
-   // column pass
-   dct_pass(vrshrn_n_s32, 10);
-
-   // 16bit 8x8 transpose
-   {
+#define dct_bfly32o(out0, out1, a, b, shiftop, s)                              \
+  {                                                                            \
+    dct_wadd(sum, a, b);                                                       \
+    dct_wsub(dif, a, b);                                                       \
+    out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s));                 \
+    out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s));                 \
+  }
+
+#define dct_pass(shiftop, shift)                                               \
+  {                                                                            \
+    /* even part */                                                            \
+    int16x8_t sum26 = vaddq_s16(row2, row6);                                   \
+    dct_long_mul(p1e, sum26, rot0_0);                                          \
+    dct_long_mac(t2e, p1e, row6, rot0_1);                                      \
+    dct_long_mac(t3e, p1e, row2, rot0_2);                                      \
+    int16x8_t sum04 = vaddq_s16(row0, row4);                                   \
+    int16x8_t dif04 = vsubq_s16(row0, row4);                                   \
+    dct_widen(t0e, sum04);                                                     \
+    dct_widen(t1e, dif04);                                                     \
+    dct_wadd(x0, t0e, t3e);                                                    \
+    dct_wsub(x3, t0e, t3e);                                                    \
+    dct_wadd(x1, t1e, t2e);                                                    \
+    dct_wsub(x2, t1e, t2e);                                                    \
+    /* odd part */                                                             \
+    int16x8_t sum15 = vaddq_s16(row1, row5);                                   \
+    int16x8_t sum17 = vaddq_s16(row1, row7);                                   \
+    int16x8_t sum35 = vaddq_s16(row3, row5);                                   \
+    int16x8_t sum37 = vaddq_s16(row3, row7);                                   \
+    int16x8_t sumodd = vaddq_s16(sum17, sum35);                                \
+    dct_long_mul(p5o, sumodd, rot1_0);                                         \
+    dct_long_mac(p1o, p5o, sum17, rot1_1);                                     \
+    dct_long_mac(p2o, p5o, sum35, rot1_2);                                     \
+    dct_long_mul(p3o, sum37, rot2_0);                                          \
+    dct_long_mul(p4o, sum15, rot2_1);                                          \
+    dct_wadd(sump13o, p1o, p3o);                                               \
+    dct_wadd(sump24o, p2o, p4o);                                               \
+    dct_wadd(sump23o, p2o, p3o);                                               \
+    dct_wadd(sump14o, p1o, p4o);                                               \
+    dct_long_mac(x4, sump13o, row7, rot3_0);                                   \
+    dct_long_mac(x5, sump24o, row5, rot3_1);                                   \
+    dct_long_mac(x6, sump23o, row3, rot3_2);                                   \
+    dct_long_mac(x7, sump14o, row1, rot3_3);                                   \
+    dct_bfly32o(row0, row7, x0, x7, shiftop, shift);                           \
+    dct_bfly32o(row1, row6, x1, x6, shiftop, shift);                           \
+    dct_bfly32o(row2, row5, x2, x5, shiftop, shift);                           \
+    dct_bfly32o(row3, row4, x3, x4, shiftop, shift);                           \
+  }
+
+  // load
+  row0 = vld1q_s16(data + 0 * 8);
+  row1 = vld1q_s16(data + 1 * 8);
+  row2 = vld1q_s16(data + 2 * 8);
+  row3 = vld1q_s16(data + 3 * 8);
+  row4 = vld1q_s16(data + 4 * 8);
+  row5 = vld1q_s16(data + 5 * 8);
+  row6 = vld1q_s16(data + 6 * 8);
+  row7 = vld1q_s16(data + 7 * 8);
+
+  // add DC bias
+  row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+  // column pass
+  dct_pass(vrshrn_n_s32, 10);
+
+  // 16bit 8x8 transpose
+  {
 // these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
 // whether compilers actually get this is another story, sadly.
-#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
-#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
-#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
-
-      // pass 1
-      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
-      dct_trn16(row2, row3);
-      dct_trn16(row4, row5);
-      dct_trn16(row6, row7);
-
-      // pass 2
-      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
-      dct_trn32(row1, row3);
-      dct_trn32(row4, row6);
-      dct_trn32(row5, row7);
-
-      // pass 3
-      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
-      dct_trn64(row1, row5);
-      dct_trn64(row2, row6);
-      dct_trn64(row3, row7);
+#define dct_trn16(x, y)                                                        \
+  {                                                                            \
+    int16x8x2_t t = vtrnq_s16(x, y);                                           \
+    x = t.val[0];                                                              \
+    y = t.val[1];                                                              \
+  }
+#define dct_trn32(x, y)                                                        \
+  {                                                                            \
+    int32x4x2_t t =                                                            \
+        vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y));         \
+    x = vreinterpretq_s16_s32(t.val[0]);                                       \
+    y = vreinterpretq_s16_s32(t.val[1]);                                       \
+  }
+#define dct_trn64(x, y)                                                        \
+  {                                                                            \
+    int16x8_t x0 = x;                                                          \
+    int16x8_t y0 = y;                                                          \
+    x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0));                      \
+    y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0));                    \
+  }
+
+    // pass 1
+    dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+    dct_trn16(row2, row3);
+    dct_trn16(row4, row5);
+    dct_trn16(row6, row7);
+
+    // pass 2
+    dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+    dct_trn32(row1, row3);
+    dct_trn32(row4, row6);
+    dct_trn32(row5, row7);
+
+    // pass 3
+    dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+    dct_trn64(row1, row5);
+    dct_trn64(row2, row6);
+    dct_trn64(row3, row7);
 
 #undef dct_trn16
 #undef dct_trn32
 #undef dct_trn64
-   }
-
-   // row pass
-   // vrshrn_n_s32 only supports shifts up to 16, we need
-   // 17. so do a non-rounding shift of 16 first then follow
-   // up with a rounding shift by 1.
-   dct_pass(vshrn_n_s32, 16);
-
-   {
-      // pack and round
-      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
-      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
-      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
-      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
-      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
-      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
-      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
-      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
-
-      // again, these can translate into one instruction, but often don't.
-#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
-#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
-#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
-
-      // sadly can't use interleaved stores here since we only write
-      // 8 bytes to each scan line!
-
-      // 8x8 8-bit transpose pass 1
-      dct_trn8_8(p0, p1);
-      dct_trn8_8(p2, p3);
-      dct_trn8_8(p4, p5);
-      dct_trn8_8(p6, p7);
-
-      // pass 2
-      dct_trn8_16(p0, p2);
-      dct_trn8_16(p1, p3);
-      dct_trn8_16(p4, p6);
-      dct_trn8_16(p5, p7);
-
-      // pass 3
-      dct_trn8_32(p0, p4);
-      dct_trn8_32(p1, p5);
-      dct_trn8_32(p2, p6);
-      dct_trn8_32(p3, p7);
-
-      // store
-      vst1_u8(out, p0); out += out_stride;
-      vst1_u8(out, p1); out += out_stride;
-      vst1_u8(out, p2); out += out_stride;
-      vst1_u8(out, p3); out += out_stride;
-      vst1_u8(out, p4); out += out_stride;
-      vst1_u8(out, p5); out += out_stride;
-      vst1_u8(out, p6); out += out_stride;
-      vst1_u8(out, p7);
+  }
+
+  // row pass
+  // vrshrn_n_s32 only supports shifts up to 16, we need
+  // 17. so do a non-rounding shift of 16 first then follow
+  // up with a rounding shift by 1.
+  dct_pass(vshrn_n_s32, 16);
+
+  {
+    // pack and round
+    uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+    uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+    uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+    uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+    uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+    uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+    uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+    uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+    // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y)                                                       \
+  {                                                                            \
+    uint8x8x2_t t = vtrn_u8(x, y);                                             \
+    x = t.val[0];                                                              \
+    y = t.val[1];                                                              \
+  }
+#define dct_trn8_16(x, y)                                                      \
+  {                                                                            \
+    uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); \
+    x = vreinterpret_u8_u16(t.val[0]);                                         \
+    y = vreinterpret_u8_u16(t.val[1]);                                         \
+  }
+#define dct_trn8_32(x, y)                                                      \
+  {                                                                            \
+    uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); \
+    x = vreinterpret_u8_u32(t.val[0]);                                         \
+    y = vreinterpret_u8_u32(t.val[1]);                                         \
+  }
+
+    // sadly can't use interleaved stores here since we only write
+    // 8 bytes to each scan line!
+
+    // 8x8 8-bit transpose pass 1
+    dct_trn8_8(p0, p1);
+    dct_trn8_8(p2, p3);
+    dct_trn8_8(p4, p5);
+    dct_trn8_8(p6, p7);
+
+    // pass 2
+    dct_trn8_16(p0, p2);
+    dct_trn8_16(p1, p3);
+    dct_trn8_16(p4, p6);
+    dct_trn8_16(p5, p7);
+
+    // pass 3
+    dct_trn8_32(p0, p4);
+    dct_trn8_32(p1, p5);
+    dct_trn8_32(p2, p6);
+    dct_trn8_32(p3, p7);
+
+    // store
+    vst1_u8(out, p0);
+    out += out_stride;
+    vst1_u8(out, p1);
+    out += out_stride;
+    vst1_u8(out, p2);
+    out += out_stride;
+    vst1_u8(out, p3);
+    out += out_stride;
+    vst1_u8(out, p4);
+    out += out_stride;
+    vst1_u8(out, p5);
+    out += out_stride;
+    vst1_u8(out, p6);
+    out += out_stride;
+    vst1_u8(out, p7);
 
 #undef dct_trn8_8
 #undef dct_trn8_16
 #undef dct_trn8_32
-   }
+  }
 
 #undef dct_long_mul
 #undef dct_long_mac
@@ -2716,1130 +2955,1270 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
 
 #endif // STBI_NEON
 
-#define STBI__MARKER_none  0xff
+#define STBI__MARKER_none 0xff
 // if there's a pending marker from the entropy stream, return that
 // otherwise, fetch from the stream and get a marker. if there's no
 // marker, return 0xff, which is never a valid marker value
-static stbi_uc stbi__get_marker(stbi__jpeg *j)
-{
-   stbi_uc x;
-   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
-   x = stbi__get8(j->s);
-   if (x != 0xff) return STBI__MARKER_none;
-   while (x == 0xff)
-      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
-   return x;
+static stbi_uc stbi__get_marker(stbi__jpeg *j) {
+  stbi_uc x;
+  if (j->marker != STBI__MARKER_none) {
+    x = j->marker;
+    j->marker = STBI__MARKER_none;
+    return x;
+  }
+  x = stbi__get8(j->s);
+  if (x != 0xff)
+    return STBI__MARKER_none;
+  while (x == 0xff)
+    x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+  return x;
 }
 
 // in each scan, we'll have scan_n components, and the order
 // of the components is specified by order[]
-#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+#define STBI__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7)
 
 // after a restart interval, stbi__jpeg_reset the entropy decoder and
 // the dc prediction
-static void stbi__jpeg_reset(stbi__jpeg *j)
-{
-   j->code_bits = 0;
-   j->code_buffer = 0;
-   j->nomore = 0;
-   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
-   j->marker = STBI__MARKER_none;
-   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
-   j->eob_run = 0;
-   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
-   // since we don't even allow 1<<30 pixels
-}
-
-static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
-{
-   stbi__jpeg_reset(z);
-   if (!z->progressive) {
-      if (z->scan_n == 1) {
-         int i,j;
-         STBI_SIMD_ALIGN(short, data[64]);
-         int n = z->order[0];
-         // non-interleaved data, we just need to process one block at a time,
-         // in trivial scanline order
-         // number of blocks to do just depends on how many actual "pixels" this
-         // component has, independent of interleaved MCU blocking and such
-         int w = (z->img_comp[n].x+7) >> 3;
-         int h = (z->img_comp[n].y+7) >> 3;
-         for (j=0; j < h; ++j) {
-            for (i=0; i < w; ++i) {
-               int ha = z->img_comp[n].ha;
-               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
-               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
-               // every data block is an MCU, so countdown the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  // if it's NOT a restart, then just bail, so we get corrupt data
-                  // rather than no data
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
-            }
-         }
-         return 1;
-      } else { // interleaved
-         int i,j,k,x,y;
-         STBI_SIMD_ALIGN(short, data[64]);
-         for (j=0; j < z->img_mcu_y; ++j) {
-            for (i=0; i < z->img_mcu_x; ++i) {
-               // scan an interleaved mcu... process scan_n components in order
-               for (k=0; k < z->scan_n; ++k) {
-                  int n = z->order[k];
-                  // scan out an mcu's worth of this component; that's just determined
-                  // by the basic H and V specified for the component
-                  for (y=0; y < z->img_comp[n].v; ++y) {
-                     for (x=0; x < z->img_comp[n].h; ++x) {
-                        int x2 = (i*z->img_comp[n].h + x)*8;
-                        int y2 = (j*z->img_comp[n].v + y)*8;
-                        int ha = z->img_comp[n].ha;
-                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
-                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
-                     }
-                  }
-               }
-               // after all interleaved components, that's an interleaved MCU,
-               // so now count down the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
-            }
-         }
-         return 1;
+static void stbi__jpeg_reset(stbi__jpeg *j) {
+  j->code_bits = 0;
+  j->code_buffer = 0;
+  j->nomore = 0;
+  j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred =
+      j->img_comp[3].dc_pred = 0;
+  j->marker = STBI__MARKER_none;
+  j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+  j->eob_run = 0;
+  // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+  // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z) {
+  stbi__jpeg_reset(z);
+  if (!z->progressive) {
+    if (z->scan_n == 1) {
+      int i, j;
+      STBI_SIMD_ALIGN(short, data[64]);
+      int n = z->order[0];
+      // non-interleaved data, we just need to process one block at a time,
+      // in trivial scanline order
+      // number of blocks to do just depends on how many actual "pixels" this
+      // component has, independent of interleaved MCU blocking and such
+      int w = (z->img_comp[n].x + 7) >> 3;
+      int h = (z->img_comp[n].y + 7) >> 3;
+      for (j = 0; j < h; ++j) {
+        for (i = 0; i < w; ++i) {
+          int ha = z->img_comp[n].ha;
+          if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd,
+                                       z->huff_ac + ha, z->fast_ac[ha], n,
+                                       z->dequant[z->img_comp[n].tq]))
+            return 0;
+          z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 +
+                                   i * 8,
+                               z->img_comp[n].w2, data);
+          // every data block is an MCU, so countdown the restart interval
+          if (--z->todo <= 0) {
+            if (z->code_bits < 24)
+              stbi__grow_buffer_unsafe(z);
+            // if it's NOT a restart, then just bail, so we get corrupt data
+            // rather than no data
+            if (!STBI__RESTART(z->marker))
+              return 1;
+            stbi__jpeg_reset(z);
+          }
+        }
       }
-   } else {
-      if (z->scan_n == 1) {
-         int i,j;
-         int n = z->order[0];
-         // non-interleaved data, we just need to process one block at a time,
-         // in trivial scanline order
-         // number of blocks to do just depends on how many actual "pixels" this
-         // component has, independent of interleaved MCU blocking and such
-         int w = (z->img_comp[n].x+7) >> 3;
-         int h = (z->img_comp[n].y+7) >> 3;
-         for (j=0; j < h; ++j) {
-            for (i=0; i < w; ++i) {
-               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-               if (z->spec_start == 0) {
-                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
-                     return 0;
-               } else {
-                  int ha = z->img_comp[n].ha;
-                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
-                     return 0;
-               }
-               // every data block is an MCU, so countdown the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
-            }
-         }
-         return 1;
-      } else { // interleaved
-         int i,j,k,x,y;
-         for (j=0; j < z->img_mcu_y; ++j) {
-            for (i=0; i < z->img_mcu_x; ++i) {
-               // scan an interleaved mcu... process scan_n components in order
-               for (k=0; k < z->scan_n; ++k) {
-                  int n = z->order[k];
-                  // scan out an mcu's worth of this component; that's just determined
-                  // by the basic H and V specified for the component
-                  for (y=0; y < z->img_comp[n].v; ++y) {
-                     for (x=0; x < z->img_comp[n].h; ++x) {
-                        int x2 = (i*z->img_comp[n].h + x);
-                        int y2 = (j*z->img_comp[n].v + y);
-                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
-                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
-                           return 0;
-                     }
-                  }
-               }
-               // after all interleaved components, that's an interleaved MCU,
-               // so now count down the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
+      return 1;
+    } else { // interleaved
+      int i, j, k, x, y;
+      STBI_SIMD_ALIGN(short, data[64]);
+      for (j = 0; j < z->img_mcu_y; ++j) {
+        for (i = 0; i < z->img_mcu_x; ++i) {
+          // scan an interleaved mcu... process scan_n components in order
+          for (k = 0; k < z->scan_n; ++k) {
+            int n = z->order[k];
+            // scan out an mcu's worth of this component; that's just determined
+            // by the basic H and V specified for the component
+            for (y = 0; y < z->img_comp[n].v; ++y) {
+              for (x = 0; x < z->img_comp[n].h; ++x) {
+                int x2 = (i * z->img_comp[n].h + x) * 8;
+                int y2 = (j * z->img_comp[n].v + y) * 8;
+                int ha = z->img_comp[n].ha;
+                if (!stbi__jpeg_decode_block(z, data,
+                                             z->huff_dc + z->img_comp[n].hd,
+                                             z->huff_ac + ha, z->fast_ac[ha], n,
+                                             z->dequant[z->img_comp[n].tq]))
+                  return 0;
+                z->idct_block_kernel(z->img_comp[n].data +
+                                         z->img_comp[n].w2 * y2 + x2,
+                                     z->img_comp[n].w2, data);
+              }
             }
-         }
-         return 1;
+          }
+          // after all interleaved components, that's an interleaved MCU,
+          // so now count down the restart interval
+          if (--z->todo <= 0) {
+            if (z->code_bits < 24)
+              stbi__grow_buffer_unsafe(z);
+            if (!STBI__RESTART(z->marker))
+              return 1;
+            stbi__jpeg_reset(z);
+          }
+        }
       }
-   }
-}
-
-static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
-{
-   int i;
-   for (i=0; i < 64; ++i)
-      data[i] *= dequant[i];
-}
-
-static void stbi__jpeg_finish(stbi__jpeg *z)
-{
-   if (z->progressive) {
-      // dequantize and idct the data
-      int i,j,n;
-      for (n=0; n < z->s->img_n; ++n) {
-         int w = (z->img_comp[n].x+7) >> 3;
-         int h = (z->img_comp[n].y+7) >> 3;
-         for (j=0; j < h; ++j) {
-            for (i=0; i < w; ++i) {
-               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
-               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
-            }
-         }
+      return 1;
+    }
+  } else {
+    if (z->scan_n == 1) {
+      int i, j;
+      int n = z->order[0];
+      // non-interleaved data, we just need to process one block at a time,
+      // in trivial scanline order
+      // number of blocks to do just depends on how many actual "pixels" this
+      // component has, independent of interleaved MCU blocking and such
+      int w = (z->img_comp[n].x + 7) >> 3;
+      int h = (z->img_comp[n].y + 7) >> 3;
+      for (j = 0; j < h; ++j) {
+        for (i = 0; i < w; ++i) {
+          short *data =
+              z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+          if (z->spec_start == 0) {
+            if (!stbi__jpeg_decode_block_prog_dc(
+                    z, data, &z->huff_dc[z->img_comp[n].hd], n))
+              return 0;
+          } else {
+            int ha = z->img_comp[n].ha;
+            if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha],
+                                                 z->fast_ac[ha]))
+              return 0;
+          }
+          // every data block is an MCU, so countdown the restart interval
+          if (--z->todo <= 0) {
+            if (z->code_bits < 24)
+              stbi__grow_buffer_unsafe(z);
+            if (!STBI__RESTART(z->marker))
+              return 1;
+            stbi__jpeg_reset(z);
+          }
+        }
       }
-   }
-}
-
-static int stbi__process_marker(stbi__jpeg *z, int m)
-{
-   int L;
-   switch (m) {
-      case STBI__MARKER_none: // no marker found
-         return stbi__err("expected marker","Corrupt JPEG");
-
-      case 0xDD: // DRI - specify restart interval
-         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
-         z->restart_interval = stbi__get16be(z->s);
-         return 1;
-
-      case 0xDB: // DQT - define quantization table
-         L = stbi__get16be(z->s)-2;
-         while (L > 0) {
-            int q = stbi__get8(z->s);
-            int p = q >> 4, sixteen = (p != 0);
-            int t = q & 15,i;
-            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
-            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
-
-            for (i=0; i < 64; ++i)
-               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
-            L -= (sixteen ? 129 : 65);
-         }
-         return L==0;
-
-      case 0xC4: // DHT - define huffman table
-         L = stbi__get16be(z->s)-2;
-         while (L > 0) {
-            stbi_uc *v;
-            int sizes[16],i,n=0;
-            int q = stbi__get8(z->s);
-            int tc = q >> 4;
-            int th = q & 15;
-            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
-            for (i=0; i < 16; ++i) {
-               sizes[i] = stbi__get8(z->s);
-               n += sizes[i];
-            }
-            L -= 17;
-            if (tc == 0) {
-               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
-               v = z->huff_dc[th].values;
-            } else {
-               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
-               v = z->huff_ac[th].values;
+      return 1;
+    } else { // interleaved
+      int i, j, k, x, y;
+      for (j = 0; j < z->img_mcu_y; ++j) {
+        for (i = 0; i < z->img_mcu_x; ++i) {
+          // scan an interleaved mcu... process scan_n components in order
+          for (k = 0; k < z->scan_n; ++k) {
+            int n = z->order[k];
+            // scan out an mcu's worth of this component; that's just determined
+            // by the basic H and V specified for the component
+            for (y = 0; y < z->img_comp[n].v; ++y) {
+              for (x = 0; x < z->img_comp[n].h; ++x) {
+                int x2 = (i * z->img_comp[n].h + x);
+                int y2 = (j * z->img_comp[n].v + y);
+                short *data = z->img_comp[n].coeff +
+                              64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                if (!stbi__jpeg_decode_block_prog_dc(
+                        z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                  return 0;
+              }
             }
-            for (i=0; i < n; ++i)
-               v[i] = stbi__get8(z->s);
-            if (tc != 0)
-               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
-            L -= n;
-         }
-         return L==0;
-   }
-
-   // check for comment block or APP blocks
-   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
-      L = stbi__get16be(z->s);
-      if (L < 2) {
-         if (m == 0xFE)
-            return stbi__err("bad COM len","Corrupt JPEG");
-         else
-            return stbi__err("bad APP len","Corrupt JPEG");
+          }
+          // after all interleaved components, that's an interleaved MCU,
+          // so now count down the restart interval
+          if (--z->todo <= 0) {
+            if (z->code_bits < 24)
+              stbi__grow_buffer_unsafe(z);
+            if (!STBI__RESTART(z->marker))
+              return 1;
+            stbi__jpeg_reset(z);
+          }
+        }
       }
-      L -= 2;
-
-      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
-         static const unsigned char tag[5] = {'J','F','I','F','\0'};
-         int ok = 1;
-         int i;
-         for (i=0; i < 5; ++i)
-            if (stbi__get8(z->s) != tag[i])
-               ok = 0;
-         L -= 5;
-         if (ok)
-            z->jfif = 1;
-      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
-         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
-         int ok = 1;
-         int i;
-         for (i=0; i < 6; ++i)
-            if (stbi__get8(z->s) != tag[i])
-               ok = 0;
-         L -= 6;
-         if (ok) {
-            stbi__get8(z->s); // version
-            stbi__get16be(z->s); // flags0
-            stbi__get16be(z->s); // flags1
-            z->app14_color_transform = stbi__get8(z->s); // color transform
-            L -= 6;
-         }
+      return 1;
+    }
+  }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z) {
+  if (z->progressive) {
+    // dequantize and idct the data
+    int i, j, n;
+    for (n = 0; n < z->s->img_n; ++n) {
+      int w = (z->img_comp[n].x + 7) >> 3;
+      int h = (z->img_comp[n].y + 7) >> 3;
+      for (j = 0; j < h; ++j) {
+        for (i = 0; i < w; ++i) {
+          short *data =
+              z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+          stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+          z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 +
+                                   i * 8,
+                               z->img_comp[n].w2, data);
+        }
       }
+    }
+  }
+}
 
-      stbi__skip(z->s, L);
-      return 1;
-   }
+static int stbi__process_marker(stbi__jpeg *z, int m) {
+  int L;
+  switch (m) {
+  case STBI__MARKER_none: // no marker found
+    return stbi__err("expected marker", "Corrupt JPEG");
 
-   return stbi__err("unknown marker","Corrupt JPEG");
-}
+  case 0xDD: // DRI - specify restart interval
+    if (stbi__get16be(z->s) != 4)
+      return stbi__err("bad DRI len", "Corrupt JPEG");
+    z->restart_interval = stbi__get16be(z->s);
+    return 1;
 
-// after we see SOS
-static int stbi__process_scan_header(stbi__jpeg *z)
-{
-   int i;
-   int Ls = stbi__get16be(z->s);
-   z->scan_n = stbi__get8(z->s);
-   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
-   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
-   for (i=0; i < z->scan_n; ++i) {
-      int id = stbi__get8(z->s), which;
+  case 0xDB: // DQT - define quantization table
+    L = stbi__get16be(z->s) - 2;
+    while (L > 0) {
       int q = stbi__get8(z->s);
-      for (which = 0; which < z->s->img_n; ++which)
-         if (z->img_comp[which].id == id)
-            break;
-      if (which == z->s->img_n) return 0; // no match
-      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
-      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
-      z->order[i] = which;
-   }
-
-   {
-      int aa;
-      z->spec_start = stbi__get8(z->s);
-      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
-      aa = stbi__get8(z->s);
-      z->succ_high = (aa >> 4);
-      z->succ_low  = (aa & 15);
-      if (z->progressive) {
-         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
-            return stbi__err("bad SOS", "Corrupt JPEG");
+      int p = q >> 4, sixteen = (p != 0);
+      int t = q & 15, i;
+      if (p != 0 && p != 1)
+        return stbi__err("bad DQT type", "Corrupt JPEG");
+      if (t > 3)
+        return stbi__err("bad DQT table", "Corrupt JPEG");
+
+      for (i = 0; i < 64; ++i)
+        z->dequant[t][stbi__jpeg_dezigzag[i]] =
+            (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+      L -= (sixteen ? 129 : 65);
+    }
+    return L == 0;
+
+  case 0xC4: // DHT - define huffman table
+    L = stbi__get16be(z->s) - 2;
+    while (L > 0) {
+      stbi_uc *v;
+      int sizes[16], i, n = 0;
+      int q = stbi__get8(z->s);
+      int tc = q >> 4;
+      int th = q & 15;
+      if (tc > 1 || th > 3)
+        return stbi__err("bad DHT header", "Corrupt JPEG");
+      for (i = 0; i < 16; ++i) {
+        sizes[i] = stbi__get8(z->s);
+        n += sizes[i];
+      }
+      L -= 17;
+      if (tc == 0) {
+        if (!stbi__build_huffman(z->huff_dc + th, sizes))
+          return 0;
+        v = z->huff_dc[th].values;
       } else {
-         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
-         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
-         z->spec_end = 63;
+        if (!stbi__build_huffman(z->huff_ac + th, sizes))
+          return 0;
+        v = z->huff_ac[th].values;
+      }
+      for (i = 0; i < n; ++i)
+        v[i] = stbi__get8(z->s);
+      if (tc != 0)
+        stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+      L -= n;
+    }
+    return L == 0;
+  }
+
+  // check for comment block or APP blocks
+  if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+    L = stbi__get16be(z->s);
+    if (L < 2) {
+      if (m == 0xFE)
+        return stbi__err("bad COM len", "Corrupt JPEG");
+      else
+        return stbi__err("bad APP len", "Corrupt JPEG");
+    }
+    L -= 2;
+
+    if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+      static const unsigned char tag[5] = {'J', 'F', 'I', 'F', '\0'};
+      int ok = 1;
+      int i;
+      for (i = 0; i < 5; ++i)
+        if (stbi__get8(z->s) != tag[i])
+          ok = 0;
+      L -= 5;
+      if (ok)
+        z->jfif = 1;
+    } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+      static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'};
+      int ok = 1;
+      int i;
+      for (i = 0; i < 6; ++i)
+        if (stbi__get8(z->s) != tag[i])
+          ok = 0;
+      L -= 6;
+      if (ok) {
+        stbi__get8(z->s);                            // version
+        stbi__get16be(z->s);                         // flags0
+        stbi__get16be(z->s);                         // flags1
+        z->app14_color_transform = stbi__get8(z->s); // color transform
+        L -= 6;
       }
-   }
+    }
+
+    stbi__skip(z->s, L);
+    return 1;
+  }
 
-   return 1;
+  return stbi__err("unknown marker", "Corrupt JPEG");
 }
 
-static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
-{
-   int i;
-   for (i=0; i < ncomp; ++i) {
-      if (z->img_comp[i].raw_data) {
-         STBI_FREE(z->img_comp[i].raw_data);
-         z->img_comp[i].raw_data = NULL;
-         z->img_comp[i].data = NULL;
-      }
-      if (z->img_comp[i].raw_coeff) {
-         STBI_FREE(z->img_comp[i].raw_coeff);
-         z->img_comp[i].raw_coeff = 0;
-         z->img_comp[i].coeff = 0;
-      }
-      if (z->img_comp[i].linebuf) {
-         STBI_FREE(z->img_comp[i].linebuf);
-         z->img_comp[i].linebuf = NULL;
-      }
-   }
-   return why;
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z) {
+  int i;
+  int Ls = stbi__get16be(z->s);
+  z->scan_n = stbi__get8(z->s);
+  if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n)
+    return stbi__err("bad SOS component count", "Corrupt JPEG");
+  if (Ls != 6 + 2 * z->scan_n)
+    return stbi__err("bad SOS len", "Corrupt JPEG");
+  for (i = 0; i < z->scan_n; ++i) {
+    int id = stbi__get8(z->s), which;
+    int q = stbi__get8(z->s);
+    for (which = 0; which < z->s->img_n; ++which)
+      if (z->img_comp[which].id == id)
+        break;
+    if (which == z->s->img_n)
+      return 0; // no match
+    z->img_comp[which].hd = q >> 4;
+    if (z->img_comp[which].hd > 3)
+      return stbi__err("bad DC huff", "Corrupt JPEG");
+    z->img_comp[which].ha = q & 15;
+    if (z->img_comp[which].ha > 3)
+      return stbi__err("bad AC huff", "Corrupt JPEG");
+    z->order[i] = which;
+  }
+
+  {
+    int aa;
+    z->spec_start = stbi__get8(z->s);
+    z->spec_end = stbi__get8(z->s); // should be 63, but might be 0
+    aa = stbi__get8(z->s);
+    z->succ_high = (aa >> 4);
+    z->succ_low = (aa & 15);
+    if (z->progressive) {
+      if (z->spec_start > 63 || z->spec_end > 63 ||
+          z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+        return stbi__err("bad SOS", "Corrupt JPEG");
+    } else {
+      if (z->spec_start != 0)
+        return stbi__err("bad SOS", "Corrupt JPEG");
+      if (z->succ_high != 0 || z->succ_low != 0)
+        return stbi__err("bad SOS", "Corrupt JPEG");
+      z->spec_end = 63;
+    }
+  }
+
+  return 1;
 }
 
-static int stbi__process_frame_header(stbi__jpeg *z, int scan)
-{
-   stbi__context *s = z->s;
-   int Lf,p,i,q, h_max=1,v_max=1,c;
-   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
-   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
-   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
-   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
-   c = stbi__get8(s);
-   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
-   s->img_n = c;
-   for (i=0; i < c; ++i) {
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why) {
+  int i;
+  for (i = 0; i < ncomp; ++i) {
+    if (z->img_comp[i].raw_data) {
+      STBI_FREE(z->img_comp[i].raw_data);
+      z->img_comp[i].raw_data = NULL;
       z->img_comp[i].data = NULL;
-      z->img_comp[i].linebuf = NULL;
-   }
-
-   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
-
-   z->rgb = 0;
-   for (i=0; i < s->img_n; ++i) {
-      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
-      z->img_comp[i].id = stbi__get8(s);
-      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
-         ++z->rgb;
-      q = stbi__get8(s);
-      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
-      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
-      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
-   }
-
-   if (scan != STBI__SCAN_load) return 1;
-
-   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
-
-   for (i=0; i < s->img_n; ++i) {
-      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
-      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
-   }
-
-   // compute interleaved mcu info
-   z->img_h_max = h_max;
-   z->img_v_max = v_max;
-   z->img_mcu_w = h_max * 8;
-   z->img_mcu_h = v_max * 8;
-   // these sizes can't be more than 17 bits
-   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
-   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
-
-   for (i=0; i < s->img_n; ++i) {
-      // number of effective pixels (e.g. for non-interleaved MCU)
-      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
-      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
-      // to simplify generation, we'll allocate enough memory to decode
-      // the bogus oversized data from using interleaved MCUs and their
-      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
-      // discard the extra data until colorspace conversion
-      //
-      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
-      // so these muls can't overflow with 32-bit ints (which we require)
-      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
-      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
-      z->img_comp[i].coeff = 0;
+    }
+    if (z->img_comp[i].raw_coeff) {
+      STBI_FREE(z->img_comp[i].raw_coeff);
       z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].coeff = 0;
+    }
+    if (z->img_comp[i].linebuf) {
+      STBI_FREE(z->img_comp[i].linebuf);
       z->img_comp[i].linebuf = NULL;
-      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
-      if (z->img_comp[i].raw_data == NULL)
-         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
-      // align blocks for idct using mmx/sse
-      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
-      if (z->progressive) {
-         // w2, h2 are multiples of 8 (see above)
-         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
-         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
-         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
-         if (z->img_comp[i].raw_coeff == NULL)
-            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
-         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
-      }
-   }
+    }
+  }
+  return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan) {
+  stbi__context *s = z->s;
+  int Lf, p, i, q, h_max = 1, v_max = 1, c;
+  Lf = stbi__get16be(s);
+  if (Lf < 11)
+    return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG
+  p = stbi__get8(s);
+  if (p != 8)
+    return stbi__err("only 8-bit",
+                     "JPEG format not supported: 8-bit only"); // JPEG baseline
+  s->img_y = stbi__get16be(s);
+  if (s->img_y == 0)
+    return stbi__err(
+        "no header height",
+        "JPEG format not supported: delayed height"); // Legal, but we don't
+                                                      // handle it--but neither
+                                                      // does IJG
+  s->img_x = stbi__get16be(s);
+  if (s->img_x == 0)
+    return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires
+  c = stbi__get8(s);
+  if (c != 3 && c != 1 && c != 4)
+    return stbi__err("bad component count", "Corrupt JPEG");
+  s->img_n = c;
+  for (i = 0; i < c; ++i) {
+    z->img_comp[i].data = NULL;
+    z->img_comp[i].linebuf = NULL;
+  }
+
+  if (Lf != 8 + 3 * s->img_n)
+    return stbi__err("bad SOF len", "Corrupt JPEG");
+
+  z->rgb = 0;
+  for (i = 0; i < s->img_n; ++i) {
+    static const unsigned char rgb[3] = {'R', 'G', 'B'};
+    z->img_comp[i].id = stbi__get8(s);
+    if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+      ++z->rgb;
+    q = stbi__get8(s);
+    z->img_comp[i].h = (q >> 4);
+    if (!z->img_comp[i].h || z->img_comp[i].h > 4)
+      return stbi__err("bad H", "Corrupt JPEG");
+    z->img_comp[i].v = q & 15;
+    if (!z->img_comp[i].v || z->img_comp[i].v > 4)
+      return stbi__err("bad V", "Corrupt JPEG");
+    z->img_comp[i].tq = stbi__get8(s);
+    if (z->img_comp[i].tq > 3)
+      return stbi__err("bad TQ", "Corrupt JPEG");
+  }
+
+  if (scan != STBI__SCAN_load)
+    return 1;
+
+  if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0))
+    return stbi__err("too large", "Image too large to decode");
+
+  for (i = 0; i < s->img_n; ++i) {
+    if (z->img_comp[i].h > h_max)
+      h_max = z->img_comp[i].h;
+    if (z->img_comp[i].v > v_max)
+      v_max = z->img_comp[i].v;
+  }
+
+  // compute interleaved mcu info
+  z->img_h_max = h_max;
+  z->img_v_max = v_max;
+  z->img_mcu_w = h_max * 8;
+  z->img_mcu_h = v_max * 8;
+  // these sizes can't be more than 17 bits
+  z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w;
+  z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h;
+
+  for (i = 0; i < s->img_n; ++i) {
+    // number of effective pixels (e.g. for non-interleaved MCU)
+    z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max;
+    z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max - 1) / v_max;
+    // to simplify generation, we'll allocate enough memory to decode
+    // the bogus oversized data from using interleaved MCUs and their
+    // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+    // discard the extra data until colorspace conversion
+    //
+    // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked
+    // earlier) so these muls can't overflow with 32-bit ints (which we require)
+    z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+    z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+    z->img_comp[i].coeff = 0;
+    z->img_comp[i].raw_coeff = 0;
+    z->img_comp[i].linebuf = NULL;
+    z->img_comp[i].raw_data =
+        stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+    if (z->img_comp[i].raw_data == NULL)
+      return stbi__free_jpeg_components(z, i + 1,
+                                        stbi__err("outofmem", "Out of memory"));
+    // align blocks for idct using mmx/sse
+    z->img_comp[i].data =
+        (stbi_uc *)(((size_t)z->img_comp[i].raw_data + 15) & ~15);
+    if (z->progressive) {
+      // w2, h2 are multiples of 8 (see above)
+      z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+      z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+      z->img_comp[i].raw_coeff = stbi__malloc_mad3(
+          z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+      if (z->img_comp[i].raw_coeff == NULL)
+        return stbi__free_jpeg_components(
+            z, i + 1, stbi__err("outofmem", "Out of memory"));
+      z->img_comp[i].coeff =
+          (short *)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15);
+    }
+  }
 
-   return 1;
+  return 1;
 }
 
 // use comparisons since in some cases we handle more than one case (e.g. SOF)
-#define stbi__DNL(x)         ((x) == 0xdc)
-#define stbi__SOI(x)         ((x) == 0xd8)
-#define stbi__EOI(x)         ((x) == 0xd9)
-#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
-#define stbi__SOS(x)         ((x) == 0xda)
-
-#define stbi__SOF_progressive(x)   ((x) == 0xc2)
-
-static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
-{
-   int m;
-   z->jfif = 0;
-   z->app14_color_transform = -1; // valid values are 0,1,2
-   z->marker = STBI__MARKER_none; // initialize cached marker to empty
-   m = stbi__get_marker(z);
-   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
-   if (scan == STBI__SCAN_type) return 1;
-   m = stbi__get_marker(z);
-   while (!stbi__SOF(m)) {
-      if (!stbi__process_marker(z,m)) return 0;
+#define stbi__DNL(x) ((x) == 0xdc)
+#define stbi__SOI(x) ((x) == 0xd8)
+#define stbi__EOI(x) ((x) == 0xd9)
+#define stbi__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x) ((x) == 0xda)
+
+#define stbi__SOF_progressive(x) ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan) {
+  int m;
+  z->jfif = 0;
+  z->app14_color_transform = -1; // valid values are 0,1,2
+  z->marker = STBI__MARKER_none; // initialize cached marker to empty
+  m = stbi__get_marker(z);
+  if (!stbi__SOI(m))
+    return stbi__err("no SOI", "Corrupt JPEG");
+  if (scan == STBI__SCAN_type)
+    return 1;
+  m = stbi__get_marker(z);
+  while (!stbi__SOF(m)) {
+    if (!stbi__process_marker(z, m))
+      return 0;
+    m = stbi__get_marker(z);
+    while (m == STBI__MARKER_none) {
+      // some files have extra padding after their blocks, so ok, we'll scan
+      if (stbi__at_eof(z->s))
+        return stbi__err("no SOF", "Corrupt JPEG");
       m = stbi__get_marker(z);
-      while (m == STBI__MARKER_none) {
-         // some files have extra padding after their blocks, so ok, we'll scan
-         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
-         m = stbi__get_marker(z);
-      }
-   }
-   z->progressive = stbi__SOF_progressive(m);
-   if (!stbi__process_frame_header(z, scan)) return 0;
-   return 1;
+    }
+  }
+  z->progressive = stbi__SOF_progressive(m);
+  if (!stbi__process_frame_header(z, scan))
+    return 0;
+  return 1;
 }
 
 // decode image to YCbCr format
-static int stbi__decode_jpeg_image(stbi__jpeg *j)
-{
-   int m;
-   for (m = 0; m < 4; m++) {
-      j->img_comp[m].raw_data = NULL;
-      j->img_comp[m].raw_coeff = NULL;
-   }
-   j->restart_interval = 0;
-   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
-   m = stbi__get_marker(j);
-   while (!stbi__EOI(m)) {
-      if (stbi__SOS(m)) {
-         if (!stbi__process_scan_header(j)) return 0;
-         if (!stbi__parse_entropy_coded_data(j)) return 0;
-         if (j->marker == STBI__MARKER_none ) {
-            // handle 0s at the end of image data from IP Kamera 9060
-            while (!stbi__at_eof(j->s)) {
-               int x = stbi__get8(j->s);
-               if (x == 255) {
-                  j->marker = stbi__get8(j->s);
-                  break;
-               }
-            }
-            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
-         }
-      } else if (stbi__DNL(m)) {
-         int Ld = stbi__get16be(j->s);
-         stbi__uint32 NL = stbi__get16be(j->s);
-         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
-         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
-      } else {
-         if (!stbi__process_marker(j, m)) return 0;
+static int stbi__decode_jpeg_image(stbi__jpeg *j) {
+  int m;
+  for (m = 0; m < 4; m++) {
+    j->img_comp[m].raw_data = NULL;
+    j->img_comp[m].raw_coeff = NULL;
+  }
+  j->restart_interval = 0;
+  if (!stbi__decode_jpeg_header(j, STBI__SCAN_load))
+    return 0;
+  m = stbi__get_marker(j);
+  while (!stbi__EOI(m)) {
+    if (stbi__SOS(m)) {
+      if (!stbi__process_scan_header(j))
+        return 0;
+      if (!stbi__parse_entropy_coded_data(j))
+        return 0;
+      if (j->marker == STBI__MARKER_none) {
+        // handle 0s at the end of image data from IP Kamera 9060
+        while (!stbi__at_eof(j->s)) {
+          int x = stbi__get8(j->s);
+          if (x == 255) {
+            j->marker = stbi__get8(j->s);
+            break;
+          }
+        }
+        // if we reach eof without hitting a marker, stbi__get_marker() below
+        // will fail and we'll eventually return 0
       }
-      m = stbi__get_marker(j);
-   }
-   if (j->progressive)
-      stbi__jpeg_finish(j);
-   return 1;
+    } else if (stbi__DNL(m)) {
+      int Ld = stbi__get16be(j->s);
+      stbi__uint32 NL = stbi__get16be(j->s);
+      if (Ld != 4)
+        return stbi__err("bad DNL len", "Corrupt JPEG");
+      if (NL != j->s->img_y)
+        return stbi__err("bad DNL height", "Corrupt JPEG");
+    } else {
+      if (!stbi__process_marker(j, m))
+        return 0;
+    }
+    m = stbi__get_marker(j);
+  }
+  if (j->progressive)
+    stbi__jpeg_finish(j);
+  return 1;
 }
 
 // static jfif-centered resampling (across block boundaries)
 
 typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
-                                    int w, int hs);
-
-#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
-
-static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   STBI_NOTUSED(out);
-   STBI_NOTUSED(in_far);
-   STBI_NOTUSED(w);
-   STBI_NOTUSED(hs);
-   return in_near;
-}
-
-static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate two samples vertically for every one in input
-   int i;
-   STBI_NOTUSED(hs);
-   for (i=0; i < w; ++i)
-      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
-   return out;
-}
-
-static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate two samples horizontally for every one in input
-   int i;
-   stbi_uc *input = in_near;
-
-   if (w == 1) {
-      // if only one sample, can't do any interpolation
-      out[0] = out[1] = input[0];
-      return out;
-   }
-
-   out[0] = input[0];
-   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
-   for (i=1; i < w-1; ++i) {
-      int n = 3*input[i]+2;
-      out[i*2+0] = stbi__div4(n+input[i-1]);
-      out[i*2+1] = stbi__div4(n+input[i+1]);
-   }
-   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
-   out[i*2+1] = input[w-1];
-
-   STBI_NOTUSED(in_far);
-   STBI_NOTUSED(hs);
-
-   return out;
-}
-
-#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
-
-static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate 2x2 samples for every one in input
-   int i,t0,t1;
-   if (w == 1) {
-      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
-      return out;
-   }
-
-   t1 = 3*in_near[0] + in_far[0];
-   out[0] = stbi__div4(t1+2);
-   for (i=1; i < w; ++i) {
-      t0 = t1;
-      t1 = 3*in_near[i]+in_far[i];
-      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
-      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
-   }
-   out[w*2-1] = stbi__div4(t1+2);
-
-   STBI_NOTUSED(hs);
-
-   return out;
+                                      int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc)((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far,
+                               int w, int hs) {
+  STBI_NOTUSED(out);
+  STBI_NOTUSED(in_far);
+  STBI_NOTUSED(w);
+  STBI_NOTUSED(hs);
+  return in_near;
+}
+
+static stbi_uc *stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near,
+                                       stbi_uc *in_far, int w, int hs) {
+  // need to generate two samples vertically for every one in input
+  int i;
+  STBI_NOTUSED(hs);
+  for (i = 0; i < w; ++i)
+    out[i] = stbi__div4(3 * in_near[i] + in_far[i] + 2);
+  return out;
+}
+
+static stbi_uc *stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near,
+                                       stbi_uc *in_far, int w, int hs) {
+  // need to generate two samples horizontally for every one in input
+  int i;
+  stbi_uc *input = in_near;
+
+  if (w == 1) {
+    // if only one sample, can't do any interpolation
+    out[0] = out[1] = input[0];
+    return out;
+  }
+
+  out[0] = input[0];
+  out[1] = stbi__div4(input[0] * 3 + input[1] + 2);
+  for (i = 1; i < w - 1; ++i) {
+    int n = 3 * input[i] + 2;
+    out[i * 2 + 0] = stbi__div4(n + input[i - 1]);
+    out[i * 2 + 1] = stbi__div4(n + input[i + 1]);
+  }
+  out[i * 2 + 0] = stbi__div4(input[w - 2] * 3 + input[w - 1] + 2);
+  out[i * 2 + 1] = input[w - 1];
+
+  STBI_NOTUSED(in_far);
+  STBI_NOTUSED(hs);
+
+  return out;
+}
+
+#define stbi__div16(x) ((stbi_uc)((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near,
+                                        stbi_uc *in_far, int w, int hs) {
+  // need to generate 2x2 samples for every one in input
+  int i, t0, t1;
+  if (w == 1) {
+    out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
+    return out;
+  }
+
+  t1 = 3 * in_near[0] + in_far[0];
+  out[0] = stbi__div4(t1 + 2);
+  for (i = 1; i < w; ++i) {
+    t0 = t1;
+    t1 = 3 * in_near[i] + in_far[i];
+    out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8);
+    out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
+  }
+  out[w * 2 - 1] = stbi__div4(t1 + 2);
+
+  STBI_NOTUSED(hs);
+
+  return out;
 }
 
 #if defined(STBI_SSE2) || defined(STBI_NEON)
-static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate 2x2 samples for every one in input
-   int i=0,t0,t1;
-
-   if (w == 1) {
-      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
-      return out;
-   }
-
-   t1 = 3*in_near[0] + in_far[0];
-   // process groups of 8 pixels for as long as we can.
-   // note we can't handle the last pixel in a row in this loop
-   // because we need to handle the filter boundary conditions.
-   for (; i < ((w-1) & ~7); i += 8) {
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near,
+                                             stbi_uc *in_far, int w, int hs) {
+  // need to generate 2x2 samples for every one in input
+  int i = 0, t0, t1;
+
+  if (w == 1) {
+    out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
+    return out;
+  }
+
+  t1 = 3 * in_near[0] + in_far[0];
+  // process groups of 8 pixels for as long as we can.
+  // note we can't handle the last pixel in a row in this loop
+  // because we need to handle the filter boundary conditions.
+  for (; i < ((w - 1) & ~7); i += 8) {
 #if defined(STBI_SSE2)
-      // load and perform the vertical filtering pass
-      // this uses 3*x + y = 4*x + (y - x)
-      __m128i zero  = _mm_setzero_si128();
-      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
-      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
-      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
-      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
-      __m128i diff  = _mm_sub_epi16(farw, nearw);
-      __m128i nears = _mm_slli_epi16(nearw, 2);
-      __m128i curr  = _mm_add_epi16(nears, diff); // current row
-
-      // horizontal filter works the same based on shifted vers of current
-      // row. "prev" is current row shifted right by 1 pixel; we need to
-      // insert the previous pixel value (from t1).
-      // "next" is current row shifted left by 1 pixel, with first pixel
-      // of next block of 8 pixels added in.
-      __m128i prv0 = _mm_slli_si128(curr, 2);
-      __m128i nxt0 = _mm_srli_si128(curr, 2);
-      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
-      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
-
-      // horizontal filter, polyphase implementation since it's convenient:
-      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
-      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
-      // note the shared term.
-      __m128i bias  = _mm_set1_epi16(8);
-      __m128i curs = _mm_slli_epi16(curr, 2);
-      __m128i prvd = _mm_sub_epi16(prev, curr);
-      __m128i nxtd = _mm_sub_epi16(next, curr);
-      __m128i curb = _mm_add_epi16(curs, bias);
-      __m128i even = _mm_add_epi16(prvd, curb);
-      __m128i odd  = _mm_add_epi16(nxtd, curb);
-
-      // interleave even and odd pixels, then undo scaling.
-      __m128i int0 = _mm_unpacklo_epi16(even, odd);
-      __m128i int1 = _mm_unpackhi_epi16(even, odd);
-      __m128i de0  = _mm_srli_epi16(int0, 4);
-      __m128i de1  = _mm_srli_epi16(int1, 4);
-
-      // pack and write output
-      __m128i outv = _mm_packus_epi16(de0, de1);
-      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+    // load and perform the vertical filtering pass
+    // this uses 3*x + y = 4*x + (y - x)
+    __m128i zero = _mm_setzero_si128();
+    __m128i farb = _mm_loadl_epi64((__m128i *)(in_far + i));
+    __m128i nearb = _mm_loadl_epi64((__m128i *)(in_near + i));
+    __m128i farw = _mm_unpacklo_epi8(farb, zero);
+    __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+    __m128i diff = _mm_sub_epi16(farw, nearw);
+    __m128i nears = _mm_slli_epi16(nearw, 2);
+    __m128i curr = _mm_add_epi16(nears, diff); // current row
+
+    // horizontal filter works the same based on shifted vers of current
+    // row. "prev" is current row shifted right by 1 pixel; we need to
+    // insert the previous pixel value (from t1).
+    // "next" is current row shifted left by 1 pixel, with first pixel
+    // of next block of 8 pixels added in.
+    __m128i prv0 = _mm_slli_si128(curr, 2);
+    __m128i nxt0 = _mm_srli_si128(curr, 2);
+    __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+    __m128i next =
+        _mm_insert_epi16(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7);
+
+    // horizontal filter, polyphase implementation since it's convenient:
+    // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+    // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+    // note the shared term.
+    __m128i bias = _mm_set1_epi16(8);
+    __m128i curs = _mm_slli_epi16(curr, 2);
+    __m128i prvd = _mm_sub_epi16(prev, curr);
+    __m128i nxtd = _mm_sub_epi16(next, curr);
+    __m128i curb = _mm_add_epi16(curs, bias);
+    __m128i even = _mm_add_epi16(prvd, curb);
+    __m128i odd = _mm_add_epi16(nxtd, curb);
+
+    // interleave even and odd pixels, then undo scaling.
+    __m128i int0 = _mm_unpacklo_epi16(even, odd);
+    __m128i int1 = _mm_unpackhi_epi16(even, odd);
+    __m128i de0 = _mm_srli_epi16(int0, 4);
+    __m128i de1 = _mm_srli_epi16(int1, 4);
+
+    // pack and write output
+    __m128i outv = _mm_packus_epi16(de0, de1);
+    _mm_storeu_si128((__m128i *)(out + i * 2), outv);
 #elif defined(STBI_NEON)
-      // load and perform the vertical filtering pass
-      // this uses 3*x + y = 4*x + (y - x)
-      uint8x8_t farb  = vld1_u8(in_far + i);
-      uint8x8_t nearb = vld1_u8(in_near + i);
-      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
-      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
-      int16x8_t curr  = vaddq_s16(nears, diff); // current row
-
-      // horizontal filter works the same based on shifted vers of current
-      // row. "prev" is current row shifted right by 1 pixel; we need to
-      // insert the previous pixel value (from t1).
-      // "next" is current row shifted left by 1 pixel, with first pixel
-      // of next block of 8 pixels added in.
-      int16x8_t prv0 = vextq_s16(curr, curr, 7);
-      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
-      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
-      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
-
-      // horizontal filter, polyphase implementation since it's convenient:
-      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
-      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
-      // note the shared term.
-      int16x8_t curs = vshlq_n_s16(curr, 2);
-      int16x8_t prvd = vsubq_s16(prev, curr);
-      int16x8_t nxtd = vsubq_s16(next, curr);
-      int16x8_t even = vaddq_s16(curs, prvd);
-      int16x8_t odd  = vaddq_s16(curs, nxtd);
-
-      // undo scaling and round, then store with even/odd phases interleaved
-      uint8x8x2_t o;
-      o.val[0] = vqrshrun_n_s16(even, 4);
-      o.val[1] = vqrshrun_n_s16(odd,  4);
-      vst2_u8(out + i*2, o);
+    // load and perform the vertical filtering pass
+    // this uses 3*x + y = 4*x + (y - x)
+    uint8x8_t farb = vld1_u8(in_far + i);
+    uint8x8_t nearb = vld1_u8(in_near + i);
+    int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+    int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+    int16x8_t curr = vaddq_s16(nears, diff); // current row
+
+    // horizontal filter works the same based on shifted vers of current
+    // row. "prev" is current row shifted right by 1 pixel; we need to
+    // insert the previous pixel value (from t1).
+    // "next" is current row shifted left by 1 pixel, with first pixel
+    // of next block of 8 pixels added in.
+    int16x8_t prv0 = vextq_s16(curr, curr, 7);
+    int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+    int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+    int16x8_t next =
+        vsetq_lane_s16(3 * in_near[i + 8] + in_far[i + 8], nxt0, 7);
+
+    // horizontal filter, polyphase implementation since it's convenient:
+    // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+    // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+    // note the shared term.
+    int16x8_t curs = vshlq_n_s16(curr, 2);
+    int16x8_t prvd = vsubq_s16(prev, curr);
+    int16x8_t nxtd = vsubq_s16(next, curr);
+    int16x8_t even = vaddq_s16(curs, prvd);
+    int16x8_t odd = vaddq_s16(curs, nxtd);
+
+    // undo scaling and round, then store with even/odd phases interleaved
+    uint8x8x2_t o;
+    o.val[0] = vqrshrun_n_s16(even, 4);
+    o.val[1] = vqrshrun_n_s16(odd, 4);
+    vst2_u8(out + i * 2, o);
 #endif
 
-      // "previous" value for next iter
-      t1 = 3*in_near[i+7] + in_far[i+7];
-   }
+    // "previous" value for next iter
+    t1 = 3 * in_near[i + 7] + in_far[i + 7];
+  }
 
-   t0 = t1;
-   t1 = 3*in_near[i] + in_far[i];
-   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+  t0 = t1;
+  t1 = 3 * in_near[i] + in_far[i];
+  out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
 
-   for (++i; i < w; ++i) {
-      t0 = t1;
-      t1 = 3*in_near[i]+in_far[i];
-      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
-      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
-   }
-   out[w*2-1] = stbi__div4(t1+2);
+  for (++i; i < w; ++i) {
+    t0 = t1;
+    t1 = 3 * in_near[i] + in_far[i];
+    out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8);
+    out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
+  }
+  out[w * 2 - 1] = stbi__div4(t1 + 2);
 
-   STBI_NOTUSED(hs);
+  STBI_NOTUSED(hs);
 
-   return out;
+  return out;
 }
 #endif
 
-static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // resample with nearest-neighbor
-   int i,j;
-   STBI_NOTUSED(in_far);
-   for (i=0; i < w; ++i)
-      for (j=0; j < hs; ++j)
-         out[i*hs+j] = in_near[i];
-   return out;
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near,
+                                           stbi_uc *in_far, int w, int hs) {
+  // resample with nearest-neighbor
+  int i, j;
+  STBI_NOTUSED(in_far);
+  for (i = 0; i < w; ++i)
+    for (j = 0; j < hs; ++j)
+      out[i * hs + j] = in_near[i];
+  return out;
 }
 
 // this is a reduced-precision calculation of YCbCr-to-RGB introduced
 // to make sure the code produces the same results in both SIMD and scalar
-#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
-static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
-{
-   int i;
-   for (i=0; i < count; ++i) {
-      int y_fixed = (y[i] << 20) + (1<<19); // rounding
-      int r,g,b;
-      int cr = pcr[i] - 128;
-      int cb = pcb[i] - 128;
-      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
-      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
-      r >>= 20;
-      g >>= 20;
-      b >>= 20;
-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
-      out[0] = (stbi_uc)r;
-      out[1] = (stbi_uc)g;
-      out[2] = (stbi_uc)b;
-      out[3] = 255;
-      out += step;
-   }
+#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y,
+                                   const stbi_uc *pcb, const stbi_uc *pcr,
+                                   int count, int step) {
+  int i;
+  for (i = 0; i < count; ++i) {
+    int y_fixed = (y[i] << 20) + (1 << 19); // rounding
+    int r, g, b;
+    int cr = pcr[i] - 128;
+    int cb = pcb[i] - 128;
+    r = y_fixed + cr * stbi__float2fixed(1.40200f);
+    g = y_fixed + (cr * -stbi__float2fixed(0.71414f)) +
+        ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
+    b = y_fixed + cb * stbi__float2fixed(1.77200f);
+    r >>= 20;
+    g >>= 20;
+    b >>= 20;
+    if ((unsigned)r > 255) {
+      if (r < 0)
+        r = 0;
+      else
+        r = 255;
+    }
+    if ((unsigned)g > 255) {
+      if (g < 0)
+        g = 0;
+      else
+        g = 255;
+    }
+    if ((unsigned)b > 255) {
+      if (b < 0)
+        b = 0;
+      else
+        b = 255;
+    }
+    out[0] = (stbi_uc)r;
+    out[1] = (stbi_uc)g;
+    out[2] = (stbi_uc)b;
+    out[3] = 255;
+    out += step;
+  }
 }
 
 #if defined(STBI_SSE2) || defined(STBI_NEON)
-static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
-{
-   int i = 0;
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y,
+                                    stbi_uc const *pcb, stbi_uc const *pcr,
+                                    int count, int step) {
+  int i = 0;
 
 #ifdef STBI_SSE2
-   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
-   // it's useful in practice (you wouldn't use it for textures, for example).
-   // so just accelerate step == 4 case.
-   if (step == 4) {
-      // this is a fairly straightforward implementation and not super-optimized.
-      __m128i signflip  = _mm_set1_epi8(-0x80);
-      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
-      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
-      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
-      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
-      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
-      __m128i xw = _mm_set1_epi16(255); // alpha channel
-
-      for (; i+7 < count; i += 8) {
-         // load
-         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
-         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
-         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
-         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
-         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
-
-         // unpack to short (and left-shift cr, cb by 8)
-         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
-         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
-         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
-
-         // color transform
-         __m128i yws = _mm_srli_epi16(yw, 4);
-         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
-         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
-         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
-         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
-         __m128i rws = _mm_add_epi16(cr0, yws);
-         __m128i gwt = _mm_add_epi16(cb0, yws);
-         __m128i bws = _mm_add_epi16(yws, cb1);
-         __m128i gws = _mm_add_epi16(gwt, cr1);
-
-         // descale
-         __m128i rw = _mm_srai_epi16(rws, 4);
-         __m128i bw = _mm_srai_epi16(bws, 4);
-         __m128i gw = _mm_srai_epi16(gws, 4);
-
-         // back to byte, set up for transpose
-         __m128i brb = _mm_packus_epi16(rw, bw);
-         __m128i gxb = _mm_packus_epi16(gw, xw);
-
-         // transpose to interleave channels
-         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
-         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
-         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
-         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
-
-         // store
-         _mm_storeu_si128((__m128i *) (out + 0), o0);
-         _mm_storeu_si128((__m128i *) (out + 16), o1);
-         out += 32;
-      }
-   }
+  // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+  // it's useful in practice (you wouldn't use it for textures, for example).
+  // so just accelerate step == 4 case.
+  if (step == 4) {
+    // this is a fairly straightforward implementation and not super-optimized.
+    __m128i signflip = _mm_set1_epi8(-0x80);
+    __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
+    __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
+    __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
+    __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
+    __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
+    __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+    for (; i + 7 < count; i += 8) {
+      // load
+      __m128i y_bytes = _mm_loadl_epi64((__m128i *)(y + i));
+      __m128i cr_bytes = _mm_loadl_epi64((__m128i *)(pcr + i));
+      __m128i cb_bytes = _mm_loadl_epi64((__m128i *)(pcb + i));
+      __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+      __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+      // unpack to short (and left-shift cr, cb by 8)
+      __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
+      __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+      __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+      // color transform
+      __m128i yws = _mm_srli_epi16(yw, 4);
+      __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+      __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+      __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+      __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+      __m128i rws = _mm_add_epi16(cr0, yws);
+      __m128i gwt = _mm_add_epi16(cb0, yws);
+      __m128i bws = _mm_add_epi16(yws, cb1);
+      __m128i gws = _mm_add_epi16(gwt, cr1);
+
+      // descale
+      __m128i rw = _mm_srai_epi16(rws, 4);
+      __m128i bw = _mm_srai_epi16(bws, 4);
+      __m128i gw = _mm_srai_epi16(gws, 4);
+
+      // back to byte, set up for transpose
+      __m128i brb = _mm_packus_epi16(rw, bw);
+      __m128i gxb = _mm_packus_epi16(gw, xw);
+
+      // transpose to interleave channels
+      __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+      __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+      __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+      __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+      // store
+      _mm_storeu_si128((__m128i *)(out + 0), o0);
+      _mm_storeu_si128((__m128i *)(out + 16), o1);
+      out += 32;
+    }
+  }
 #endif
 
 #ifdef STBI_NEON
-   // in this version, step=3 support would be easy to add. but is there demand?
-   if (step == 4) {
-      // this is a fairly straightforward implementation and not super-optimized.
-      uint8x8_t signflip = vdup_n_u8(0x80);
-      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
-      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
-      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
-      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
-
-      for (; i+7 < count; i += 8) {
-         // load
-         uint8x8_t y_bytes  = vld1_u8(y + i);
-         uint8x8_t cr_bytes = vld1_u8(pcr + i);
-         uint8x8_t cb_bytes = vld1_u8(pcb + i);
-         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
-         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
-
-         // expand to s16
-         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
-         int16x8_t crw = vshll_n_s8(cr_biased, 7);
-         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
-
-         // color transform
-         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
-         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
-         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
-         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
-         int16x8_t rws = vaddq_s16(yws, cr0);
-         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
-         int16x8_t bws = vaddq_s16(yws, cb1);
-
-         // undo scaling, round, convert to byte
-         uint8x8x4_t o;
-         o.val[0] = vqrshrun_n_s16(rws, 4);
-         o.val[1] = vqrshrun_n_s16(gws, 4);
-         o.val[2] = vqrshrun_n_s16(bws, 4);
-         o.val[3] = vdup_n_u8(255);
-
-         // store, interleaving r/g/b/a
-         vst4_u8(out, o);
-         out += 8*4;
-      }
-   }
+  // in this version, step=3 support would be easy to add. but is there demand?
+  if (step == 4) {
+    // this is a fairly straightforward implementation and not super-optimized.
+    uint8x8_t signflip = vdup_n_u8(0x80);
+    int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
+    int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
+    int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
+    int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
+
+    for (; i + 7 < count; i += 8) {
+      // load
+      uint8x8_t y_bytes = vld1_u8(y + i);
+      uint8x8_t cr_bytes = vld1_u8(pcr + i);
+      uint8x8_t cb_bytes = vld1_u8(pcb + i);
+      int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+      int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+      // expand to s16
+      int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+      int16x8_t crw = vshll_n_s8(cr_biased, 7);
+      int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+      // color transform
+      int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+      int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+      int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+      int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+      int16x8_t rws = vaddq_s16(yws, cr0);
+      int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+      int16x8_t bws = vaddq_s16(yws, cb1);
+
+      // undo scaling, round, convert to byte
+      uint8x8x4_t o;
+      o.val[0] = vqrshrun_n_s16(rws, 4);
+      o.val[1] = vqrshrun_n_s16(gws, 4);
+      o.val[2] = vqrshrun_n_s16(bws, 4);
+      o.val[3] = vdup_n_u8(255);
+
+      // store, interleaving r/g/b/a
+      vst4_u8(out, o);
+      out += 8 * 4;
+    }
+  }
 #endif
 
-   for (; i < count; ++i) {
-      int y_fixed = (y[i] << 20) + (1<<19); // rounding
-      int r,g,b;
-      int cr = pcr[i] - 128;
-      int cb = pcb[i] - 128;
-      r = y_fixed + cr* stbi__float2fixed(1.40200f);
-      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
-      r >>= 20;
-      g >>= 20;
-      b >>= 20;
-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
-      out[0] = (stbi_uc)r;
-      out[1] = (stbi_uc)g;
-      out[2] = (stbi_uc)b;
-      out[3] = 255;
-      out += step;
-   }
+  for (; i < count; ++i) {
+    int y_fixed = (y[i] << 20) + (1 << 19); // rounding
+    int r, g, b;
+    int cr = pcr[i] - 128;
+    int cb = pcb[i] - 128;
+    r = y_fixed + cr * stbi__float2fixed(1.40200f);
+    g = y_fixed + cr * -stbi__float2fixed(0.71414f) +
+        ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
+    b = y_fixed + cb * stbi__float2fixed(1.77200f);
+    r >>= 20;
+    g >>= 20;
+    b >>= 20;
+    if ((unsigned)r > 255) {
+      if (r < 0)
+        r = 0;
+      else
+        r = 255;
+    }
+    if ((unsigned)g > 255) {
+      if (g < 0)
+        g = 0;
+      else
+        g = 255;
+    }
+    if ((unsigned)b > 255) {
+      if (b < 0)
+        b = 0;
+      else
+        b = 255;
+    }
+    out[0] = (stbi_uc)r;
+    out[1] = (stbi_uc)g;
+    out[2] = (stbi_uc)b;
+    out[3] = 255;
+    out += step;
+  }
 }
 #endif
 
 // set up the kernels
-static void stbi__setup_jpeg(stbi__jpeg *j)
-{
-   j->idct_block_kernel = stbi__idct_block;
-   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
-   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+static void stbi__setup_jpeg(stbi__jpeg *j) {
+  j->idct_block_kernel = stbi__idct_block;
+  j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+  j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
 
 #ifdef STBI_SSE2
-   if (stbi__sse2_available()) {
-      j->idct_block_kernel = stbi__idct_simd;
-      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
-   }
+  if (stbi__sse2_available()) {
+    j->idct_block_kernel = stbi__idct_simd;
+    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+  }
 #endif
 
 #ifdef STBI_NEON
-   j->idct_block_kernel = stbi__idct_simd;
-   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+  j->idct_block_kernel = stbi__idct_simd;
+  j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+  j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 #endif
 }
 
 // clean up the temporary component buffers
-static void stbi__cleanup_jpeg(stbi__jpeg *j)
-{
-   stbi__free_jpeg_components(j, j->s->img_n, 0);
+static void stbi__cleanup_jpeg(stbi__jpeg *j) {
+  stbi__free_jpeg_components(j, j->s->img_n, 0);
 }
 
-typedef struct
-{
-   resample_row_func resample;
-   stbi_uc *line0,*line1;
-   int hs,vs;   // expansion factor in each axis
-   int w_lores; // horizontal pixels pre-expansion
-   int ystep;   // how far through vertical expansion we are
-   int ypos;    // which pre-expansion row we're on
+typedef struct {
+  resample_row_func resample;
+  stbi_uc *line0, *line1;
+  int hs, vs;  // expansion factor in each axis
+  int w_lores; // horizontal pixels pre-expansion
+  int ystep;   // how far through vertical expansion we are
+  int ypos;    // which pre-expansion row we're on
 } stbi__resample;
 
 // fast 0..255 * 0..255 => 0..255 rounded multiplication
-static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
-{
-   unsigned int t = x*y + 128;
-   return (stbi_uc) ((t + (t >>8)) >> 8);
-}
-
-static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
-{
-   int n, decode_n, is_rgb;
-   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
-
-   // validate req_comp
-   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
-
-   // load a jpeg image from whichever source, but leave in YCbCr format
-   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
-
-   // determine actual number of components to generate
-   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
-
-   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
-
-   if (z->s->img_n == 3 && n < 3 && !is_rgb)
-      decode_n = 1;
-   else
-      decode_n = z->s->img_n;
-
-   // resample and color-convert
-   {
-      int k;
-      unsigned int i,j;
-      stbi_uc *output;
-      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
-
-      stbi__resample res_comp[4];
-
-      for (k=0; k < decode_n; ++k) {
-         stbi__resample *r = &res_comp[k];
-
-         // allocate line buffer big enough for upsampling off the edges
-         // with upsample factor of 4
-         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
-         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
-
-         r->hs      = z->img_h_max / z->img_comp[k].h;
-         r->vs      = z->img_v_max / z->img_comp[k].v;
-         r->ystep   = r->vs >> 1;
-         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
-         r->ypos    = 0;
-         r->line0   = r->line1 = z->img_comp[k].data;
-
-         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
-         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
-         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
-         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
-         else                               r->resample = stbi__resample_row_generic;
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y) {
+  unsigned int t = x * y + 128;
+  return (stbi_uc)((t + (t >> 8)) >> 8);
+}
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y,
+                                int *comp, int req_comp) {
+  int n, decode_n, is_rgb;
+  z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+  // validate req_comp
+  if (req_comp < 0 || req_comp > 4)
+    return stbi__errpuc("bad req_comp", "Internal error");
+
+  // load a jpeg image from whichever source, but leave in YCbCr format
+  if (!stbi__decode_jpeg_image(z)) {
+    stbi__cleanup_jpeg(z);
+    return NULL;
+  }
+
+  // determine actual number of components to generate
+  n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+  is_rgb = z->s->img_n == 3 &&
+           (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+  if (z->s->img_n == 3 && n < 3 && !is_rgb)
+    decode_n = 1;
+  else
+    decode_n = z->s->img_n;
+
+  // resample and color-convert
+  {
+    int k;
+    unsigned int i, j;
+    stbi_uc *output;
+    stbi_uc *coutput[4] = {NULL, NULL, NULL, NULL};
+
+    stbi__resample res_comp[4];
+
+    for (k = 0; k < decode_n; ++k) {
+      stbi__resample *r = &res_comp[k];
+
+      // allocate line buffer big enough for upsampling off the edges
+      // with upsample factor of 4
+      z->img_comp[k].linebuf = (stbi_uc *)stbi__malloc(z->s->img_x + 3);
+      if (!z->img_comp[k].linebuf) {
+        stbi__cleanup_jpeg(z);
+        return stbi__errpuc("outofmem", "Out of memory");
       }
 
-      // can't error after this so, this is safe
-      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
-      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
-
-      // now go ahead and resample
-      for (j=0; j < z->s->img_y; ++j) {
-         stbi_uc *out = output + n * z->s->img_x * j;
-         for (k=0; k < decode_n; ++k) {
-            stbi__resample *r = &res_comp[k];
-            int y_bot = r->ystep >= (r->vs >> 1);
-            coutput[k] = r->resample(z->img_comp[k].linebuf,
-                                     y_bot ? r->line1 : r->line0,
-                                     y_bot ? r->line0 : r->line1,
-                                     r->w_lores, r->hs);
-            if (++r->ystep >= r->vs) {
-               r->ystep = 0;
-               r->line0 = r->line1;
-               if (++r->ypos < z->img_comp[k].y)
-                  r->line1 += z->img_comp[k].w2;
+      r->hs = z->img_h_max / z->img_comp[k].h;
+      r->vs = z->img_v_max / z->img_comp[k].v;
+      r->ystep = r->vs >> 1;
+      r->w_lores = (z->s->img_x + r->hs - 1) / r->hs;
+      r->ypos = 0;
+      r->line0 = r->line1 = z->img_comp[k].data;
+
+      if (r->hs == 1 && r->vs == 1)
+        r->resample = resample_row_1;
+      else if (r->hs == 1 && r->vs == 2)
+        r->resample = stbi__resample_row_v_2;
+      else if (r->hs == 2 && r->vs == 1)
+        r->resample = stbi__resample_row_h_2;
+      else if (r->hs == 2 && r->vs == 2)
+        r->resample = z->resample_row_hv_2_kernel;
+      else
+        r->resample = stbi__resample_row_generic;
+    }
+
+    // can't error after this so, this is safe
+    output = (stbi_uc *)stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+    if (!output) {
+      stbi__cleanup_jpeg(z);
+      return stbi__errpuc("outofmem", "Out of memory");
+    }
+
+    // now go ahead and resample
+    for (j = 0; j < z->s->img_y; ++j) {
+      stbi_uc *out = output + n * z->s->img_x * j;
+      for (k = 0; k < decode_n; ++k) {
+        stbi__resample *r = &res_comp[k];
+        int y_bot = r->ystep >= (r->vs >> 1);
+        coutput[k] =
+            r->resample(z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0,
+                        y_bot ? r->line0 : r->line1, r->w_lores, r->hs);
+        if (++r->ystep >= r->vs) {
+          r->ystep = 0;
+          r->line0 = r->line1;
+          if (++r->ypos < z->img_comp[k].y)
+            r->line1 += z->img_comp[k].w2;
+        }
+      }
+      if (n >= 3) {
+        stbi_uc *y = coutput[0];
+        if (z->s->img_n == 3) {
+          if (is_rgb) {
+            for (i = 0; i < z->s->img_x; ++i) {
+              out[0] = y[i];
+              out[1] = coutput[1][i];
+              out[2] = coutput[2][i];
+              out[3] = 255;
+              out += n;
+            }
+          } else {
+            z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x,
+                                   n);
+          }
+        } else if (z->s->img_n == 4) {
+          if (z->app14_color_transform == 0) { // CMYK
+            for (i = 0; i < z->s->img_x; ++i) {
+              stbi_uc m = coutput[3][i];
+              out[0] = stbi__blinn_8x8(coutput[0][i], m);
+              out[1] = stbi__blinn_8x8(coutput[1][i], m);
+              out[2] = stbi__blinn_8x8(coutput[2][i], m);
+              out[3] = 255;
+              out += n;
+            }
+          } else if (z->app14_color_transform == 2) { // YCCK
+            z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x,
+                                   n);
+            for (i = 0; i < z->s->img_x; ++i) {
+              stbi_uc m = coutput[3][i];
+              out[0] = stbi__blinn_8x8(255 - out[0], m);
+              out[1] = stbi__blinn_8x8(255 - out[1], m);
+              out[2] = stbi__blinn_8x8(255 - out[2], m);
+              out += n;
+            }
+          } else { // YCbCr + alpha?  Ignore the fourth channel for now
+            z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x,
+                                   n);
+          }
+        } else
+          for (i = 0; i < z->s->img_x; ++i) {
+            out[0] = out[1] = out[2] = y[i];
+            out[3] = 255; // not used if n==3
+            out += n;
+          }
+      } else {
+        if (is_rgb) {
+          if (n == 1)
+            for (i = 0; i < z->s->img_x; ++i)
+              *out++ =
+                  stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+          else {
+            for (i = 0; i < z->s->img_x; ++i, out += 2) {
+              out[0] =
+                  stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+              out[1] = 255;
             }
-         }
-         if (n >= 3) {
-            stbi_uc *y = coutput[0];
-            if (z->s->img_n == 3) {
-               if (is_rgb) {
-                  for (i=0; i < z->s->img_x; ++i) {
-                     out[0] = y[i];
-                     out[1] = coutput[1][i];
-                     out[2] = coutput[2][i];
-                     out[3] = 255;
-                     out += n;
-                  }
-               } else {
-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-               }
-            } else if (z->s->img_n == 4) {
-               if (z->app14_color_transform == 0) { // CMYK
-                  for (i=0; i < z->s->img_x; ++i) {
-                     stbi_uc m = coutput[3][i];
-                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
-                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
-                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
-                     out[3] = 255;
-                     out += n;
-                  }
-               } else if (z->app14_color_transform == 2) { // YCCK
-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-                  for (i=0; i < z->s->img_x; ++i) {
-                     stbi_uc m = coutput[3][i];
-                     out[0] = stbi__blinn_8x8(255 - out[0], m);
-                     out[1] = stbi__blinn_8x8(255 - out[1], m);
-                     out[2] = stbi__blinn_8x8(255 - out[2], m);
-                     out += n;
-                  }
-               } else { // YCbCr + alpha?  Ignore the fourth channel for now
-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-               }
-            } else
-               for (i=0; i < z->s->img_x; ++i) {
-                  out[0] = out[1] = out[2] = y[i];
-                  out[3] = 255; // not used if n==3
-                  out += n;
-               }
-         } else {
-            if (is_rgb) {
-               if (n == 1)
-                  for (i=0; i < z->s->img_x; ++i)
-                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
-               else {
-                  for (i=0; i < z->s->img_x; ++i, out += 2) {
-                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
-                     out[1] = 255;
-                  }
-               }
-            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
-               for (i=0; i < z->s->img_x; ++i) {
-                  stbi_uc m = coutput[3][i];
-                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
-                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
-                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
-                  out[0] = stbi__compute_y(r, g, b);
-                  out[1] = 255;
-                  out += n;
-               }
-            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
-               for (i=0; i < z->s->img_x; ++i) {
-                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
-                  out[1] = 255;
-                  out += n;
-               }
-            } else {
-               stbi_uc *y = coutput[0];
-               if (n == 1)
-                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
-               else
-                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+          }
+        } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+          for (i = 0; i < z->s->img_x; ++i) {
+            stbi_uc m = coutput[3][i];
+            stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+            stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+            stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+            out[0] = stbi__compute_y(r, g, b);
+            out[1] = 255;
+            out += n;
+          }
+        } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+          for (i = 0; i < z->s->img_x; ++i) {
+            out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+            out[1] = 255;
+            out += n;
+          }
+        } else {
+          stbi_uc *y = coutput[0];
+          if (n == 1)
+            for (i = 0; i < z->s->img_x; ++i)
+              out[i] = y[i];
+          else
+            for (i = 0; i < z->s->img_x; ++i) {
+              *out++ = y[i];
+              *out++ = 255;
             }
-         }
+        }
       }
-      stbi__cleanup_jpeg(z);
-      *out_x = z->s->img_x;
-      *out_y = z->s->img_y;
-      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
-      return output;
-   }
-}
-
-static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   unsigned char* result;
-   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
-   STBI_NOTUSED(ri);
-   j->s = s;
-   stbi__setup_jpeg(j);
-   result = load_jpeg_image(j, x,y,comp,req_comp);
-   STBI_FREE(j);
-   return result;
-}
-
-static int stbi__jpeg_test(stbi__context *s)
-{
-   int r;
-   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
-   j->s = s;
-   stbi__setup_jpeg(j);
-   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
-   stbi__rewind(s);
-   STBI_FREE(j);
-   return r;
-}
-
-static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
-{
-   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
-      stbi__rewind( j->s );
-      return 0;
-   }
-   if (x) *x = j->s->img_x;
-   if (y) *y = j->s->img_y;
-   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
-   return 1;
-}
-
-static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int result;
-   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
-   j->s = s;
-   result = stbi__jpeg_info_raw(j, x, y, comp);
-   STBI_FREE(j);
-   return result;
+    }
+    stbi__cleanup_jpeg(z);
+    *out_x = z->s->img_x;
+    *out_y = z->s->img_y;
+    if (comp)
+      *comp =
+          z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+    return output;
+  }
+}
+
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp,
+                             int req_comp, stbi__result_info *ri) {
+  unsigned char *result;
+  stbi__jpeg *j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg));
+  STBI_NOTUSED(ri);
+  j->s = s;
+  stbi__setup_jpeg(j);
+  result = load_jpeg_image(j, x, y, comp, req_comp);
+  STBI_FREE(j);
+  return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s) {
+  int r;
+  stbi__jpeg *j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg));
+  j->s = s;
+  stbi__setup_jpeg(j);
+  r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+  stbi__rewind(s);
+  STBI_FREE(j);
+  return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp) {
+  if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+    stbi__rewind(j->s);
+    return 0;
+  }
+  if (x)
+    *x = j->s->img_x;
+  if (y)
+    *y = j->s->img_y;
+  if (comp)
+    *comp = j->s->img_n >= 3 ? 3 : 1;
+  return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp) {
+  int result;
+  stbi__jpeg *j = (stbi__jpeg *)(stbi__malloc(sizeof(stbi__jpeg)));
+  j->s = s;
+  result = stbi__jpeg_info_raw(j, x, y, comp);
+  STBI_FREE(j);
+  return result;
 }
 #endif
 
@@ -3853,83 +4232,81 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 #ifndef STBI_NO_ZLIB
 
 // fast-way is faster to check than jpeg huffman, but slow way is slower
-#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
-#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1)
 
 // zlib-style huffman encoding
 // (jpegs packs from left, zlib from right, so can't share code)
-typedef struct
-{
-   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
-   stbi__uint16 firstcode[16];
-   int maxcode[17];
-   stbi__uint16 firstsymbol[16];
-   stbi_uc  size[288];
-   stbi__uint16 value[288];
+typedef struct {
+  stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+  stbi__uint16 firstcode[16];
+  int maxcode[17];
+  stbi__uint16 firstsymbol[16];
+  stbi_uc size[288];
+  stbi__uint16 value[288];
 } stbi__zhuffman;
 
-stbi_inline static int stbi__bitreverse16(int n)
-{
-  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
-  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
-  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
-  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+stbi_inline static int stbi__bitreverse16(int n) {
+  n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8);
   return n;
 }
 
-stbi_inline static int stbi__bit_reverse(int v, int bits)
-{
-   STBI_ASSERT(bits <= 16);
-   // to bit reverse n bits, reverse 16 and shift
-   // e.g. 11 bits, bit reverse and shift away 5
-   return stbi__bitreverse16(v) >> (16-bits);
-}
-
-static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
-{
-   int i,k=0;
-   int code, next_code[16], sizes[17];
-
-   // DEFLATE spec for generating codes
-   memset(sizes, 0, sizeof(sizes));
-   memset(z->fast, 0, sizeof(z->fast));
-   for (i=0; i < num; ++i)
-      ++sizes[sizelist[i]];
-   sizes[0] = 0;
-   for (i=1; i < 16; ++i)
-      if (sizes[i] > (1 << i))
-         return stbi__err("bad sizes", "Corrupt PNG");
-   code = 0;
-   for (i=1; i < 16; ++i) {
-      next_code[i] = code;
-      z->firstcode[i] = (stbi__uint16) code;
-      z->firstsymbol[i] = (stbi__uint16) k;
-      code = (code + sizes[i]);
-      if (sizes[i])
-         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
-      z->maxcode[i] = code << (16-i); // preshift for inner loop
-      code <<= 1;
-      k += sizes[i];
-   }
-   z->maxcode[16] = 0x10000; // sentinel
-   for (i=0; i < num; ++i) {
-      int s = sizelist[i];
-      if (s) {
-         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
-         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
-         z->size [c] = (stbi_uc     ) s;
-         z->value[c] = (stbi__uint16) i;
-         if (s <= STBI__ZFAST_BITS) {
-            int j = stbi__bit_reverse(next_code[s],s);
-            while (j < (1 << STBI__ZFAST_BITS)) {
-               z->fast[j] = fastv;
-               j += (1 << s);
-            }
-         }
-         ++next_code[s];
+stbi_inline static int stbi__bit_reverse(int v, int bits) {
+  STBI_ASSERT(bits <= 16);
+  // to bit reverse n bits, reverse 16 and shift
+  // e.g. 11 bits, bit reverse and shift away 5
+  return stbi__bitreverse16(v) >> (16 - bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist,
+                                int num) {
+  int i, k = 0;
+  int code, next_code[16], sizes[17];
+
+  // DEFLATE spec for generating codes
+  memset(sizes, 0, sizeof(sizes));
+  memset(z->fast, 0, sizeof(z->fast));
+  for (i = 0; i < num; ++i)
+    ++sizes[sizelist[i]];
+  sizes[0] = 0;
+  for (i = 1; i < 16; ++i)
+    if (sizes[i] > (1 << i))
+      return stbi__err("bad sizes", "Corrupt PNG");
+  code = 0;
+  for (i = 1; i < 16; ++i) {
+    next_code[i] = code;
+    z->firstcode[i] = (stbi__uint16)code;
+    z->firstsymbol[i] = (stbi__uint16)k;
+    code = (code + sizes[i]);
+    if (sizes[i])
+      if (code - 1 >= (1 << i))
+        return stbi__err("bad codelengths", "Corrupt PNG");
+    z->maxcode[i] = code << (16 - i); // preshift for inner loop
+    code <<= 1;
+    k += sizes[i];
+  }
+  z->maxcode[16] = 0x10000; // sentinel
+  for (i = 0; i < num; ++i) {
+    int s = sizelist[i];
+    if (s) {
+      int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+      stbi__uint16 fastv = (stbi__uint16)((s << 9) | i);
+      z->size[c] = (stbi_uc)s;
+      z->value[c] = (stbi__uint16)i;
+      if (s <= STBI__ZFAST_BITS) {
+        int j = stbi__bit_reverse(next_code[s], s);
+        while (j < (1 << STBI__ZFAST_BITS)) {
+          z->fast[j] = fastv;
+          j += (1 << s);
+        }
       }
-   }
-   return 1;
+      ++next_code[s];
+    }
+  }
+  return 1;
 }
 
 // zlib-from-memory implementation for PNG reading
@@ -3938,259 +4315,292 @@ static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int
 //    we require PNG read all the IDATs and combine them into a single
 //    memory buffer
 
-typedef struct
-{
-   stbi_uc *zbuffer, *zbuffer_end;
-   int num_bits;
-   stbi__uint32 code_buffer;
+typedef struct {
+  stbi_uc *zbuffer, *zbuffer_end;
+  int num_bits;
+  stbi__uint32 code_buffer;
 
-   char *zout;
-   char *zout_start;
-   char *zout_end;
-   int   z_expandable;
+  char *zout;
+  char *zout_start;
+  char *zout_end;
+  int z_expandable;
 
-   stbi__zhuffman z_length, z_distance;
+  stbi__zhuffman z_length, z_distance;
 } stbi__zbuf;
 
-stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
-{
-   if (z->zbuffer >= z->zbuffer_end) return 0;
-   return *z->zbuffer++;
-}
-
-static void stbi__fill_bits(stbi__zbuf *z)
-{
-   do {
-      STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
-      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
-      z->num_bits += 8;
-   } while (z->num_bits <= 24);
-}
-
-stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
-{
-   unsigned int k;
-   if (z->num_bits < n) stbi__fill_bits(z);
-   k = z->code_buffer & ((1 << n) - 1);
-   z->code_buffer >>= n;
-   z->num_bits -= n;
-   return k;
-}
-
-static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
-{
-   int b,s,k;
-   // not resolved by fast table, so compute it the slow way
-   // use jpeg approach, which requires MSbits at top
-   k = stbi__bit_reverse(a->code_buffer, 16);
-   for (s=STBI__ZFAST_BITS+1; ; ++s)
-      if (k < z->maxcode[s])
-         break;
-   if (s == 16) return -1; // invalid code!
-   // code size is s, so:
-   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
-   STBI_ASSERT(z->size[b] == s);
-   a->code_buffer >>= s;
-   a->num_bits -= s;
-   return z->value[b];
-}
-
-stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
-{
-   int b,s;
-   if (a->num_bits < 16) stbi__fill_bits(a);
-   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
-   if (b) {
-      s = b >> 9;
-      a->code_buffer >>= s;
-      a->num_bits -= s;
-      return b & 511;
-   }
-   return stbi__zhuffman_decode_slowpath(a, z);
-}
-
-static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
-{
-   char *q;
-   int cur, limit, old_limit __attribute__((unused));
-   z->zout = zout;
-   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
-   cur   = (int) (z->zout     - z->zout_start);
-   limit = old_limit = (int) (z->zout_end - z->zout_start);
-   while (cur + n > limit)
-      limit *= 2;
-   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
-   STBI_NOTUSED(old_limit);
-   if (q == NULL) return stbi__err("outofmem", "Out of memory");
-   z->zout_start = q;
-   z->zout       = q + cur;
-   z->zout_end   = q + limit;
-   return 1;
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z) {
+  if (z->zbuffer >= z->zbuffer_end)
+    return 0;
+  return *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z) {
+  do {
+    STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
+    z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits;
+    z->num_bits += 8;
+  } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n) {
+  unsigned int k;
+  if (z->num_bits < n)
+    stbi__fill_bits(z);
+  k = z->code_buffer & ((1 << n) - 1);
+  z->code_buffer >>= n;
+  z->num_bits -= n;
+  return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z) {
+  int b, s, k;
+  // not resolved by fast table, so compute it the slow way
+  // use jpeg approach, which requires MSbits at top
+  k = stbi__bit_reverse(a->code_buffer, 16);
+  for (s = STBI__ZFAST_BITS + 1;; ++s)
+    if (k < z->maxcode[s])
+      break;
+  if (s == 16)
+    return -1; // invalid code!
+  // code size is s, so:
+  b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s];
+  STBI_ASSERT(z->size[b] == s);
+  a->code_buffer >>= s;
+  a->num_bits -= s;
+  return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z) {
+  int b, s;
+  if (a->num_bits < 16)
+    stbi__fill_bits(a);
+  b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+  if (b) {
+    s = b >> 9;
+    a->code_buffer >>= s;
+    a->num_bits -= s;
+    return b & 511;
+  }
+  return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout,
+                         int n) // need to make room for n bytes
+{
+  char *q;
+  int cur, limit, old_limit __attribute__((unused));
+  z->zout = zout;
+  if (!z->z_expandable)
+    return stbi__err("output buffer limit", "Corrupt PNG");
+  cur = (int)(z->zout - z->zout_start);
+  limit = old_limit = (int)(z->zout_end - z->zout_start);
+  while (cur + n > limit)
+    limit *= 2;
+  q = (char *)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+  STBI_NOTUSED(old_limit);
+  if (q == NULL)
+    return stbi__err("outofmem", "Out of memory");
+  z->zout_start = q;
+  z->zout = q + cur;
+  z->zout_end = q + limit;
+  return 1;
 }
 
 static const int stbi__zlength_base[31] = {
-   3,4,5,6,7,8,9,10,11,13,
-   15,17,19,23,27,31,35,43,51,59,
-   67,83,99,115,131,163,195,227,258,0,0 };
-
-static const int stbi__zlength_extra[31]=
-{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
-
-static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
-257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
-
-static const int stbi__zdist_extra[32] =
-{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
-
-static int stbi__parse_huffman_block(stbi__zbuf *a)
-{
-   char *zout = a->zout;
-   for(;;) {
-      int z = stbi__zhuffman_decode(a, &a->z_length);
-      if (z < 256) {
-         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
-         if (zout >= a->zout_end) {
-            if (!stbi__zexpand(a, zout, 1)) return 0;
-            zout = a->zout;
-         }
-         *zout++ = (char) z;
+    3,  4,  5,  6,  7,  8,  9,  10,  11,  13,  15,  17,  19,  23, 27, 31,
+    35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0,  0};
+
+static const int stbi__zlength_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
+                                            1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4,
+                                            4, 4, 5, 5, 5, 5, 0, 0, 0};
+
+static const int stbi__zdist_base[32] = {
+    1,    2,    3,    4,    5,    7,     9,     13,    17,  25,   33,
+    49,   65,   97,   129,  193,  257,   385,   513,   769, 1025, 1537,
+    2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0,   0};
+
+static const int stbi__zdist_extra[32] = {0, 0, 0,  0,  1,  1,  2,  2,  3,  3,
+                                          4, 4, 5,  5,  6,  6,  7,  7,  8,  8,
+                                          9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a) {
+  char *zout = a->zout;
+  for (;;) {
+    int z = stbi__zhuffman_decode(a, &a->z_length);
+    if (z < 256) {
+      if (z < 0)
+        return stbi__err("bad huffman code",
+                         "Corrupt PNG"); // error in huffman codes
+      if (zout >= a->zout_end) {
+        if (!stbi__zexpand(a, zout, 1))
+          return 0;
+        zout = a->zout;
+      }
+      *zout++ = (char)z;
+    } else {
+      stbi_uc *p;
+      int len, dist;
+      if (z == 256) {
+        a->zout = zout;
+        return 1;
+      }
+      z -= 257;
+      len = stbi__zlength_base[z];
+      if (stbi__zlength_extra[z])
+        len += stbi__zreceive(a, stbi__zlength_extra[z]);
+      z = stbi__zhuffman_decode(a, &a->z_distance);
+      if (z < 0)
+        return stbi__err("bad huffman code", "Corrupt PNG");
+      dist = stbi__zdist_base[z];
+      if (stbi__zdist_extra[z])
+        dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+      if (zout - a->zout_start < dist)
+        return stbi__err("bad dist", "Corrupt PNG");
+      if (zout + len > a->zout_end) {
+        if (!stbi__zexpand(a, zout, len))
+          return 0;
+        zout = a->zout;
+      }
+      p = (stbi_uc *)(zout - dist);
+      if (dist == 1) { // run of one byte; common in images.
+        stbi_uc v = *p;
+        if (len) {
+          do
+            *zout++ = v;
+          while (--len);
+        }
       } else {
-         stbi_uc *p;
-         int len,dist;
-         if (z == 256) {
-            a->zout = zout;
-            return 1;
-         }
-         z -= 257;
-         len = stbi__zlength_base[z];
-         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
-         z = stbi__zhuffman_decode(a, &a->z_distance);
-         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
-         dist = stbi__zdist_base[z];
-         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
-         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
-         if (zout + len > a->zout_end) {
-            if (!stbi__zexpand(a, zout, len)) return 0;
-            zout = a->zout;
-         }
-         p = (stbi_uc *) (zout - dist);
-         if (dist == 1) { // run of one byte; common in images.
-            stbi_uc v = *p;
-            if (len) { do *zout++ = v; while (--len); }
-         } else {
-            if (len) { do *zout++ = *p++; while (--len); }
-         }
+        if (len) {
+          do
+            *zout++ = *p++;
+          while (--len);
+        }
       }
-   }
-}
-
-static int stbi__compute_huffman_codes(stbi__zbuf *a)
-{
-   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
-   stbi__zhuffman z_codelength;
-   stbi_uc lencodes[286+32+137];//padding for maximum single op
-   stbi_uc codelength_sizes[19];
-   int i,n;
-
-   int hlit  = stbi__zreceive(a,5) + 257;
-   int hdist = stbi__zreceive(a,5) + 1;
-   int hclen = stbi__zreceive(a,4) + 4;
-   int ntot  = hlit + hdist;
-
-   memset(codelength_sizes, 0, sizeof(codelength_sizes));
-   for (i=0; i < hclen; ++i) {
-      int s = stbi__zreceive(a,3);
-      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
-   }
-   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
-
-   n = 0;
-   while (n < ntot) {
-      int c = stbi__zhuffman_decode(a, &z_codelength);
-      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
-      if (c < 16)
-         lencodes[n++] = (stbi_uc) c;
+    }
+  }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a) {
+  static const stbi_uc length_dezigzag[19] = {
+      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+  stbi__zhuffman z_codelength;
+  stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op
+  stbi_uc codelength_sizes[19];
+  int i, n;
+
+  int hlit = stbi__zreceive(a, 5) + 257;
+  int hdist = stbi__zreceive(a, 5) + 1;
+  int hclen = stbi__zreceive(a, 4) + 4;
+  int ntot = hlit + hdist;
+
+  memset(codelength_sizes, 0, sizeof(codelength_sizes));
+  for (i = 0; i < hclen; ++i) {
+    int s = stbi__zreceive(a, 3);
+    codelength_sizes[length_dezigzag[i]] = (stbi_uc)s;
+  }
+  if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19))
+    return 0;
+
+  n = 0;
+  while (n < ntot) {
+    int c = stbi__zhuffman_decode(a, &z_codelength);
+    if (c < 0 || c >= 19)
+      return stbi__err("bad codelengths", "Corrupt PNG");
+    if (c < 16)
+      lencodes[n++] = (stbi_uc)c;
+    else {
+      stbi_uc fill = 0;
+      if (c == 16) {
+        c = stbi__zreceive(a, 2) + 3;
+        if (n == 0)
+          return stbi__err("bad codelengths", "Corrupt PNG");
+        fill = lencodes[n - 1];
+      } else if (c == 17)
+        c = stbi__zreceive(a, 3) + 3;
       else {
-         stbi_uc fill = 0;
-         if (c == 16) {
-            c = stbi__zreceive(a,2)+3;
-            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
-            fill = lencodes[n-1];
-         } else if (c == 17)
-            c = stbi__zreceive(a,3)+3;
-         else {
-            STBI_ASSERT(c == 18);
-            c = stbi__zreceive(a,7)+11;
-         }
-         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
-         memset(lencodes+n, fill, c);
-         n += c;
+        STBI_ASSERT(c == 18);
+        c = stbi__zreceive(a, 7) + 11;
       }
-   }
-   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
-   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
-   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
-   return 1;
-}
-
-static int stbi__parse_uncompressed_block(stbi__zbuf *a)
-{
-   stbi_uc header[4];
-   int len,nlen,k;
-   if (a->num_bits & 7)
-      stbi__zreceive(a, a->num_bits & 7); // discard
-   // drain the bit-packed data into header
-   k = 0;
-   while (a->num_bits > 0) {
-      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
-      a->code_buffer >>= 8;
-      a->num_bits -= 8;
-   }
-   STBI_ASSERT(a->num_bits == 0);
-   // now fill header the normal way
-   while (k < 4)
-      header[k++] = stbi__zget8(a);
-   len  = header[1] * 256 + header[0];
-   nlen = header[3] * 256 + header[2];
-   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
-   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
-   if (a->zout + len > a->zout_end)
-      if (!stbi__zexpand(a, a->zout, len)) return 0;
-   memcpy(a->zout, a->zbuffer, len);
-   a->zbuffer += len;
-   a->zout += len;
-   return 1;
-}
-
-static int stbi__parse_zlib_header(stbi__zbuf *a)
-{
-   int cmf   = stbi__zget8(a);
-   int cm    = cmf & 15;
-   /* int cinfo = cmf >> 4; */
-   int flg   = stbi__zget8(a);
-   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
-   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
-   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
-   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
-   return 1;
-}
-
-static const stbi_uc stbi__zdefault_length[288] =
-{
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
-};
-static const stbi_uc stbi__zdefault_distance[32] =
-{
-   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
-};
+      if (ntot - n < c)
+        return stbi__err("bad codelengths", "Corrupt PNG");
+      memset(lencodes + n, fill, c);
+      n += c;
+    }
+  }
+  if (n != ntot)
+    return stbi__err("bad codelengths", "Corrupt PNG");
+  if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit))
+    return 0;
+  if (!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist))
+    return 0;
+  return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a) {
+  stbi_uc header[4];
+  int len, nlen, k;
+  if (a->num_bits & 7)
+    stbi__zreceive(a, a->num_bits & 7); // discard
+  // drain the bit-packed data into header
+  k = 0;
+  while (a->num_bits > 0) {
+    header[k++] =
+        (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check
+    a->code_buffer >>= 8;
+    a->num_bits -= 8;
+  }
+  STBI_ASSERT(a->num_bits == 0);
+  // now fill header the normal way
+  while (k < 4)
+    header[k++] = stbi__zget8(a);
+  len = header[1] * 256 + header[0];
+  nlen = header[3] * 256 + header[2];
+  if (nlen != (len ^ 0xffff))
+    return stbi__err("zlib corrupt", "Corrupt PNG");
+  if (a->zbuffer + len > a->zbuffer_end)
+    return stbi__err("read past buffer", "Corrupt PNG");
+  if (a->zout + len > a->zout_end)
+    if (!stbi__zexpand(a, a->zout, len))
+      return 0;
+  memcpy(a->zout, a->zbuffer, len);
+  a->zbuffer += len;
+  a->zout += len;
+  return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a) {
+  int cmf = stbi__zget8(a);
+  int cm = cmf & 15;
+  /* int cinfo = cmf >> 4; */
+  int flg = stbi__zget8(a);
+  if ((cmf * 256 + flg) % 31 != 0)
+    return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec
+  if (flg & 32)
+    return stbi__err("no preset dict",
+                     "Corrupt PNG"); // preset dictionary not allowed in png
+  if (cm != 8)
+    return stbi__err("bad compression",
+                     "Corrupt PNG"); // DEFLATE required for png
+  // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+  return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[288] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8};
+static const stbi_uc stbi__zdefault_distance[32] = {
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
 /*
 Init algorithm:
 {
@@ -4204,117 +4614,131 @@ Init algorithm:
 }
 */
 
-static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
-{
-   int final, type;
-   if (parse_header)
-      if (!stbi__parse_zlib_header(a)) return 0;
-   a->num_bits = 0;
-   a->code_buffer = 0;
-   do {
-      final = stbi__zreceive(a,1);
-      type = stbi__zreceive(a,2);
-      if (type == 0) {
-         if (!stbi__parse_uncompressed_block(a)) return 0;
-      } else if (type == 3) {
-         return 0;
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header) {
+  int final, type;
+  if (parse_header)
+    if (!stbi__parse_zlib_header(a))
+      return 0;
+  a->num_bits = 0;
+  a->code_buffer = 0;
+  do {
+    final = stbi__zreceive(a, 1);
+    type = stbi__zreceive(a, 2);
+    if (type == 0) {
+      if (!stbi__parse_uncompressed_block(a))
+        return 0;
+    } else if (type == 3) {
+      return 0;
+    } else {
+      if (type == 1) {
+        // use fixed code lengths
+        if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288))
+          return 0;
+        if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32))
+          return 0;
       } else {
-         if (type == 1) {
-            // use fixed code lengths
-            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
-            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
-         } else {
-            if (!stbi__compute_huffman_codes(a)) return 0;
-         }
-         if (!stbi__parse_huffman_block(a)) return 0;
+        if (!stbi__compute_huffman_codes(a))
+          return 0;
       }
-   } while (!final);
-   return 1;
-}
-
-static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
-{
-   a->zout_start = obuf;
-   a->zout       = obuf;
-   a->zout_end   = obuf + olen;
-   a->z_expandable = exp;
-
-   return stbi__parse_zlib(a, parse_header);
-}
-
-STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
-{
-   stbi__zbuf a;
-   char *p = (char *) stbi__malloc(initial_size);
-   if (p == NULL) return NULL;
-   a.zbuffer = (stbi_uc *) buffer;
-   a.zbuffer_end = (stbi_uc *) buffer + len;
-   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
-      return a.zout_start;
-   } else {
-      STBI_FREE(a.zout_start);
-      return NULL;
-   }
-}
-
-STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
-{
-   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
-}
-
-STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
-{
-   stbi__zbuf a;
-   char *p = (char *) stbi__malloc(initial_size);
-   if (p == NULL) return NULL;
-   a.zbuffer = (stbi_uc *) buffer;
-   a.zbuffer_end = (stbi_uc *) buffer + len;
-   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
-      return a.zout_start;
-   } else {
-      STBI_FREE(a.zout_start);
-      return NULL;
-   }
-}
-
-STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
-{
-   stbi__zbuf a;
-   a.zbuffer = (stbi_uc *) ibuffer;
-   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
-   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
-      return (int) (a.zout - a.zout_start);
-   else
-      return -1;
-}
-
-STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
-{
-   stbi__zbuf a;
-   char *p = (char *) stbi__malloc(16384);
-   if (p == NULL) return NULL;
-   a.zbuffer = (stbi_uc *) buffer;
-   a.zbuffer_end = (stbi_uc *) buffer+len;
-   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
-      return a.zout_start;
-   } else {
-      STBI_FREE(a.zout_start);
-      return NULL;
-   }
-}
-
-STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
-{
-   stbi__zbuf a;
-   a.zbuffer = (stbi_uc *) ibuffer;
-   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
-   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
-      return (int) (a.zout - a.zout_start);
-   else
-      return -1;
+      if (!stbi__parse_huffman_block(a))
+        return 0;
+    }
+  } while (!final);
+  return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp,
+                         int parse_header) {
+  a->zout_start = obuf;
+  a->zout = obuf;
+  a->zout_end = obuf + olen;
+  a->z_expandable = exp;
+
+  return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len,
+                                                int initial_size, int *outlen) {
+  stbi__zbuf a;
+  char *p = (char *)stbi__malloc(initial_size);
+  if (p == NULL)
+    return NULL;
+  a.zbuffer = (stbi_uc *)buffer;
+  a.zbuffer_end = (stbi_uc *)buffer + len;
+  if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+    if (outlen)
+      *outlen = (int)(a.zout - a.zout_start);
+    return a.zout_start;
+  } else {
+    STBI_FREE(a.zout_start);
+    return NULL;
+  }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len,
+                                      int *outlen) {
+  return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer,
+                                                           int len,
+                                                           int initial_size,
+                                                           int *outlen,
+                                                           int parse_header) {
+  stbi__zbuf a;
+  char *p = (char *)stbi__malloc(initial_size);
+  if (p == NULL)
+    return NULL;
+  a.zbuffer = (stbi_uc *)buffer;
+  a.zbuffer_end = (stbi_uc *)buffer + len;
+  if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+    if (outlen)
+      *outlen = (int)(a.zout - a.zout_start);
+    return a.zout_start;
+  } else {
+    STBI_FREE(a.zout_start);
+    return NULL;
+  }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen,
+                                    char const *ibuffer, int ilen) {
+  stbi__zbuf a;
+  a.zbuffer = (stbi_uc *)ibuffer;
+  a.zbuffer_end = (stbi_uc *)ibuffer + ilen;
+  if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+    return (int)(a.zout - a.zout_start);
+  else
+    return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len,
+                                               int *outlen) {
+  stbi__zbuf a;
+  char *p = (char *)stbi__malloc(16384);
+  if (p == NULL)
+    return NULL;
+  a.zbuffer = (stbi_uc *)buffer;
+  a.zbuffer_end = (stbi_uc *)buffer + len;
+  if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+    if (outlen)
+      *outlen = (int)(a.zout - a.zout_start);
+    return a.zout_start;
+  } else {
+    STBI_FREE(a.zout_start);
+    return NULL;
+  }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen,
+                                             const char *ibuffer, int ilen) {
+  stbi__zbuf a;
+  a.zbuffer = (stbi_uc *)ibuffer;
+  a.zbuffer_end = (stbi_uc *)ibuffer + ilen;
+  if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+    return (int)(a.zout - a.zout_start);
+  else
+    return -1;
 }
 #endif
 
@@ -4329,1062 +4753,1276 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char
 //      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
 
 #ifndef STBI_NO_PNG
-typedef struct
-{
-   stbi__uint32 length;
-   stbi__uint32 type;
+typedef struct {
+  stbi__uint32 length;
+  stbi__uint32 type;
 } stbi__pngchunk;
 
-static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
-{
-   stbi__pngchunk c;
-   c.length = stbi__get32be(s);
-   c.type   = stbi__get32be(s);
-   return c;
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s) {
+  stbi__pngchunk c;
+  c.length = stbi__get32be(s);
+  c.type = stbi__get32be(s);
+  return c;
 }
 
-static int stbi__check_png_header(stbi__context *s)
-{
-   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
-   int i;
-   for (i=0; i < 8; ++i)
-      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
-   return 1;
+static int stbi__check_png_header(stbi__context *s) {
+  static const stbi_uc png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+  int i;
+  for (i = 0; i < 8; ++i)
+    if (stbi__get8(s) != png_sig[i])
+      return stbi__err("bad png sig", "Not a PNG");
+  return 1;
 }
 
-typedef struct
-{
-   stbi__context *s;
-   stbi_uc *idata, *expanded, *out;
-   int depth;
+typedef struct {
+  stbi__context *s;
+  stbi_uc *idata, *expanded, *out;
+  int depth;
 } stbi__png;
 
-
 enum {
-   STBI__F_none=0,
-   STBI__F_sub=1,
-   STBI__F_up=2,
-   STBI__F_avg=3,
-   STBI__F_paeth=4,
-   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
-   STBI__F_avg_first,
-   STBI__F_paeth_first
+  STBI__F_none = 0,
+  STBI__F_sub = 1,
+  STBI__F_up = 2,
+  STBI__F_avg = 3,
+  STBI__F_paeth = 4,
+  // synthetic filters used for first scanline to avoid needing a dummy row of
+  // 0s
+  STBI__F_avg_first,
+  STBI__F_paeth_first
 };
 
-static stbi_uc first_row_filter[5] =
-{
-   STBI__F_none,
-   STBI__F_sub,
-   STBI__F_none,
-   STBI__F_avg_first,
-   STBI__F_paeth_first
-};
+static stbi_uc first_row_filter[5] = {STBI__F_none, STBI__F_sub, STBI__F_none,
+                                      STBI__F_avg_first, STBI__F_paeth_first};
 
-static int stbi__paeth(int a, int b, int c)
-{
-   int p = a + b - c;
-   int pa = abs(p-a);
-   int pb = abs(p-b);
-   int pc = abs(p-c);
-   if (pa <= pb && pa <= pc) return a;
-   if (pb <= pc) return b;
-   return c;
+static int stbi__paeth(int a, int b, int c) {
+  int p = a + b - c;
+  int pa = abs(p - a);
+  int pb = abs(p - b);
+  int pc = abs(p - c);
+  if (pa <= pb && pa <= pc)
+    return a;
+  if (pb <= pc)
+    return b;
+  return c;
 }
 
-static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+static const stbi_uc stbi__depth_scale_table[9] = {0, 0xff, 0x55, 0,   0x11,
+                                                   0, 0,    0,    0x01};
 
 // create the png data from post-deflated data
-static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
-{
-   int bytes = (depth == 16? 2 : 1);
-   stbi__context *s = a->s;
-   stbi__uint32 i,j,stride = x*out_n*bytes;
-   stbi__uint32 img_len, img_width_bytes;
-   int k;
-   int img_n = s->img_n; // copy it into a local for later
-
-   int output_bytes = out_n*bytes;
-   int filter_bytes = img_n*bytes;
-   int width = x;
-
-   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
-   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
-   if (!a->out) return stbi__err("outofmem", "Out of memory");
-
-   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
-   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
-   img_len = (img_width_bytes + 1) * y;
-
-   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
-   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
-   // so just check for raw_len < img_len always.
-   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
-
-   for (j=0; j < y; ++j) {
-      stbi_uc *cur = a->out + stride*j;
-      stbi_uc *prior;
-      int filter = *raw++;
-
-      if (filter > 4)
-         return stbi__err("invalid filter","Corrupt PNG");
-
-      if (depth < 8) {
-         STBI_ASSERT(img_width_bytes <= x);
-         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
-         filter_bytes = 1;
-         width = img_width_bytes;
-      }
-      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
-
-      // if first row, use special filter that doesn't sample previous row
-      if (j == 0) filter = first_row_filter[filter];
-
-      // handle first byte explicitly
-      for (k=0; k < filter_bytes; ++k) {
-         switch (filter) {
-            case STBI__F_none       : cur[k] = raw[k]; break;
-            case STBI__F_sub        : cur[k] = raw[k]; break;
-            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
-            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
-            case STBI__F_avg_first  : cur[k] = raw[k]; break;
-            case STBI__F_paeth_first: cur[k] = raw[k]; break;
-         }
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw,
+                                      stbi__uint32 raw_len, int out_n,
+                                      stbi__uint32 x, stbi__uint32 y, int depth,
+                                      int color) {
+  int bytes = (depth == 16 ? 2 : 1);
+  stbi__context *s = a->s;
+  stbi__uint32 i, j, stride = x * out_n * bytes;
+  stbi__uint32 img_len, img_width_bytes;
+  int k;
+  int img_n = s->img_n; // copy it into a local for later
+
+  int output_bytes = out_n * bytes;
+  int filter_bytes = img_n * bytes;
+  int width = x;
+
+  STBI_ASSERT(out_n == s->img_n || out_n == s->img_n + 1);
+  a->out = (stbi_uc *)stbi__malloc_mad3(
+      x, y, output_bytes, 0); // extra bytes to write off the end into
+  if (!a->out)
+    return stbi__err("outofmem", "Out of memory");
+
+  if (!stbi__mad3sizes_valid(img_n, x, depth, 7))
+    return stbi__err("too large", "Corrupt PNG");
+  img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+  img_len = (img_width_bytes + 1) * y;
+
+  // we used to check for exact match between raw_len and img_len on
+  // non-interlaced PNGs, but issue #276 reported a PNG in the wild that had
+  // extra data at the end (all zeros), so just check for raw_len < img_len
+  // always.
+  if (raw_len < img_len)
+    return stbi__err("not enough pixels", "Corrupt PNG");
+
+  for (j = 0; j < y; ++j) {
+    stbi_uc *cur = a->out + stride * j;
+    stbi_uc *prior;
+    int filter = *raw++;
+
+    if (filter > 4)
+      return stbi__err("invalid filter", "Corrupt PNG");
+
+    if (depth < 8) {
+      STBI_ASSERT(img_width_bytes <= x);
+      cur +=
+          x * out_n - img_width_bytes; // store output to the rightmost img_len
+                                       // bytes, so we can decode in place
+      filter_bytes = 1;
+      width = img_width_bytes;
+    }
+    prior =
+        cur -
+        stride; // bugfix: need to compute this after 'cur +=' computation above
+
+    // if first row, use special filter that doesn't sample previous row
+    if (j == 0)
+      filter = first_row_filter[filter];
+
+    // handle first byte explicitly
+    for (k = 0; k < filter_bytes; ++k) {
+      switch (filter) {
+      case STBI__F_none:
+        cur[k] = raw[k];
+        break;
+      case STBI__F_sub:
+        cur[k] = raw[k];
+        break;
+      case STBI__F_up:
+        cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+        break;
+      case STBI__F_avg:
+        cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1));
+        break;
+      case STBI__F_paeth:
+        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0));
+        break;
+      case STBI__F_avg_first:
+        cur[k] = raw[k];
+        break;
+      case STBI__F_paeth_first:
+        cur[k] = raw[k];
+        break;
       }
+    }
 
-      if (depth == 8) {
-         if (img_n != out_n)
-            cur[img_n] = 255; // first pixel
-         raw += img_n;
-         cur += out_n;
-         prior += out_n;
-      } else if (depth == 16) {
-         if (img_n != out_n) {
-            cur[filter_bytes]   = 255; // first pixel top byte
-            cur[filter_bytes+1] = 255; // first pixel bottom byte
-         }
-         raw += filter_bytes;
-         cur += output_bytes;
-         prior += output_bytes;
-      } else {
-         raw += 1;
-         cur += 1;
-         prior += 1;
+    if (depth == 8) {
+      if (img_n != out_n)
+        cur[img_n] = 255; // first pixel
+      raw += img_n;
+      cur += out_n;
+      prior += out_n;
+    } else if (depth == 16) {
+      if (img_n != out_n) {
+        cur[filter_bytes] = 255;     // first pixel top byte
+        cur[filter_bytes + 1] = 255; // first pixel bottom byte
       }
+      raw += filter_bytes;
+      cur += output_bytes;
+      prior += output_bytes;
+    } else {
+      raw += 1;
+      cur += 1;
+      prior += 1;
+    }
 
-      // this is a little gross, so that we don't switch per-pixel or per-component
-      if (depth < 8 || img_n == out_n) {
-         int nk = (width - 1)*filter_bytes;
-         #define STBI__CASE(f) \
-             case f:     \
-                for (k=0; k < nk; ++k)
-         switch (filter) {
-            // "none" filter turns into a memcpy here; make that explicit.
-            case STBI__F_none:         memcpy(cur, raw, nk); break;
-            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
-            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
-            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
-            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
-            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
-            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
-         }
-         #undef STBI__CASE
-         raw += nk;
-      } else {
-         STBI_ASSERT(img_n+1 == out_n);
-         #define STBI__CASE(f) \
-             case f:     \
-                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
-                   for (k=0; k < filter_bytes; ++k)
-         switch (filter) {
-            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
-            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
-            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
-            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
-            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
-            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
-            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
-         }
-         #undef STBI__CASE
-
-         // the loop above sets the high byte of the pixels' alpha, but for
-         // 16 bit png files we also need the low byte set. we'll do that here.
-         if (depth == 16) {
-            cur = a->out + stride*j; // start at the beginning of the row again
-            for (i=0; i < x; ++i,cur+=output_bytes) {
-               cur[filter_bytes+1] = 255;
-            }
-         }
+    // this is a little gross, so that we don't switch per-pixel or
+    // per-component
+    if (depth < 8 || img_n == out_n) {
+      int nk = (width - 1) * filter_bytes;
+#define STBI__CASE(f)                                                          \
+  case f:                                                                      \
+    for (k = 0; k < nk; ++k)
+      switch (filter) {
+      // "none" filter turns into a memcpy here; make that explicit.
+      case STBI__F_none:
+        memcpy(cur, raw, nk);
+        break;
+        STBI__CASE(STBI__F_sub) {
+          cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]);
+        }
+        break;
+        STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); }
+        break;
+        STBI__CASE(STBI__F_avg) {
+          cur[k] = STBI__BYTECAST(raw[k] +
+                                  ((prior[k] + cur[k - filter_bytes]) >> 1));
+        }
+        break;
+        STBI__CASE(STBI__F_paeth) {
+          cur[k] = STBI__BYTECAST(raw[k] +
+                                  stbi__paeth(cur[k - filter_bytes], prior[k],
+                                              prior[k - filter_bytes]));
+        }
+        break;
+        STBI__CASE(STBI__F_avg_first) {
+          cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1));
+        }
+        break;
+        STBI__CASE(STBI__F_paeth_first) {
+          cur[k] =
+              STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0));
+        }
+        break;
       }
-   }
-
-   // we make a separate pass to expand bits to pixels; for performance,
-   // this could run two scanlines behind the above code, so it won't
-   // intefere with filtering but will still be in the cache.
-   if (depth < 8) {
-      for (j=0; j < y; ++j) {
-         stbi_uc *cur = a->out + stride*j;
-         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
-         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
-         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
-         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
-
-         // note that the final byte might overshoot and write more data than desired.
-         // we can allocate enough data that this never writes out of memory, but it
-         // could also overwrite the next scanline. can it overwrite non-empty data
-         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
-         // so we need to explicitly clamp the final ones
-
-         if (depth == 4) {
-            for (k=x*img_n; k >= 2; k-=2, ++in) {
-               *cur++ = scale * ((*in >> 4)       );
-               *cur++ = scale * ((*in     ) & 0x0f);
-            }
-            if (k > 0) *cur++ = scale * ((*in >> 4)       );
-         } else if (depth == 2) {
-            for (k=x*img_n; k >= 4; k-=4, ++in) {
-               *cur++ = scale * ((*in >> 6)       );
-               *cur++ = scale * ((*in >> 4) & 0x03);
-               *cur++ = scale * ((*in >> 2) & 0x03);
-               *cur++ = scale * ((*in     ) & 0x03);
-            }
-            if (k > 0) *cur++ = scale * ((*in >> 6)       );
-            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
-            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
-         } else if (depth == 1) {
-            for (k=x*img_n; k >= 8; k-=8, ++in) {
-               *cur++ = scale * ((*in >> 7)       );
-               *cur++ = scale * ((*in >> 6) & 0x01);
-               *cur++ = scale * ((*in >> 5) & 0x01);
-               *cur++ = scale * ((*in >> 4) & 0x01);
-               *cur++ = scale * ((*in >> 3) & 0x01);
-               *cur++ = scale * ((*in >> 2) & 0x01);
-               *cur++ = scale * ((*in >> 1) & 0x01);
-               *cur++ = scale * ((*in     ) & 0x01);
-            }
-            if (k > 0) *cur++ = scale * ((*in >> 7)       );
-            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
-            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
-            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
-            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
-            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
-            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
-         }
-         if (img_n != out_n) {
-            int q;
-            // insert alpha = 255
-            cur = a->out + stride*j;
-            if (img_n == 1) {
-               for (q=x-1; q >= 0; --q) {
-                  cur[q*2+1] = 255;
-                  cur[q*2+0] = cur[q];
-               }
-            } else {
-               STBI_ASSERT(img_n == 3);
-               for (q=x-1; q >= 0; --q) {
-                  cur[q*4+3] = 255;
-                  cur[q*4+2] = cur[q*3+2];
-                  cur[q*4+1] = cur[q*3+1];
-                  cur[q*4+0] = cur[q*3+0];
-               }
-            }
-         }
+#undef STBI__CASE
+      raw += nk;
+    } else {
+      STBI_ASSERT(img_n + 1 == out_n);
+#define STBI__CASE(f)                                                          \
+  case f:                                                                      \
+    for (i = x - 1; i >= 1; --i, cur[filter_bytes] = 255, raw += filter_bytes, \
+        cur += output_bytes, prior += output_bytes)                            \
+      for (k = 0; k < filter_bytes; ++k)
+      switch (filter) {
+        STBI__CASE(STBI__F_none) { cur[k] = raw[k]; }
+        break;
+        STBI__CASE(STBI__F_sub) {
+          cur[k] = STBI__BYTECAST(raw[k] + cur[k - output_bytes]);
+        }
+        break;
+        STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); }
+        break;
+        STBI__CASE(STBI__F_avg) {
+          cur[k] = STBI__BYTECAST(raw[k] +
+                                  ((prior[k] + cur[k - output_bytes]) >> 1));
+        }
+        break;
+        STBI__CASE(STBI__F_paeth) {
+          cur[k] = STBI__BYTECAST(raw[k] +
+                                  stbi__paeth(cur[k - output_bytes], prior[k],
+                                              prior[k - output_bytes]));
+        }
+        break;
+        STBI__CASE(STBI__F_avg_first) {
+          cur[k] = STBI__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1));
+        }
+        break;
+        STBI__CASE(STBI__F_paeth_first) {
+          cur[k] =
+              STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], 0, 0));
+        }
+        break;
       }
-   } else if (depth == 16) {
-      // force the image data from big-endian to platform-native.
-      // this is done in a separate pass due to the decoding relying
-      // on the data being untouched, but could probably be done
-      // per-line during decode if care is taken.
-      stbi_uc *cur = a->out;
-      stbi__uint16 *cur16 = (stbi__uint16*)cur;
-
-      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
-         *cur16 = (cur[0] << 8) | cur[1];
+#undef STBI__CASE
+
+      // the loop above sets the high byte of the pixels' alpha, but for
+      // 16 bit png files we also need the low byte set. we'll do that here.
+      if (depth == 16) {
+        cur = a->out + stride * j; // start at the beginning of the row again
+        for (i = 0; i < x; ++i, cur += output_bytes) {
+          cur[filter_bytes + 1] = 255;
+        }
       }
-   }
+    }
+  }
+
+  // we make a separate pass to expand bits to pixels; for performance,
+  // this could run two scanlines behind the above code, so it won't
+  // intefere with filtering but will still be in the cache.
+  if (depth < 8) {
+    for (j = 0; j < y; ++j) {
+      stbi_uc *cur = a->out + stride * j;
+      stbi_uc *in = a->out + stride * j + x * out_n - img_width_bytes;
+      // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common
+      // 8-bit path optimal at minimal cost for 1/2/4-bit png guarante byte
+      // alignment, if width is not multiple of 8/4/2 we'll decode dummy
+      // trailing data that will be skipped in the later loop
+      stbi_uc scale = (color == 0)
+                          ? stbi__depth_scale_table[depth]
+                          : 1; // scale grayscale values to 0..255 range
+
+      // note that the final byte might overshoot and write more data than
+      // desired. we can allocate enough data that this never writes out of
+      // memory, but it could also overwrite the next scanline. can it overwrite
+      // non-empty data on the next scanline? yes, consider 1-pixel-wide
+      // scanlines with 1-bit-per-pixel. so we need to explicitly clamp the
+      // final ones
+
+      if (depth == 4) {
+        for (k = x * img_n; k >= 2; k -= 2, ++in) {
+          *cur++ = scale * ((*in >> 4));
+          *cur++ = scale * ((*in) & 0x0f);
+        }
+        if (k > 0)
+          *cur++ = scale * ((*in >> 4));
+      } else if (depth == 2) {
+        for (k = x * img_n; k >= 4; k -= 4, ++in) {
+          *cur++ = scale * ((*in >> 6));
+          *cur++ = scale * ((*in >> 4) & 0x03);
+          *cur++ = scale * ((*in >> 2) & 0x03);
+          *cur++ = scale * ((*in) & 0x03);
+        }
+        if (k > 0)
+          *cur++ = scale * ((*in >> 6));
+        if (k > 1)
+          *cur++ = scale * ((*in >> 4) & 0x03);
+        if (k > 2)
+          *cur++ = scale * ((*in >> 2) & 0x03);
+      } else if (depth == 1) {
+        for (k = x * img_n; k >= 8; k -= 8, ++in) {
+          *cur++ = scale * ((*in >> 7));
+          *cur++ = scale * ((*in >> 6) & 0x01);
+          *cur++ = scale * ((*in >> 5) & 0x01);
+          *cur++ = scale * ((*in >> 4) & 0x01);
+          *cur++ = scale * ((*in >> 3) & 0x01);
+          *cur++ = scale * ((*in >> 2) & 0x01);
+          *cur++ = scale * ((*in >> 1) & 0x01);
+          *cur++ = scale * ((*in) & 0x01);
+        }
+        if (k > 0)
+          *cur++ = scale * ((*in >> 7));
+        if (k > 1)
+          *cur++ = scale * ((*in >> 6) & 0x01);
+        if (k > 2)
+          *cur++ = scale * ((*in >> 5) & 0x01);
+        if (k > 3)
+          *cur++ = scale * ((*in >> 4) & 0x01);
+        if (k > 4)
+          *cur++ = scale * ((*in >> 3) & 0x01);
+        if (k > 5)
+          *cur++ = scale * ((*in >> 2) & 0x01);
+        if (k > 6)
+          *cur++ = scale * ((*in >> 1) & 0x01);
+      }
+      if (img_n != out_n) {
+        int q;
+        // insert alpha = 255
+        cur = a->out + stride * j;
+        if (img_n == 1) {
+          for (q = x - 1; q >= 0; --q) {
+            cur[q * 2 + 1] = 255;
+            cur[q * 2 + 0] = cur[q];
+          }
+        } else {
+          STBI_ASSERT(img_n == 3);
+          for (q = x - 1; q >= 0; --q) {
+            cur[q * 4 + 3] = 255;
+            cur[q * 4 + 2] = cur[q * 3 + 2];
+            cur[q * 4 + 1] = cur[q * 3 + 1];
+            cur[q * 4 + 0] = cur[q * 3 + 0];
+          }
+        }
+      }
+    }
+  } else if (depth == 16) {
+    // force the image data from big-endian to platform-native.
+    // this is done in a separate pass due to the decoding relying
+    // on the data being untouched, but could probably be done
+    // per-line during decode if care is taken.
+    stbi_uc *cur = a->out;
+    stbi__uint16 *cur16 = (stbi__uint16 *)cur;
+
+    for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) {
+      *cur16 = (cur[0] << 8) | cur[1];
+    }
+  }
+
+  return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data,
+                                  stbi__uint32 image_data_len, int out_n,
+                                  int depth, int color, int interlaced) {
+  int bytes = (depth == 16 ? 2 : 1);
+  int out_bytes = out_n * bytes;
+  stbi_uc *final;
+  int p;
+  if (!interlaced)
+    return stbi__create_png_image_raw(a, image_data, image_data_len, out_n,
+                                      a->s->img_x, a->s->img_y, depth, color);
+
+  // de-interlacing
+  final = (stbi_uc *)stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+  for (p = 0; p < 7; ++p) {
+    int xorig[] = {0, 4, 0, 2, 0, 1, 0};
+    int yorig[] = {0, 0, 4, 0, 2, 0, 1};
+    int xspc[] = {8, 8, 4, 4, 2, 2, 1};
+    int yspc[] = {8, 8, 8, 4, 4, 2, 2};
+    int i, j, x, y;
+    // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+    x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p];
+    y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p];
+    if (x && y) {
+      stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+      if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x,
+                                      y, depth, color)) {
+        STBI_FREE(final);
+        return 0;
+      }
+      for (j = 0; j < y; ++j) {
+        for (i = 0; i < x; ++i) {
+          int out_y = j * yspc[p] + yorig[p];
+          int out_x = i * xspc[p] + xorig[p];
+          memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes,
+                 a->out + (j * x + i) * out_bytes, out_bytes);
+        }
+      }
+      STBI_FREE(a->out);
+      image_data += img_len;
+      image_data_len -= img_len;
+    }
+  }
+  a->out = final;
 
-   return 1;
+  return 1;
 }
 
-static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
-{
-   int bytes = (depth == 16 ? 2 : 1);
-   int out_bytes = out_n * bytes;
-   stbi_uc *final;
-   int p;
-   if (!interlaced)
-      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
-
-   // de-interlacing
-   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
-   for (p=0; p < 7; ++p) {
-      int xorig[] = { 0,4,0,2,0,1,0 };
-      int yorig[] = { 0,0,4,0,2,0,1 };
-      int xspc[]  = { 8,8,4,4,2,2,1 };
-      int yspc[]  = { 8,8,8,4,4,2,2 };
-      int i,j,x,y;
-      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
-      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
-      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
-      if (x && y) {
-         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
-         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
-            STBI_FREE(final);
-            return 0;
-         }
-         for (j=0; j < y; ++j) {
-            for (i=0; i < x; ++i) {
-               int out_y = j*yspc[p]+yorig[p];
-               int out_x = i*xspc[p]+xorig[p];
-               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
-                      a->out + (j*x+i)*out_bytes, out_bytes);
-            }
-         }
-         STBI_FREE(a->out);
-         image_data += img_len;
-         image_data_len -= img_len;
-      }
-   }
-   a->out = final;
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n) {
+  stbi__context *s = z->s;
+  stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+  stbi_uc *p = z->out;
 
-   return 1;
-}
+  // compute color-based transparency, assuming we've
+  // already got 255 as the alpha value in the output
+  STBI_ASSERT(out_n == 2 || out_n == 4);
 
-static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
-{
-   stbi__context *s = z->s;
-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-   stbi_uc *p = z->out;
-
-   // compute color-based transparency, assuming we've
-   // already got 255 as the alpha value in the output
-   STBI_ASSERT(out_n == 2 || out_n == 4);
-
-   if (out_n == 2) {
-      for (i=0; i < pixel_count; ++i) {
-         p[1] = (p[0] == tc[0] ? 0 : 255);
-         p += 2;
-      }
-   } else {
-      for (i=0; i < pixel_count; ++i) {
-         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
-            p[3] = 0;
-         p += 4;
-      }
-   }
-   return 1;
+  if (out_n == 2) {
+    for (i = 0; i < pixel_count; ++i) {
+      p[1] = (p[0] == tc[0] ? 0 : 255);
+      p += 2;
+    }
+  } else {
+    for (i = 0; i < pixel_count; ++i) {
+      if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+        p[3] = 0;
+      p += 4;
+    }
+  }
+  return 1;
 }
 
-static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
-{
-   stbi__context *s = z->s;
-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-   stbi__uint16 *p = (stbi__uint16*) z->out;
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3],
+                                        int out_n) {
+  stbi__context *s = z->s;
+  stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+  stbi__uint16 *p = (stbi__uint16 *)z->out;
 
-   // compute color-based transparency, assuming we've
-   // already got 65535 as the alpha value in the output
-   STBI_ASSERT(out_n == 2 || out_n == 4);
+  // compute color-based transparency, assuming we've
+  // already got 65535 as the alpha value in the output
+  STBI_ASSERT(out_n == 2 || out_n == 4);
 
-   if (out_n == 2) {
-      for (i = 0; i < pixel_count; ++i) {
-         p[1] = (p[0] == tc[0] ? 0 : 65535);
-         p += 2;
-      }
-   } else {
-      for (i = 0; i < pixel_count; ++i) {
-         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
-            p[3] = 0;
-         p += 4;
-      }
-   }
-   return 1;
+  if (out_n == 2) {
+    for (i = 0; i < pixel_count; ++i) {
+      p[1] = (p[0] == tc[0] ? 0 : 65535);
+      p += 2;
+    }
+  } else {
+    for (i = 0; i < pixel_count; ++i) {
+      if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+        p[3] = 0;
+      p += 4;
+    }
+  }
+  return 1;
 }
 
-static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
-{
-   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
-   stbi_uc *p, *temp_out, *orig = a->out;
-
-   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
-   if (p == NULL) return stbi__err("outofmem", "Out of memory");
-
-   // between here and free(out) below, exitting would leak
-   temp_out = p;
-
-   if (pal_img_n == 3) {
-      for (i=0; i < pixel_count; ++i) {
-         int n = orig[i]*4;
-         p[0] = palette[n  ];
-         p[1] = palette[n+1];
-         p[2] = palette[n+2];
-         p += 3;
-      }
-   } else {
-      for (i=0; i < pixel_count; ++i) {
-         int n = orig[i]*4;
-         p[0] = palette[n  ];
-         p[1] = palette[n+1];
-         p[2] = palette[n+2];
-         p[3] = palette[n+3];
-         p += 4;
-      }
-   }
-   STBI_FREE(a->out);
-   a->out = temp_out;
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len,
+                                    int pal_img_n) {
+  stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+  stbi_uc *p, *temp_out, *orig = a->out;
+
+  p = (stbi_uc *)stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+  if (p == NULL)
+    return stbi__err("outofmem", "Out of memory");
 
-   STBI_NOTUSED(len);
+  // between here and free(out) below, exitting would leak
+  temp_out = p;
+
+  if (pal_img_n == 3) {
+    for (i = 0; i < pixel_count; ++i) {
+      int n = orig[i] * 4;
+      p[0] = palette[n];
+      p[1] = palette[n + 1];
+      p[2] = palette[n + 2];
+      p += 3;
+    }
+  } else {
+    for (i = 0; i < pixel_count; ++i) {
+      int n = orig[i] * 4;
+      p[0] = palette[n];
+      p[1] = palette[n + 1];
+      p[2] = palette[n + 2];
+      p[3] = palette[n + 3];
+      p += 4;
+    }
+  }
+  STBI_FREE(a->out);
+  a->out = temp_out;
 
-   return 1;
+  STBI_NOTUSED(len);
+
+  return 1;
 }
 
 static int stbi__unpremultiply_on_load = 0;
 static int stbi__de_iphone_flag = 0;
 
-STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
-{
-   stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
+STBIDEF void
+stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply) {
+  stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
 }
 
-STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
-{
-   stbi__de_iphone_flag = flag_true_if_should_convert;
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert) {
+  stbi__de_iphone_flag = flag_true_if_should_convert;
 }
 
-static void stbi__de_iphone(stbi__png *z)
-{
-   stbi__context *s = z->s;
-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-   stbi_uc *p = z->out;
-
-   if (s->img_out_n == 3) {  // convert bgr to rgb
-      for (i=0; i < pixel_count; ++i) {
-         stbi_uc t = p[0];
-         p[0] = p[2];
-         p[2] = t;
-         p += 3;
+static void stbi__de_iphone(stbi__png *z) {
+  stbi__context *s = z->s;
+  stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+  stbi_uc *p = z->out;
+
+  if (s->img_out_n == 3) { // convert bgr to rgb
+    for (i = 0; i < pixel_count; ++i) {
+      stbi_uc t = p[0];
+      p[0] = p[2];
+      p[2] = t;
+      p += 3;
+    }
+  } else {
+    STBI_ASSERT(s->img_out_n == 4);
+    if (stbi__unpremultiply_on_load) {
+      // convert bgr to rgb and unpremultiply
+      for (i = 0; i < pixel_count; ++i) {
+        stbi_uc a = p[3];
+        stbi_uc t = p[0];
+        if (a) {
+          stbi_uc half = a / 2;
+          p[0] = (p[2] * 255 + half) / a;
+          p[1] = (p[1] * 255 + half) / a;
+          p[2] = (t * 255 + half) / a;
+        } else {
+          p[0] = p[2];
+          p[2] = t;
+        }
+        p += 4;
       }
-   } else {
-      STBI_ASSERT(s->img_out_n == 4);
-      if (stbi__unpremultiply_on_load) {
-         // convert bgr to rgb and unpremultiply
-         for (i=0; i < pixel_count; ++i) {
-            stbi_uc a = p[3];
-            stbi_uc t = p[0];
-            if (a) {
-               stbi_uc half = a / 2;
-               p[0] = (p[2] * 255 + half) / a;
-               p[1] = (p[1] * 255 + half) / a;
-               p[2] = ( t   * 255 + half) / a;
-            } else {
-               p[0] = p[2];
-               p[2] = t;
-            }
-            p += 4;
-         }
+    } else {
+      // convert bgr to rgb
+      for (i = 0; i < pixel_count; ++i) {
+        stbi_uc t = p[0];
+        p[0] = p[2];
+        p[2] = t;
+        p += 4;
+      }
+    }
+  }
+}
+
+#define STBI__PNG_TYPE(a, b, c, d)                                             \
+  (((unsigned)(a) << 24) + ((unsigned)(b) << 16) + ((unsigned)(c) << 8) +      \
+   (unsigned)(d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) {
+  stbi_uc palette[1024], pal_img_n = 0;
+  stbi_uc has_trans = 0, tc[3] = {0};
+  stbi__uint16 tc16[3];
+  stbi__uint32 ioff = 0, idata_limit = 0, i, pal_len = 0;
+  int first = 1, k, interlace = 0, color = 0, is_iphone = 0;
+  stbi__context *s = z->s;
+
+  z->expanded = NULL;
+  z->idata = NULL;
+  z->out = NULL;
+
+  if (!stbi__check_png_header(s))
+    return 0;
+
+  if (scan == STBI__SCAN_type)
+    return 1;
+
+  for (;;) {
+    stbi__pngchunk c = stbi__get_chunk_header(s);
+    switch (c.type) {
+    case STBI__PNG_TYPE('C', 'g', 'B', 'I'):
+      is_iphone = 1;
+      stbi__skip(s, c.length);
+      break;
+    case STBI__PNG_TYPE('I', 'H', 'D', 'R'): {
+      int comp, filter;
+      if (!first)
+        return stbi__err("multiple IHDR", "Corrupt PNG");
+      first = 0;
+      if (c.length != 13)
+        return stbi__err("bad IHDR len", "Corrupt PNG");
+      s->img_x = stbi__get32be(s);
+      if (s->img_x > (1 << 24))
+        return stbi__err("too large", "Very large image (corrupt?)");
+      s->img_y = stbi__get32be(s);
+      if (s->img_y > (1 << 24))
+        return stbi__err("too large", "Very large image (corrupt?)");
+      z->depth = stbi__get8(s);
+      if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 &&
+          z->depth != 16)
+        return stbi__err("1/2/4/8/16-bit only",
+                         "PNG not supported: 1/2/4/8/16-bit only");
+      color = stbi__get8(s);
+      if (color > 6)
+        return stbi__err("bad ctype", "Corrupt PNG");
+      if (color == 3 && z->depth == 16)
+        return stbi__err("bad ctype", "Corrupt PNG");
+      if (color == 3)
+        pal_img_n = 3;
+      else if (color & 1)
+        return stbi__err("bad ctype", "Corrupt PNG");
+      comp = stbi__get8(s);
+      if (comp)
+        return stbi__err("bad comp method", "Corrupt PNG");
+      filter = stbi__get8(s);
+      if (filter)
+        return stbi__err("bad filter method", "Corrupt PNG");
+      interlace = stbi__get8(s);
+      if (interlace > 1)
+        return stbi__err("bad interlace method", "Corrupt PNG");
+      if (!s->img_x || !s->img_y)
+        return stbi__err("0-pixel image", "Corrupt PNG");
+      if (!pal_img_n) {
+        s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+        if ((1 << 30) / s->img_x / s->img_n < s->img_y)
+          return stbi__err("too large", "Image too large to decode");
+        if (scan == STBI__SCAN_header)
+          return 1;
       } else {
-         // convert bgr to rgb
-         for (i=0; i < pixel_count; ++i) {
-            stbi_uc t = p[0];
-            p[0] = p[2];
-            p[2] = t;
-            p += 4;
-         }
+        // if paletted, then pal_n is our final components, and
+        // img_n is # components to decompress/filter.
+        s->img_n = 1;
+        if ((1 << 30) / s->img_x / 4 < s->img_y)
+          return stbi__err("too large", "Corrupt PNG");
+        // if SCAN_header, have to scan to see if we have a tRNS
       }
-   }
-}
-
-#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+      break;
+    }
 
-static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
-{
-   stbi_uc palette[1024], pal_img_n=0;
-   stbi_uc has_trans=0, tc[3]={0};
-   stbi__uint16 tc16[3];
-   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
-   int first=1,k,interlace=0, color=0, is_iphone=0;
-   stbi__context *s = z->s;
-
-   z->expanded = NULL;
-   z->idata = NULL;
-   z->out = NULL;
-
-   if (!stbi__check_png_header(s)) return 0;
-
-   if (scan == STBI__SCAN_type) return 1;
-
-   for (;;) {
-      stbi__pngchunk c = stbi__get_chunk_header(s);
-      switch (c.type) {
-         case STBI__PNG_TYPE('C','g','B','I'):
-            is_iphone = 1;
-            stbi__skip(s, c.length);
-            break;
-         case STBI__PNG_TYPE('I','H','D','R'): {
-            int comp,filter;
-            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
-            first = 0;
-            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
-            s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
-            s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
-            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
-            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
-            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
-            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
-            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
-            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
-            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
-            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
-            if (!pal_img_n) {
-               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
-               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
-               if (scan == STBI__SCAN_header) return 1;
-            } else {
-               // if paletted, then pal_n is our final components, and
-               // img_n is # components to decompress/filter.
-               s->img_n = 1;
-               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
-               // if SCAN_header, have to scan to see if we have a tRNS
-            }
-            break;
-         }
-
-         case STBI__PNG_TYPE('P','L','T','E'):  {
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
-            pal_len = c.length / 3;
-            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
-            for (i=0; i < pal_len; ++i) {
-               palette[i*4+0] = stbi__get8(s);
-               palette[i*4+1] = stbi__get8(s);
-               palette[i*4+2] = stbi__get8(s);
-               palette[i*4+3] = 255;
-            }
-            break;
-         }
-
-         case STBI__PNG_TYPE('t','R','N','S'): {
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
-            if (pal_img_n) {
-               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
-               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
-               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
-               pal_img_n = 4;
-               for (i=0; i < c.length; ++i)
-                  palette[i*4+3] = stbi__get8(s);
-            } else {
-               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
-               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
-               has_trans = 1;
-               if (z->depth == 16) {
-                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
-               } else {
-                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
-               }
-            }
-            break;
-         }
-
-         case STBI__PNG_TYPE('I','D','A','T'): {
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
-            if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
-            if ((int)(ioff + c.length) < (int)ioff) return 0;
-            if (ioff + c.length > idata_limit) {
-               stbi__uint32 idata_limit_old __attribute__((unused)) = idata_limit;
-               stbi_uc *p;
-               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
-               while (ioff + c.length > idata_limit)
-                  idata_limit *= 2;
-               STBI_NOTUSED(idata_limit_old);
-               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
-               z->idata = p;
-            }
-            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
-            ioff += c.length;
-            break;
-         }
-
-         case STBI__PNG_TYPE('I','E','N','D'): {
-            stbi__uint32 raw_len, bpl;
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (scan != STBI__SCAN_load) return 1;
-            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
-            // initial guess for decoded data size to avoid unnecessary reallocs
-            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
-            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
-            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
-            if (z->expanded == NULL) return 0; // zlib should set error
-            STBI_FREE(z->idata); z->idata = NULL;
-            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
-               s->img_out_n = s->img_n+1;
-            else
-               s->img_out_n = s->img_n;
-            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
-            if (has_trans) {
-               if (z->depth == 16) {
-                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
-               } else {
-                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
-               }
-            }
-            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
-               stbi__de_iphone(z);
-            if (pal_img_n) {
-               // pal_img_n == 3 or 4
-               s->img_n = pal_img_n; // record the actual colors we had
-               s->img_out_n = pal_img_n;
-               if (req_comp >= 3) s->img_out_n = req_comp;
-               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
-                  return 0;
-            } else if (has_trans) {
-               // non-paletted image with tRNS -> source image has (constant) alpha
-               ++s->img_n;
-            }
-            STBI_FREE(z->expanded); z->expanded = NULL;
-            return 1;
-         }
-
-         default:
-            // if critical, fail
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if ((c.type & (1 << 29)) == 0) {
-               #ifndef STBI_NO_FAILURE_STRINGS
-               // not threadsafe
-               static char invalid_chunk[] = "XXXX PNG chunk not known";
-               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
-               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
-               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
-               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
-               #endif
-               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
-            }
-            stbi__skip(s, c.length);
-            break;
+    case STBI__PNG_TYPE('P', 'L', 'T', 'E'): {
+      if (first)
+        return stbi__err("first not IHDR", "Corrupt PNG");
+      if (c.length > 256 * 3)
+        return stbi__err("invalid PLTE", "Corrupt PNG");
+      pal_len = c.length / 3;
+      if (pal_len * 3 != c.length)
+        return stbi__err("invalid PLTE", "Corrupt PNG");
+      for (i = 0; i < pal_len; ++i) {
+        palette[i * 4 + 0] = stbi__get8(s);
+        palette[i * 4 + 1] = stbi__get8(s);
+        palette[i * 4 + 2] = stbi__get8(s);
+        palette[i * 4 + 3] = 255;
       }
-      // end of PNG chunk, read and skip CRC
-      stbi__get32be(s);
-   }
-}
+      break;
+    }
 
-static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
-{
-   void *result=NULL;
-   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
-   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
-      if (p->depth < 8)
-         ri->bits_per_channel = 8;
-      else
-         ri->bits_per_channel = p->depth;
-      result = p->out;
-      p->out = NULL;
-      if (req_comp && req_comp != p->s->img_out_n) {
-         if (ri->bits_per_channel == 8)
-            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
-         else
-            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
-         p->s->img_out_n = req_comp;
-         if (result == NULL) return result;
+    case STBI__PNG_TYPE('t', 'R', 'N', 'S'): {
+      if (first)
+        return stbi__err("first not IHDR", "Corrupt PNG");
+      if (z->idata)
+        return stbi__err("tRNS after IDAT", "Corrupt PNG");
+      if (pal_img_n) {
+        if (scan == STBI__SCAN_header) {
+          s->img_n = 4;
+          return 1;
+        }
+        if (pal_len == 0)
+          return stbi__err("tRNS before PLTE", "Corrupt PNG");
+        if (c.length > pal_len)
+          return stbi__err("bad tRNS len", "Corrupt PNG");
+        pal_img_n = 4;
+        for (i = 0; i < c.length; ++i)
+          palette[i * 4 + 3] = stbi__get8(s);
+      } else {
+        if (!(s->img_n & 1))
+          return stbi__err("tRNS with alpha", "Corrupt PNG");
+        if (c.length != (stbi__uint32)s->img_n * 2)
+          return stbi__err("bad tRNS len", "Corrupt PNG");
+        has_trans = 1;
+        if (z->depth == 16) {
+          for (k = 0; k < s->img_n; ++k)
+            tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+        } else {
+          for (k = 0; k < s->img_n; ++k)
+            tc[k] = (stbi_uc)(stbi__get16be(s) & 255) *
+                    stbi__depth_scale_table[z->depth]; // non 8-bit images will
+                                                       // be larger
+        }
       }
-      *x = p->s->img_x;
-      *y = p->s->img_y;
-      if (n) *n = p->s->img_n;
-   }
-   STBI_FREE(p->out);      p->out      = NULL;
-   STBI_FREE(p->expanded); p->expanded = NULL;
-   STBI_FREE(p->idata);    p->idata    = NULL;
-
-   return result;
-}
-
-static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi__png p;
-   p.s = s;
-   return stbi__do_png(&p, x,y,comp,req_comp, ri);
-}
-
-static int stbi__png_test(stbi__context *s)
-{
-   int r;
-   r = stbi__check_png_header(s);
-   stbi__rewind(s);
-   return r;
-}
+      break;
+    }
 
-static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
-{
-   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
-      stbi__rewind( p->s );
-      return 0;
-   }
-   if (x) *x = p->s->img_x;
-   if (y) *y = p->s->img_y;
-   if (comp) *comp = p->s->img_n;
-   return 1;
-}
+    case STBI__PNG_TYPE('I', 'D', 'A', 'T'): {
+      if (first)
+        return stbi__err("first not IHDR", "Corrupt PNG");
+      if (pal_img_n && !pal_len)
+        return stbi__err("no PLTE", "Corrupt PNG");
+      if (scan == STBI__SCAN_header) {
+        s->img_n = pal_img_n;
+        return 1;
+      }
+      if ((int)(ioff + c.length) < (int)ioff)
+        return 0;
+      if (ioff + c.length > idata_limit) {
+        stbi__uint32 idata_limit_old __attribute__((unused)) = idata_limit;
+        stbi_uc *p;
+        if (idata_limit == 0)
+          idata_limit = c.length > 4096 ? c.length : 4096;
+        while (ioff + c.length > idata_limit)
+          idata_limit *= 2;
+        STBI_NOTUSED(idata_limit_old);
+        p = (stbi_uc *)STBI_REALLOC_SIZED(z->idata, idata_limit_old,
+                                          idata_limit);
+        if (p == NULL)
+          return stbi__err("outofmem", "Out of memory");
+        z->idata = p;
+      }
+      if (!stbi__getn(s, z->idata + ioff, c.length))
+        return stbi__err("outofdata", "Corrupt PNG");
+      ioff += c.length;
+      break;
+    }
 
-static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   stbi__png p;
-   p.s = s;
-   return stbi__png_info_raw(&p, x, y, comp);
-}
+    case STBI__PNG_TYPE('I', 'E', 'N', 'D'): {
+      stbi__uint32 raw_len, bpl;
+      if (first)
+        return stbi__err("first not IHDR", "Corrupt PNG");
+      if (scan != STBI__SCAN_load)
+        return 1;
+      if (z->idata == NULL)
+        return stbi__err("no IDAT", "Corrupt PNG");
+      // initial guess for decoded data size to avoid unnecessary reallocs
+      bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+      raw_len = bpl * s->img_y * s->img_n /* pixels */ +
+                s->img_y /* filter mode per row */;
+      z->expanded = (stbi_uc *)stbi_zlib_decode_malloc_guesssize_headerflag(
+          (char *)z->idata, ioff, raw_len, (int *)&raw_len, !is_iphone);
+      if (z->expanded == NULL)
+        return 0; // zlib should set error
+      STBI_FREE(z->idata);
+      z->idata = NULL;
+      if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) ||
+          has_trans)
+        s->img_out_n = s->img_n + 1;
+      else
+        s->img_out_n = s->img_n;
+      if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n,
+                                  z->depth, color, interlace))
+        return 0;
+      if (has_trans) {
+        if (z->depth == 16) {
+          if (!stbi__compute_transparency16(z, tc16, s->img_out_n))
+            return 0;
+        } else {
+          if (!stbi__compute_transparency(z, tc, s->img_out_n))
+            return 0;
+        }
+      }
+      if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+        stbi__de_iphone(z);
+      if (pal_img_n) {
+        // pal_img_n == 3 or 4
+        s->img_n = pal_img_n; // record the actual colors we had
+        s->img_out_n = pal_img_n;
+        if (req_comp >= 3)
+          s->img_out_n = req_comp;
+        if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+          return 0;
+      } else if (has_trans) {
+        // non-paletted image with tRNS -> source image has (constant) alpha
+        ++s->img_n;
+      }
+      STBI_FREE(z->expanded);
+      z->expanded = NULL;
+      return 1;
+    }
 
-static int stbi__png_is16(stbi__context *s)
-{
-   stbi__png p;
-   p.s = s;
-   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
-	   return 0;
-   if (p.depth != 16) {
-      stbi__rewind(p.s);
-      return 0;
-   }
-   return 1;
+    default:
+      // if critical, fail
+      if (first)
+        return stbi__err("first not IHDR", "Corrupt PNG");
+      if ((c.type & (1 << 29)) == 0) {
+#ifndef STBI_NO_FAILURE_STRINGS
+        // not threadsafe
+        static char invalid_chunk[] = "XXXX PNG chunk not known";
+        invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+        invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+        invalid_chunk[2] = STBI__BYTECAST(c.type >> 8);
+        invalid_chunk[3] = STBI__BYTECAST(c.type >> 0);
+#endif
+        return stbi__err(invalid_chunk,
+                         "PNG not supported: unknown PNG chunk type");
+      }
+      stbi__skip(s, c.length);
+      break;
+    }
+    // end of PNG chunk, read and skip CRC
+    stbi__get32be(s);
+  }
+}
+
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp,
+                          stbi__result_info *ri) {
+  void *result = NULL;
+  if (req_comp < 0 || req_comp > 4)
+    return stbi__errpuc("bad req_comp", "Internal error");
+  if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+    if (p->depth < 8)
+      ri->bits_per_channel = 8;
+    else
+      ri->bits_per_channel = p->depth;
+    result = p->out;
+    p->out = NULL;
+    if (req_comp && req_comp != p->s->img_out_n) {
+      if (ri->bits_per_channel == 8)
+        result = stbi__convert_format((unsigned char *)result, p->s->img_out_n,
+                                      req_comp, p->s->img_x, p->s->img_y);
+      else
+        result = stbi__convert_format16((stbi__uint16 *)result, p->s->img_out_n,
+                                        req_comp, p->s->img_x, p->s->img_y);
+      p->s->img_out_n = req_comp;
+      if (result == NULL)
+        return result;
+    }
+    *x = p->s->img_x;
+    *y = p->s->img_y;
+    if (n)
+      *n = p->s->img_n;
+  }
+  STBI_FREE(p->out);
+  p->out = NULL;
+  STBI_FREE(p->expanded);
+  p->expanded = NULL;
+  STBI_FREE(p->idata);
+  p->idata = NULL;
+
+  return result;
+}
+
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri) {
+  stbi__png p;
+  p.s = s;
+  return stbi__do_png(&p, x, y, comp, req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context *s) {
+  int r;
+  r = stbi__check_png_header(s);
+  stbi__rewind(s);
+  return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp) {
+  if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+    stbi__rewind(p->s);
+    return 0;
+  }
+  if (x)
+    *x = p->s->img_x;
+  if (y)
+    *y = p->s->img_y;
+  if (comp)
+    *comp = p->s->img_n;
+  return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp) {
+  stbi__png p;
+  p.s = s;
+  return stbi__png_info_raw(&p, x, y, comp);
+}
+
+static int stbi__png_is16(stbi__context *s) {
+  stbi__png p;
+  p.s = s;
+  if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+    return 0;
+  if (p.depth != 16) {
+    stbi__rewind(p.s);
+    return 0;
+  }
+  return 1;
 }
 #endif
 
 // Microsoft/Windows BMP image
 
 #ifndef STBI_NO_BMP
-static int stbi__bmp_test_raw(stbi__context *s)
-{
-   int r;
-   int sz;
-   if (stbi__get8(s) != 'B') return 0;
-   if (stbi__get8(s) != 'M') return 0;
-   stbi__get32le(s); // discard filesize
-   stbi__get16le(s); // discard reserved
-   stbi__get16le(s); // discard reserved
-   stbi__get32le(s); // discard data offset
-   sz = stbi__get32le(s);
-   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
-   return r;
-}
-
-static int stbi__bmp_test(stbi__context *s)
-{
-   int r = stbi__bmp_test_raw(s);
-   stbi__rewind(s);
-   return r;
+static int stbi__bmp_test_raw(stbi__context *s) {
+  int r;
+  int sz;
+  if (stbi__get8(s) != 'B')
+    return 0;
+  if (stbi__get8(s) != 'M')
+    return 0;
+  stbi__get32le(s); // discard filesize
+  stbi__get16le(s); // discard reserved
+  stbi__get16le(s); // discard reserved
+  stbi__get32le(s); // discard data offset
+  sz = stbi__get32le(s);
+  r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+  return r;
+}
+
+static int stbi__bmp_test(stbi__context *s) {
+  int r = stbi__bmp_test_raw(s);
+  stbi__rewind(s);
+  return r;
 }
 
-
 // returns 0..31 for the highest set bit
-static int stbi__high_bit(unsigned int z)
-{
-   int n=0;
-   if (z == 0) return -1;
-   if (z >= 0x10000) { n += 16; z >>= 16; }
-   if (z >= 0x00100) { n +=  8; z >>=  8; }
-   if (z >= 0x00010) { n +=  4; z >>=  4; }
-   if (z >= 0x00004) { n +=  2; z >>=  2; }
-   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
-   return n;
+static int stbi__high_bit(unsigned int z) {
+  int n = 0;
+  if (z == 0)
+    return -1;
+  if (z >= 0x10000) {
+    n += 16;
+    z >>= 16;
+  }
+  if (z >= 0x00100) {
+    n += 8;
+    z >>= 8;
+  }
+  if (z >= 0x00010) {
+    n += 4;
+    z >>= 4;
+  }
+  if (z >= 0x00004) {
+    n += 2;
+    z >>= 2;
+  }
+  if (z >= 0x00002) {
+    n += 1; /* >>=  1;*/
+  }
+  return n;
 }
 
-static int stbi__bitcount(unsigned int a)
-{
-   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
-   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
-   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
-   a = (a + (a >> 8)); // max 16 per 8 bits
-   a = (a + (a >> 16)); // max 32 per 8 bits
-   return a & 0xff;
+static int stbi__bitcount(unsigned int a) {
+  a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2
+  a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4
+  a = (a + (a >> 4)) & 0x0f0f0f0f;                // max 8 per 4, now 8 bits
+  a = (a + (a >> 8));                             // max 16 per 8 bits
+  a = (a + (a >> 16));                            // max 32 per 8 bits
+  return a & 0xff;
 }
 
 // extract an arbitrarily-aligned N-bit value (N=bits)
 // from v, and then make it 8-bits long and fractionally
 // extend it to full full range.
-static int stbi__shiftsigned(unsigned int v, int shift, int bits)
-{
-   static unsigned int mul_table[9] = {
+static int stbi__shiftsigned(unsigned int v, int shift, int bits) {
+  static unsigned int mul_table[9] = {
       0,
-      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
-      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
-   };
-   static unsigned int shift_table[9] = {
-      0, 0,0,1,0,2,4,6,0,
-   };
-   if (shift < 0)
-      v <<= -shift;
-   else
-      v >>= shift;
-   STBI_ASSERT(v < 256);
-   v >>= (8-bits);
-   STBI_ASSERT(bits >= 0 && bits <= 8);
-   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
-}
-
-typedef struct
-{
-   int bpp, offset, hsz;
-   unsigned int mr,mg,mb,ma, all_a;
+      0xff /*0b11111111*/,
+      0x55 /*0b01010101*/,
+      0x49 /*0b01001001*/,
+      0x11 /*0b00010001*/,
+      0x21 /*0b00100001*/,
+      0x41 /*0b01000001*/,
+      0x81 /*0b10000001*/,
+      0x01 /*0b00000001*/,
+  };
+  static unsigned int shift_table[9] = {
+      0, 0, 0, 1, 0, 2, 4, 6, 0,
+  };
+  if (shift < 0)
+    v <<= -shift;
+  else
+    v >>= shift;
+  STBI_ASSERT(v < 256);
+  v >>= (8 - bits);
+  STBI_ASSERT(bits >= 0 && bits <= 8);
+  return (int)((unsigned)v * mul_table[bits]) >> shift_table[bits];
+}
+
+typedef struct {
+  int bpp, offset, hsz;
+  unsigned int mr, mg, mb, ma, all_a;
 } stbi__bmp_data;
 
-static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
-{
-   int hsz;
-   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
-   stbi__get32le(s); // discard filesize
-   stbi__get16le(s); // discard reserved
-   stbi__get16le(s); // discard reserved
-   info->offset = stbi__get32le(s);
-   info->hsz = hsz = stbi__get32le(s);
-   info->mr = info->mg = info->mb = info->ma = 0;
-
-   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
-   if (hsz == 12) {
-      s->img_x = stbi__get16le(s);
-      s->img_y = stbi__get16le(s);
-   } else {
-      s->img_x = stbi__get32le(s);
-      s->img_y = stbi__get32le(s);
-   }
-   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
-   info->bpp = stbi__get16le(s);
-   if (hsz != 12) {
-      int compress = stbi__get32le(s);
-      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
-      stbi__get32le(s); // discard sizeof
-      stbi__get32le(s); // discard hres
-      stbi__get32le(s); // discard vres
-      stbi__get32le(s); // discard colorsused
-      stbi__get32le(s); // discard max important
-      if (hsz == 40 || hsz == 56) {
-         if (hsz == 56) {
-            stbi__get32le(s);
-            stbi__get32le(s);
-            stbi__get32le(s);
-            stbi__get32le(s);
-         }
-         if (info->bpp == 16 || info->bpp == 32) {
-            if (compress == 0) {
-               if (info->bpp == 32) {
-                  info->mr = 0xffu << 16;
-                  info->mg = 0xffu <<  8;
-                  info->mb = 0xffu <<  0;
-                  info->ma = 0xffu << 24;
-                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
-               } else {
-                  info->mr = 31u << 10;
-                  info->mg = 31u <<  5;
-                  info->mb = 31u <<  0;
-               }
-            } else if (compress == 3) {
-               info->mr = stbi__get32le(s);
-               info->mg = stbi__get32le(s);
-               info->mb = stbi__get32le(s);
-               // not documented, but generated by photoshop and handled by mspaint
-               if (info->mr == info->mg && info->mg == info->mb) {
-                  // ?!?!?
-                  return stbi__errpuc("bad BMP", "bad BMP");
-               }
-            } else
-               return stbi__errpuc("bad BMP", "bad BMP");
-         }
-      } else {
-         int i;
-         if (hsz != 108 && hsz != 124)
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) {
+  int hsz;
+  if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M')
+    return stbi__errpuc("not BMP", "Corrupt BMP");
+  stbi__get32le(s); // discard filesize
+  stbi__get16le(s); // discard reserved
+  stbi__get16le(s); // discard reserved
+  info->offset = stbi__get32le(s);
+  info->hsz = hsz = stbi__get32le(s);
+  info->mr = info->mg = info->mb = info->ma = 0;
+
+  if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124)
+    return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+  if (hsz == 12) {
+    s->img_x = stbi__get16le(s);
+    s->img_y = stbi__get16le(s);
+  } else {
+    s->img_x = stbi__get32le(s);
+    s->img_y = stbi__get32le(s);
+  }
+  if (stbi__get16le(s) != 1)
+    return stbi__errpuc("bad BMP", "bad BMP");
+  info->bpp = stbi__get16le(s);
+  if (hsz != 12) {
+    int compress = stbi__get32le(s);
+    if (compress == 1 || compress == 2)
+      return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+    stbi__get32le(s); // discard sizeof
+    stbi__get32le(s); // discard hres
+    stbi__get32le(s); // discard vres
+    stbi__get32le(s); // discard colorsused
+    stbi__get32le(s); // discard max important
+    if (hsz == 40 || hsz == 56) {
+      if (hsz == 56) {
+        stbi__get32le(s);
+        stbi__get32le(s);
+        stbi__get32le(s);
+        stbi__get32le(s);
+      }
+      if (info->bpp == 16 || info->bpp == 32) {
+        if (compress == 0) {
+          if (info->bpp == 32) {
+            info->mr = 0xffu << 16;
+            info->mg = 0xffu << 8;
+            info->mb = 0xffu << 0;
+            info->ma = 0xffu << 24;
+            info->all_a = 0; // if all_a is 0 at end, then we loaded alpha
+                             // channel but it was all 0
+          } else {
+            info->mr = 31u << 10;
+            info->mg = 31u << 5;
+            info->mb = 31u << 0;
+          }
+        } else if (compress == 3) {
+          info->mr = stbi__get32le(s);
+          info->mg = stbi__get32le(s);
+          info->mb = stbi__get32le(s);
+          // not documented, but generated by photoshop and handled by mspaint
+          if (info->mr == info->mg && info->mg == info->mb) {
+            // ?!?!?
             return stbi__errpuc("bad BMP", "bad BMP");
-         info->mr = stbi__get32le(s);
-         info->mg = stbi__get32le(s);
-         info->mb = stbi__get32le(s);
-         info->ma = stbi__get32le(s);
-         stbi__get32le(s); // discard color space
-         for (i=0; i < 12; ++i)
-            stbi__get32le(s); // discard color space parameters
-         if (hsz == 124) {
-            stbi__get32le(s); // discard rendering intent
-            stbi__get32le(s); // discard offset of profile data
-            stbi__get32le(s); // discard size of profile data
-            stbi__get32le(s); // discard reserved
-         }
+          }
+        } else
+          return stbi__errpuc("bad BMP", "bad BMP");
       }
-   }
-   return (void *) 1;
-}
-
-
-static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *out;
-   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
-   stbi_uc pal[256][4];
-   int psize=0,i,j,width;
-   int flip_vertically, pad, target;
-   stbi__bmp_data info;
-   STBI_NOTUSED(ri);
-
-   info.all_a = 255;
-   if (stbi__bmp_parse_header(s, &info) == NULL)
-      return NULL; // error code already set
-
-   flip_vertically = ((int) s->img_y) > 0;
-   s->img_y = abs((int) s->img_y);
-
-   mr = info.mr;
-   mg = info.mg;
-   mb = info.mb;
-   ma = info.ma;
-   all_a = info.all_a;
-
-   if (info.hsz == 12) {
-      if (info.bpp < 24)
-         psize = (info.offset - 14 - 24) / 3;
-   } else {
-      if (info.bpp < 16)
-         psize = (info.offset - 14 - info.hsz) >> 2;
-   }
-
-   if (info.bpp == 24 && ma == 0xff000000)
-      s->img_n = 3;
-   else
-      s->img_n = ma ? 4 : 3;
-   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
-      target = req_comp;
-   else
-      target = s->img_n; // if they want monochrome, we'll post-convert
-
-   // sanity-check size
-   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
-      return stbi__errpuc("too large", "Corrupt BMP");
-
-   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   if (info.bpp < 16) {
-      int z=0;
-      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
-      for (i=0; i < psize; ++i) {
-         pal[i][2] = stbi__get8(s);
-         pal[i][1] = stbi__get8(s);
-         pal[i][0] = stbi__get8(s);
-         if (info.hsz != 12) stbi__get8(s);
-         pal[i][3] = 255;
+    } else {
+      int i;
+      if (hsz != 108 && hsz != 124)
+        return stbi__errpuc("bad BMP", "bad BMP");
+      info->mr = stbi__get32le(s);
+      info->mg = stbi__get32le(s);
+      info->mb = stbi__get32le(s);
+      info->ma = stbi__get32le(s);
+      stbi__get32le(s); // discard color space
+      for (i = 0; i < 12; ++i)
+        stbi__get32le(s); // discard color space parameters
+      if (hsz == 124) {
+        stbi__get32le(s); // discard rendering intent
+        stbi__get32le(s); // discard offset of profile data
+        stbi__get32le(s); // discard size of profile data
+        stbi__get32le(s); // discard reserved
       }
-      stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
-      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
-      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
-      else if (info.bpp == 8) width = s->img_x;
-      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
-      pad = (-width)&3;
-      if (info.bpp == 1) {
-         for (j=0; j < (int) s->img_y; ++j) {
-            int bit_offset = 7, v = stbi__get8(s);
-            for (i=0; i < (int) s->img_x; ++i) {
-               int color = (v>>bit_offset)&0x1;
-               out[z++] = pal[color][0];
-               out[z++] = pal[color][1];
-               out[z++] = pal[color][2];
-               if (target == 4) out[z++] = 255;
-               if (i+1 == (int) s->img_x) break;
-               if((--bit_offset) < 0) {
-                  bit_offset = 7;
-                  v = stbi__get8(s);
-               }
-            }
-            stbi__skip(s, pad);
-         }
-      } else {
-         for (j=0; j < (int) s->img_y; ++j) {
-            for (i=0; i < (int) s->img_x; i += 2) {
-               int v=stbi__get8(s),v2=0;
-               if (info.bpp == 4) {
-                  v2 = v & 15;
-                  v >>= 4;
-               }
-               out[z++] = pal[v][0];
-               out[z++] = pal[v][1];
-               out[z++] = pal[v][2];
-               if (target == 4) out[z++] = 255;
-               if (i+1 == (int) s->img_x) break;
-               v = (info.bpp == 8) ? stbi__get8(s) : v2;
-               out[z++] = pal[v][0];
-               out[z++] = pal[v][1];
-               out[z++] = pal[v][2];
-               if (target == 4) out[z++] = 255;
-            }
-            stbi__skip(s, pad);
-         }
+    }
+  }
+  return (void *)1;
+}
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri) {
+  stbi_uc *out;
+  unsigned int mr = 0, mg = 0, mb = 0, ma = 0, all_a;
+  stbi_uc pal[256][4];
+  int psize = 0, i, j, width;
+  int flip_vertically, pad, target;
+  stbi__bmp_data info;
+  STBI_NOTUSED(ri);
+
+  info.all_a = 255;
+  if (stbi__bmp_parse_header(s, &info) == NULL)
+    return NULL; // error code already set
+
+  flip_vertically = ((int)s->img_y) > 0;
+  s->img_y = abs((int)s->img_y);
+
+  mr = info.mr;
+  mg = info.mg;
+  mb = info.mb;
+  ma = info.ma;
+  all_a = info.all_a;
+
+  if (info.hsz == 12) {
+    if (info.bpp < 24)
+      psize = (info.offset - 14 - 24) / 3;
+  } else {
+    if (info.bpp < 16)
+      psize = (info.offset - 14 - info.hsz) >> 2;
+  }
+
+  if (info.bpp == 24 && ma == 0xff000000)
+    s->img_n = 3;
+  else
+    s->img_n = ma ? 4 : 3;
+  if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+    target = req_comp;
+  else
+    target = s->img_n; // if they want monochrome, we'll post-convert
+
+  // sanity-check size
+  if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+    return stbi__errpuc("too large", "Corrupt BMP");
+
+  out = (stbi_uc *)stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+  if (!out)
+    return stbi__errpuc("outofmem", "Out of memory");
+  if (info.bpp < 16) {
+    int z = 0;
+    if (psize == 0 || psize > 256) {
+      STBI_FREE(out);
+      return stbi__errpuc("invalid", "Corrupt BMP");
+    }
+    for (i = 0; i < psize; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      if (info.hsz != 12)
+        stbi__get8(s);
+      pal[i][3] = 255;
+    }
+    stbi__skip(s,
+               info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+    if (info.bpp == 1)
+      width = (s->img_x + 7) >> 3;
+    else if (info.bpp == 4)
+      width = (s->img_x + 1) >> 1;
+    else if (info.bpp == 8)
+      width = s->img_x;
+    else {
+      STBI_FREE(out);
+      return stbi__errpuc("bad bpp", "Corrupt BMP");
+    }
+    pad = (-width) & 3;
+    if (info.bpp == 1) {
+      for (j = 0; j < (int)s->img_y; ++j) {
+        int bit_offset = 7, v = stbi__get8(s);
+        for (i = 0; i < (int)s->img_x; ++i) {
+          int color = (v >> bit_offset) & 0x1;
+          out[z++] = pal[color][0];
+          out[z++] = pal[color][1];
+          out[z++] = pal[color][2];
+          if (target == 4)
+            out[z++] = 255;
+          if (i + 1 == (int)s->img_x)
+            break;
+          if ((--bit_offset) < 0) {
+            bit_offset = 7;
+            v = stbi__get8(s);
+          }
+        }
+        stbi__skip(s, pad);
       }
-   } else {
-      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
-      int z = 0;
-      int easy=0;
-      stbi__skip(s, info.offset - 14 - info.hsz);
-      if (info.bpp == 24) width = 3 * s->img_x;
-      else if (info.bpp == 16) width = 2*s->img_x;
-      else /* bpp = 32 and pad = 0 */ width=0;
-      pad = (-width) & 3;
-      if (info.bpp == 24) {
-         easy = 1;
-      } else if (info.bpp == 32) {
-         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
-            easy = 2;
+    } else {
+      for (j = 0; j < (int)s->img_y; ++j) {
+        for (i = 0; i < (int)s->img_x; i += 2) {
+          int v = stbi__get8(s), v2 = 0;
+          if (info.bpp == 4) {
+            v2 = v & 15;
+            v >>= 4;
+          }
+          out[z++] = pal[v][0];
+          out[z++] = pal[v][1];
+          out[z++] = pal[v][2];
+          if (target == 4)
+            out[z++] = 255;
+          if (i + 1 == (int)s->img_x)
+            break;
+          v = (info.bpp == 8) ? stbi__get8(s) : v2;
+          out[z++] = pal[v][0];
+          out[z++] = pal[v][1];
+          out[z++] = pal[v][2];
+          if (target == 4)
+            out[z++] = 255;
+        }
+        stbi__skip(s, pad);
       }
-      if (!easy) {
-         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
-         // right shift amt to put high bit in position #7
-         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
-         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
-         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
-         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+    }
+  } else {
+    int rshift = 0, gshift = 0, bshift = 0, ashift = 0, rcount = 0, gcount = 0,
+        bcount = 0, acount = 0;
+    int z = 0;
+    int easy = 0;
+    stbi__skip(s, info.offset - 14 - info.hsz);
+    if (info.bpp == 24)
+      width = 3 * s->img_x;
+    else if (info.bpp == 16)
+      width = 2 * s->img_x;
+    else /* bpp = 32 and pad = 0 */
+      width = 0;
+    pad = (-width) & 3;
+    if (info.bpp == 24) {
+      easy = 1;
+    } else if (info.bpp == 32) {
+      if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+        easy = 2;
+    }
+    if (!easy) {
+      if (!mr || !mg || !mb) {
+        STBI_FREE(out);
+        return stbi__errpuc("bad masks", "Corrupt BMP");
       }
-      for (j=0; j < (int) s->img_y; ++j) {
-         if (easy) {
-            for (i=0; i < (int) s->img_x; ++i) {
-               unsigned char a;
-               out[z+2] = stbi__get8(s);
-               out[z+1] = stbi__get8(s);
-               out[z+0] = stbi__get8(s);
-               z += 3;
-               a = (easy == 2 ? stbi__get8(s) : 255);
-               all_a |= a;
-               if (target == 4) out[z++] = a;
-            }
-         } else {
-            int bpp = info.bpp;
-            for (i=0; i < (int) s->img_x; ++i) {
-               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
-               unsigned int a;
-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
-               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
-               all_a |= a;
-               if (target == 4) out[z++] = STBI__BYTECAST(a);
-            }
-         }
-         stbi__skip(s, pad);
+      // right shift amt to put high bit in position #7
+      rshift = stbi__high_bit(mr) - 7;
+      rcount = stbi__bitcount(mr);
+      gshift = stbi__high_bit(mg) - 7;
+      gcount = stbi__bitcount(mg);
+      bshift = stbi__high_bit(mb) - 7;
+      bcount = stbi__bitcount(mb);
+      ashift = stbi__high_bit(ma) - 7;
+      acount = stbi__bitcount(ma);
+    }
+    for (j = 0; j < (int)s->img_y; ++j) {
+      if (easy) {
+        for (i = 0; i < (int)s->img_x; ++i) {
+          unsigned char a;
+          out[z + 2] = stbi__get8(s);
+          out[z + 1] = stbi__get8(s);
+          out[z + 0] = stbi__get8(s);
+          z += 3;
+          a = (easy == 2 ? stbi__get8(s) : 255);
+          all_a |= a;
+          if (target == 4)
+            out[z++] = a;
+        }
+      } else {
+        int bpp = info.bpp;
+        for (i = 0; i < (int)s->img_x; ++i) {
+          stbi__uint32 v =
+              (bpp == 16 ? (stbi__uint32)stbi__get16le(s) : stbi__get32le(s));
+          unsigned int a;
+          out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+          out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+          out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+          a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+          all_a |= a;
+          if (target == 4)
+            out[z++] = STBI__BYTECAST(a);
+        }
       }
-   }
-
-   // if alpha channel is all 0s, replace with all 255s
-   if (target == 4 && all_a == 0)
-      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
-         out[i] = 255;
-
-   if (flip_vertically) {
-      stbi_uc t;
-      for (j=0; j < (int) s->img_y>>1; ++j) {
-         stbi_uc *p1 = out +      j     *s->img_x*target;
-         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
-         for (i=0; i < (int) s->img_x*target; ++i) {
-            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
-         }
+      stbi__skip(s, pad);
+    }
+  }
+
+  // if alpha channel is all 0s, replace with all 255s
+  if (target == 4 && all_a == 0)
+    for (i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4)
+      out[i] = 255;
+
+  if (flip_vertically) {
+    stbi_uc t;
+    for (j = 0; j< ((int)s->img_y >> 1); ++j) {
+      stbi_uc *p1 = out + j * s->img_x * target;
+      stbi_uc *p2 = out + (s->img_y - 1 - j) * s->img_x * target;
+      for (i = 0; i < (int)s->img_x * target; ++i) {
+        t = p1[i];
+        p1[i] = p2[i];
+        p2[i] = t;
       }
-   }
+    }
+  }
 
-   if (req_comp && req_comp != target) {
-      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
-      if (out == NULL) return out; // stbi__convert_format frees input on failure
-   }
+  if (req_comp && req_comp != target) {
+    out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+    if (out == NULL)
+      return out; // stbi__convert_format frees input on failure
+  }
 
-   *x = s->img_x;
-   *y = s->img_y;
-   if (comp) *comp = s->img_n;
-   return out;
+  *x = s->img_x;
+  *y = s->img_y;
+  if (comp)
+    *comp = s->img_n;
+  return out;
 }
 #endif
 
@@ -5392,581 +6030,610 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
 // by Jonathan Dummer
 #ifndef STBI_NO_TGA
 // returns STBI_rgb or whatever, 0 on error
-static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
-{
-   // only RGB or RGBA (incl. 16bit) or grey allowed
-   if (is_rgb16) *is_rgb16 = 0;
-   switch(bits_per_pixel) {
-      case 8:  return STBI_grey;
-      case 16: if(is_grey) return STBI_grey_alpha;
-               // fallthrough
-      case 15: if(is_rgb16) *is_rgb16 = 1;
-               return STBI_rgb;
-      case 24: // fallthrough
-      case 32: return bits_per_pixel/8;
-      default: return 0;
-   }
-}
-
-static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
-{
-    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
-    int sz, tga_colormap_type;
-    stbi__get8(s);                   // discard Offset
-    tga_colormap_type = stbi__get8(s); // colormap type
-    if( tga_colormap_type > 1 ) {
-        stbi__rewind(s);
-        return 0;      // only RGB or indexed allowed
-    }
-    tga_image_type = stbi__get8(s); // image type
-    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
-        if (tga_image_type != 1 && tga_image_type != 9) {
-            stbi__rewind(s);
-            return 0;
-        }
-        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
-        sz = stbi__get8(s);    //   check bits per palette color entry
-        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
-            stbi__rewind(s);
-            return 0;
-        }
-        stbi__skip(s,4);       // skip image x and y origin
-        tga_colormap_bpp = sz;
-    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
-        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
-            stbi__rewind(s);
-            return 0; // only RGB or grey allowed, +/- RLE
-        }
-        stbi__skip(s,9); // skip colormap specification and image x/y origin
-        tga_colormap_bpp = 0;
-    }
-    tga_w = stbi__get16le(s);
-    if( tga_w < 1 ) {
-        stbi__rewind(s);
-        return 0;   // test width
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int *is_rgb16) {
+  // only RGB or RGBA (incl. 16bit) or grey allowed
+  if (is_rgb16)
+    *is_rgb16 = 0;
+  switch (bits_per_pixel) {
+  case 8:
+    return STBI_grey;
+  case 16:
+    if (is_grey)
+      return STBI_grey_alpha;
+    // fallthrough
+  case 15:
+    if (is_rgb16)
+      *is_rgb16 = 1;
+    return STBI_rgb;
+  case 24: // fallthrough
+  case 32:
+    return bits_per_pixel / 8;
+  default:
+    return 0;
+  }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp) {
+  int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel,
+      tga_colormap_bpp;
+  int sz, tga_colormap_type;
+  stbi__get8(s);                     // discard Offset
+  tga_colormap_type = stbi__get8(s); // colormap type
+  if (tga_colormap_type > 1) {
+    stbi__rewind(s);
+    return 0; // only RGB or indexed allowed
+  }
+  tga_image_type = stbi__get8(s); // image type
+  if (tga_colormap_type == 1) {   // colormapped (paletted) image
+    if (tga_image_type != 1 && tga_image_type != 9) {
+      stbi__rewind(s);
+      return 0;
     }
-    tga_h = stbi__get16le(s);
-    if( tga_h < 1 ) {
-        stbi__rewind(s);
-        return 0;   // test height
+    stbi__skip(s,
+               4); // skip index of first colormap entry and number of entries
+    sz = stbi__get8(s); //   check bits per palette color entry
+    if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) {
+      stbi__rewind(s);
+      return 0;
     }
-    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
-    stbi__get8(s); // ignore alpha bits
-    if (tga_colormap_bpp != 0) {
-        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
-            // when using a colormap, tga_bits_per_pixel is the size of the indexes
-            // I don't think anything but 8 or 16bit indexes makes sense
-            stbi__rewind(s);
-            return 0;
-        }
-        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
-    } else {
-        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    stbi__skip(s, 4); // skip image x and y origin
+    tga_colormap_bpp = sz;
+  } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+    if ((tga_image_type != 2) && (tga_image_type != 3) &&
+        (tga_image_type != 10) && (tga_image_type != 11)) {
+      stbi__rewind(s);
+      return 0; // only RGB or grey allowed, +/- RLE
     }
-    if(!tga_comp) {
+    stbi__skip(s, 9); // skip colormap specification and image x/y origin
+    tga_colormap_bpp = 0;
+  }
+  tga_w = stbi__get16le(s);
+  if (tga_w < 1) {
+    stbi__rewind(s);
+    return 0; // test width
+  }
+  tga_h = stbi__get16le(s);
+  if (tga_h < 1) {
+    stbi__rewind(s);
+    return 0; // test height
+  }
+  tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+  stbi__get8(s);                      // ignore alpha bits
+  if (tga_colormap_bpp != 0) {
+    if ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+      // when using a colormap, tga_bits_per_pixel is the size of the indexes
+      // I don't think anything but 8 or 16bit indexes makes sense
       stbi__rewind(s);
       return 0;
     }
-    if (x) *x = tga_w;
-    if (y) *y = tga_h;
-    if (comp) *comp = tga_comp;
-    return 1;                   // seems to have passed everything
-}
-
-static int stbi__tga_test(stbi__context *s)
-{
-   int res = 0;
-   int sz, tga_color_type;
-   stbi__get8(s);      //   discard Offset
-   tga_color_type = stbi__get8(s);   //   color type
-   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
-   sz = stbi__get8(s);   //   image type
-   if ( tga_color_type == 1 ) { // colormapped (paletted) image
-      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
-      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
-      sz = stbi__get8(s);    //   check bits per palette color entry
-      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
-      stbi__skip(s,4);       // skip image x and y origin
-   } else { // "normal" image w/o colormap
-      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
-      stbi__skip(s,9); // skip colormap specification and image x/y origin
-   }
-   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
-   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
-   sz = stbi__get8(s);   //   bits per pixel
-   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
-   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
-
-   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+    tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+  } else {
+    tga_comp = stbi__tga_get_comp(
+        tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11),
+        NULL);
+  }
+  if (!tga_comp) {
+    stbi__rewind(s);
+    return 0;
+  }
+  if (x)
+    *x = tga_w;
+  if (y)
+    *y = tga_h;
+  if (comp)
+    *comp = tga_comp;
+  return 1; // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s) {
+  int res = 0;
+  int sz, tga_color_type;
+  stbi__get8(s);                  //   discard Offset
+  tga_color_type = stbi__get8(s); //   color type
+  if (tga_color_type > 1)
+    goto errorEnd;           //   only RGB or indexed allowed
+  sz = stbi__get8(s);        //   image type
+  if (tga_color_type == 1) { // colormapped (paletted) image
+    if (sz != 1 && sz != 9)
+      goto errorEnd; // colortype 1 demands image type 1 or 9
+    stbi__skip(s,
+               4); // skip index of first colormap entry and number of entries
+    sz = stbi__get8(s); //   check bits per palette color entry
+    if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
+      goto errorEnd;
+    stbi__skip(s, 4); // skip image x and y origin
+  } else {            // "normal" image w/o colormap
+    if ((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11))
+      goto errorEnd;  // only RGB or grey allowed, +/- RLE
+    stbi__skip(s, 9); // skip colormap specification and image x/y origin
+  }
+  if (stbi__get16le(s) < 1)
+    goto errorEnd; //   test width
+  if (stbi__get16le(s) < 1)
+    goto errorEnd;    //   test height
+  sz = stbi__get8(s); //   bits per pixel
+  if ((tga_color_type == 1) && (sz != 8) && (sz != 16))
+    goto errorEnd; // for colormapped images, bpp is size of an index
+  if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
+    goto errorEnd;
+
+  res = 1; // if we got this far, everything's good and we can return 1 instead
+           // of 0
 
 errorEnd:
-   stbi__rewind(s);
-   return res;
+  stbi__rewind(s);
+  return res;
 }
 
 // read 16bit value and convert to 24bit RGB
-static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
-{
-   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
-   stbi__uint16 fiveBitMask = 31;
-   // we have 3 channels with 5bits each
-   int r = (px >> 10) & fiveBitMask;
-   int g = (px >> 5) & fiveBitMask;
-   int b = px & fiveBitMask;
-   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
-   out[0] = (stbi_uc)((r * 255)/31);
-   out[1] = (stbi_uc)((g * 255)/31);
-   out[2] = (stbi_uc)((b * 255)/31);
-
-   // some people claim that the most significant bit might be used for alpha
-   // (possibly if an alpha-bit is set in the "image descriptor byte")
-   // but that only made 16bit test images completely translucent..
-   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
-}
-
-static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   //   read in the TGA header stuff
-   int tga_offset = stbi__get8(s);
-   int tga_indexed = stbi__get8(s);
-   int tga_image_type = stbi__get8(s);
-   int tga_is_RLE = 0;
-   int tga_palette_start = stbi__get16le(s);
-   int tga_palette_len = stbi__get16le(s);
-   int tga_palette_bits = stbi__get8(s);
-   int tga_x_origin = stbi__get16le(s);
-   int tga_y_origin = stbi__get16le(s);
-   int tga_width = stbi__get16le(s);
-   int tga_height = stbi__get16le(s);
-   int tga_bits_per_pixel = stbi__get8(s);
-   int tga_comp, tga_rgb16=0;
-   int tga_inverted = stbi__get8(s);
-   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
-   //   image data
-   unsigned char *tga_data;
-   unsigned char *tga_palette = NULL;
-   int i, j;
-   unsigned char raw_data[4] = {0};
-   int RLE_count = 0;
-   int RLE_repeating = 0;
-   int read_next_pixel = 1;
-   STBI_NOTUSED(ri);
-   STBI_NOTUSED(tga_x_origin); // @TODO
-   STBI_NOTUSED(tga_y_origin); // @TODO
-
-   //   do a tiny bit of precessing
-   if ( tga_image_type >= 8 )
-   {
-      tga_image_type -= 8;
-      tga_is_RLE = 1;
-   }
-   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
-
-   //   If I'm paletted, then I'll use the number of bits from the palette
-   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
-   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
-
-   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
-      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
-
-   //   tga info
-   *x = tga_width;
-   *y = tga_height;
-   if (comp) *comp = tga_comp;
-
-   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
-      return stbi__errpuc("too large", "Corrupt TGA");
-
-   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
-   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
-
-   // skip to the data's starting position (offset usually = 0)
-   stbi__skip(s, tga_offset );
-
-   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
-      for (i=0; i < tga_height; ++i) {
-         int row = tga_inverted ? tga_height -i - 1 : i;
-         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
-         stbi__getn(s, tga_row, tga_width * tga_comp);
-      }
-   } else  {
-      //   do I need to load a palette?
-      if ( tga_indexed)
-      {
-         //   any data to skip? (offset usually = 0)
-         stbi__skip(s, tga_palette_start );
-         //   load the palette
-         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
-         if (!tga_palette) {
-            STBI_FREE(tga_data);
-            return stbi__errpuc("outofmem", "Out of memory");
-         }
-         if (tga_rgb16) {
-            stbi_uc *pal_entry = tga_palette;
-            STBI_ASSERT(tga_comp == STBI_rgb);
-            for (i=0; i < tga_palette_len; ++i) {
-               stbi__tga_read_rgb16(s, pal_entry);
-               pal_entry += tga_comp;
-            }
-         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
-               STBI_FREE(tga_data);
-               STBI_FREE(tga_palette);
-               return stbi__errpuc("bad palette", "Corrupt TGA");
-         }
-      }
-      //   load the data
-      for (i=0; i < tga_width * tga_height; ++i)
-      {
-         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
-         if ( tga_is_RLE )
-         {
-            if ( RLE_count == 0 )
-            {
-               //   yep, get the next byte as a RLE command
-               int RLE_cmd = stbi__get8(s);
-               RLE_count = 1 + (RLE_cmd & 127);
-               RLE_repeating = RLE_cmd >> 7;
-               read_next_pixel = 1;
-            } else if ( !RLE_repeating )
-            {
-               read_next_pixel = 1;
-            }
-         } else
-         {
-            read_next_pixel = 1;
-         }
-         //   OK, if I need to read a pixel, do it now
-         if ( read_next_pixel )
-         {
-            //   load however much data we did have
-            if ( tga_indexed )
-            {
-               // read in index, then perform the lookup
-               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
-               if ( pal_idx >= tga_palette_len ) {
-                  // invalid index
-                  pal_idx = 0;
-               }
-               pal_idx *= tga_comp;
-               for (j = 0; j < tga_comp; ++j) {
-                  raw_data[j] = tga_palette[pal_idx+j];
-               }
-            } else if(tga_rgb16) {
-               STBI_ASSERT(tga_comp == STBI_rgb);
-               stbi__tga_read_rgb16(s, raw_data);
-            } else {
-               //   read in the data raw
-               for (j = 0; j < tga_comp; ++j) {
-                  raw_data[j] = stbi__get8(s);
-               }
-            }
-            //   clear the reading flag for the next pixel
-            read_next_pixel = 0;
-         } // end of reading a pixel
-
-         // copy data
-         for (j = 0; j < tga_comp; ++j)
-           tga_data[i*tga_comp+j] = raw_data[j];
-
-         //   in case we're in RLE mode, keep counting down
-         --RLE_count;
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc *out) {
+  stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+  stbi__uint16 fiveBitMask = 31;
+  // we have 3 channels with 5bits each
+  int r = (px >> 10) & fiveBitMask;
+  int g = (px >> 5) & fiveBitMask;
+  int b = px & fiveBitMask;
+  // Note that this saves the data in RGB(A) order, so it doesn't need to be
+  // swapped later
+  out[0] = (stbi_uc)((r * 255) / 31);
+  out[1] = (stbi_uc)((g * 255) / 31);
+  out[2] = (stbi_uc)((b * 255) / 31);
+
+  // some people claim that the most significant bit might be used for alpha
+  // (possibly if an alpha-bit is set in the "image descriptor byte")
+  // but that only made 16bit test images completely translucent..
+  // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri) {
+  //   read in the TGA header stuff
+  int tga_offset = stbi__get8(s);
+  int tga_indexed = stbi__get8(s);
+  int tga_image_type = stbi__get8(s);
+  int tga_is_RLE = 0;
+  int tga_palette_start = stbi__get16le(s);
+  int tga_palette_len = stbi__get16le(s);
+  int tga_palette_bits = stbi__get8(s);
+  int tga_x_origin = stbi__get16le(s);
+  int tga_y_origin = stbi__get16le(s);
+  int tga_width = stbi__get16le(s);
+  int tga_height = stbi__get16le(s);
+  int tga_bits_per_pixel = stbi__get8(s);
+  int tga_comp, tga_rgb16 = 0;
+  int tga_inverted = stbi__get8(s);
+  // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused
+  // (useless?)
+  //   image data
+  unsigned char *tga_data;
+  unsigned char *tga_palette = NULL;
+  int i, j;
+  unsigned char raw_data[4] = {0};
+  int RLE_count = 0;
+  int RLE_repeating = 0;
+  int read_next_pixel = 1;
+  STBI_NOTUSED(ri);
+  STBI_NOTUSED(tga_x_origin); // @TODO
+  STBI_NOTUSED(tga_y_origin); // @TODO
+
+  //   do a tiny bit of precessing
+  if (tga_image_type >= 8) {
+    tga_image_type -= 8;
+    tga_is_RLE = 1;
+  }
+  tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+  //   If I'm paletted, then I'll use the number of bits from the palette
+  if (tga_indexed)
+    tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+  else
+    tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3),
+                                  &tga_rgb16);
+
+  if (!tga_comp) // shouldn't really happen, stbi__tga_test() should have
+                 // ensured basic consistency
+    return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+  //   tga info
+  *x = tga_width;
+  *y = tga_height;
+  if (comp)
+    *comp = tga_comp;
+
+  if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+    return stbi__errpuc("too large", "Corrupt TGA");
+
+  tga_data =
+      (unsigned char *)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+  if (!tga_data)
+    return stbi__errpuc("outofmem", "Out of memory");
+
+  // skip to the data's starting position (offset usually = 0)
+  stbi__skip(s, tga_offset);
+
+  if (!tga_indexed && !tga_is_RLE && !tga_rgb16) {
+    for (i = 0; i < tga_height; ++i) {
+      int row = tga_inverted ? tga_height - i - 1 : i;
+      stbi_uc *tga_row = tga_data + row * tga_width * tga_comp;
+      stbi__getn(s, tga_row, tga_width * tga_comp);
+    }
+  } else {
+    //   do I need to load a palette?
+    if (tga_indexed) {
+      //   any data to skip? (offset usually = 0)
+      stbi__skip(s, tga_palette_start);
+      //   load the palette
+      tga_palette =
+          (unsigned char *)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+      if (!tga_palette) {
+        STBI_FREE(tga_data);
+        return stbi__errpuc("outofmem", "Out of memory");
       }
-      //   do I need to invert the image?
-      if ( tga_inverted )
-      {
-         for (j = 0; j*2 < tga_height; ++j)
-         {
-            int index1 = j * tga_width * tga_comp;
-            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
-            for (i = tga_width * tga_comp; i > 0; --i)
-            {
-               unsigned char temp = tga_data[index1];
-               tga_data[index1] = tga_data[index2];
-               tga_data[index2] = temp;
-               ++index1;
-               ++index2;
-            }
-         }
+      if (tga_rgb16) {
+        stbi_uc *pal_entry = tga_palette;
+        STBI_ASSERT(tga_comp == STBI_rgb);
+        for (i = 0; i < tga_palette_len; ++i) {
+          stbi__tga_read_rgb16(s, pal_entry);
+          pal_entry += tga_comp;
+        }
+      } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+        STBI_FREE(tga_data);
+        STBI_FREE(tga_palette);
+        return stbi__errpuc("bad palette", "Corrupt TGA");
       }
-      //   clear my palette, if I had one
-      if ( tga_palette != NULL )
-      {
-         STBI_FREE( tga_palette );
+    }
+    //   load the data
+    for (i = 0; i < tga_width * tga_height; ++i) {
+      //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+      if (tga_is_RLE) {
+        if (RLE_count == 0) {
+          //   yep, get the next byte as a RLE command
+          int RLE_cmd = stbi__get8(s);
+          RLE_count = 1 + (RLE_cmd & 127);
+          RLE_repeating = RLE_cmd >> 7;
+          read_next_pixel = 1;
+        } else if (!RLE_repeating) {
+          read_next_pixel = 1;
+        }
+      } else {
+        read_next_pixel = 1;
       }
-   }
+      //   OK, if I need to read a pixel, do it now
+      if (read_next_pixel) {
+        //   load however much data we did have
+        if (tga_indexed) {
+          // read in index, then perform the lookup
+          int pal_idx =
+              (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+          if (pal_idx >= tga_palette_len) {
+            // invalid index
+            pal_idx = 0;
+          }
+          pal_idx *= tga_comp;
+          for (j = 0; j < tga_comp; ++j) {
+            raw_data[j] = tga_palette[pal_idx + j];
+          }
+        } else if (tga_rgb16) {
+          STBI_ASSERT(tga_comp == STBI_rgb);
+          stbi__tga_read_rgb16(s, raw_data);
+        } else {
+          //   read in the data raw
+          for (j = 0; j < tga_comp; ++j) {
+            raw_data[j] = stbi__get8(s);
+          }
+        }
+        //   clear the reading flag for the next pixel
+        read_next_pixel = 0;
+      } // end of reading a pixel
 
-   // swap RGB - if the source data was RGB16, it already is in the right order
-   if (tga_comp >= 3 && !tga_rgb16)
-   {
-      unsigned char* tga_pixel = tga_data;
-      for (i=0; i < tga_width * tga_height; ++i)
-      {
-         unsigned char temp = tga_pixel[0];
-         tga_pixel[0] = tga_pixel[2];
-         tga_pixel[2] = temp;
-         tga_pixel += tga_comp;
+      // copy data
+      for (j = 0; j < tga_comp; ++j)
+        tga_data[i * tga_comp + j] = raw_data[j];
+
+      //   in case we're in RLE mode, keep counting down
+      --RLE_count;
+    }
+    //   do I need to invert the image?
+    if (tga_inverted) {
+      for (j = 0; j * 2 < tga_height; ++j) {
+        int index1 = j * tga_width * tga_comp;
+        int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+        for (i = tga_width * tga_comp; i > 0; --i) {
+          unsigned char temp = tga_data[index1];
+          tga_data[index1] = tga_data[index2];
+          tga_data[index2] = temp;
+          ++index1;
+          ++index2;
+        }
       }
-   }
+    }
+    //   clear my palette, if I had one
+    if (tga_palette != NULL) {
+      STBI_FREE(tga_palette);
+    }
+  }
+
+  // swap RGB - if the source data was RGB16, it already is in the right order
+  if (tga_comp >= 3 && !tga_rgb16) {
+    unsigned char *tga_pixel = tga_data;
+    for (i = 0; i < tga_width * tga_height; ++i) {
+      unsigned char temp = tga_pixel[0];
+      tga_pixel[0] = tga_pixel[2];
+      tga_pixel[2] = temp;
+      tga_pixel += tga_comp;
+    }
+  }
 
-   // convert to target component count
-   if (req_comp && req_comp != tga_comp)
-      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+  // convert to target component count
+  if (req_comp && req_comp != tga_comp)
+    tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width,
+                                    tga_height);
 
-   //   the things I do to get rid of an error message, and yet keep
-   //   Microsoft's C compilers happy... [8^(
-   tga_palette_start = tga_palette_len = tga_palette_bits =
-         tga_x_origin = tga_y_origin = 0;
-   STBI_NOTUSED(tga_palette_start);
-   //   OK, done
-   return tga_data;
+  //   the things I do to get rid of an error message, and yet keep
+  //   Microsoft's C compilers happy... [8^(
+  tga_palette_start = tga_palette_len = tga_palette_bits = tga_x_origin =
+      tga_y_origin = 0;
+  STBI_NOTUSED(tga_palette_start);
+  //   OK, done
+  return tga_data;
 }
 #endif
 
 // *************************************************************************************************
-// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz,
+// tweaked by STB
 
 #ifndef STBI_NO_PSD
-static int stbi__psd_test(stbi__context *s)
-{
-   int r = (stbi__get32be(s) == 0x38425053);
-   stbi__rewind(s);
-   return r;
-}
-
-static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
-{
-   int count, nleft, len;
-
-   count = 0;
-   while ((nleft = pixelCount - count) > 0) {
-      len = stbi__get8(s);
-      if (len == 128) {
-         // No-op.
-      } else if (len < 128) {
-         // Copy next len+1 bytes literally.
-         len++;
-         if (len > nleft) return 0; // corrupt data
-         count += len;
-         while (len) {
-            *p = stbi__get8(s);
-            p += 4;
-            len--;
-         }
-      } else if (len > 128) {
-         stbi_uc   val;
-         // Next -len+1 bytes in the dest are replicated from next source byte.
-         // (Interpret len as a negative 8-bit int.)
-         len = 257 - len;
-         if (len > nleft) return 0; // corrupt data
-         val = stbi__get8(s);
-         count += len;
-         while (len) {
-            *p = val;
-            p += 4;
-            len--;
-         }
+static int stbi__psd_test(stbi__context *s) {
+  int r = (stbi__get32be(s) == 0x38425053);
+  stbi__rewind(s);
+  return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount) {
+  int count, nleft, len;
+
+  count = 0;
+  while ((nleft = pixelCount - count) > 0) {
+    len = stbi__get8(s);
+    if (len == 128) {
+      // No-op.
+    } else if (len < 128) {
+      // Copy next len+1 bytes literally.
+      len++;
+      if (len > nleft)
+        return 0; // corrupt data
+      count += len;
+      while (len) {
+        *p = stbi__get8(s);
+        p += 4;
+        len--;
       }
-   }
-
-   return 1;
-}
+    } else if (len > 128) {
+      stbi_uc val;
+      // Next -len+1 bytes in the dest are replicated from next source byte.
+      // (Interpret len as a negative 8-bit int.)
+      len = 257 - len;
+      if (len > nleft)
+        return 0; // corrupt data
+      val = stbi__get8(s);
+      count += len;
+      while (len) {
+        *p = val;
+        p += 4;
+        len--;
+      }
+    }
+  }
+
+  return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri, int bpc) {
+  int pixelCount;
+  int channelCount, compression;
+  int channel, i;
+  int bitdepth;
+  int w, h;
+  stbi_uc *out;
+  STBI_NOTUSED(ri);
+
+  // Check identifier
+  if (stbi__get32be(s) != 0x38425053) // "8BPS"
+    return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+  // Check file type version.
+  if (stbi__get16be(s) != 1)
+    return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+  // Skip 6 reserved bytes.
+  stbi__skip(s, 6);
+
+  // Read the number of channels (R, G, B, A, etc).
+  channelCount = stbi__get16be(s);
+  if (channelCount < 0 || channelCount > 16)
+    return stbi__errpuc("wrong channel count",
+                        "Unsupported number of channels in PSD image");
+
+  // Read the rows and columns of the image.
+  h = stbi__get32be(s);
+  w = stbi__get32be(s);
+
+  // Make sure the depth is 8 bits.
+  bitdepth = stbi__get16be(s);
+  if (bitdepth != 8 && bitdepth != 16)
+    return stbi__errpuc("unsupported bit depth",
+                        "PSD bit depth is not 8 or 16 bit");
+
+  // Make sure the color mode is RGB.
+  // Valid options are:
+  //   0: Bitmap
+  //   1: Grayscale
+  //   2: Indexed color
+  //   3: RGB color
+  //   4: CMYK color
+  //   7: Multichannel
+  //   8: Duotone
+  //   9: Lab color
+  if (stbi__get16be(s) != 3)
+    return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+  // Skip the Mode Data.  (It's the palette for indexed color; other info for
+  // other modes.)
+  stbi__skip(s, stbi__get32be(s));
+
+  // Skip the image resources.  (resolution, pen tool paths, etc)
+  stbi__skip(s, stbi__get32be(s));
+
+  // Skip the reserved data.
+  stbi__skip(s, stbi__get32be(s));
+
+  // Find out if the data is compressed.
+  // Known values:
+  //   0: no compression
+  //   1: RLE compressed
+  compression = stbi__get16be(s);
+  if (compression > 1)
+    return stbi__errpuc("bad compression",
+                        "PSD has an unknown compression format");
+
+  // Check size
+  if (!stbi__mad3sizes_valid(4, w, h, 0))
+    return stbi__errpuc("too large", "Corrupt PSD");
+
+  // Create the destination image.
+
+  if (!compression && bitdepth == 16 && bpc == 16) {
+    out = (stbi_uc *)stbi__malloc_mad3(8, w, h, 0);
+    ri->bits_per_channel = 16;
+  } else
+    out = (stbi_uc *)stbi__malloc(4 * w * h);
+
+  if (!out)
+    return stbi__errpuc("outofmem", "Out of memory");
+  pixelCount = w * h;
+
+  // Initialize the data to zero.
+  // memset( out, 0, pixelCount * 4 );
+
+  // Finally, the image data.
+  if (compression) {
+    // RLE as used by .PSD and .TIFF
+    // Loop until you get the number of unpacked bytes you are expecting:
+    //     Read the next source byte into n.
+    //     If n is between 0 and 127 inclusive, copy the next n+1 bytes
+    //     literally. Else if n is between -127 and -1 inclusive, copy the next
+    //     byte -n+1 times. Else if n is 128, noop.
+    // Endloop
+
+    // The RLE-compressed data is preceded by a 2-byte data count for each row
+    // in the data, which we're going to just skip.
+    stbi__skip(s, h * channelCount * 2);
+
+    // Read the RLE data by channel.
+    for (channel = 0; channel < 4; channel++) {
+      stbi_uc *p;
+
+      p = out + channel;
+      if (channel >= channelCount) {
+        // Fill this channel with default data.
+        for (i = 0; i < pixelCount; i++, p += 4)
+          *p = (channel == 3 ? 255 : 0);
+      } else {
+        // Read the RLE data.
+        if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+          STBI_FREE(out);
+          return stbi__errpuc("corrupt", "bad RLE data");
+        }
+      }
+    }
 
-static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
-{
-   int pixelCount;
-   int channelCount, compression;
-   int channel, i;
-   int bitdepth;
-   int w,h;
-   stbi_uc *out;
-   STBI_NOTUSED(ri);
-
-   // Check identifier
-   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
-      return stbi__errpuc("not PSD", "Corrupt PSD image");
-
-   // Check file type version.
-   if (stbi__get16be(s) != 1)
-      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
-
-   // Skip 6 reserved bytes.
-   stbi__skip(s, 6 );
-
-   // Read the number of channels (R, G, B, A, etc).
-   channelCount = stbi__get16be(s);
-   if (channelCount < 0 || channelCount > 16)
-      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
-
-   // Read the rows and columns of the image.
-   h = stbi__get32be(s);
-   w = stbi__get32be(s);
-
-   // Make sure the depth is 8 bits.
-   bitdepth = stbi__get16be(s);
-   if (bitdepth != 8 && bitdepth != 16)
-      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
-
-   // Make sure the color mode is RGB.
-   // Valid options are:
-   //   0: Bitmap
-   //   1: Grayscale
-   //   2: Indexed color
-   //   3: RGB color
-   //   4: CMYK color
-   //   7: Multichannel
-   //   8: Duotone
-   //   9: Lab color
-   if (stbi__get16be(s) != 3)
-      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
-
-   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
-   stbi__skip(s,stbi__get32be(s) );
-
-   // Skip the image resources.  (resolution, pen tool paths, etc)
-   stbi__skip(s, stbi__get32be(s) );
-
-   // Skip the reserved data.
-   stbi__skip(s, stbi__get32be(s) );
-
-   // Find out if the data is compressed.
-   // Known values:
-   //   0: no compression
-   //   1: RLE compressed
-   compression = stbi__get16be(s);
-   if (compression > 1)
-      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
-
-   // Check size
-   if (!stbi__mad3sizes_valid(4, w, h, 0))
-      return stbi__errpuc("too large", "Corrupt PSD");
-
-   // Create the destination image.
-
-   if (!compression && bitdepth == 16 && bpc == 16) {
-      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
-      ri->bits_per_channel = 16;
-   } else
-      out = (stbi_uc *) stbi__malloc(4 * w*h);
-
-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   pixelCount = w*h;
-
-   // Initialize the data to zero.
-   //memset( out, 0, pixelCount * 4 );
-
-   // Finally, the image data.
-   if (compression) {
-      // RLE as used by .PSD and .TIFF
-      // Loop until you get the number of unpacked bytes you are expecting:
-      //     Read the next source byte into n.
-      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
-      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
-      //     Else if n is 128, noop.
-      // Endloop
-
-      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
-      // which we're going to just skip.
-      stbi__skip(s, h * channelCount * 2 );
-
-      // Read the RLE data by channel.
-      for (channel = 0; channel < 4; channel++) {
-         stbi_uc *p;
-
-         p = out+channel;
-         if (channel >= channelCount) {
-            // Fill this channel with default data.
+  } else {
+    // We're at the raw image data.  It's each channel in order (Red, Green,
+    // Blue, Alpha, ...) where each channel consists of an 8-bit (or 16-bit)
+    // value for each pixel in the image.
+
+    // Read the data by channel.
+    for (channel = 0; channel < 4; channel++) {
+      if (channel >= channelCount) {
+        // Fill this channel with default data.
+        if (bitdepth == 16 && bpc == 16) {
+          stbi__uint16 *q = ((stbi__uint16 *)out) + channel;
+          stbi__uint16 val = channel == 3 ? 65535 : 0;
+          for (i = 0; i < pixelCount; i++, q += 4)
+            *q = val;
+        } else {
+          stbi_uc *p = out + channel;
+          stbi_uc val = channel == 3 ? 255 : 0;
+          for (i = 0; i < pixelCount; i++, p += 4)
+            *p = val;
+        }
+      } else {
+        if (ri->bits_per_channel == 16) { // output bpc
+          stbi__uint16 *q = ((stbi__uint16 *)out) + channel;
+          for (i = 0; i < pixelCount; i++, q += 4)
+            *q = (stbi__uint16)stbi__get16be(s);
+        } else {
+          stbi_uc *p = out + channel;
+          if (bitdepth == 16) { // input bpc
             for (i = 0; i < pixelCount; i++, p += 4)
-               *p = (channel == 3 ? 255 : 0);
-         } else {
-            // Read the RLE data.
-            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
-               STBI_FREE(out);
-               return stbi__errpuc("corrupt", "bad RLE data");
-            }
-         }
+              *p = (stbi_uc)(stbi__get16be(s) >> 8);
+          } else {
+            for (i = 0; i < pixelCount; i++, p += 4)
+              *p = stbi__get8(s);
+          }
+        }
       }
-
-   } else {
-      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
-      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
-
-      // Read the data by channel.
-      for (channel = 0; channel < 4; channel++) {
-         if (channel >= channelCount) {
-            // Fill this channel with default data.
-            if (bitdepth == 16 && bpc == 16) {
-               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
-               stbi__uint16 val = channel == 3 ? 65535 : 0;
-               for (i = 0; i < pixelCount; i++, q += 4)
-                  *q = val;
-            } else {
-               stbi_uc *p = out+channel;
-               stbi_uc val = channel == 3 ? 255 : 0;
-               for (i = 0; i < pixelCount; i++, p += 4)
-                  *p = val;
-            }
-         } else {
-            if (ri->bits_per_channel == 16) {    // output bpc
-               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
-               for (i = 0; i < pixelCount; i++, q += 4)
-                  *q = (stbi__uint16) stbi__get16be(s);
-            } else {
-               stbi_uc *p = out+channel;
-               if (bitdepth == 16) {  // input bpc
-                  for (i = 0; i < pixelCount; i++, p += 4)
-                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
-               } else {
-                  for (i = 0; i < pixelCount; i++, p += 4)
-                     *p = stbi__get8(s);
-               }
-            }
-         }
+    }
+  }
+
+  // remove weird white matte from PSD
+  if (channelCount >= 4) {
+    if (ri->bits_per_channel == 16) {
+      for (i = 0; i < w * h; ++i) {
+        stbi__uint16 *pixel = (stbi__uint16 *)out + 4 * i;
+        if (pixel[3] != 0 && pixel[3] != 65535) {
+          float a = pixel[3] / 65535.0f;
+          float ra = 1.0f / a;
+          float inv_a = 65535.0f * (1 - ra);
+          pixel[0] = (stbi__uint16)(pixel[0] * ra + inv_a);
+          pixel[1] = (stbi__uint16)(pixel[1] * ra + inv_a);
+          pixel[2] = (stbi__uint16)(pixel[2] * ra + inv_a);
+        }
       }
-   }
-
-   // remove weird white matte from PSD
-   if (channelCount >= 4) {
-      if (ri->bits_per_channel == 16) {
-         for (i=0; i < w*h; ++i) {
-            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
-            if (pixel[3] != 0 && pixel[3] != 65535) {
-               float a = pixel[3] / 65535.0f;
-               float ra = 1.0f / a;
-               float inv_a = 65535.0f * (1 - ra);
-               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
-               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
-               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
-            }
-         }
-      } else {
-         for (i=0; i < w*h; ++i) {
-            unsigned char *pixel = out + 4*i;
-            if (pixel[3] != 0 && pixel[3] != 255) {
-               float a = pixel[3] / 255.0f;
-               float ra = 1.0f / a;
-               float inv_a = 255.0f * (1 - ra);
-               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
-               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
-               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
-            }
-         }
+    } else {
+      for (i = 0; i < w * h; ++i) {
+        unsigned char *pixel = out + 4 * i;
+        if (pixel[3] != 0 && pixel[3] != 255) {
+          float a = pixel[3] / 255.0f;
+          float ra = 1.0f / a;
+          float inv_a = 255.0f * (1 - ra);
+          pixel[0] = (unsigned char)(pixel[0] * ra + inv_a);
+          pixel[1] = (unsigned char)(pixel[1] * ra + inv_a);
+          pixel[2] = (unsigned char)(pixel[2] * ra + inv_a);
+        }
       }
-   }
-
-   // convert to desired output format
-   if (req_comp && req_comp != 4) {
-      if (ri->bits_per_channel == 16)
-         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
-      else
-         out = stbi__convert_format(out, 4, req_comp, w, h);
-      if (out == NULL) return out; // stbi__convert_format frees input on failure
-   }
-
-   if (comp) *comp = 4;
-   *y = h;
-   *x = w;
-
-   return out;
+    }
+  }
+
+  // convert to desired output format
+  if (req_comp && req_comp != 4) {
+    if (ri->bits_per_channel == 16)
+      out = (stbi_uc *)stbi__convert_format16((stbi__uint16 *)out, 4, req_comp,
+                                              w, h);
+    else
+      out = stbi__convert_format(out, 4, req_comp, w, h);
+    if (out == NULL)
+      return out; // stbi__convert_format frees input on failure
+  }
+
+  if (comp)
+    *comp = 4;
+  *y = h;
+  *x = w;
+
+  return out;
 }
 #endif
 
@@ -5978,211 +6645,216 @@ static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req
 // See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
 
 #ifndef STBI_NO_PIC
-static int stbi__pic_is4(stbi__context *s,const char *str)
-{
-   int i;
-   for (i=0; i<4; ++i)
-      if (stbi__get8(s) != (stbi_uc)str[i])
-         return 0;
+static int stbi__pic_is4(stbi__context *s, const char *str) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    if (stbi__get8(s) != (stbi_uc)str[i])
+      return 0;
 
-   return 1;
+  return 1;
 }
 
-static int stbi__pic_test_core(stbi__context *s)
-{
-   int i;
+static int stbi__pic_test_core(stbi__context *s) {
+  int i;
 
-   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
-      return 0;
+  if (!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
+    return 0;
 
-   for(i=0;i<84;++i)
-      stbi__get8(s);
+  for (i = 0; i < 84; ++i)
+    stbi__get8(s);
 
-   if (!stbi__pic_is4(s,"PICT"))
-      return 0;
+  if (!stbi__pic_is4(s, "PICT"))
+    return 0;
 
-   return 1;
+  return 1;
 }
 
-typedef struct
-{
-   stbi_uc size,type,channel;
+typedef struct {
+  stbi_uc size, type, channel;
 } stbi__pic_packet;
 
-static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
-{
-   int mask=0x80, i;
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest) {
+  int mask = 0x80, i;
 
-   for (i=0; i<4; ++i, mask>>=1) {
-      if (channel & mask) {
-         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
-         dest[i]=stbi__get8(s);
-      }
-   }
+  for (i = 0; i < 4; ++i, mask >>= 1) {
+    if (channel & mask) {
+      if (stbi__at_eof(s))
+        return stbi__errpuc("bad file", "PIC file too short");
+      dest[i] = stbi__get8(s);
+    }
+  }
 
-   return dest;
+  return dest;
 }
 
-static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
-{
-   int mask=0x80,i;
+static void stbi__copyval(int channel, stbi_uc *dest, const stbi_uc *src) {
+  int mask = 0x80, i;
 
-   for (i=0;i<4; ++i, mask>>=1)
-      if (channel&mask)
-         dest[i]=src[i];
+  for (i = 0; i < 4; ++i, mask >>= 1)
+    if (channel & mask)
+      dest[i] = src[i];
 }
 
-static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
-{
-   int act_comp=0,num_packets=0,y,chained;
-   stbi__pic_packet packets[10];
+static stbi_uc *stbi__pic_load_core(stbi__context *s, int width, int height,
+                                    int *comp, stbi_uc *result) {
+  int act_comp = 0, num_packets = 0, y, chained;
+  stbi__pic_packet packets[10];
 
-   // this will (should...) cater for even some bizarre stuff like having data
-    // for the same channel in multiple packets.
-   do {
-      stbi__pic_packet *packet;
+  // this will (should...) cater for even some bizarre stuff like having data
+  // for the same channel in multiple packets.
+  do {
+    stbi__pic_packet *packet;
 
-      if (num_packets==sizeof(packets)/sizeof(packets[0]))
-         return stbi__errpuc("bad format","too many packets");
+    if (num_packets == sizeof(packets) / sizeof(packets[0]))
+      return stbi__errpuc("bad format", "too many packets");
 
-      packet = &packets[num_packets++];
+    packet = &packets[num_packets++];
 
-      chained = stbi__get8(s);
-      packet->size    = stbi__get8(s);
-      packet->type    = stbi__get8(s);
-      packet->channel = stbi__get8(s);
+    chained = stbi__get8(s);
+    packet->size = stbi__get8(s);
+    packet->type = stbi__get8(s);
+    packet->channel = stbi__get8(s);
 
-      act_comp |= packet->channel;
+    act_comp |= packet->channel;
 
-      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
-      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
-   } while (chained);
+    if (stbi__at_eof(s))
+      return stbi__errpuc("bad file", "file too short (reading packets)");
+    if (packet->size != 8)
+      return stbi__errpuc("bad format", "packet isn't 8bpp");
+  } while (chained);
 
-   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+  *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
 
-   for(y=0; y<height; ++y) {
-      int packet_idx;
+  for (y = 0; y < height; ++y) {
+    int packet_idx;
 
-      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
-         stbi__pic_packet *packet = &packets[packet_idx];
-         stbi_uc *dest = result+y*width*4;
+    for (packet_idx = 0; packet_idx < num_packets; ++packet_idx) {
+      stbi__pic_packet *packet = &packets[packet_idx];
+      stbi_uc *dest = result + y * width * 4;
 
-         switch (packet->type) {
-            default:
-               return stbi__errpuc("bad format","packet has bad compression type");
+      switch (packet->type) {
+      default:
+        return stbi__errpuc("bad format", "packet has bad compression type");
 
-            case 0: {//uncompressed
-               int x;
+      case 0: { // uncompressed
+        int x;
 
-               for(x=0;x<width;++x, dest+=4)
-                  if (!stbi__readval(s,packet->channel,dest))
-                     return 0;
-               break;
-            }
+        for (x = 0; x < width; ++x, dest += 4)
+          if (!stbi__readval(s, packet->channel, dest))
+            return 0;
+        break;
+      }
 
-            case 1://Pure RLE
-               {
-                  int left=width, i;
-
-                  while (left>0) {
-                     stbi_uc count,value[4];
-
-                     count=stbi__get8(s);
-                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
-
-                     if (count > left)
-                        count = (stbi_uc) left;
-
-                     if (!stbi__readval(s,packet->channel,value))  return 0;
-
-                     for(i=0; i<count; ++i,dest+=4)
-                        stbi__copyval(packet->channel,dest,value);
-                     left -= count;
-                  }
-               }
-               break;
-
-            case 2: {//Mixed RLE
-               int left=width;
-               while (left>0) {
-                  int count = stbi__get8(s), i;
-                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
-
-                  if (count >= 128) { // Repeated
-                     stbi_uc value[4];
-
-                     if (count==128)
-                        count = stbi__get16be(s);
-                     else
-                        count -= 127;
-                     if (count > left)
-                        return stbi__errpuc("bad file","scanline overrun");
-
-                     if (!stbi__readval(s,packet->channel,value))
-                        return 0;
-
-                     for(i=0;i<count;++i, dest += 4)
-                        stbi__copyval(packet->channel,dest,value);
-                  } else { // Raw
-                     ++count;
-                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
-
-                     for(i=0;i<count;++i, dest+=4)
-                        if (!stbi__readval(s,packet->channel,dest))
-                           return 0;
-                  }
-                  left-=count;
-               }
-               break;
-            }
-         }
+      case 1: // Pure RLE
+      {
+        int left = width, i;
+
+        while (left > 0) {
+          stbi_uc count, value[4];
+
+          count = stbi__get8(s);
+          if (stbi__at_eof(s))
+            return stbi__errpuc("bad file", "file too short (pure read count)");
+
+          if (count > left)
+            count = (stbi_uc)left;
+
+          if (!stbi__readval(s, packet->channel, value))
+            return 0;
+
+          for (i = 0; i < count; ++i, dest += 4)
+            stbi__copyval(packet->channel, dest, value);
+          left -= count;
+        }
+      } break;
+
+      case 2: { // Mixed RLE
+        int left = width;
+        while (left > 0) {
+          int count = stbi__get8(s), i;
+          if (stbi__at_eof(s))
+            return stbi__errpuc("bad file",
+                                "file too short (mixed read count)");
+
+          if (count >= 128) { // Repeated
+            stbi_uc value[4];
+
+            if (count == 128)
+              count = stbi__get16be(s);
+            else
+              count -= 127;
+            if (count > left)
+              return stbi__errpuc("bad file", "scanline overrun");
+
+            if (!stbi__readval(s, packet->channel, value))
+              return 0;
+
+            for (i = 0; i < count; ++i, dest += 4)
+              stbi__copyval(packet->channel, dest, value);
+          } else { // Raw
+            ++count;
+            if (count > left)
+              return stbi__errpuc("bad file", "scanline overrun");
+
+            for (i = 0; i < count; ++i, dest += 4)
+              if (!stbi__readval(s, packet->channel, dest))
+                return 0;
+          }
+          left -= count;
+        }
+        break;
       }
-   }
+      }
+    }
+  }
 
-   return result;
+  return result;
 }
 
-static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *result;
-   int i, x,y, internal_comp;
-   STBI_NOTUSED(ri);
+static void *stbi__pic_load(stbi__context *s, int *px, int *py, int *comp,
+                            int req_comp, stbi__result_info *ri) {
+  stbi_uc *result;
+  int i, x, y, internal_comp;
+  STBI_NOTUSED(ri);
 
-   if (!comp) comp = &internal_comp;
+  if (!comp)
+    comp = &internal_comp;
 
-   for (i=0; i<92; ++i)
-      stbi__get8(s);
+  for (i = 0; i < 92; ++i)
+    stbi__get8(s);
 
-   x = stbi__get16be(s);
-   y = stbi__get16be(s);
-   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
-   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
+  x = stbi__get16be(s);
+  y = stbi__get16be(s);
+  if (stbi__at_eof(s))
+    return stbi__errpuc("bad file", "file too short (pic header)");
+  if (!stbi__mad3sizes_valid(x, y, 4, 0))
+    return stbi__errpuc("too large", "PIC image too large to decode");
 
-   stbi__get32be(s); //skip `ratio'
-   stbi__get16be(s); //skip `fields'
-   stbi__get16be(s); //skip `pad'
+  stbi__get32be(s); // skip `ratio'
+  stbi__get16be(s); // skip `fields'
+  stbi__get16be(s); // skip `pad'
 
-   // intermediate buffer is RGBA
-   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
-   memset(result, 0xff, x*y*4);
+  // intermediate buffer is RGBA
+  result = (stbi_uc *)stbi__malloc_mad3(x, y, 4, 0);
+  memset(result, 0xff, x * y * 4);
 
-   if (!stbi__pic_load_core(s,x,y,comp, result)) {
-      STBI_FREE(result);
-      result=0;
-   }
-   *px = x;
-   *py = y;
-   if (req_comp == 0) req_comp = *comp;
-   result=stbi__convert_format(result,4,req_comp,x,y);
+  if (!stbi__pic_load_core(s, x, y, comp, result)) {
+    STBI_FREE(result);
+    result = 0;
+  }
+  *px = x;
+  *py = y;
+  if (req_comp == 0)
+    req_comp = *comp;
+  result = stbi__convert_format(result, 4, req_comp, x, y);
 
-   return result;
+  return result;
 }
 
-static int stbi__pic_test(stbi__context *s)
-{
-   int r = stbi__pic_test_core(s);
-   stbi__rewind(s);
-   return r;
+static int stbi__pic_test(stbi__context *s) {
+  int r = stbi__pic_test_core(s);
+  stbi__rewind(s);
+  return r;
 }
 #endif
 
@@ -6190,495 +6862,517 @@ static int stbi__pic_test(stbi__context *s)
 // GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
 
 #ifndef STBI_NO_GIF
-typedef struct
-{
-   stbi__int16 prefix;
-   stbi_uc first;
-   stbi_uc suffix;
+typedef struct {
+  stbi__int16 prefix;
+  stbi_uc first;
+  stbi_uc suffix;
 } stbi__gif_lzw;
 
-typedef struct
-{
-   int w,h;
-   stbi_uc *out;                 // output buffer (always 4 components)
-   stbi_uc *background;          // The current "background" as far as a gif is concerned
-   stbi_uc *history; 
-   int flags, bgindex, ratio, transparent, eflags;
-   stbi_uc  pal[256][4];
-   stbi_uc lpal[256][4];
-   stbi__gif_lzw codes[8192];
-   stbi_uc *color_table;
-   int parse, step;
-   int lflags;
-   int start_x, start_y;
-   int max_x, max_y;
-   int cur_x, cur_y;
-   int line_size;
-   int delay;
+typedef struct {
+  int w, h;
+  stbi_uc *out;        // output buffer (always 4 components)
+  stbi_uc *background; // The current "background" as far as a gif is concerned
+  stbi_uc *history;
+  int flags, bgindex, ratio, transparent, eflags;
+  stbi_uc pal[256][4];
+  stbi_uc lpal[256][4];
+  stbi__gif_lzw codes[8192];
+  stbi_uc *color_table;
+  int parse, step;
+  int lflags;
+  int start_x, start_y;
+  int max_x, max_y;
+  int cur_x, cur_y;
+  int line_size;
+  int delay;
 } stbi__gif;
 
-static int stbi__gif_test_raw(stbi__context *s)
-{
-   int sz;
-   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
-   sz = stbi__get8(s);
-   if (sz != '9' && sz != '7') return 0;
-   if (stbi__get8(s) != 'a') return 0;
-   return 1;
-}
-
-static int stbi__gif_test(stbi__context *s)
-{
-   int r = stbi__gif_test_raw(s);
-   stbi__rewind(s);
-   return r;
-}
+static int stbi__gif_test_raw(stbi__context *s) {
+  int sz;
+  if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' ||
+      stbi__get8(s) != '8')
+    return 0;
+  sz = stbi__get8(s);
+  if (sz != '9' && sz != '7')
+    return 0;
+  if (stbi__get8(s) != 'a')
+    return 0;
+  return 1;
+}
+
+static int stbi__gif_test(stbi__context *s) {
+  int r = stbi__gif_test_raw(s);
+  stbi__rewind(s);
+  return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4],
+                                       int num_entries, int transp) {
+  int i;
+  for (i = 0; i < num_entries; ++i) {
+    pal[i][2] = stbi__get8(s);
+    pal[i][1] = stbi__get8(s);
+    pal[i][0] = stbi__get8(s);
+    pal[i][3] = transp == i ? 0 : 255;
+  }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp,
+                            int is_info) {
+  stbi_uc version;
+  if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' ||
+      stbi__get8(s) != '8')
+    return stbi__err("not GIF", "Corrupt GIF");
+
+  version = stbi__get8(s);
+  if (version != '7' && version != '9')
+    return stbi__err("not GIF", "Corrupt GIF");
+  if (stbi__get8(s) != 'a')
+    return stbi__err("not GIF", "Corrupt GIF");
+
+  stbi__g_failure_reason = "";
+  g->w = stbi__get16le(s);
+  g->h = stbi__get16le(s);
+  g->flags = stbi__get8(s);
+  g->bgindex = stbi__get8(s);
+  g->ratio = stbi__get8(s);
+  g->transparent = -1;
+
+  if (comp != 0)
+    *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the
+               // comments
+
+  if (is_info)
+    return 1;
+
+  if (g->flags & 0x80)
+    stbi__gif_parse_colortable(s, g->pal, 2 << (g->flags & 7), -1);
+
+  return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp) {
+  stbi__gif *g = (stbi__gif *)stbi__malloc(sizeof(stbi__gif));
+  if (!stbi__gif_header(s, g, comp, 1)) {
+    STBI_FREE(g);
+    stbi__rewind(s);
+    return 0;
+  }
+  if (x)
+    *x = g->w;
+  if (y)
+    *y = g->h;
+  STBI_FREE(g);
+  return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code) {
+  stbi_uc *p, *c;
+  int idx;
+
+  // recurse to decode the prefixes, since the linked-list is backwards,
+  // and working backwards through an interleaved image would be nasty
+  if (g->codes[code].prefix >= 0)
+    stbi__out_gif_code(g, g->codes[code].prefix);
+
+  if (g->cur_y >= g->max_y)
+    return;
+
+  idx = g->cur_x + g->cur_y;
+  p = &g->out[idx];
+  g->history[idx / 4] = 1;
+
+  c = &g->color_table[g->codes[code].suffix * 4];
+  if (c[3] > 128) { // don't render transparent pixels;
+    p[0] = c[2];
+    p[1] = c[1];
+    p[2] = c[0];
+    p[3] = c[3];
+  }
+  g->cur_x += 4;
+
+  if (g->cur_x >= g->max_x) {
+    g->cur_x = g->start_x;
+    g->cur_y += g->step;
+
+    while (g->cur_y >= g->max_y && g->parse > 0) {
+      g->step = (1 << g->parse) * g->line_size;
+      g->cur_y = g->start_y + (g->step >> 1);
+      --g->parse;
+    }
+  }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g) {
+  stbi_uc lzw_cs;
+  stbi__int32 len, init_code;
+  stbi__uint32 first;
+  stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+  stbi__gif_lzw *p;
+
+  lzw_cs = stbi__get8(s);
+  if (lzw_cs > 12)
+    return NULL;
+  clear = 1 << lzw_cs;
+  first = 1;
+  codesize = lzw_cs + 1;
+  codemask = (1 << codesize) - 1;
+  bits = 0;
+  valid_bits = 0;
+  for (init_code = 0; init_code < clear; init_code++) {
+    g->codes[init_code].prefix = -1;
+    g->codes[init_code].first = (stbi_uc)init_code;
+    g->codes[init_code].suffix = (stbi_uc)init_code;
+  }
+
+  // support no starting clear code
+  avail = clear + 2;
+  oldcode = -1;
+
+  len = 0;
+  for (;;) {
+    if (valid_bits < codesize) {
+      if (len == 0) {
+        len = stbi__get8(s); // start new block
+        if (len == 0)
+          return g->out;
+      }
+      --len;
+      bits |= (stbi__int32)stbi__get8(s) << valid_bits;
+      valid_bits += 8;
+    } else {
+      stbi__int32 code = bits & codemask;
+      bits >>= codesize;
+      valid_bits -= codesize;
+      // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+      if (code == clear) { // clear code
+        codesize = lzw_cs + 1;
+        codemask = (1 << codesize) - 1;
+        avail = clear + 2;
+        oldcode = -1;
+        first = 0;
+      } else if (code == clear + 1) { // end of stream code
+        stbi__skip(s, len);
+        while ((len = stbi__get8(s)) > 0)
+          stbi__skip(s, len);
+        return g->out;
+      } else if (code <= avail) {
+        if (first) {
+          return stbi__errpuc("no clear code", "Corrupt GIF");
+        }
 
-static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
-{
-   int i;
-   for (i=0; i < num_entries; ++i) {
-      pal[i][2] = stbi__get8(s);
-      pal[i][1] = stbi__get8(s);
-      pal[i][0] = stbi__get8(s);
-      pal[i][3] = transp == i ? 0 : 255;
-   }
-}
+        if (oldcode >= 0) {
+          p = &g->codes[avail++];
+          if (avail > 8192) {
+            return stbi__errpuc("too many codes", "Corrupt GIF");
+          }
 
-static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
-{
-   stbi_uc version;
-   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
-      return stbi__err("not GIF", "Corrupt GIF");
+          p->prefix = (stbi__int16)oldcode;
+          p->first = g->codes[oldcode].first;
+          p->suffix = (code == avail) ? p->first : g->codes[code].first;
+        } else if (code == avail)
+          return stbi__errpuc("illegal code in raster", "Corrupt GIF");
 
-   version = stbi__get8(s);
-   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
-   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+        stbi__out_gif_code(g, (stbi__uint16)code);
 
-   stbi__g_failure_reason = "";
-   g->w = stbi__get16le(s);
-   g->h = stbi__get16le(s);
-   g->flags = stbi__get8(s);
-   g->bgindex = stbi__get8(s);
-   g->ratio = stbi__get8(s);
-   g->transparent = -1;
+        if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+          codesize++;
+          codemask = (1 << codesize) - 1;
+        }
 
-   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+        oldcode = code;
+      } else {
+        return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+      }
+    }
+  }
+}
+
+// this function is designed to support animated gifs, although stb_image
+// doesn't support it two back is the image from two frames ago, used for a very
+// specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp,
+                                    int req_comp, stbi_uc *two_back) {
+  int dispose;
+  int first_frame;
+  int pi;
+  int pcount;
+  STBI_NOTUSED(req_comp);
+
+  // on first frame, any non-written pixels get the background colour
+  // (non-transparent)
+  first_frame = 0;
+  if (g->out == 0) {
+    if (!stbi__gif_header(s, g, comp, 0))
+      return 0; // stbi__g_failure_reason set by stbi__gif_header
+    if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+      return stbi__errpuc("too large", "GIF image is too large");
+    pcount = g->w * g->h;
+    g->out = (stbi_uc *)stbi__malloc(4 * pcount);
+    g->background = (stbi_uc *)stbi__malloc(4 * pcount);
+    g->history = (stbi_uc *)stbi__malloc(pcount);
+    if (!g->out || !g->background || !g->history)
+      return stbi__errpuc("outofmem", "Out of memory");
 
-   if (is_info) return 1;
+    // image is treated as "transparent" at the start - ie, nothing overwrites
+    // the current background; background colour is only used for pixels that
+    // are not rendered first frame, after that "background" color refers to the
+    // color that was there the previous frame.
+    memset(g->out, 0x00, 4 * pcount);
+    memset(g->background, 0x00,
+           4 * pcount); // state of the background (starts transparent)
+    memset(g->history, 0x00,
+           pcount); // pixels that were affected previous frame
+    first_frame = 1;
+  } else {
+    // second frame - how do we dispoase of the previous one?
+    dispose = (g->eflags & 0x1C) >> 2;
+    pcount = g->w * g->h;
+
+    if ((dispose == 3) && (two_back == 0)) {
+      dispose = 2; // if I don't have an image to revert back to, default to the
+                   // old background
+    }
 
-   if (g->flags & 0x80)
-      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+    if (dispose == 3) { // use previous graphic
+      for (pi = 0; pi < pcount; ++pi) {
+        if (g->history[pi]) {
+          memcpy(&g->out[pi * 4], &two_back[pi * 4], 4);
+        }
+      }
+    } else if (dispose == 2) {
+      // restore what was changed last frame to background before that frame;
+      for (pi = 0; pi < pcount; ++pi) {
+        if (g->history[pi]) {
+          memcpy(&g->out[pi * 4], &g->background[pi * 4], 4);
+        }
+      }
+    } else {
+      // This is a non-disposal case eithe way, so just
+      // leave the pixels as is, and they will become the new background
+      // 1: do not dispose
+      // 0:  not specified.
+    }
 
-   return 1;
-}
+    // background is what out is after the undoing of the previou frame;
+    memcpy(g->background, g->out, 4 * g->w * g->h);
+  }
+
+  // clear my history;
+  memset(g->history, 0x00,
+         g->w * g->h); // pixels that were affected previous frame
+
+  for (;;) {
+    int tag = stbi__get8(s);
+    switch (tag) {
+    case 0x2C: /* Image Descriptor */
+    {
+      stbi__int32 x, y, w, h;
+      stbi_uc *o;
+
+      x = stbi__get16le(s);
+      y = stbi__get16le(s);
+      w = stbi__get16le(s);
+      h = stbi__get16le(s);
+      if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+        return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+      g->line_size = g->w * 4;
+      g->start_x = x * 4;
+      g->start_y = y * g->line_size;
+      g->max_x = g->start_x + w * 4;
+      g->max_y = g->start_y + h * g->line_size;
+      g->cur_x = g->start_x;
+      g->cur_y = g->start_y;
 
-static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
-{
-   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
-   if (!stbi__gif_header(s, g, comp, 1)) {
-      STBI_FREE(g);
-      stbi__rewind( s );
-      return 0;
-   }
-   if (x) *x = g->w;
-   if (y) *y = g->h;
-   STBI_FREE(g);
-   return 1;
-}
+      // if the width of the specified rectangle is 0, that means
+      // we may not see *any* pixels or the image is malformed;
+      // to make sure this is caught, move the current y down to
+      // max_y (which is what out_gif_code checks).
+      if (w == 0)
+        g->cur_y = g->max_y;
 
-static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
-{
-   stbi_uc *p, *c;
-   int idx; 
-
-   // recurse to decode the prefixes, since the linked-list is backwards,
-   // and working backwards through an interleaved image would be nasty
-   if (g->codes[code].prefix >= 0)
-      stbi__out_gif_code(g, g->codes[code].prefix);
-
-   if (g->cur_y >= g->max_y) return;
-
-   idx = g->cur_x + g->cur_y; 
-   p = &g->out[idx];
-   g->history[idx / 4] = 1;  
-
-   c = &g->color_table[g->codes[code].suffix * 4];
-   if (c[3] > 128) { // don't render transparent pixels; 
-      p[0] = c[2];
-      p[1] = c[1];
-      p[2] = c[0];
-      p[3] = c[3];
-   }
-   g->cur_x += 4;
-
-   if (g->cur_x >= g->max_x) {
-      g->cur_x = g->start_x;
-      g->cur_y += g->step;
+      g->lflags = stbi__get8(s);
 
-      while (g->cur_y >= g->max_y && g->parse > 0) {
-         g->step = (1 << g->parse) * g->line_size;
-         g->cur_y = g->start_y + (g->step >> 1);
-         --g->parse;
+      if (g->lflags & 0x40) {
+        g->step = 8 * g->line_size; // first interlaced spacing
+        g->parse = 3;
+      } else {
+        g->step = g->line_size;
+        g->parse = 0;
       }
-   }
-}
 
-static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
-{
-   stbi_uc lzw_cs;
-   stbi__int32 len, init_code;
-   stbi__uint32 first;
-   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
-   stbi__gif_lzw *p;
-
-   lzw_cs = stbi__get8(s);
-   if (lzw_cs > 12) return NULL;
-   clear = 1 << lzw_cs;
-   first = 1;
-   codesize = lzw_cs + 1;
-   codemask = (1 << codesize) - 1;
-   bits = 0;
-   valid_bits = 0;
-   for (init_code = 0; init_code < clear; init_code++) {
-      g->codes[init_code].prefix = -1;
-      g->codes[init_code].first = (stbi_uc) init_code;
-      g->codes[init_code].suffix = (stbi_uc) init_code;
-   }
-
-   // support no starting clear code
-   avail = clear+2;
-   oldcode = -1;
-
-   len = 0;
-   for(;;) {
-      if (valid_bits < codesize) {
-         if (len == 0) {
-            len = stbi__get8(s); // start new block
-            if (len == 0)
-               return g->out;
-         }
-         --len;
-         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
-         valid_bits += 8;
-      } else {
-         stbi__int32 code = bits & codemask;
-         bits >>= codesize;
-         valid_bits -= codesize;
-         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
-         if (code == clear) {  // clear code
-            codesize = lzw_cs + 1;
-            codemask = (1 << codesize) - 1;
-            avail = clear + 2;
-            oldcode = -1;
-            first = 0;
-         } else if (code == clear + 1) { // end of stream code
-            stbi__skip(s, len);
-            while ((len = stbi__get8(s)) > 0)
-               stbi__skip(s,len);
-            return g->out;
-         } else if (code <= avail) {
-            if (first) {
-               return stbi__errpuc("no clear code", "Corrupt GIF");
-            }
+      if (g->lflags & 0x80) {
+        stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7),
+                                   g->eflags & 0x01 ? g->transparent : -1);
+        g->color_table = (stbi_uc *)g->lpal;
+      } else if (g->flags & 0x80) {
+        g->color_table = (stbi_uc *)g->pal;
+      } else
+        return stbi__errpuc("missing color table", "Corrupt GIF");
 
-            if (oldcode >= 0) {
-               p = &g->codes[avail++];
-               if (avail > 8192) {
-                  return stbi__errpuc("too many codes", "Corrupt GIF");
-               }
+      o = stbi__process_gif_raster(s, g);
+      if (!o)
+        return NULL;
 
-               p->prefix = (stbi__int16) oldcode;
-               p->first = g->codes[oldcode].first;
-               p->suffix = (code == avail) ? p->first : g->codes[code].first;
-            } else if (code == avail)
-               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+      // if this was the first frame,
+      pcount = g->w * g->h;
+      if (first_frame && (g->bgindex > 0)) {
+        // if first frame, any pixel not drawn to gets the background color
+        for (pi = 0; pi < pcount; ++pi) {
+          if (g->history[pi] == 0) {
+            g->pal[g->bgindex][3] =
+                255; // just in case it was made transparent, undo that; It will
+                     // be reset next frame if need be;
+            memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4);
+          }
+        }
+      }
 
-            stbi__out_gif_code(g, (stbi__uint16) code);
+      return o;
+    }
 
-            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
-               codesize++;
-               codemask = (1 << codesize) - 1;
+    case 0x21: // Comment Extension.
+    {
+      int len;
+      int ext = stbi__get8(s);
+      if (ext == 0xF9) { // Graphic Control Extension.
+        len = stbi__get8(s);
+        if (len == 4) {
+          g->eflags = stbi__get8(s);
+          g->delay =
+              10 * stbi__get16le(
+                       s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+          // unset old transparent
+          if (g->transparent >= 0) {
+            g->pal[g->transparent][3] = 255;
+          }
+          if (g->eflags & 0x01) {
+            g->transparent = stbi__get8(s);
+            if (g->transparent >= 0) {
+              g->pal[g->transparent][3] = 0;
             }
-
-            oldcode = code;
-         } else {
-            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
-         }
+          } else {
+            // don't need transparent
+            stbi__skip(s, 1);
+            g->transparent = -1;
+          }
+        } else {
+          stbi__skip(s, len);
+          break;
+        }
       }
-   }
-}
-
-// this function is designed to support animated gifs, although stb_image doesn't support it
-// two back is the image from two frames ago, used for a very specific disposal format
-static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
-{
-   int dispose; 
-   int first_frame; 
-   int pi; 
-   int pcount; 
-   STBI_NOTUSED(req_comp);
-
-   // on first frame, any non-written pixels get the background colour (non-transparent)
-   first_frame = 0; 
-   if (g->out == 0) {
-      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
-      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
-         return stbi__errpuc("too large", "GIF image is too large");
-      pcount = g->w * g->h;
-      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
-      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
-      g->history = (stbi_uc *) stbi__malloc(pcount);
-      if (!g->out || !g->background || !g->history)
-         return stbi__errpuc("outofmem", "Out of memory");
-
-      // image is treated as "transparent" at the start - ie, nothing overwrites the current background; 
-      // background colour is only used for pixels that are not rendered first frame, after that "background"
-      // color refers to the color that was there the previous frame. 
-      memset(g->out, 0x00, 4 * pcount);
-      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
-      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
-      first_frame = 1; 
-   } else {
-      // second frame - how do we dispoase of the previous one?
-      dispose = (g->eflags & 0x1C) >> 2; 
-      pcount = g->w * g->h; 
-
-      if ((dispose == 3) && (two_back == 0)) {
-         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      while ((len = stbi__get8(s)) != 0) {
+        stbi__skip(s, len);
       }
+      break;
+    }
 
-      if (dispose == 3) { // use previous graphic
-         for (pi = 0; pi < pcount; ++pi) {
-            if (g->history[pi]) {
-               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 ); 
-            }
-         }
-      } else if (dispose == 2) { 
-         // restore what was changed last frame to background before that frame; 
-         for (pi = 0; pi < pcount; ++pi) {
-            if (g->history[pi]) {
-               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 ); 
-            }
-         }
-      } else {
-         // This is a non-disposal case eithe way, so just 
-         // leave the pixels as is, and they will become the new background
-         // 1: do not dispose
-         // 0:  not specified.
-      }
+    case 0x3B:             // gif stream termination code
+      return (stbi_uc *)s; // using '1' causes warning on some compilers
 
-      // background is what out is after the undoing of the previou frame; 
-      memcpy( g->background, g->out, 4 * g->w * g->h ); 
-   }
-
-   // clear my history; 
-   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
-
-   for (;;) {
-      int tag = stbi__get8(s); 
-      switch (tag) {
-         case 0x2C: /* Image Descriptor */
-         {
-            stbi__int32 x, y, w, h;
-            stbi_uc *o;
-
-            x = stbi__get16le(s);
-            y = stbi__get16le(s);
-            w = stbi__get16le(s);
-            h = stbi__get16le(s);
-            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
-               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
-
-            g->line_size = g->w * 4;
-            g->start_x = x * 4;
-            g->start_y = y * g->line_size;
-            g->max_x   = g->start_x + w * 4;
-            g->max_y   = g->start_y + h * g->line_size;
-            g->cur_x   = g->start_x;
-            g->cur_y   = g->start_y;
-
-            // if the width of the specified rectangle is 0, that means
-            // we may not see *any* pixels or the image is malformed;
-            // to make sure this is caught, move the current y down to
-            // max_y (which is what out_gif_code checks).
-            if (w == 0)
-               g->cur_y = g->max_y;
-
-            g->lflags = stbi__get8(s);
-
-            if (g->lflags & 0x40) {
-               g->step = 8 * g->line_size; // first interlaced spacing
-               g->parse = 3;
-            } else {
-               g->step = g->line_size;
-               g->parse = 0;
-            }
+    default:
+      return stbi__errpuc("unknown code", "Corrupt GIF");
+    }
+  }
+}
+
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
+                                 int *z, int *comp, int req_comp) {
+  if (stbi__gif_test(s)) {
+    int layers = 0;
+    stbi_uc *u = 0;
+    stbi_uc *out = 0;
+    stbi_uc *two_back = 0;
+    stbi__gif g;
+    int stride;
+    memset(&g, 0, sizeof(g));
+    if (delays) {
+      *delays = 0;
+    }
 
-            if (g->lflags & 0x80) {
-               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
-               g->color_table = (stbi_uc *) g->lpal;
-            } else if (g->flags & 0x80) {
-               g->color_table = (stbi_uc *) g->pal;
-            } else
-               return stbi__errpuc("missing color table", "Corrupt GIF");            
-            
-            o = stbi__process_gif_raster(s, g);
-            if (!o) return NULL;
-
-            // if this was the first frame, 
-            pcount = g->w * g->h; 
-            if (first_frame && (g->bgindex > 0)) {
-               // if first frame, any pixel not drawn to gets the background color
-               for (pi = 0; pi < pcount; ++pi) {
-                  if (g->history[pi] == 0) {
-                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be; 
-                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 ); 
-                  }
-               }
-            }
+    do {
+      u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+      if (u == (stbi_uc *)s)
+        u = 0; // end of animated gif marker
+
+      if (u) {
+        *x = g.w;
+        *y = g.h;
+        ++layers;
+        stride = g.w * g.h * 4;
+
+        if (out) {
+          out = (stbi_uc *)STBI_REALLOC(out, layers * stride);
+          if (delays) {
+            *delays = (int *)STBI_REALLOC(*delays, sizeof(int) * layers);
+          }
+        } else {
+          out = (stbi_uc *)stbi__malloc(layers * stride);
+          if (delays) {
+            *delays = (int *)stbi__malloc(layers * sizeof(int));
+          }
+        }
+        memcpy(out + ((layers - 1) * stride), u, stride);
+        if (layers >= 2) {
+          two_back = out - 2 * stride;
+        }
 
-            return o;
-         }
-
-         case 0x21: // Comment Extension.
-         {
-            int len;
-            int ext = stbi__get8(s); 
-            if (ext == 0xF9) { // Graphic Control Extension.
-               len = stbi__get8(s);
-               if (len == 4) {
-                  g->eflags = stbi__get8(s);
-                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
-
-                  // unset old transparent
-                  if (g->transparent >= 0) {
-                     g->pal[g->transparent][3] = 255; 
-                  } 
-                  if (g->eflags & 0x01) {
-                     g->transparent = stbi__get8(s);
-                     if (g->transparent >= 0) {
-                        g->pal[g->transparent][3] = 0; 
-                     }
-                  } else {
-                     // don't need transparent
-                     stbi__skip(s, 1); 
-                     g->transparent = -1; 
-                  }
-               } else {
-                  stbi__skip(s, len);
-                  break;
-               }
-            } 
-            while ((len = stbi__get8(s)) != 0) {
-               stbi__skip(s, len);
-            }
-            break;
-         }
+        if (delays) {
+          (*delays)[layers - 1U] = g.delay;
+        }
+      }
+    } while (u != 0);
 
-         case 0x3B: // gif stream termination code
-            return (stbi_uc *) s; // using '1' causes warning on some compilers
+    // free temp buffer;
+    STBI_FREE(g.out);
+    STBI_FREE(g.history);
+    STBI_FREE(g.background);
 
-         default:
-            return stbi__errpuc("unknown code", "Corrupt GIF");
-      }
-   }
-}
+    // do the final conversion after loading everything;
+    if (req_comp && req_comp != 4)
+      out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
 
-static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
-{
-   if (stbi__gif_test(s)) {
-      int layers = 0; 
-      stbi_uc *u = 0;
-      stbi_uc *out = 0;
-      stbi_uc *two_back = 0; 
-      stbi__gif g;
-      int stride; 
-      memset(&g, 0, sizeof(g));
-      if (delays) {
-         *delays = 0; 
-      }
+    *z = layers;
+    return out;
+  } else {
+    return stbi__errpuc("not GIF", "Image was not as a gif type.");
+  }
+}
 
-      do {
-         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
-         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
-
-         if (u) {
-            *x = g.w;
-            *y = g.h;
-            ++layers; 
-            stride = g.w * g.h * 4; 
-         
-            if (out) {
-               out = (stbi_uc*) STBI_REALLOC( out, layers * stride ); 
-               if (delays) {
-                  *delays = (int*) STBI_REALLOC( *delays, sizeof(int) * layers ); 
-               }
-            } else {
-               out = (stbi_uc*)stbi__malloc( layers * stride ); 
-               if (delays) {
-                  *delays = (int*) stbi__malloc( layers * sizeof(int) ); 
-               }
-            }
-            memcpy( out + ((layers - 1) * stride), u, stride ); 
-            if (layers >= 2) {
-               two_back = out - 2 * stride; 
-            }
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri) {
+  stbi_uc *u = 0;
+  stbi__gif g;
+  memset(&g, 0, sizeof(g));
+  STBI_NOTUSED(ri);
 
-            if (delays) {
-               (*delays)[layers - 1U] = g.delay; 
-            }
-         }
-      } while (u != 0); 
+  u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+  if (u == (stbi_uc *)s)
+    u = 0; // end of animated gif marker
+  if (u) {
+    *x = g.w;
+    *y = g.h;
 
-      // free temp buffer; 
-      STBI_FREE(g.out); 
-      STBI_FREE(g.history); 
-      STBI_FREE(g.background); 
+    // moved conversion to after successful load so that the same
+    // can be done for multiple frames.
+    if (req_comp && req_comp != 4)
+      u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+  } else if (g.out) {
+    // if there was an error and we allocated an image buffer, free it!
+    STBI_FREE(g.out);
+  }
 
-      // do the final conversion after loading everything; 
-      if (req_comp && req_comp != 4)
-         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+  // free buffers needed for multiple frame loading;
+  STBI_FREE(g.history);
+  STBI_FREE(g.background);
 
-      *z = layers; 
-      return out;
-   } else {
-      return stbi__errpuc("not GIF", "Image was not as a gif type."); 
-   }
+  return u;
 }
 
-static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *u = 0;
-   stbi__gif g;
-   memset(&g, 0, sizeof(g));
-   STBI_NOTUSED(ri);
-
-   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
-   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
-   if (u) {
-      *x = g.w;
-      *y = g.h;
-
-      // moved conversion to after successful load so that the same
-      // can be done for multiple frames. 
-      if (req_comp && req_comp != 4)
-         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
-   } else if (g.out) {
-      // if there was an error and we allocated an image buffer, free it!
-      STBI_FREE(g.out);
-   }
-
-   // free buffers needed for multiple frame loading; 
-   STBI_FREE(g.history);
-   STBI_FREE(g.background); 
-
-   return u;
-}
-
-static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   return stbi__gif_info_raw(s,x,y,comp);
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp) {
+  return stbi__gif_info_raw(s, x, y, comp);
 }
 #endif
 
@@ -6686,393 +7380,429 @@ static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
 // Radiance RGBE HDR loader
 // originally by Nicolas Schulz
 #ifndef STBI_NO_HDR
-static int stbi__hdr_test_core(stbi__context *s, const char *signature)
-{
-   int i;
-   for (i=0; signature[i]; ++i)
-      if (stbi__get8(s) != signature[i])
-          return 0;
-   stbi__rewind(s);
-   return 1;
-}
-
-static int stbi__hdr_test(stbi__context* s)
-{
-   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
-   stbi__rewind(s);
-   if(!r) {
-       r = stbi__hdr_test_core(s, "#?RGBE\n");
-       stbi__rewind(s);
-   }
-   return r;
-}
-
-#define STBI__HDR_BUFLEN  1024
-static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
-{
-   int len=0;
-   char c = '\0';
-
-   c = (char) stbi__get8(z);
-
-   while (!stbi__at_eof(z) && c != '\n') {
-      buffer[len++] = c;
-      if (len == STBI__HDR_BUFLEN-1) {
-         // flush to end of line
-         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
-            ;
-         break;
+static int stbi__hdr_test_core(stbi__context *s, const char *signature) {
+  int i;
+  for (i = 0; signature[i]; ++i)
+    if (stbi__get8(s) != signature[i])
+      return 0;
+  stbi__rewind(s);
+  return 1;
+}
+
+static int stbi__hdr_test(stbi__context *s) {
+  int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+  stbi__rewind(s);
+  if (!r) {
+    r = stbi__hdr_test_core(s, "#?RGBE\n");
+    stbi__rewind(s);
+  }
+  return r;
+}
+
+#define STBI__HDR_BUFLEN 1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer) {
+  int len = 0;
+  char c = '\0';
+
+  c = (char)stbi__get8(z);
+
+  while (!stbi__at_eof(z) && c != '\n') {
+    buffer[len++] = c;
+    if (len == STBI__HDR_BUFLEN - 1) {
+      // flush to end of line
+      while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+        ;
+      break;
+    }
+    c = (char)stbi__get8(z);
+  }
+
+  buffer[len] = 0;
+  return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp) {
+  if (input[3] != 0) {
+    float f1;
+    // Exponent
+    f1 = (float)ldexp(1.0f, input[3] - (int)(128 + 8));
+    if (req_comp <= 2)
+      output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+    else {
+      output[0] = input[0] * f1;
+      output[1] = input[1] * f1;
+      output[2] = input[2] * f1;
+    }
+    if (req_comp == 2)
+      output[1] = 1;
+    if (req_comp == 4)
+      output[3] = 1;
+  } else {
+    switch (req_comp) {
+    case 4:
+      output[3] = 1; /* fallthrough */
+    case 3:
+      output[0] = output[1] = output[2] = 0;
+      break;
+    case 2:
+      output[1] = 1; /* fallthrough */
+    case 1:
+      output[0] = 0;
+      break;
+    }
+  }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp,
+                             int req_comp, stbi__result_info *ri) {
+  char buffer[STBI__HDR_BUFLEN];
+  char *token;
+  int valid = 0;
+  int width, height;
+  stbi_uc *scanline;
+  float *hdr_data;
+  int len;
+  unsigned char count, value;
+  int i, j, k, c1, c2, z;
+  const char *headerToken;
+  STBI_NOTUSED(ri);
+
+  // Check identifier
+  headerToken = stbi__hdr_gettoken(s, buffer);
+  if (strcmp(headerToken, "#?RADIANCE") != 0 &&
+      strcmp(headerToken, "#?RGBE") != 0)
+    return stbi__errpf("not HDR", "Corrupt HDR image");
+
+  // Parse header
+  for (;;) {
+    token = stbi__hdr_gettoken(s, buffer);
+    if (token[0] == 0)
+      break;
+    if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
+      valid = 1;
+  }
+
+  if (!valid)
+    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+  // Parse width and height
+  // can't use sscanf() if we're not using stdio!
+  token = stbi__hdr_gettoken(s, buffer);
+  if (strncmp(token, "-Y ", 3))
+    return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+  token += 3;
+  height = (int)strtol(token, &token, 10);
+  while (*token == ' ')
+    ++token;
+  if (strncmp(token, "+X ", 3))
+    return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+  token += 3;
+  width = (int)strtol(token, NULL, 10);
+
+  *x = width;
+  *y = height;
+
+  if (comp)
+    *comp = 3;
+  if (req_comp == 0)
+    req_comp = 3;
+
+  if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+    return stbi__errpf("too large", "HDR image is too large");
+
+  // Read data
+  hdr_data =
+      (float *)stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+  if (!hdr_data)
+    return stbi__errpf("outofmem", "Out of memory");
+
+  // Load image data
+  // image data is stored as some number of sca
+  if (width < 8 || width >= 32768) {
+    // Read flat data
+    for (j = 0; j < height; ++j) {
+      for (i = 0; i < width; ++i) {
+        stbi_uc rgbe[4];
+      main_decode_loop:
+        stbi__getn(s, rgbe, 4);
+        stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe,
+                          req_comp);
       }
-      c = (char) stbi__get8(z);
-   }
-
-   buffer[len] = 0;
-   return buffer;
-}
+    }
+  } else {
+    // Read RLE-encoded data
+    scanline = NULL;
 
-static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
-{
-   if ( input[3] != 0 ) {
-      float f1;
-      // Exponent
-      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
-      if (req_comp <= 2)
-         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
-      else {
-         output[0] = input[0] * f1;
-         output[1] = input[1] * f1;
-         output[2] = input[2] * f1;
+    for (j = 0; j < height; ++j) {
+      c1 = stbi__get8(s);
+      c2 = stbi__get8(s);
+      len = stbi__get8(s);
+      if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+        // not run-length encoded, so we have to actually use THIS data as a
+        // decoded pixel (note this can't be a valid pixel--one of RGB must be
+        // >= 128)
+        stbi_uc rgbe[4];
+        rgbe[0] = (stbi_uc)c1;
+        rgbe[1] = (stbi_uc)c2;
+        rgbe[2] = (stbi_uc)len;
+        rgbe[3] = (stbi_uc)stbi__get8(s);
+        stbi__hdr_convert(hdr_data, rgbe, req_comp);
+        i = 1;
+        j = 0;
+        STBI_FREE(scanline);
+        goto main_decode_loop; // yes, this makes no sense
       }
-      if (req_comp == 2) output[1] = 1;
-      if (req_comp == 4) output[3] = 1;
-   } else {
-      switch (req_comp) {
-         case 4: output[3] = 1; /* fallthrough */
-         case 3: output[0] = output[1] = output[2] = 0;
-                 break;
-         case 2: output[1] = 1; /* fallthrough */
-         case 1: output[0] = 0;
-                 break;
+      len <<= 8;
+      len |= stbi__get8(s);
+      if (len != width) {
+        STBI_FREE(hdr_data);
+        STBI_FREE(scanline);
+        return stbi__errpf("invalid decoded scanline length", "corrupt HDR");
       }
-   }
-}
-
-static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   char buffer[STBI__HDR_BUFLEN];
-   char *token;
-   int valid = 0;
-   int width, height;
-   stbi_uc *scanline;
-   float *hdr_data;
-   int len;
-   unsigned char count, value;
-   int i, j, k, c1,c2, z;
-   const char *headerToken;
-   STBI_NOTUSED(ri);
-
-   // Check identifier
-   headerToken = stbi__hdr_gettoken(s,buffer);
-   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
-      return stbi__errpf("not HDR", "Corrupt HDR image");
-
-   // Parse header
-   for(;;) {
-      token = stbi__hdr_gettoken(s,buffer);
-      if (token[0] == 0) break;
-      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
-   }
-
-   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
-
-   // Parse width and height
-   // can't use sscanf() if we're not using stdio!
-   token = stbi__hdr_gettoken(s,buffer);
-   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
-   token += 3;
-   height = (int) strtol(token, &token, 10);
-   while (*token == ' ') ++token;
-   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
-   token += 3;
-   width = (int) strtol(token, NULL, 10);
-
-   *x = width;
-   *y = height;
-
-   if (comp) *comp = 3;
-   if (req_comp == 0) req_comp = 3;
-
-   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
-      return stbi__errpf("too large", "HDR image is too large");
-
-   // Read data
-   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
-   if (!hdr_data)
-      return stbi__errpf("outofmem", "Out of memory");
-
-   // Load image data
-   // image data is stored as some number of sca
-   if ( width < 8 || width >= 32768) {
-      // Read flat data
-      for (j=0; j < height; ++j) {
-         for (i=0; i < width; ++i) {
-            stbi_uc rgbe[4];
-           main_decode_loop:
-            stbi__getn(s, rgbe, 4);
-            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
-         }
+      if (scanline == NULL) {
+        scanline = (stbi_uc *)stbi__malloc_mad2(width, 4, 0);
+        if (!scanline) {
+          STBI_FREE(hdr_data);
+          return stbi__errpf("outofmem", "Out of memory");
+        }
       }
-   } else {
-      // Read RLE-encoded data
-      scanline = NULL;
-
-      for (j = 0; j < height; ++j) {
-         c1 = stbi__get8(s);
-         c2 = stbi__get8(s);
-         len = stbi__get8(s);
-         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
-            // not run-length encoded, so we have to actually use THIS data as a decoded
-            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
-            stbi_uc rgbe[4];
-            rgbe[0] = (stbi_uc) c1;
-            rgbe[1] = (stbi_uc) c2;
-            rgbe[2] = (stbi_uc) len;
-            rgbe[3] = (stbi_uc) stbi__get8(s);
-            stbi__hdr_convert(hdr_data, rgbe, req_comp);
-            i = 1;
-            j = 0;
-            STBI_FREE(scanline);
-            goto main_decode_loop; // yes, this makes no sense
-         }
-         len <<= 8;
-         len |= stbi__get8(s);
-         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
-         if (scanline == NULL) {
-            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
-            if (!scanline) {
-               STBI_FREE(hdr_data);
-               return stbi__errpf("outofmem", "Out of memory");
+
+      for (k = 0; k < 4; ++k) {
+        int nleft;
+        i = 0;
+        while ((nleft = width - i) > 0) {
+          count = stbi__get8(s);
+          if (count > 128) {
+            // Run
+            value = stbi__get8(s);
+            count -= 128;
+            if (count > nleft) {
+              STBI_FREE(hdr_data);
+              STBI_FREE(scanline);
+              return stbi__errpf("corrupt", "bad RLE data in HDR");
             }
-         }
-
-         for (k = 0; k < 4; ++k) {
-            int nleft;
-            i = 0;
-            while ((nleft = width - i) > 0) {
-               count = stbi__get8(s);
-               if (count > 128) {
-                  // Run
-                  value = stbi__get8(s);
-                  count -= 128;
-                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
-                  for (z = 0; z < count; ++z)
-                     scanline[i++ * 4 + k] = value;
-               } else {
-                  // Dump
-                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
-                  for (z = 0; z < count; ++z)
-                     scanline[i++ * 4 + k] = stbi__get8(s);
-               }
+            for (z = 0; z < count; ++z)
+              scanline[i++ * 4 + k] = value;
+          } else {
+            // Dump
+            if (count > nleft) {
+              STBI_FREE(hdr_data);
+              STBI_FREE(scanline);
+              return stbi__errpf("corrupt", "bad RLE data in HDR");
             }
-         }
-         for (i=0; i < width; ++i)
-            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+            for (z = 0; z < count; ++z)
+              scanline[i++ * 4 + k] = stbi__get8(s);
+          }
+        }
       }
-      if (scanline)
-         STBI_FREE(scanline);
-   }
-
-   return hdr_data;
-}
-
-static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   char buffer[STBI__HDR_BUFLEN];
-   char *token;
-   int valid = 0;
-   int dummy;
-
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
-
-   if (stbi__hdr_test(s) == 0) {
-       stbi__rewind( s );
-       return 0;
-   }
-
-   for(;;) {
-      token = stbi__hdr_gettoken(s,buffer);
-      if (token[0] == 0) break;
-      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
-   }
-
-   if (!valid) {
-       stbi__rewind( s );
-       return 0;
-   }
-   token = stbi__hdr_gettoken(s,buffer);
-   if (strncmp(token, "-Y ", 3)) {
-       stbi__rewind( s );
-       return 0;
-   }
-   token += 3;
-   *y = (int) strtol(token, &token, 10);
-   while (*token == ' ') ++token;
-   if (strncmp(token, "+X ", 3)) {
-       stbi__rewind( s );
-       return 0;
-   }
-   token += 3;
-   *x = (int) strtol(token, NULL, 10);
-   *comp = 3;
-   return 1;
+      for (i = 0; i < width; ++i)
+        stbi__hdr_convert(hdr_data + (j * width + i) * req_comp,
+                          scanline + i * 4, req_comp);
+    }
+    if (scanline)
+      STBI_FREE(scanline);
+  }
+
+  return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp) {
+  char buffer[STBI__HDR_BUFLEN];
+  char *token;
+  int valid = 0;
+  int dummy;
+
+  if (!x)
+    x = &dummy;
+  if (!y)
+    y = &dummy;
+  if (!comp)
+    comp = &dummy;
+
+  if (stbi__hdr_test(s) == 0) {
+    stbi__rewind(s);
+    return 0;
+  }
+
+  for (;;) {
+    token = stbi__hdr_gettoken(s, buffer);
+    if (token[0] == 0)
+      break;
+    if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
+      valid = 1;
+  }
+
+  if (!valid) {
+    stbi__rewind(s);
+    return 0;
+  }
+  token = stbi__hdr_gettoken(s, buffer);
+  if (strncmp(token, "-Y ", 3)) {
+    stbi__rewind(s);
+    return 0;
+  }
+  token += 3;
+  *y = (int)strtol(token, &token, 10);
+  while (*token == ' ')
+    ++token;
+  if (strncmp(token, "+X ", 3)) {
+    stbi__rewind(s);
+    return 0;
+  }
+  token += 3;
+  *x = (int)strtol(token, NULL, 10);
+  *comp = 3;
+  return 1;
 }
 #endif // STBI_NO_HDR
 
 #ifndef STBI_NO_BMP
-static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   void *p;
-   stbi__bmp_data info;
-
-   info.all_a = 255;
-   p = stbi__bmp_parse_header(s, &info);
-   stbi__rewind( s );
-   if (p == NULL)
-      return 0;
-   if (x) *x = s->img_x;
-   if (y) *y = s->img_y;
-   if (comp) {
-      if (info.bpp == 24 && info.ma == 0xff000000)
-         *comp = 3;
-      else
-         *comp = info.ma ? 4 : 3;
-   }
-   return 1;
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp) {
+  void *p;
+  stbi__bmp_data info;
+
+  info.all_a = 255;
+  p = stbi__bmp_parse_header(s, &info);
+  stbi__rewind(s);
+  if (p == NULL)
+    return 0;
+  if (x)
+    *x = s->img_x;
+  if (y)
+    *y = s->img_y;
+  if (comp) {
+    if (info.bpp == 24 && info.ma == 0xff000000)
+      *comp = 3;
+    else
+      *comp = info.ma ? 4 : 3;
+  }
+  return 1;
 }
 #endif
 
 #ifndef STBI_NO_PSD
-static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int channelCount, dummy, depth;
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
-   if (stbi__get32be(s) != 0x38425053) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (stbi__get16be(s) != 1) {
-       stbi__rewind( s );
-       return 0;
-   }
-   stbi__skip(s, 6);
-   channelCount = stbi__get16be(s);
-   if (channelCount < 0 || channelCount > 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   *y = stbi__get32be(s);
-   *x = stbi__get32be(s);
-   depth = stbi__get16be(s);
-   if (depth != 8 && depth != 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (stbi__get16be(s) != 3) {
-       stbi__rewind( s );
-       return 0;
-   }
-   *comp = 4;
-   return 1;
-}
-
-static int stbi__psd_is16(stbi__context *s)
-{
-   int channelCount, depth;
-   if (stbi__get32be(s) != 0x38425053) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (stbi__get16be(s) != 1) {
-       stbi__rewind( s );
-       return 0;
-   }
-   stbi__skip(s, 6);
-   channelCount = stbi__get16be(s);
-   if (channelCount < 0 || channelCount > 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   (void) stbi__get32be(s);
-   (void) stbi__get32be(s);
-   depth = stbi__get16be(s);
-   if (depth != 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   return 1;
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp) {
+  int channelCount, dummy, depth;
+  if (!x)
+    x = &dummy;
+  if (!y)
+    y = &dummy;
+  if (!comp)
+    comp = &dummy;
+  if (stbi__get32be(s) != 0x38425053) {
+    stbi__rewind(s);
+    return 0;
+  }
+  if (stbi__get16be(s) != 1) {
+    stbi__rewind(s);
+    return 0;
+  }
+  stbi__skip(s, 6);
+  channelCount = stbi__get16be(s);
+  if (channelCount < 0 || channelCount > 16) {
+    stbi__rewind(s);
+    return 0;
+  }
+  *y = stbi__get32be(s);
+  *x = stbi__get32be(s);
+  depth = stbi__get16be(s);
+  if (depth != 8 && depth != 16) {
+    stbi__rewind(s);
+    return 0;
+  }
+  if (stbi__get16be(s) != 3) {
+    stbi__rewind(s);
+    return 0;
+  }
+  *comp = 4;
+  return 1;
+}
+
+static int stbi__psd_is16(stbi__context *s) {
+  int channelCount, depth;
+  if (stbi__get32be(s) != 0x38425053) {
+    stbi__rewind(s);
+    return 0;
+  }
+  if (stbi__get16be(s) != 1) {
+    stbi__rewind(s);
+    return 0;
+  }
+  stbi__skip(s, 6);
+  channelCount = stbi__get16be(s);
+  if (channelCount < 0 || channelCount > 16) {
+    stbi__rewind(s);
+    return 0;
+  }
+  (void)stbi__get32be(s);
+  (void)stbi__get32be(s);
+  depth = stbi__get16be(s);
+  if (depth != 16) {
+    stbi__rewind(s);
+    return 0;
+  }
+  return 1;
 }
 #endif
 
 #ifndef STBI_NO_PIC
-static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int act_comp=0,num_packets=0,chained,dummy;
-   stbi__pic_packet packets[10];
-
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
-
-   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
-      stbi__rewind(s);
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp) {
+  int act_comp = 0, num_packets = 0, chained, dummy;
+  stbi__pic_packet packets[10];
+
+  if (!x)
+    x = &dummy;
+  if (!y)
+    y = &dummy;
+  if (!comp)
+    comp = &dummy;
+
+  if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) {
+    stbi__rewind(s);
+    return 0;
+  }
+
+  stbi__skip(s, 88);
+
+  *x = stbi__get16be(s);
+  *y = stbi__get16be(s);
+  if (stbi__at_eof(s)) {
+    stbi__rewind(s);
+    return 0;
+  }
+  if ((*x) != 0 && (1 << 28) / (*x) < (*y)) {
+    stbi__rewind(s);
+    return 0;
+  }
+
+  stbi__skip(s, 8);
+
+  do {
+    stbi__pic_packet *packet;
+
+    if (num_packets == sizeof(packets) / sizeof(packets[0]))
       return 0;
-   }
 
-   stbi__skip(s, 88);
+    packet = &packets[num_packets++];
+    chained = stbi__get8(s);
+    packet->size = stbi__get8(s);
+    packet->type = stbi__get8(s);
+    packet->channel = stbi__get8(s);
+    act_comp |= packet->channel;
 
-   *x = stbi__get16be(s);
-   *y = stbi__get16be(s);
-   if (stbi__at_eof(s)) {
-      stbi__rewind( s);
+    if (stbi__at_eof(s)) {
+      stbi__rewind(s);
       return 0;
-   }
-   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
-      stbi__rewind( s );
+    }
+    if (packet->size != 8) {
+      stbi__rewind(s);
       return 0;
-   }
-
-   stbi__skip(s, 8);
-
-   do {
-      stbi__pic_packet *packet;
-
-      if (num_packets==sizeof(packets)/sizeof(packets[0]))
-         return 0;
-
-      packet = &packets[num_packets++];
-      chained = stbi__get8(s);
-      packet->size    = stbi__get8(s);
-      packet->type    = stbi__get8(s);
-      packet->channel = stbi__get8(s);
-      act_comp |= packet->channel;
-
-      if (stbi__at_eof(s)) {
-          stbi__rewind( s );
-          return 0;
-      }
-      if (packet->size != 8) {
-          stbi__rewind( s );
-          return 0;
-      }
-   } while (chained);
+    }
+  } while (chained);
 
-   *comp = (act_comp & 0x10 ? 4 : 3);
+  *comp = (act_comp & 0x10 ? 4 : 3);
 
-   return 1;
+  return 1;
 }
 #endif
 
@@ -7090,254 +7820,259 @@ static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
 
 #ifndef STBI_NO_PNM
 
-static int      stbi__pnm_test(stbi__context *s)
-{
-   char p, t;
-   p = (char) stbi__get8(s);
-   t = (char) stbi__get8(s);
-   if (p != 'P' || (t != '5' && t != '6')) {
-       stbi__rewind( s );
-       return 0;
-   }
-   return 1;
+static int stbi__pnm_test(stbi__context *s) {
+  char p, t;
+  p = (char)stbi__get8(s);
+  t = (char)stbi__get8(s);
+  if (p != 'P' || (t != '5' && t != '6')) {
+    stbi__rewind(s);
+    return 0;
+  }
+  return 1;
 }
 
-static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *out;
-   STBI_NOTUSED(ri);
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp,
+                            int req_comp, stbi__result_info *ri) {
+  stbi_uc *out;
+  STBI_NOTUSED(ri);
 
-   if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
-      return 0;
+  if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
+    return 0;
 
-   *x = s->img_x;
-   *y = s->img_y;
-   if (comp) *comp = s->img_n;
+  *x = s->img_x;
+  *y = s->img_y;
+  if (comp)
+    *comp = s->img_n;
 
-   if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
-      return stbi__errpuc("too large", "PNM too large");
+  if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
+    return stbi__errpuc("too large", "PNM too large");
 
-   out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
+  out = (stbi_uc *)stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
+  if (!out)
+    return stbi__errpuc("outofmem", "Out of memory");
+  stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
 
-   if (req_comp && req_comp != s->img_n) {
-      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
-      if (out == NULL) return out; // stbi__convert_format frees input on failure
-   }
-   return out;
+  if (req_comp && req_comp != s->img_n) {
+    out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+    if (out == NULL)
+      return out; // stbi__convert_format frees input on failure
+  }
+  return out;
 }
 
-static int      stbi__pnm_isspace(char c)
-{
-   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+static int stbi__pnm_isspace(char c) {
+  return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' ||
+         c == '\r';
 }
 
-static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
-{
-   for (;;) {
-      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
-         *c = (char) stbi__get8(s);
+static void stbi__pnm_skip_whitespace(stbi__context *s, char *c) {
+  for (;;) {
+    while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+      *c = (char)stbi__get8(s);
 
-      if (stbi__at_eof(s) || *c != '#')
-         break;
+    if (stbi__at_eof(s) || *c != '#')
+      break;
 
-      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
-         *c = (char) stbi__get8(s);
-   }
+    while (!stbi__at_eof(s) && *c != '\n' && *c != '\r')
+      *c = (char)stbi__get8(s);
+  }
 }
 
-static int      stbi__pnm_isdigit(char c)
-{
-   return c >= '0' && c <= '9';
-}
+static int stbi__pnm_isdigit(char c) { return c >= '0' && c <= '9'; }
 
-static int      stbi__pnm_getinteger(stbi__context *s, char *c)
-{
-   int value = 0;
+static int stbi__pnm_getinteger(stbi__context *s, char *c) {
+  int value = 0;
 
-   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
-      value = value*10 + (*c - '0');
-      *c = (char) stbi__get8(s);
-   }
+  while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+    value = value * 10 + (*c - '0');
+    *c = (char)stbi__get8(s);
+  }
 
-   return value;
+  return value;
 }
 
-static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int maxv, dummy;
-   char c, p, t;
+static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp) {
+  int maxv, dummy;
+  char c, p, t;
 
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
+  if (!x)
+    x = &dummy;
+  if (!y)
+    y = &dummy;
+  if (!comp)
+    comp = &dummy;
 
-   stbi__rewind(s);
+  stbi__rewind(s);
 
-   // Get identifier
-   p = (char) stbi__get8(s);
-   t = (char) stbi__get8(s);
-   if (p != 'P' || (t != '5' && t != '6')) {
-       stbi__rewind(s);
-       return 0;
-   }
+  // Get identifier
+  p = (char)stbi__get8(s);
+  t = (char)stbi__get8(s);
+  if (p != 'P' || (t != '5' && t != '6')) {
+    stbi__rewind(s);
+    return 0;
+  }
 
-   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+  *comp =
+      (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm
 
-   c = (char) stbi__get8(s);
-   stbi__pnm_skip_whitespace(s, &c);
+  c = (char)stbi__get8(s);
+  stbi__pnm_skip_whitespace(s, &c);
 
-   *x = stbi__pnm_getinteger(s, &c); // read width
-   stbi__pnm_skip_whitespace(s, &c);
+  *x = stbi__pnm_getinteger(s, &c); // read width
+  stbi__pnm_skip_whitespace(s, &c);
 
-   *y = stbi__pnm_getinteger(s, &c); // read height
-   stbi__pnm_skip_whitespace(s, &c);
+  *y = stbi__pnm_getinteger(s, &c); // read height
+  stbi__pnm_skip_whitespace(s, &c);
 
-   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+  maxv = stbi__pnm_getinteger(s, &c); // read max value
 
-   if (maxv > 255)
-      return stbi__err("max value > 255", "PPM image not 8-bit");
-   else
-      return 1;
+  if (maxv > 255)
+    return stbi__err("max value > 255", "PPM image not 8-bit");
+  else
+    return 1;
 }
 #endif
 
-static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
-{
-   #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_info(s, x, y, comp)) return 1;
-   #endif
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp) {
+#ifndef STBI_NO_JPEG
+  if (stbi__jpeg_info(s, x, y, comp))
+    return 1;
+#endif
 
-   #ifndef STBI_NO_PNG
-   if (stbi__png_info(s, x, y, comp))  return 1;
-   #endif
+#ifndef STBI_NO_PNG
+  if (stbi__png_info(s, x, y, comp))
+    return 1;
+#endif
 
-   #ifndef STBI_NO_GIF
-   if (stbi__gif_info(s, x, y, comp))  return 1;
-   #endif
+#ifndef STBI_NO_GIF
+  if (stbi__gif_info(s, x, y, comp))
+    return 1;
+#endif
 
-   #ifndef STBI_NO_BMP
-   if (stbi__bmp_info(s, x, y, comp))  return 1;
-   #endif
+#ifndef STBI_NO_BMP
+  if (stbi__bmp_info(s, x, y, comp))
+    return 1;
+#endif
 
-   #ifndef STBI_NO_PSD
-   if (stbi__psd_info(s, x, y, comp))  return 1;
-   #endif
+#ifndef STBI_NO_PSD
+  if (stbi__psd_info(s, x, y, comp))
+    return 1;
+#endif
 
-   #ifndef STBI_NO_PIC
-   if (stbi__pic_info(s, x, y, comp))  return 1;
-   #endif
+#ifndef STBI_NO_PIC
+  if (stbi__pic_info(s, x, y, comp))
+    return 1;
+#endif
 
-   #ifndef STBI_NO_PNM
-   if (stbi__pnm_info(s, x, y, comp))  return 1;
-   #endif
+#ifndef STBI_NO_PNM
+  if (stbi__pnm_info(s, x, y, comp))
+    return 1;
+#endif
 
-   #ifndef STBI_NO_HDR
-   if (stbi__hdr_info(s, x, y, comp))  return 1;
-   #endif
+#ifndef STBI_NO_HDR
+  if (stbi__hdr_info(s, x, y, comp))
+    return 1;
+#endif
 
-   // test tga last because it's a crappy test!
-   #ifndef STBI_NO_TGA
-   if (stbi__tga_info(s, x, y, comp))
-       return 1;
-   #endif
-   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+// test tga last because it's a crappy test!
+#ifndef STBI_NO_TGA
+  if (stbi__tga_info(s, x, y, comp))
+    return 1;
+#endif
+  return stbi__err("unknown image type",
+                   "Image not of any known type, or corrupt");
 }
 
-static int stbi__is_16_main(stbi__context *s)
-{
-   #ifndef STBI_NO_PNG
-   if (stbi__png_is16(s))  return 1;
-   #endif
+static int stbi__is_16_main(stbi__context *s) {
+#ifndef STBI_NO_PNG
+  if (stbi__png_is16(s))
+    return 1;
+#endif
 
-   #ifndef STBI_NO_PSD
-   if (stbi__psd_is16(s))  return 1;
-   #endif
+#ifndef STBI_NO_PSD
+  if (stbi__psd_is16(s))
+    return 1;
+#endif
 
-   return 0;
+  return 0;
 }
 
 #ifndef STBI_NO_STDIO
-STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
-{
-    FILE *f = stbi__fopen(filename, "rb");
-    int result;
-    if (!f) return stbi__err("can't fopen", "Unable to open file");
-    result = stbi_info_from_file(f, x, y, comp);
-    fclose(f);
-    return result;
-}
-
-STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
-{
-   int r;
-   stbi__context s;
-   long pos = ftell(f);
-   stbi__start_file(&s, f);
-   r = stbi__info_main(&s,x,y,comp);
-   fseek(f,pos,SEEK_SET);
-   return r;
-}
-
-STBIDEF int stbi_is_16_bit(char const *filename)
-{
-    FILE *f = stbi__fopen(filename, "rb");
-    int result;
-    if (!f) return stbi__err("can't fopen", "Unable to open file");
-    result = stbi_is_16_bit_from_file(f);
-    fclose(f);
-    return result;
-}
-
-STBIDEF int stbi_is_16_bit_from_file(FILE *f)
-{
-   int r;
-   stbi__context s;
-   long pos = ftell(f);
-   stbi__start_file(&s, f);
-   r = stbi__is_16_main(&s);
-   fseek(f,pos,SEEK_SET);
-   return r;
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp) {
+  FILE *f = stbi__fopen(filename, "rb");
+  int result;
+  if (!f)
+    return stbi__err("can't fopen", "Unable to open file");
+  result = stbi_info_from_file(f, x, y, comp);
+  fclose(f);
+  return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp) {
+  int r;
+  stbi__context s;
+  long pos = ftell(f);
+  stbi__start_file(&s, f);
+  r = stbi__info_main(&s, x, y, comp);
+  fseek(f, pos, SEEK_SET);
+  return r;
+}
+
+STBIDEF int stbi_is_16_bit(char const *filename) {
+  FILE *f = stbi__fopen(filename, "rb");
+  int result;
+  if (!f)
+    return stbi__err("can't fopen", "Unable to open file");
+  result = stbi_is_16_bit_from_file(f);
+  fclose(f);
+  return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f) {
+  int r;
+  stbi__context s;
+  long pos = ftell(f);
+  stbi__start_file(&s, f);
+  r = stbi__is_16_main(&s);
+  fseek(f, pos, SEEK_SET);
+  return r;
 }
 #endif // !STBI_NO_STDIO
 
-STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__info_main(&s,x,y,comp);
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x,
+                                  int *y, int *comp) {
+  stbi__context s;
+  stbi__start_mem(&s, buffer, len);
+  return stbi__info_main(&s, x, y, comp);
 }
 
-STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
-   return stbi__info_main(&s,x,y,comp);
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user,
+                                     int *x, int *y, int *comp) {
+  stbi__context s;
+  stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user);
+  return stbi__info_main(&s, x, y, comp);
 }
 
-STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__is_16_main(&s);
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len) {
+  stbi__context s;
+  stbi__start_mem(&s, buffer, len);
+  return stbi__is_16_main(&s);
 }
 
-STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
-   return stbi__is_16_main(&s);
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c,
+                                          void *user) {
+  stbi__context s;
+  stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user);
+  return stbi__is_16_main(&s);
 }
 
 #endif // STB_IMAGE_IMPLEMENTATION
 
 /*
    revision history:
-      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs 
-      2.19  (2018-02-11) fix warning
-      2.18  (2018-01-30) fix warnings
-      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and
+   platform ifdefs 2.19  (2018-02-11) fix warning 2.18  (2018-01-30) fix
+   warnings 2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
                          1-bit BMP
                          *_is_16_bit api
                          avoid warnings
@@ -7352,13 +8087,11 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
                          warning fixes; disable run-time SSE detection on gcc;
                          uniform handling of optional "return" values;
                          thread-safe initialization of zlib tables
-      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
-      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
-      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
-      2.11  (2016-04-02) allocate large structures on the stack
-                         remove white matting for transparent PSD
-                         fix reported channel count for PNG & BMP
-                         re-enable SSE2 in non-gcc 64-bit
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet
+   JPGs 2.13  (2016-11-29) add 16-bit API, only supported for PNG right now 2.12
+   (2016-04-02) fix typo in 2.11 PSD fix that caused crashes 2.11  (2016-04-02)
+   allocate large structures on the stack remove white matting for transparent
+   PSD fix reported channel count for PNG & BMP re-enable SSE2 in non-gcc 64-bit
                          support RGB-formatted JPEG
                          read 16-bit PNGs (only as 8-bit)
       2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
@@ -7366,11 +8099,9 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
                          16-bit-per-pixel TGA (not bit-per-component)
                          info() for TGA could break due to .hdr handling
                          info() for BMP to shares code instead of sloppy parse
-                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
-                         code cleanup
-      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
-      2.07  (2015-09-13) fix compiler warnings
-                         partial animated GIF support
+                         can use STBI_REALLOC_SIZED if allocator doesn't support
+   realloc code cleanup 2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD
+   as RGBA 2.07  (2015-09-13) fix compiler warnings partial animated GIF support
                          limited 16-bpc PSD support
                          #ifdef unused functions
                          bug with < 92 byte PIC,PNM,HDR,TGA
@@ -7381,23 +8112,18 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
                          stbi_set_flip_vertically_on_load (nguillemot)
                          fix NEON support; fix mingw support
       2.02  (2015-01-19) fix incorrect assert, fix warning
-      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
-      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
-      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
-                         progressive JPEG (stb)
-                         PGM/PPM support (Ken Miller)
-                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit
+   without -msse2 2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG 2.00
+   (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg) progressive
+   JPEG (stb) PGM/PPM support (Ken Miller) STBI_MALLOC,STBI_REALLOC,STBI_FREE
                          GIF bugfix -- seemingly never worked
                          STBI_NO_*, STBI_ONLY_*
       1.48  (2014-12-14) fix incorrectly-named assert()
-      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
-                         optimize PNG (ryg)
-                         fix bug in interlaced PNG with user-specified channel count (stb)
-      1.46  (2014-08-26)
-              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
-      1.45  (2014-08-16)
-              fix MSVC-ARM internal compiler error by wrapping malloc
-      1.44  (2014-08-07)
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar
+   Cornut & stb) optimize PNG (ryg) fix bug in interlaced PNG with
+   user-specified channel count (stb) 1.46  (2014-08-26) fix broken tRNS chunk
+   (colorkey-style transparency) in non-paletted PNG 1.45  (2014-08-16) fix
+   MSVC-ARM internal compiler error by wrapping malloc 1.44  (2014-08-07)
               various warning fixes from Ronny Chevalier
       1.43  (2014-07-15)
               fix MSVC-only compiler problem in code changed in 1.42
@@ -7406,73 +8132,48 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
               fixes to stbi__cleanup_jpeg path
               added STBI_ASSERT to avoid requiring assert.h
       1.41  (2014-06-25)
-              fix search&replace from 1.36 that messed up comments/error messages
-      1.40  (2014-06-22)
-              fix gcc struct-initialization warning
-      1.39  (2014-06-15)
-              fix to TGA optimization when req_comp != number of components in TGA;
-              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
-              add support for BMP version 5 (more ignored fields)
-      1.38  (2014-06-06)
-              suppress MSVC warnings on integer casts truncating values
-              fix accidental rename of 'skip' field of I/O
-      1.37  (2014-06-04)
-              remove duplicate typedef
-      1.36  (2014-06-03)
-              convert to header file single-file library
-              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
-      1.35  (2014-05-27)
-              various warnings
-              fix broken STBI_SIMD path
-              fix bug where stbi_load_from_file no longer left file pointer in correct place
-              fix broken non-easy path for 32-bit BMP (possibly never used)
-              TGA optimization by Arseny Kapoulkine
-      1.34  (unknown)
-              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
-      1.33  (2011-07-14)
-              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
-      1.32  (2011-07-13)
-              support for "info" function for all supported filetypes (SpartanJ)
-      1.31  (2011-06-20)
-              a few more leak fixes, bug in PNG handling (SpartanJ)
-      1.30  (2011-06-11)
-              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              fix search&replace from 1.36 that messed up comments/error
+   messages 1.40  (2014-06-22) fix gcc struct-initialization warning 1.39
+   (2014-06-15) fix to TGA optimization when req_comp != number of components in
+   TGA; fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my
+   test suite) add support for BMP version 5 (more ignored fields) 1.38
+   (2014-06-06) suppress MSVC warnings on integer casts truncating values fix
+   accidental rename of 'skip' field of I/O 1.37  (2014-06-04) remove duplicate
+   typedef 1.36  (2014-06-03) convert to header file single-file library if
+   de-iphone isn't set, load iphone images color-swapped instead of returning
+   NULL 1.35  (2014-05-27) various warnings fix broken STBI_SIMD path fix bug
+   where stbi_load_from_file no longer left file pointer in correct place fix
+   broken non-easy path for 32-bit BMP (possibly never used) TGA optimization by
+   Arseny Kapoulkine 1.34  (unknown) use STBI_NOTUSED in
+   stbi__resample_row_generic(), fix one more leak in tga failure case 1.33
+   (2011-07-14) make stbi_is_hdr work in STBI_NO_HDR (as specified), minor
+   compiler-friendly improvements 1.32  (2011-07-13) support for "info" function
+   for all supported filetypes (SpartanJ) 1.31  (2011-06-20) a few more leak
+   fixes, bug in PNG handling (SpartanJ) 1.30  (2011-06-11) added ability to
+   load files via callbacks to accomidate custom input streams (Ben Wenger)
               removed deprecated format-specific test/load functions
-              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
-              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
-              fix inefficiency in decoding 32-bit BMP (David Woo)
-      1.29  (2010-08-16)
-              various warning fixes from Aurelien Pocheville
-      1.28  (2010-08-01)
-              fix bug in GIF palette transparency (SpartanJ)
-      1.27  (2010-08-01)
-              cast-to-stbi_uc to fix warnings
-      1.26  (2010-07-24)
-              fix bug in file buffering for PNG reported by SpartanJ
-      1.25  (2010-07-17)
-              refix trans_data warning (Won Chun)
-      1.24  (2010-07-12)
-              perf improvements reading from files on platforms with lock-heavy fgetc()
-              minor perf improvements for jpeg
-              deprecated type-specific functions so we'll get feedback if they're needed
-              attempt to fix trans_data warning (Won Chun)
-      1.23    fixed bug in iPhone support
-      1.22  (2010-07-10)
-              removed image *writing* support
-              stbi_info support from Jetro Lauha
-              GIF support from Jean-Marc Lienher
+              removed support for installable file formats (stbi_loader) --
+   would have been broken for IO callbacks anyway error cases in bmp and tga
+   give messages and don't leak (Raymond Barbiero, grisha) fix inefficiency in
+   decoding 32-bit BMP (David Woo) 1.29  (2010-08-16) various warning fixes from
+   Aurelien Pocheville 1.28  (2010-08-01) fix bug in GIF palette transparency
+   (SpartanJ) 1.27  (2010-08-01) cast-to-stbi_uc to fix warnings 1.26
+   (2010-07-24) fix bug in file buffering for PNG reported by SpartanJ 1.25
+   (2010-07-17) refix trans_data warning (Won Chun) 1.24  (2010-07-12) perf
+   improvements reading from files on platforms with lock-heavy fgetc() minor
+   perf improvements for jpeg deprecated type-specific functions so we'll get
+   feedback if they're needed attempt to fix trans_data warning (Won Chun) 1.23
+   fixed bug in iPhone support 1.22  (2010-07-10) removed image *writing*
+   support stbi_info support from Jetro Lauha GIF support from Jean-Marc Lienher
               iPhone PNG-extensions from James Brown
-              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
-      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
-      1.20    added support for Softimage PIC, by Tom Seddon
-      1.19    bug in interlaced PNG corruption check (found by ryg)
-      1.18  (2008-08-02)
-              fix a threading bug (local mutable static)
-      1.17    support interlaced PNG
-      1.16    major bugfix - stbi__convert_format converted one too many pixels
-      1.15    initialize some fields for thread safety
-      1.14    fix threadsafe conversion bug
-              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err.
+   Janez (U+017D)emva) 1.21    fix use of 'stbi_uc' in header (reported by jon
+   blow) 1.20    added support for Softimage PIC, by Tom Seddon 1.19    bug in
+   interlaced PNG corruption check (found by ryg) 1.18  (2008-08-02) fix a
+   threading bug (local mutable static) 1.17    support interlaced PNG 1.16
+   major bugfix - stbi__convert_format converted one too many pixels 1.15
+   initialize some fields for thread safety 1.14    fix threadsafe conversion
+   bug header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
       1.13    threadsafe
       1.12    const qualifiers in the API
       1.11    Support installable IDCT, colorspace conversion routines
@@ -7482,15 +8183,14 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
       1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
       1.07    attempt to fix C++ warning/errors again
       1.06    attempt to fix C++ warning/errors again
-      1.05    fix TGA loading to return correct *comp and use good luminance calc
-      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
-      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
-      1.02    support for (subset of) HDR files, float interface for preferred access to them
-      1.01    fix bug: possible bug in handling right-side up bmps... not sure
-              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
-      1.00    interface to zlib that skips zlib header
-      0.99    correct handling of alpha in palette
-      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      1.05    fix TGA loading to return correct *comp and use good luminance
+   calc 1.04    default float alpha is 1, not 255; use 'void *' for
+   stbi_image_free 1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR 1.02 support
+   for (subset of) HDR files, float interface for preferred access to them 1.01
+   fix bug: possible bug in handling right-side up bmps... not sure fix bug: the
+   stbi__bmp_load() and stbi__tga_load() functions didn't work at all 1.00
+   interface to zlib that skips zlib header 0.99    correct handling of alpha in
+   palette 0.98    TGA loader by lonesock; dynamically add loaders (untested)
       0.97    jpeg errors on too large a file; also catch another malloc failure
       0.96    fix detection of invalid v value - particleman@mollyrocket forum
       0.95    during header scan, seek to markers in case of padding
@@ -7503,8 +8203,8 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
       0.60    fix compiling as c++
       0.59    fix warnings: merge Dave Moore's -Wall fixes
       0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
-      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
-      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but
+   less than 16 available 0.56    fix bug: zlib uncompressed mode len vs. nlen
       0.55    fix bug: restart_interval not initialized to 0
       0.54    allow NULL for 'int *comp'
       0.53    fix bug in png 3->4; speedup png decoding
@@ -7515,7 +8215,6 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
               first released version
 */
 
-
 /*
 ------------------------------------------------------------------------------
 This software is available under 2 licenses -- choose whichever you prefer.
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h
index a9bf66c14e1f783eb505c2a09a8e762a5d9694dc..84b84981b44876c35c9bb6cce1af402ec302c3eb 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h
@@ -26,11 +26,12 @@ BUILDING:
    You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
    malloc,realloc,free.
    You can #define STBIW_MEMMOVE() to replace memmove()
-   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
-   for PNG compression (instead of the builtin one), it must have the following signature:
-   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
-   The returned data will be freed with STBIW_FREE() (free() by default),
-   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
+   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress
+function for PNG compression (instead of the builtin one), it must have the
+following signature: unsigned char * my_compress(unsigned char *data, int
+data_len, int *out_len, int quality); The returned data will be freed with
+STBIW_FREE() (free() by default), so it must be heap allocated with
+STBIW_MALLOC() (malloc() by default),
 
 UNICODE:
 
@@ -44,30 +45,37 @@ USAGE:
 
    There are five functions, one for each image file format:
 
-     int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
-     int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
-     int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
-     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
-     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
-
-     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
-
-   There are also five equivalent functions that use an arbitrary write function. You are
-   expected to open/close your file-equivalent before and after calling these:
-
-     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
-     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
-     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
+     int stbi_write_png(char const *filename, int w, int h, int comp, const void
+*data, int stride_in_bytes); int stbi_write_bmp(char const *filename, int w, int
+h, int comp, const void *data); int stbi_write_tga(char const *filename, int w,
+int h, int comp, const void *data); int stbi_write_jpg(char const *filename, int
+w, int h, int comp, const void *data, int quality); int stbi_write_hdr(char
+const *filename, int w, int h, int comp, const float *data);
+
+     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip
+data vertically
+
+   There are also five equivalent functions that use an arbitrary write
+function. You are expected to open/close your file-equivalent before and after
+calling these:
+
+     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int
+h, int comp, const void  *data, int stride_in_bytes); int
+stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int
+comp, const void  *data); int stbi_write_tga_to_func(stbi_write_func *func, void
+*context, int w, int h, int comp, const void  *data); int
+stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int
+comp, const float *data); int stbi_write_jpg_to_func(stbi_write_func *func, void
+*context, int x, int y, int comp, const void *data, int quality);
 
    where the callback is:
       void stbi_write_func(void *context, void *data, int size);
 
    You can configure it with these global variables:
-      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
-      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
-      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
+      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to
+disable RLE int stbi_write_png_compression_level;    // defaults to 8; set to
+higher for more compression int stbi_write_force_png_filter;         // defaults
+to -1; set to 0..5 to force a filter mode
 
 
    You can define STBI_WRITE_NO_STDIO to disable the file variant of these
@@ -105,7 +113,7 @@ USAGE:
 
    TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
    data, set the global variable 'stbi_write_tga_with_rle' to 0.
-   
+
    JPEG does ignore alpha channels in input data; quality is between 1 and 100.
    Higher quality looks better but results in a bigger image.
    JPEG baseline (no JPEG progressive).
@@ -113,7 +121,7 @@ USAGE:
 CREDITS:
 
 
-   Sean Barrett           -    PNG/BMP/TGA 
+   Sean Barrett           -    PNG/BMP/TGA
    Baldur Karlsson        -    HDR
    Jean-Sebastien Guay    -    TGA monochrome
    Tim Kelsey             -    misc enhancements
@@ -152,135 +160,147 @@ LICENSE
 
 #include <stdlib.h>
 
-// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
+// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline'
+// or 'static inline'
 #ifndef STBIWDEF
 #ifdef STB_IMAGE_WRITE_STATIC
-#define STBIWDEF  static
+#define STBIWDEF static
 #else
 #ifdef __cplusplus
-#define STBIWDEF  extern "C"
+#define STBIWDEF extern "C"
 #else
-#define STBIWDEF  extern
+#define STBIWDEF extern
 #endif
 #endif
 #endif
 
-#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
+#ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations
 extern int stbi_write_tga_with_rle;
 extern int stbi_write_png_compression_level;
 extern int stbi_write_force_png_filter;
 #endif
 
 #ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
-STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp,
+                            const void *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp,
+                            const void *data);
+STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp,
+                            const void *data);
+STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp,
+                            const float *data);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp,
+                            const void *data, int quality);
 
 #ifdef STBI_WINDOWS_UTF8
-STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen,
+                                         const wchar_t *input);
 #endif
 #endif
 
 typedef void stbi_write_func(void *context, void *data, int size);
 
-STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
-STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
-STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w,
+                                    int h, int comp, const void *data,
+                                    int stride_in_bytes);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w,
+                                    int h, int comp, const void *data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w,
+                                    int h, int comp, const void *data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w,
+                                    int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x,
+                                    int y, int comp, const void *data,
+                                    int quality);
 
 STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 
-#endif//INCLUDE_STB_IMAGE_WRITE_H
+#endif // INCLUDE_STB_IMAGE_WRITE_H
 
 #ifdef STB_IMAGE_WRITE_IMPLEMENTATION
 
 #ifdef _WIN32
-   #ifndef _CRT_SECURE_NO_WARNINGS
-   #define _CRT_SECURE_NO_WARNINGS
-   #endif
-   #ifndef _CRT_NONSTDC_NO_DEPRECATE
-   #define _CRT_NONSTDC_NO_DEPRECATE
-   #endif
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+#ifndef _CRT_NONSTDC_NO_DEPRECATE
+#define _CRT_NONSTDC_NO_DEPRECATE
+#endif
 #endif
 
 #ifndef STBI_WRITE_NO_STDIO
 #include <stdio.h>
 #endif // STBI_WRITE_NO_STDIO
 
+#include <math.h>
 #include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
 
-#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
+#if defined(STBIW_MALLOC) && defined(STBIW_FREE) &&                            \
+    (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
 // ok
-#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
+#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) &&                        \
+    !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
 // ok
 #else
-#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
+#error                                                                         \
+    "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
 #endif
 
 #ifndef STBIW_MALLOC
-#define STBIW_MALLOC(sz)        malloc(sz)
-#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
-#define STBIW_FREE(p)           free(p)
+#define STBIW_MALLOC(sz) malloc(sz)
+#define STBIW_REALLOC(p, newsz) realloc(p, newsz)
+#define STBIW_FREE(p) free(p)
 #endif
 
 #ifndef STBIW_REALLOC_SIZED
-#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
+#define STBIW_REALLOC_SIZED(p, oldsz, newsz) STBIW_REALLOC(p, newsz)
 #endif
 
-
 #ifndef STBIW_MEMMOVE
-#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
+#define STBIW_MEMMOVE(a, b, sz) memmove(a, b, sz)
 #endif
 
-
 #ifndef STBIW_ASSERT
 #include <assert.h>
 #define STBIW_ASSERT(x) assert(x)
 #endif
 
-#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
+#define STBIW_UCHAR(x) (unsigned char)((x)&0xff)
 
 #ifdef STB_IMAGE_WRITE_STATIC
-static int stbi__flip_vertically_on_write=0;
+static int stbi__flip_vertically_on_write = 0;
 static int stbi_write_png_compression_level = 8;
 static int stbi_write_tga_with_rle = 1;
 static int stbi_write_force_png_filter = -1;
 #else
 int stbi_write_png_compression_level = 8;
-int stbi__flip_vertically_on_write=0;
+int stbi__flip_vertically_on_write = 0;
 int stbi_write_tga_with_rle = 1;
 int stbi_write_force_png_filter = -1;
 #endif
 
-STBIWDEF void stbi_flip_vertically_on_write(int flag)
-{
-   stbi__flip_vertically_on_write = flag;
+STBIWDEF void stbi_flip_vertically_on_write(int flag) {
+  stbi__flip_vertically_on_write = flag;
 }
 
-typedef struct
-{
-   stbi_write_func *func;
-   void *context;
+typedef struct {
+  stbi_write_func *func;
+  void *context;
 } stbi__write_context;
 
 // initialize a callback-based context
-static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
-{
-   s->func    = c;
-   s->context = context;
+static void stbi__start_write_callbacks(stbi__write_context *s,
+                                        stbi_write_func *c, void *context) {
+  s->func = c;
+  s->context = context;
 }
 
 #ifndef STBI_WRITE_NO_STDIO
 
-static void stbi__stdio_write(void *context, void *data, int size)
-{
-   fwrite(data,1,size,(FILE*) context);
+static void stbi__stdio_write(void *context, void *data, int size) {
+  fwrite(data, 1, size, (FILE *)context);
 }
 
 #if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
@@ -289,306 +309,332 @@ static void stbi__stdio_write(void *context, void *data, int size)
 #else
 #define STBIW_EXTERN extern
 #endif
-STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
-STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
-
-STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
-{
-	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(
+    unsigned int cp, unsigned long flags, const char *str, int cbmb,
+    wchar_t *widestr, int cchwide);
+STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(
+    unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide,
+    char *str, int cbmb, const char *defchar, int *used_default);
+
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen,
+                                         const wchar_t *input) {
+  return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer,
+                             (int)bufferlen, NULL, NULL);
 }
 #endif
 
-static FILE *stbiw__fopen(char const *filename, char const *mode)
-{
-   FILE *f;
+static FILE *stbiw__fopen(char const *filename, char const *mode) {
+  FILE *f;
 #if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
-   wchar_t wMode[64];
-   wchar_t wFilename[1024];
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
-      return 0;
-	
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
-      return 0;
+  wchar_t wMode[64];
+  wchar_t wFilename[1024];
+  if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename,
+                               sizeof(wFilename)))
+    return 0;
+
+  if (0 ==
+      MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+    return 0;
 
 #if _MSC_VER >= 1400
-	if (0 != _wfopen_s(&f, wFilename, wMode))
-		f = 0;
+  if (0 != _wfopen_s(&f, wFilename, wMode))
+    f = 0;
 #else
-   f = _wfopen(wFilename, wMode);
+  f = _wfopen(wFilename, wMode);
 #endif
 
 #elif defined(_MSC_VER) && _MSC_VER >= 1400
-   if (0 != fopen_s(&f, filename, mode))
-      f=0;
+  if (0 != fopen_s(&f, filename, mode))
+    f = 0;
 #else
-   f = fopen(filename, mode);
+  f = fopen(filename, mode);
 #endif
-   return f;
+  return f;
 }
 
-static int stbi__start_write_file(stbi__write_context *s, const char *filename)
-{
-   FILE *f = stbiw__fopen(filename, "wb");
-   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
-   return f != NULL;
+static int stbi__start_write_file(stbi__write_context *s,
+                                  const char *filename) {
+  FILE *f = stbiw__fopen(filename, "wb");
+  stbi__start_write_callbacks(s, stbi__stdio_write, (void *)f);
+  return f != NULL;
 }
 
-static void stbi__end_write_file(stbi__write_context *s)
-{
-   fclose((FILE *)s->context);
+static void stbi__end_write_file(stbi__write_context *s) {
+  fclose((FILE *)s->context);
 }
 
 #endif // !STBI_WRITE_NO_STDIO
 
 typedef unsigned int stbiw_uint32;
-typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
-
-static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
-{
-   while (*fmt) {
-      switch (*fmt++) {
-         case ' ': break;
-         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
-                     s->func(s->context,&x,1);
-                     break; }
-         case '2': { int x = va_arg(v,int);
-                     unsigned char b[2];
-                     b[0] = STBIW_UCHAR(x);
-                     b[1] = STBIW_UCHAR(x>>8);
-                     s->func(s->context,b,2);
-                     break; }
-         case '4': { stbiw_uint32 x = va_arg(v,int);
-                     unsigned char b[4];
-                     b[0]=STBIW_UCHAR(x);
-                     b[1]=STBIW_UCHAR(x>>8);
-                     b[2]=STBIW_UCHAR(x>>16);
-                     b[3]=STBIW_UCHAR(x>>24);
-                     s->func(s->context,b,4);
-                     break; }
-         default:
-            STBIW_ASSERT(0);
-            return;
-      }
-   }
+typedef int stb_image_write_test[sizeof(stbiw_uint32) == 4 ? 1 : -1];
+
+static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v) {
+  while (*fmt) {
+    switch (*fmt++) {
+    case ' ':
+      break;
+    case '1': {
+      unsigned char x = STBIW_UCHAR(va_arg(v, int));
+      s->func(s->context, &x, 1);
+      break;
+    }
+    case '2': {
+      int x = va_arg(v, int);
+      unsigned char b[2];
+      b[0] = STBIW_UCHAR(x);
+      b[1] = STBIW_UCHAR(x >> 8);
+      s->func(s->context, b, 2);
+      break;
+    }
+    case '4': {
+      stbiw_uint32 x = va_arg(v, int);
+      unsigned char b[4];
+      b[0] = STBIW_UCHAR(x);
+      b[1] = STBIW_UCHAR(x >> 8);
+      b[2] = STBIW_UCHAR(x >> 16);
+      b[3] = STBIW_UCHAR(x >> 24);
+      s->func(s->context, b, 4);
+      break;
+    }
+    default:
+      STBIW_ASSERT(0);
+      return;
+    }
+  }
 }
 
-static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
-{
-   va_list v;
-   va_start(v, fmt);
-   stbiw__writefv(s, fmt, v);
-   va_end(v);
+static void stbiw__writef(stbi__write_context *s, const char *fmt, ...) {
+  va_list v;
+  va_start(v, fmt);
+  stbiw__writefv(s, fmt, v);
+  va_end(v);
 }
 
-static void stbiw__putc(stbi__write_context *s, unsigned char c)
-{
-   s->func(s->context, &c, 1);
+static void stbiw__putc(stbi__write_context *s, unsigned char c) {
+  s->func(s->context, &c, 1);
 }
 
-static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
-{
-   unsigned char arr[3];
-   arr[0] = a; arr[1] = b; arr[2] = c;
-   s->func(s->context, arr, 3);
+static void stbiw__write3(stbi__write_context *s, unsigned char a,
+                          unsigned char b, unsigned char c) {
+  unsigned char arr[3];
+  arr[0] = a;
+  arr[1] = b;
+  arr[2] = c;
+  s->func(s->context, arr, 3);
 }
 
-static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
-{
-   unsigned char bg[3] = { 255, 0, 255}, px[3];
-   int k;
-
-   if (write_alpha < 0)
-      s->func(s->context, &d[comp - 1], 1);
-
-   switch (comp) {
-      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
-      case 1:
-         if (expand_mono)
-            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
-         else
-            s->func(s->context, d, 1);  // monochrome TGA
-         break;
-      case 4:
-         if (!write_alpha) {
-            // composite against pink background
-            for (k = 0; k < 3; ++k)
-               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
-            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
-            break;
-         }
-         /* FALLTHROUGH */
-      case 3:
-         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
-         break;
-   }
-   if (write_alpha > 0)
-      s->func(s->context, &d[comp - 1], 1);
+static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp,
+                               int write_alpha, int expand_mono,
+                               unsigned char *d) {
+  unsigned char bg[3] = {255, 0, 255}, px[3];
+  int k;
+
+  if (write_alpha < 0)
+    s->func(s->context, &d[comp - 1], 1);
+
+  switch (comp) {
+  case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as
+          // 1-channel case
+  case 1:
+    if (expand_mono)
+      stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+    else
+      s->func(s->context, d, 1); // monochrome TGA
+    break;
+  case 4:
+    if (!write_alpha) {
+      // composite against pink background
+      for (k = 0; k < 3; ++k)
+        px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+      stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
+      break;
+    }
+    /* FALLTHROUGH */
+  case 3:
+    stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+    break;
+  }
+  if (write_alpha > 0)
+    s->func(s->context, &d[comp - 1], 1);
 }
 
-static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
-{
-   stbiw_uint32 zero = 0;
-   int i,j, j_end;
-
-   if (y <= 0)
-      return;
-
-   if (stbi__flip_vertically_on_write)
-      vdir *= -1;
-
-   if (vdir < 0) {
-      j_end = -1; j = y-1;
-   } else {
-      j_end =  y; j = 0;
-   }
-
-   for (; j != j_end; j += vdir) {
-      for (i=0; i < x; ++i) {
-         unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
-         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
-      }
-      s->func(s->context, &zero, scanline_pad);
-   }
+static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir,
+                                int x, int y, int comp, void *data,
+                                int write_alpha, int scanline_pad,
+                                int expand_mono) {
+  stbiw_uint32 zero = 0;
+  int i, j, j_end;
+
+  if (y <= 0)
+    return;
+
+  if (stbi__flip_vertically_on_write)
+    vdir *= -1;
+
+  if (vdir < 0) {
+    j_end = -1;
+    j = y - 1;
+  } else {
+    j_end = y;
+    j = 0;
+  }
+
+  for (; j != j_end; j += vdir) {
+    for (i = 0; i < x; ++i) {
+      unsigned char *d = (unsigned char *)data + (j * x + i) * comp;
+      stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
+    }
+    s->func(s->context, &zero, scanline_pad);
+  }
 }
 
-static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
-{
-   if (y < 0 || x < 0) {
-      return 0;
-   } else {
-      va_list v;
-      va_start(v, fmt);
-      stbiw__writefv(s, fmt, v);
-      va_end(v);
-      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
-      return 1;
-   }
+static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x,
+                          int y, int comp, int expand_mono, void *data,
+                          int alpha, int pad, const char *fmt, ...) {
+  if (y < 0 || x < 0) {
+    return 0;
+  } else {
+    va_list v;
+    va_start(v, fmt);
+    stbiw__writefv(s, fmt, v);
+    va_end(v);
+    stbiw__write_pixels(s, rgb_dir, vdir, x, y, comp, data, alpha, pad,
+                        expand_mono);
+    return 1;
+  }
 }
 
-static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
-{
-   int pad = (-x*3) & 3;
-   return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
-           "11 4 22 4" "4 44 22 444444",
-           'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
-            40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp,
+                               const void *data) {
+  int pad = (-x * 3) & 3;
+  return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void *)data, 0, pad,
+                        "11 4 22 4"
+                        "4 44 22 444444",
+                        'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0,
+                        14 + 40,                            // file header
+                        40, x, y, 1, 24, 0, 0, 0, 0, 0, 0); // bitmap header
 }
 
-STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s;
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_bmp_core(&s, x, y, comp, data);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x,
+                                    int y, int comp, const void *data) {
+  stbi__write_context s;
+  stbi__start_write_callbacks(&s, func, context);
+  return stbi_write_bmp_core(&s, x, y, comp, data);
 }
 
 #ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s;
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_bmp_core(&s, x, y, comp, data);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
+STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp,
+                            const void *data) {
+  stbi__write_context s;
+  if (stbi__start_write_file(&s, filename)) {
+    int r = stbi_write_bmp_core(&s, x, y, comp, data);
+    stbi__end_write_file(&s);
+    return r;
+  } else
+    return 0;
 }
-#endif //!STBI_WRITE_NO_STDIO
-
-static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
-{
-   int has_alpha = (comp == 2 || comp == 4);
-   int colorbytes = has_alpha ? comp-1 : comp;
-   int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
-
-   if (y < 0 || x < 0)
-      return 0;
-
-   if (!stbi_write_tga_with_rle) {
-      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
-         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
-   } else {
-      int i,j,k;
-      int jend, jdir;
-
-      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
-
-      if (stbi__flip_vertically_on_write) {
-         j = 0;
-         jend = y;
-         jdir = 1;
-      } else {
-         j = y-1;
-         jend = -1;
-         jdir = -1;
-      }
-      for (; j != jend; j += jdir) {
-         unsigned char *row = (unsigned char *) data + j * x * comp;
-         int len;
-
-         for (i = 0; i < x; i += len) {
-            unsigned char *begin = row + i * comp;
-            int diff = 1;
-            len = 1;
-
-            if (i < x - 1) {
-               ++len;
-               diff = memcmp(begin, row + (i + 1) * comp, comp);
-               if (diff) {
-                  const unsigned char *prev = begin;
-                  for (k = i + 2; k < x && len < 128; ++k) {
-                     if (memcmp(prev, row + k * comp, comp)) {
-                        prev += comp;
-                        ++len;
-                     } else {
-                        --len;
-                        break;
-                     }
-                  }
-               } else {
-                  for (k = i + 2; k < x && len < 128; ++k) {
-                     if (!memcmp(begin, row + k * comp, comp)) {
-                        ++len;
-                     } else {
-                        break;
-                     }
-                  }
-               }
+#endif //! STBI_WRITE_NO_STDIO
+
+static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp,
+                               void *data) {
+  int has_alpha = (comp == 2 || comp == 4);
+  int colorbytes = has_alpha ? comp - 1 : comp;
+  int format =
+      colorbytes < 2
+          ? 3
+          : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
+
+  if (y < 0 || x < 0)
+    return 0;
+
+  if (!stbi_write_tga_with_rle) {
+    return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *)data, has_alpha, 0,
+                          "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y,
+                          (colorbytes + has_alpha) * 8, has_alpha * 8);
+  } else {
+    int i, j, k;
+    int jend, jdir;
+
+    stbiw__writef(s, "111 221 2222 11", 0, 0, format + 8, 0, 0, 0, 0, 0, x, y,
+                  (colorbytes + has_alpha) * 8, has_alpha * 8);
+
+    if (stbi__flip_vertically_on_write) {
+      j = 0;
+      jend = y;
+      jdir = 1;
+    } else {
+      j = y - 1;
+      jend = -1;
+      jdir = -1;
+    }
+    for (; j != jend; j += jdir) {
+      unsigned char *row = (unsigned char *)data + j * x * comp;
+      int len;
+
+      for (i = 0; i < x; i += len) {
+        unsigned char *begin = row + i * comp;
+        int diff = 1;
+        len = 1;
+
+        if (i < x - 1) {
+          ++len;
+          diff = memcmp(begin, row + (i + 1) * comp, comp);
+          if (diff) {
+            const unsigned char *prev = begin;
+            for (k = i + 2; k < x && len < 128; ++k) {
+              if (memcmp(prev, row + k * comp, comp)) {
+                prev += comp;
+                ++len;
+              } else {
+                --len;
+                break;
+              }
             }
-
-            if (diff) {
-               unsigned char header = STBIW_UCHAR(len - 1);
-               s->func(s->context, &header, 1);
-               for (k = 0; k < len; ++k) {
-                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
-               }
-            } else {
-               unsigned char header = STBIW_UCHAR(len - 129);
-               s->func(s->context, &header, 1);
-               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+          } else {
+            for (k = i + 2; k < x && len < 128; ++k) {
+              if (!memcmp(begin, row + k * comp, comp)) {
+                ++len;
+              } else {
+                break;
+              }
             }
-         }
+          }
+        }
+
+        if (diff) {
+          unsigned char header = STBIW_UCHAR(len - 1);
+          s->func(s->context, &header, 1);
+          for (k = 0; k < len; ++k) {
+            stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
+          }
+        } else {
+          unsigned char header = STBIW_UCHAR(len - 129);
+          s->func(s->context, &header, 1);
+          stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+        }
       }
-   }
-   return 1;
+    }
+  }
+  return 1;
 }
 
-STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s;
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x,
+                                    int y, int comp, const void *data) {
+  stbi__write_context s;
+  stbi__start_write_callbacks(&s, func, context);
+  return stbi_write_tga_core(&s, x, y, comp, (void *)data);
 }
 
 #ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s;
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
+STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp,
+                            const void *data) {
+  stbi__write_context s;
+  if (stbi__start_write_file(&s, filename)) {
+    int r = stbi_write_tga_core(&s, x, y, comp, (void *)data);
+    stbi__end_write_file(&s);
+    return r;
+  } else
+    return 0;
 }
 #endif
 
@@ -596,934 +642,1213 @@ STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const
 // Radiance RGBE HDR writer
 // by Baldur Karlsson
 
-#define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
+#define stbiw__max(a, b) ((a) > (b) ? (a) : (b))
 
-static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
-{
-   int exponent;
-   float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
+static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear) {
+  int exponent;
+  float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
 
-   if (maxcomp < 1e-32f) {
-      rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
-   } else {
-      float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
+  if (maxcomp < 1e-32f) {
+    rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
+  } else {
+    float normalize = (float)frexp(maxcomp, &exponent) * 256.0f / maxcomp;
 
-      rgbe[0] = (unsigned char)(linear[0] * normalize);
-      rgbe[1] = (unsigned char)(linear[1] * normalize);
-      rgbe[2] = (unsigned char)(linear[2] * normalize);
-      rgbe[3] = (unsigned char)(exponent + 128);
-   }
+    rgbe[0] = (unsigned char)(linear[0] * normalize);
+    rgbe[1] = (unsigned char)(linear[1] * normalize);
+    rgbe[2] = (unsigned char)(linear[2] * normalize);
+    rgbe[3] = (unsigned char)(exponent + 128);
+  }
 }
 
-static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
-{
-   unsigned char lengthbyte = STBIW_UCHAR(length+128);
-   STBIW_ASSERT(length+128 <= 255);
-   s->func(s->context, &lengthbyte, 1);
-   s->func(s->context, &databyte, 1);
+static void stbiw__write_run_data(stbi__write_context *s, int length,
+                                  unsigned char databyte) {
+  unsigned char lengthbyte = STBIW_UCHAR(length + 128);
+  STBIW_ASSERT(length + 128 <= 255);
+  s->func(s->context, &lengthbyte, 1);
+  s->func(s->context, &databyte, 1);
 }
 
-static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
-{
-   unsigned char lengthbyte = STBIW_UCHAR(length);
-   STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
-   s->func(s->context, &lengthbyte, 1);
-   s->func(s->context, data, length);
+static void stbiw__write_dump_data(stbi__write_context *s, int length,
+                                   unsigned char *data) {
+  unsigned char lengthbyte = STBIW_UCHAR(length);
+  STBIW_ASSERT(length <=
+               128); // inconsistent with spec but consistent with official code
+  s->func(s->context, &lengthbyte, 1);
+  s->func(s->context, data, length);
 }
 
-static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
-{
-   unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
-   unsigned char rgbe[4];
-   float linear[3];
-   int x;
-
-   scanlineheader[2] = (width&0xff00)>>8;
-   scanlineheader[3] = (width&0x00ff);
-
-   /* skip RLE for images too small or large */
-   if (width < 8 || width >= 32768) {
-      for (x=0; x < width; x++) {
-         switch (ncomp) {
-            case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*ncomp + 2];
-                    linear[1] = scanline[x*ncomp + 1];
-                    linear[0] = scanline[x*ncomp + 0];
-                    break;
-            default:
-                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
-                    break;
-         }
-         stbiw__linear_to_rgbe(rgbe, linear);
-         s->func(s->context, rgbe, 4);
+static void stbiw__write_hdr_scanline(stbi__write_context *s, int width,
+                                      int ncomp, unsigned char *scratch,
+                                      float *scanline) {
+  unsigned char scanlineheader[4] = {2, 2, 0, 0};
+  unsigned char rgbe[4];
+  float linear[3];
+  int x;
+
+  scanlineheader[2] = (width & 0xff00) >> 8;
+  scanlineheader[3] = (width & 0x00ff);
+
+  /* skip RLE for images too small or large */
+  if (width < 8 || width >= 32768) {
+    for (x = 0; x < width; x++) {
+      switch (ncomp) {
+      case 4: /* fallthrough */
+      case 3:
+        linear[2] = scanline[x * ncomp + 2];
+        linear[1] = scanline[x * ncomp + 1];
+        linear[0] = scanline[x * ncomp + 0];
+        break;
+      default:
+        linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
+        break;
       }
-   } else {
-      int c,r;
-      /* encode into scratch buffer */
-      for (x=0; x < width; x++) {
-         switch(ncomp) {
-            case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*ncomp + 2];
-                    linear[1] = scanline[x*ncomp + 1];
-                    linear[0] = scanline[x*ncomp + 0];
-                    break;
-            default:
-                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
-                    break;
-         }
-         stbiw__linear_to_rgbe(rgbe, linear);
-         scratch[x + width*0] = rgbe[0];
-         scratch[x + width*1] = rgbe[1];
-         scratch[x + width*2] = rgbe[2];
-         scratch[x + width*3] = rgbe[3];
+      stbiw__linear_to_rgbe(rgbe, linear);
+      s->func(s->context, rgbe, 4);
+    }
+  } else {
+    int c, r;
+    /* encode into scratch buffer */
+    for (x = 0; x < width; x++) {
+      switch (ncomp) {
+      case 4: /* fallthrough */
+      case 3:
+        linear[2] = scanline[x * ncomp + 2];
+        linear[1] = scanline[x * ncomp + 1];
+        linear[0] = scanline[x * ncomp + 0];
+        break;
+      default:
+        linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
+        break;
       }
-
-      s->func(s->context, scanlineheader, 4);
-
-      /* RLE each component separately */
-      for (c=0; c < 4; c++) {
-         unsigned char *comp = &scratch[width*c];
-
-         x = 0;
-         while (x < width) {
-            // find first run
-            r = x;
-            while (r+2 < width) {
-               if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
-                  break;
-               ++r;
-            }
-            if (r+2 >= width)
-               r = width;
-            // dump up to first run
-            while (x < r) {
-               int len = r-x;
-               if (len > 128) len = 128;
-               stbiw__write_dump_data(s, len, &comp[x]);
-               x += len;
-            }
-            // if there's a run, output it
-            if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
-               // find next byte after run
-               while (r < width && comp[r] == comp[x])
-                  ++r;
-               // output run up to r
-               while (x < r) {
-                  int len = r-x;
-                  if (len > 127) len = 127;
-                  stbiw__write_run_data(s, len, comp[x]);
-                  x += len;
-               }
-            }
-         }
+      stbiw__linear_to_rgbe(rgbe, linear);
+      scratch[x + width * 0] = rgbe[0];
+      scratch[x + width * 1] = rgbe[1];
+      scratch[x + width * 2] = rgbe[2];
+      scratch[x + width * 3] = rgbe[3];
+    }
+
+    s->func(s->context, scanlineheader, 4);
+
+    /* RLE each component separately */
+    for (c = 0; c < 4; c++) {
+      unsigned char *comp = &scratch[width * c];
+
+      x = 0;
+      while (x < width) {
+        // find first run
+        r = x;
+        while (r + 2 < width) {
+          if (comp[r] == comp[r + 1] && comp[r] == comp[r + 2])
+            break;
+          ++r;
+        }
+        if (r + 2 >= width)
+          r = width;
+        // dump up to first run
+        while (x < r) {
+          int len = r - x;
+          if (len > 128)
+            len = 128;
+          stbiw__write_dump_data(s, len, &comp[x]);
+          x += len;
+        }
+        // if there's a run, output it
+        if (r + 2 < width) { // same test as what we break out of in search
+                             // loop, so only true if we break'd
+          // find next byte after run
+          while (r < width && comp[r] == comp[x])
+            ++r;
+          // output run up to r
+          while (x < r) {
+            int len = r - x;
+            if (len > 127)
+              len = 127;
+            stbiw__write_run_data(s, len, comp[x]);
+            x += len;
+          }
+        }
       }
-   }
+    }
+  }
 }
 
-static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
-{
-   if (y <= 0 || x <= 0 || data == NULL)
-      return 0;
-   else {
-      // Each component is stored separately. Allocate scratch space for full output scanline.
-      unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
-      int i, len;
-      char buffer[128];
-      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
-      s->func(s->context, header, sizeof(header)-1);
+static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp,
+                               float *data) {
+  if (y <= 0 || x <= 0 || data == NULL)
+    return 0;
+  else {
+    // Each component is stored separately. Allocate scratch space for full
+    // output scanline.
+    unsigned char *scratch = (unsigned char *)STBIW_MALLOC(x * 4);
+    int i, len;
+    char buffer[128];
+    char header[] =
+        "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
+    s->func(s->context, header, sizeof(header) - 1);
 
 #ifdef __STDC_WANT_SECURE_LIB__
-      len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+    len =
+        sprintf_s(buffer, sizeof(buffer),
+                  "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
 #else
-      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+    len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n",
+                  y, x);
 #endif
-      s->func(s->context, buffer, len);
-
-      for(i=0; i < y; i++)
-         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
-      STBIW_FREE(scratch);
-      return 1;
-   }
+    s->func(s->context, buffer, len);
+
+    for (i = 0; i < y; i++)
+      stbiw__write_hdr_scanline(
+          s, x, comp, scratch,
+          data + comp * x * (stbi__flip_vertically_on_write ? y - 1 - i : i));
+    STBIW_FREE(scratch);
+    return 1;
+  }
 }
 
-STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
-{
-   stbi__write_context s;
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x,
+                                    int y, int comp, const float *data) {
+  stbi__write_context s;
+  stbi__start_write_callbacks(&s, func, context);
+  return stbi_write_hdr_core(&s, x, y, comp, (float *)data);
 }
 
 #ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
-{
-   stbi__write_context s;
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
+STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp,
+                            const float *data) {
+  stbi__write_context s;
+  if (stbi__start_write_file(&s, filename)) {
+    int r = stbi_write_hdr_core(&s, x, y, comp, (float *)data);
+    stbi__end_write_file(&s);
+    return r;
+  } else
+    return 0;
 }
 #endif // STBI_WRITE_NO_STDIO
 
-
 //////////////////////////////////////////////////////////////////////////////
 //
 // PNG writer
 //
 
 #ifndef STBIW_ZLIB_COMPRESS
-// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
-#define stbiw__sbraw(a) ((int *) (a) - 2)
-#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
-#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
-
-#define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
-#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
-#define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
-
-#define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
-#define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
-#define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
-
-static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
-{
-   int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
-   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
-   STBIW_ASSERT(p);
-   if (p) {
-      if (!*arr) ((int *) p)[1] = 0;
-      *arr = (void *) ((int *) p + 2);
-      stbiw__sbm(*arr) = m;
-   }
-   return *arr;
+// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount()
+// == vector<>::size()
+#define stbiw__sbraw(a) ((int *)(a)-2)
+#define stbiw__sbm(a) stbiw__sbraw(a)[0]
+#define stbiw__sbn(a) stbiw__sbraw(a)[1]
+
+#define stbiw__sbneedgrow(a, n) ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a))
+#define stbiw__sbmaybegrow(a, n)                                               \
+  (stbiw__sbneedgrow(a, (n)) ? stbiw__sbgrow(a, n) : 0)
+#define stbiw__sbgrow(a, n) stbiw__sbgrowf((void **)&(a), (n), sizeof(*(a)))
+
+#define stbiw__sbpush(a, v)                                                    \
+  (stbiw__sbmaybegrow(a, 1), (a)[stbiw__sbn(a)++] = (v))
+#define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0)
+#define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0)
+
+static void *stbiw__sbgrowf(void **arr, int increment, int itemsize) {
+  int m = *arr ? 2 * stbiw__sbm(*arr) + increment : increment + 1;
+  void *p = STBIW_REALLOC_SIZED(
+      *arr ? stbiw__sbraw(*arr) : 0,
+      *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0,
+      itemsize * m + sizeof(int) * 2);
+  STBIW_ASSERT(p);
+  if (p) {
+    if (!*arr)
+      ((int *)p)[1] = 0;
+    *arr = (void *)((int *)p + 2);
+    stbiw__sbm(*arr) = m;
+  }
+  return *arr;
 }
 
-static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
-{
-   while (*bitcount >= 8) {
-      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
-      *bitbuffer >>= 8;
-      *bitcount -= 8;
-   }
-   return data;
+static unsigned char *stbiw__zlib_flushf(unsigned char *data,
+                                         unsigned int *bitbuffer,
+                                         int *bitcount) {
+  while (*bitcount >= 8) {
+    stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
+    *bitbuffer >>= 8;
+    *bitcount -= 8;
+  }
+  return data;
 }
 
-static int stbiw__zlib_bitrev(int code, int codebits)
-{
-   int res=0;
-   while (codebits--) {
-      res = (res << 1) | (code & 1);
-      code >>= 1;
-   }
-   return res;
+static int stbiw__zlib_bitrev(int code, int codebits) {
+  int res = 0;
+  while (codebits--) {
+    res = (res << 1) | (code & 1);
+    code >>= 1;
+  }
+  return res;
 }
 
-static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
-{
-   int i;
-   for (i=0; i < limit && i < 258; ++i)
-      if (a[i] != b[i]) break;
-   return i;
+static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b,
+                                       int limit) {
+  int i;
+  for (i = 0; i < limit && i < 258; ++i)
+    if (a[i] != b[i])
+      break;
+  return i;
 }
 
-static unsigned int stbiw__zhash(unsigned char *data)
-{
-   stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
-   hash ^= hash << 3;
-   hash += hash >> 5;
-   hash ^= hash << 4;
-   hash += hash >> 17;
-   hash ^= hash << 25;
-   hash += hash >> 6;
-   return hash;
+static unsigned int stbiw__zhash(unsigned char *data) {
+  stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
+  hash ^= hash << 3;
+  hash += hash >> 5;
+  hash ^= hash << 4;
+  hash += hash >> 17;
+  hash ^= hash << 25;
+  hash += hash >> 6;
+  return hash;
 }
 
 #define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
-#define stbiw__zlib_add(code,codebits) \
-      (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
-#define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
+#define stbiw__zlib_add(code, codebits)                                        \
+  (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
+#define stbiw__zlib_huffa(b, c) stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c)
 // default huffman tables
-#define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
-#define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
-#define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
-#define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
-#define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
-#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
-
-#define stbiw__ZHASH   16384
+#define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8)
+#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9)
+#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256, 7)
+#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280, 8)
+#define stbiw__zlib_huff(n)                                                    \
+  ((n) <= 143 ? stbiw__zlib_huff1(n)                                           \
+              : (n) <= 255 ? stbiw__zlib_huff2(n)                              \
+                           : (n) <= 279 ? stbiw__zlib_huff3(n)                 \
+                                        : stbiw__zlib_huff4(n))
+#define stbiw__zlib_huffb(n)                                                   \
+  ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
+
+#define stbiw__ZHASH 16384
 
 #endif // STBIW_ZLIB_COMPRESS
 
-STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
-{
+STBIWDEF unsigned char *stbi_zlib_compress(unsigned char *data, int data_len,
+                                           int *out_len, int quality) {
 #ifdef STBIW_ZLIB_COMPRESS
-   // user provided a zlib compress implementation, use that
-   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
-#else // use builtin
-   static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
-   static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
-   static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
-   static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
-   unsigned int bitbuf=0;
-   int i,j, bitcount=0;
-   unsigned char *out = NULL;
-   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
-   if (hash_table == NULL)
-      return NULL;
-   if (quality < 5) quality = 5;
-
-   stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
-   stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
-   stbiw__zlib_add(1,1);  // BFINAL = 1
-   stbiw__zlib_add(1,2);  // BTYPE = 1 -- fixed huffman
-
-   for (i=0; i < stbiw__ZHASH; ++i)
-      hash_table[i] = NULL;
-
-   i=0;
-   while (i < data_len-3) {
-      // hash next 3 bytes of data to be compressed
-      int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
-      unsigned char *bestloc = 0;
-      unsigned char **hlist = hash_table[h];
-      int n = stbiw__sbcount(hlist);
-      for (j=0; j < n; ++j) {
-         if (hlist[j]-data > i-32768) { // if entry lies within window
-            int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
-            if (d >= best) { best=d; bestloc=hlist[j]; }
-         }
-      }
-      // when hash table entry is too long, delete half the entries
-      if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
-         STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
-         stbiw__sbn(hash_table[h]) = quality;
-      }
-      stbiw__sbpush(hash_table[h],data+i);
-
-      if (bestloc) {
-         // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
-         h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
-         hlist = hash_table[h];
-         n = stbiw__sbcount(hlist);
-         for (j=0; j < n; ++j) {
-            if (hlist[j]-data > i-32767) {
-               int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
-               if (e > best) { // if next match is better, bail on current match
-                  bestloc = NULL;
-                  break;
-               }
-            }
-         }
+  // user provided a zlib compress implementation, use that
+  return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
+#else  // use builtin
+  static unsigned short lengthc[] = {
+      3,  4,  5,  6,  7,  8,  9,  10, 11,  13,  15,  17,  19,  23,  27,
+      31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 259};
+  static unsigned char lengtheb[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+                                     1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
+                                     4, 4, 4, 4, 5, 5, 5, 5, 0};
+  static unsigned short distc[] = {
+      1,    2,    3,    4,    5,    7,     9,     13,    17,   25,   33,
+      49,   65,   97,   129,  193,  257,   385,   513,   769,  1025, 1537,
+      2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768};
+  static unsigned char disteb[] = {0, 0, 0,  0,  1,  1,  2,  2,  3,  3,
+                                   4, 4, 5,  5,  6,  6,  7,  7,  8,  8,
+                                   9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
+  unsigned int bitbuf = 0;
+  int i, j, bitcount = 0;
+  unsigned char *out = NULL;
+  unsigned char ***hash_table =
+      (unsigned char ***)STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char **));
+  if (hash_table == NULL)
+    return NULL;
+  if (quality < 5)
+    quality = 5;
+
+  stbiw__sbpush(out, 0x78); // DEFLATE 32K window
+  stbiw__sbpush(out, 0x5e); // FLEVEL = 1
+  stbiw__zlib_add(1, 1);    // BFINAL = 1
+  stbiw__zlib_add(1, 2);    // BTYPE = 1 -- fixed huffman
+
+  for (i = 0; i < stbiw__ZHASH; ++i)
+    hash_table[i] = NULL;
+
+  i = 0;
+  while (i < data_len - 3) {
+    // hash next 3 bytes of data to be compressed
+    int h = stbiw__zhash(data + i) & (stbiw__ZHASH - 1), best = 3;
+    unsigned char *bestloc = 0;
+    unsigned char **hlist = hash_table[h];
+    int n = stbiw__sbcount(hlist);
+    for (j = 0; j < n; ++j) {
+      if (hlist[j] - data > i - 32768) { // if entry lies within window
+        int d = stbiw__zlib_countm(hlist[j], data + i, data_len - i);
+        if (d >= best) {
+          best = d;
+          bestloc = hlist[j];
+        }
       }
-
-      if (bestloc) {
-         int d = (int) (data+i - bestloc); // distance back
-         STBIW_ASSERT(d <= 32767 && best <= 258);
-         for (j=0; best > lengthc[j+1]-1; ++j);
-         stbiw__zlib_huff(j+257);
-         if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
-         for (j=0; d > distc[j+1]-1; ++j);
-         stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
-         if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
-         i += best;
-      } else {
-         stbiw__zlib_huffb(data[i]);
-         ++i;
+    }
+    // when hash table entry is too long, delete half the entries
+    if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality) {
+      STBIW_MEMMOVE(hash_table[h], hash_table[h] + quality,
+                    sizeof(hash_table[h][0]) * quality);
+      stbiw__sbn(hash_table[h]) = quality;
+    }
+    stbiw__sbpush(hash_table[h], data + i);
+
+    if (bestloc) {
+      // "lazy matching" - check match at *next* byte, and if it's better, do
+      // cur byte as literal
+      h = stbiw__zhash(data + i + 1) & (stbiw__ZHASH - 1);
+      hlist = hash_table[h];
+      n = stbiw__sbcount(hlist);
+      for (j = 0; j < n; ++j) {
+        if (hlist[j] - data > i - 32767) {
+          int e = stbiw__zlib_countm(hlist[j], data + i + 1, data_len - i - 1);
+          if (e > best) { // if next match is better, bail on current match
+            bestloc = NULL;
+            break;
+          }
+        }
       }
-   }
-   // write out final bytes
-   for (;i < data_len; ++i)
+    }
+
+    if (bestloc) {
+      int d = (int)(data + i - bestloc); // distance back
+      STBIW_ASSERT(d <= 32767 && best <= 258);
+      for (j = 0; best > lengthc[j + 1] - 1; ++j)
+        ;
+      stbiw__zlib_huff(j + 257);
+      if (lengtheb[j])
+        stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
+      for (j = 0; d > distc[j + 1] - 1; ++j)
+        ;
+      stbiw__zlib_add(stbiw__zlib_bitrev(j, 5), 5);
+      if (disteb[j])
+        stbiw__zlib_add(d - distc[j], disteb[j]);
+      i += best;
+    } else {
       stbiw__zlib_huffb(data[i]);
-   stbiw__zlib_huff(256); // end of block
-   // pad with 0 bits to byte boundary
-   while (bitcount)
-      stbiw__zlib_add(0,1);
-
-   for (i=0; i < stbiw__ZHASH; ++i)
-      (void) stbiw__sbfree(hash_table[i]);
-   STBIW_FREE(hash_table);
-
-   {
-      // compute adler32 on input
-      unsigned int s1=1, s2=0;
-      int blocklen = (int) (data_len % 5552);
-      j=0;
-      while (j < data_len) {
-         for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
-         s1 %= 65521; s2 %= 65521;
-         j += blocklen;
-         blocklen = 5552;
+      ++i;
+    }
+  }
+  // write out final bytes
+  for (; i < data_len; ++i)
+    stbiw__zlib_huffb(data[i]);
+  stbiw__zlib_huff(256); // end of block
+  // pad with 0 bits to byte boundary
+  while (bitcount)
+    stbiw__zlib_add(0, 1);
+
+  for (i = 0; i < stbiw__ZHASH; ++i)
+    (void)stbiw__sbfree(hash_table[i]);
+  STBIW_FREE(hash_table);
+
+  {
+    // compute adler32 on input
+    unsigned int s1 = 1, s2 = 0;
+    int blocklen = (int)(data_len % 5552);
+    j = 0;
+    while (j < data_len) {
+      for (i = 0; i < blocklen; ++i) {
+        s1 += data[j + i];
+        s2 += s1;
       }
-      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
-      stbiw__sbpush(out, STBIW_UCHAR(s2));
-      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
-      stbiw__sbpush(out, STBIW_UCHAR(s1));
-   }
-   *out_len = stbiw__sbn(out);
-   // make returned pointer freeable
-   STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
-   return (unsigned char *) stbiw__sbraw(out);
+      s1 %= 65521;
+      s2 %= 65521;
+      j += blocklen;
+      blocklen = 5552;
+    }
+    stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
+    stbiw__sbpush(out, STBIW_UCHAR(s2));
+    stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
+    stbiw__sbpush(out, STBIW_UCHAR(s1));
+  }
+  *out_len = stbiw__sbn(out);
+  // make returned pointer freeable
+  STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
+  return (unsigned char *)stbiw__sbraw(out);
 #endif // STBIW_ZLIB_COMPRESS
 }
 
-static unsigned int stbiw__crc32(unsigned char *buffer, int len)
-{
+static unsigned int stbiw__crc32(unsigned char *buffer, int len) {
 #ifdef STBIW_CRC32
-    return STBIW_CRC32(buffer, len);
+  return STBIW_CRC32(buffer, len);
 #else
-   static unsigned int crc_table[256] =
-   {
-      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
-      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
-      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
-      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
-      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
-      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
-      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
-      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
-      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
-      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
-      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
-      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
-      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
-      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
-      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
-      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
-      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
-      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
-      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
-      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
-      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
-      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
-      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
-      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
-      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
-      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
-      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
-      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
-      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
-      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
-      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
-      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
-   };
-
-   unsigned int crc = ~0u;
-   int i;
-   for (i=0; i < len; ++i)
-      crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
-   return ~crc;
+  static unsigned int crc_table[256] = {
+      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
+      0xE963A535, 0x9E6495A3, 0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+      0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
+      0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9,
+      0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+      0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+      0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
+      0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+      0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
+      0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
+      0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+      0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
+      0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
+      0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+      0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
+      0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+      0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+      0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
+      0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
+      0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+      0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
+      0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
+      0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+      0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
+      0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
+      0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+      0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+      0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
+      0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+      0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
+      0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
+      0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+      0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D};
+
+  unsigned int crc = ~0u;
+  int i;
+  for (i = 0; i < len; ++i)
+    crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
+  return ~crc;
 #endif
 }
 
-#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
-#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
-#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
+#define stbiw__wpng4(o, a, b, c, d)                                            \
+  ((o)[0] = STBIW_UCHAR(a), (o)[1] = STBIW_UCHAR(b), (o)[2] = STBIW_UCHAR(c),  \
+   (o)[3] = STBIW_UCHAR(d), (o) += 4)
+#define stbiw__wp32(data, v)                                                   \
+  stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v));
+#define stbiw__wptag(data, s) stbiw__wpng4(data, s[0], s[1], s[2], s[3])
 
-static void stbiw__wpcrc(unsigned char **data, int len)
-{
-   unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
-   stbiw__wp32(*data, crc);
+static void stbiw__wpcrc(unsigned char **data, int len) {
+  unsigned int crc = stbiw__crc32(*data - len - 4, len + 4);
+  stbiw__wp32(*data, crc);
 }
 
-static unsigned char stbiw__paeth(int a, int b, int c)
-{
-   int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
-   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
-   if (pb <= pc) return STBIW_UCHAR(b);
-   return STBIW_UCHAR(c);
+static unsigned char stbiw__paeth(int a, int b, int c) {
+  int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c);
+  if (pa <= pb && pa <= pc)
+    return STBIW_UCHAR(a);
+  if (pb <= pc)
+    return STBIW_UCHAR(b);
+  return STBIW_UCHAR(c);
 }
 
 // @OPTIMIZE: provide an option that always forces left-predict or paeth predict
-static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
-{
-   static int mapping[] = { 0,1,2,3,4 };
-   static int firstmap[] = { 0,1,0,5,6 };
-   int *mymap = (y != 0) ? mapping : firstmap;
-   int i;
-   int type = mymap[filter_type];
-   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
-   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
-    
-   if (type==0) {
-      memcpy(line_buffer, z, width*n);
-      return;
-   }
-
-   // first loop isn't optimized since it's just one pixel    
-   for (i = 0; i < n; ++i) {
-      switch (type) {
-         case 1: line_buffer[i] = z[i]; break;
-         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
-         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
-         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
-         case 5: line_buffer[i] = z[i]; break;
-         case 6: line_buffer[i] = z[i]; break;
-      }
-   }
-   switch (type) {
-      case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
-      case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
-      case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
-      case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
-      case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
-      case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
-   }
+static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes,
+                                   int width, int height, int y, int n,
+                                   int filter_type, signed char *line_buffer) {
+  static int mapping[] = {0, 1, 2, 3, 4};
+  static int firstmap[] = {0, 1, 0, 5, 6};
+  int *mymap = (y != 0) ? mapping : firstmap;
+  int i;
+  int type = mymap[filter_type];
+  unsigned char *z =
+      pixels +
+      stride_bytes * (stbi__flip_vertically_on_write ? height - 1 - y : y);
+  int signed_stride =
+      stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+
+  if (type == 0) {
+    memcpy(line_buffer, z, width * n);
+    return;
+  }
+
+  // first loop isn't optimized since it's just one pixel
+  for (i = 0; i < n; ++i) {
+    switch (type) {
+    case 1:
+      line_buffer[i] = z[i];
+      break;
+    case 2:
+      line_buffer[i] = z[i] - z[i - signed_stride];
+      break;
+    case 3:
+      line_buffer[i] = z[i] - (z[i - signed_stride] >> 1);
+      break;
+    case 4:
+      line_buffer[i] =
+          (signed char)(z[i] - stbiw__paeth(0, z[i - signed_stride], 0));
+      break;
+    case 5:
+      line_buffer[i] = z[i];
+      break;
+    case 6:
+      line_buffer[i] = z[i];
+      break;
+    }
+  }
+  switch (type) {
+  case 1:
+    for (i = n; i < width * n; ++i)
+      line_buffer[i] = z[i] - z[i - n];
+    break;
+  case 2:
+    for (i = n; i < width * n; ++i)
+      line_buffer[i] = z[i] - z[i - signed_stride];
+    break;
+  case 3:
+    for (i = n; i < width * n; ++i)
+      line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1);
+    break;
+  case 4:
+    for (i = n; i < width * n; ++i)
+      line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride],
+                                           z[i - signed_stride - n]);
+    break;
+  case 5:
+    for (i = n; i < width * n; ++i)
+      line_buffer[i] = z[i] - (z[i - n] >> 1);
+    break;
+  case 6:
+    for (i = n; i < width * n; ++i)
+      line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0);
+    break;
+  }
 }
 
-STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
-{
-   int force_filter = stbi_write_force_png_filter;
-   int ctype[5] = { -1, 0, 4, 2, 6 };
-   unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
-   unsigned char *out,*o, *filt, *zlib;
-   signed char *line_buffer;
-   int j,zlen;
-
-   if (stride_bytes == 0)
-      stride_bytes = x * n;
-
-   if (force_filter >= 5) {
-      force_filter = -1;
-   }
-
-   filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
-   line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
-   for (j=0; j < y; ++j) {
-      int filter_type;
-      if (force_filter > -1) {
-         filter_type = force_filter;
-         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
-      } else { // Estimate the best filter by running through all of them:
-         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
-         for (filter_type = 0; filter_type < 5; filter_type++) {
-            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
-
-            // Estimate the entropy of the line using this filter; the less, the better.
-            est = 0;
-            for (i = 0; i < x*n; ++i) {
-               est += abs((signed char) line_buffer[i]);
-            }
-            if (est < best_filter_val) {
-               best_filter_val = est;
-               best_filter = filter_type;
-            }
-         }
-         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
-            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
-            filter_type = best_filter;
-         }
+STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels,
+                                              int stride_bytes, int x, int y,
+                                              int n, int *out_len) {
+  int force_filter = stbi_write_force_png_filter;
+  int ctype[5] = {-1, 0, 4, 2, 6};
+  unsigned char sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+  unsigned char *out, *o, *filt, *zlib;
+  signed char *line_buffer;
+  int j, zlen;
+
+  if (stride_bytes == 0)
+    stride_bytes = x * n;
+
+  if (force_filter >= 5) {
+    force_filter = -1;
+  }
+
+  filt = (unsigned char *)STBIW_MALLOC((x * n + 1) * y);
+  if (!filt)
+    return 0;
+  line_buffer = (signed char *)STBIW_MALLOC(x * n);
+  if (!line_buffer) {
+    STBIW_FREE(filt);
+    return 0;
+  }
+  for (j = 0; j < y; ++j) {
+    int filter_type;
+    if (force_filter > -1) {
+      filter_type = force_filter;
+      stbiw__encode_png_line((unsigned char *)(pixels), stride_bytes, x, y, j,
+                             n, force_filter, line_buffer);
+    } else { // Estimate the best filter by running through all of them:
+      int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+      for (filter_type = 0; filter_type < 5; filter_type++) {
+        stbiw__encode_png_line((unsigned char *)(pixels), stride_bytes, x, y, j,
+                               n, filter_type, line_buffer);
+
+        // Estimate the entropy of the line using this filter; the less, the
+        // better.
+        est = 0;
+        for (i = 0; i < x * n; ++i) {
+          est += abs((signed char)line_buffer[i]);
+        }
+        if (est < best_filter_val) {
+          best_filter_val = est;
+          best_filter = filter_type;
+        }
+      }
+      if (filter_type != best_filter) { // If the last iteration already got us
+                                        // the best filter, don't redo it
+        stbiw__encode_png_line((unsigned char *)(pixels), stride_bytes, x, y, j,
+                               n, best_filter, line_buffer);
+        filter_type = best_filter;
       }
-      // when we get here, filter_type contains the filter type, and line_buffer contains the data
-      filt[j*(x*n+1)] = (unsigned char) filter_type;
-      STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
-   }
-   STBIW_FREE(line_buffer);
-   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
-   STBIW_FREE(filt);
-   if (!zlib) return 0;
-
-   // each tag requires 12 bytes of overhead
-   out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12);
-   if (!out) return 0;
-   *out_len = 8 + 12+13 + 12+zlen + 12;
-
-   o=out;
-   STBIW_MEMMOVE(o,sig,8); o+= 8;
-   stbiw__wp32(o, 13); // header length
-   stbiw__wptag(o, "IHDR");
-   stbiw__wp32(o, x);
-   stbiw__wp32(o, y);
-   *o++ = 8;
-   *o++ = STBIW_UCHAR(ctype[n]);
-   *o++ = 0;
-   *o++ = 0;
-   *o++ = 0;
-   stbiw__wpcrc(&o,13);
-
-   stbiw__wp32(o, zlen);
-   stbiw__wptag(o, "IDAT");
-   STBIW_MEMMOVE(o, zlib, zlen);
-   o += zlen;
-   STBIW_FREE(zlib);
-   stbiw__wpcrc(&o, zlen);
-
-   stbiw__wp32(o,0);
-   stbiw__wptag(o, "IEND");
-   stbiw__wpcrc(&o,0);
-
-   STBIW_ASSERT(o == out + *out_len);
-
-   return out;
+    }
+    // when we get here, filter_type contains the filter type, and line_buffer
+    // contains the data
+    filt[j * (x * n + 1)] = (unsigned char)filter_type;
+    STBIW_MEMMOVE(filt + j * (x * n + 1) + 1, line_buffer, x * n);
+  }
+  STBIW_FREE(line_buffer);
+  zlib = stbi_zlib_compress(filt, y * (x * n + 1), &zlen,
+                            stbi_write_png_compression_level);
+  STBIW_FREE(filt);
+  if (!zlib)
+    return 0;
+
+  // each tag requires 12 bytes of overhead
+  out = (unsigned char *)STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12);
+  if (!out)
+    return 0;
+  *out_len = 8 + 12 + 13 + 12 + zlen + 12;
+
+  o = out;
+  STBIW_MEMMOVE(o, sig, 8);
+  o += 8;
+  stbiw__wp32(o, 13); // header length
+  stbiw__wptag(o, "IHDR");
+  stbiw__wp32(o, x);
+  stbiw__wp32(o, y);
+  *o++ = 8;
+  *o++ = STBIW_UCHAR(ctype[n]);
+  *o++ = 0;
+  *o++ = 0;
+  *o++ = 0;
+  stbiw__wpcrc(&o, 13);
+
+  stbiw__wp32(o, zlen);
+  stbiw__wptag(o, "IDAT");
+  STBIW_MEMMOVE(o, zlib, zlen);
+  o += zlen;
+  STBIW_FREE(zlib);
+  stbiw__wpcrc(&o, zlen);
+
+  stbiw__wp32(o, 0);
+  stbiw__wptag(o, "IEND");
+  stbiw__wpcrc(&o, 0);
+
+  STBIW_ASSERT(o == out + *out_len);
+
+  return out;
 }
 
 #ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
-{
-   FILE *f;
-   int len;
-   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
-   if (png == NULL) return 0;
-
-   f = stbiw__fopen(filename, "wb");
-   if (!f) { STBIW_FREE(png); return 0; }
-   fwrite(png, 1, len, f);
-   fclose(f);
-   STBIW_FREE(png);
-   return 1;
+STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp,
+                            const void *data, int stride_bytes) {
+  FILE *f;
+  int len;
+  unsigned char *png = stbi_write_png_to_mem((const unsigned char *)data,
+                                             stride_bytes, x, y, comp, &len);
+  if (png == NULL)
+    return 0;
+
+  f = stbiw__fopen(filename, "wb");
+  if (!f) {
+    STBIW_FREE(png);
+    return 0;
+  }
+  fwrite(png, 1, len, f);
+  fclose(f);
+  STBIW_FREE(png);
+  return 1;
 }
 #endif
 
-STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
-{
-   int len;
-   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
-   if (png == NULL) return 0;
-   func(context, png, len);
-   STBIW_FREE(png);
-   return 1;
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x,
+                                    int y, int comp, const void *data,
+                                    int stride_bytes) {
+  int len;
+  unsigned char *png = stbi_write_png_to_mem((const unsigned char *)data,
+                                             stride_bytes, x, y, comp, &len);
+  if (png == NULL)
+    return 0;
+  func(context, png, len);
+  STBIW_FREE(png);
+  return 1;
 }
 
-
 /* ***************************************************************************
  *
  * JPEG writer
  *
  * This is based on Jon Olick's jo_jpeg.cpp:
- * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
+ * public domain Simple, Minimalistic JPEG writer -
+ * http://www.jonolick.com/code.html
  */
 
-static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
-      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
-
-static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
-   int bitBuf = *bitBufP, bitCnt = *bitCntP;
-   bitCnt += bs[1];
-   bitBuf |= bs[0] << (24 - bitCnt);
-   while(bitCnt >= 8) {
-      unsigned char c = (bitBuf >> 16) & 255;
-      stbiw__putc(s, c);
-      if(c == 255) {
-         stbiw__putc(s, 0);
-      }
-      bitBuf <<= 8;
-      bitCnt -= 8;
-   }
-   *bitBufP = bitBuf;
-   *bitCntP = bitCnt;
+static const unsigned char stbiw__jpg_ZigZag[] = {
+    0,  1,  5,  6,  14, 15, 27, 28, 2,  4,  7,  13, 16, 26, 29, 42,
+    3,  8,  12, 17, 25, 30, 41, 43, 9,  11, 18, 24, 31, 40, 44, 53,
+    10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
+    21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63};
+
+static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP,
+                                 int *bitCntP, const unsigned short *bs) {
+  int bitBuf = *bitBufP, bitCnt = *bitCntP;
+  bitCnt += bs[1];
+  bitBuf |= bs[0] << (24 - bitCnt);
+  while (bitCnt >= 8) {
+    unsigned char c = (bitBuf >> 16) & 255;
+    stbiw__putc(s, c);
+    if (c == 255) {
+      stbiw__putc(s, 0);
+    }
+    bitBuf <<= 8;
+    bitCnt -= 8;
+  }
+  *bitBufP = bitBuf;
+  *bitCntP = bitCnt;
 }
 
-static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
-   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
-   float z1, z2, z3, z4, z5, z11, z13;
-
-   float tmp0 = d0 + d7;
-   float tmp7 = d0 - d7;
-   float tmp1 = d1 + d6;
-   float tmp6 = d1 - d6;
-   float tmp2 = d2 + d5;
-   float tmp5 = d2 - d5;
-   float tmp3 = d3 + d4;
-   float tmp4 = d3 - d4;
-
-   // Even part
-   float tmp10 = tmp0 + tmp3;   // phase 2
-   float tmp13 = tmp0 - tmp3;
-   float tmp11 = tmp1 + tmp2;
-   float tmp12 = tmp1 - tmp2;
-
-   d0 = tmp10 + tmp11;       // phase 3
-   d4 = tmp10 - tmp11;
-
-   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
-   d2 = tmp13 + z1;       // phase 5
-   d6 = tmp13 - z1;
-
-   // Odd part
-   tmp10 = tmp4 + tmp5;       // phase 2
-   tmp11 = tmp5 + tmp6;
-   tmp12 = tmp6 + tmp7;
-
-   // The rotator is modified from fig 4-8 to avoid extra negations.
-   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
-   z2 = tmp10 * 0.541196100f + z5; // c2-c6
-   z4 = tmp12 * 1.306562965f + z5; // c2+c6
-   z3 = tmp11 * 0.707106781f; // c4
-
-   z11 = tmp7 + z3;      // phase 5
-   z13 = tmp7 - z3;
-
-   *d5p = z13 + z2;         // phase 6
-   *d3p = z13 - z2;
-   *d1p = z11 + z4;
-   *d7p = z11 - z4;
-
-   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p,
+                           float *d4p, float *d5p, float *d6p, float *d7p) {
+  float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p,
+        d6 = *d6p, d7 = *d7p;
+  float z1, z2, z3, z4, z5, z11, z13;
+
+  float tmp0 = d0 + d7;
+  float tmp7 = d0 - d7;
+  float tmp1 = d1 + d6;
+  float tmp6 = d1 - d6;
+  float tmp2 = d2 + d5;
+  float tmp5 = d2 - d5;
+  float tmp3 = d3 + d4;
+  float tmp4 = d3 - d4;
+
+  // Even part
+  float tmp10 = tmp0 + tmp3; // phase 2
+  float tmp13 = tmp0 - tmp3;
+  float tmp11 = tmp1 + tmp2;
+  float tmp12 = tmp1 - tmp2;
+
+  d0 = tmp10 + tmp11; // phase 3
+  d4 = tmp10 - tmp11;
+
+  z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+  d2 = tmp13 + z1;                     // phase 5
+  d6 = tmp13 - z1;
+
+  // Odd part
+  tmp10 = tmp4 + tmp5; // phase 2
+  tmp11 = tmp5 + tmp6;
+  tmp12 = tmp6 + tmp7;
+
+  // The rotator is modified from fig 4-8 to avoid extra negations.
+  z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+  z2 = tmp10 * 0.541196100f + z5;      // c2-c6
+  z4 = tmp12 * 1.306562965f + z5;      // c2+c6
+  z3 = tmp11 * 0.707106781f;           // c4
+
+  z11 = tmp7 + z3; // phase 5
+  z13 = tmp7 - z3;
+
+  *d5p = z13 + z2; // phase 6
+  *d3p = z13 - z2;
+  *d1p = z11 + z4;
+  *d7p = z11 - z4;
+
+  *d0p = d0;
+  *d2p = d2;
+  *d4p = d4;
+  *d6p = d6;
 }
 
 static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
-   int tmp1 = val < 0 ? -val : val;
-   val = val < 0 ? val-1 : val;
-   bits[1] = 1;
-   while(tmp1 >>= 1) {
-      ++bits[1];
-   }
-   bits[0] = val & ((1<<bits[1])-1);
+  int tmp1 = val < 0 ? -val : val;
+  val = val < 0 ? val - 1 : val;
+  bits[1] = 1;
+  while (tmp1 >>= 1) {
+    ++bits[1];
+  }
+  bits[0] = val & ((1 << bits[1]) - 1);
 }
 
-static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
-   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
-   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
-   int dataOff, i, diff, end0pos;
-   int DU[64];
-
-   // DCT rows
-   for(dataOff=0; dataOff<64; dataOff+=8) {
-      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
-   }
-   // DCT columns
-   for(dataOff=0; dataOff<8; ++dataOff) {
-      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+8], &CDU[dataOff+16], &CDU[dataOff+24], &CDU[dataOff+32], &CDU[dataOff+40], &CDU[dataOff+48], &CDU[dataOff+56]);
-   }
-   // Quantize/descale/zigzag the coefficients
-   for(i=0; i<64; ++i) {
-      float v = CDU[i]*fdtbl[i];
-      // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
-      // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
-      DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
-   }
-
-   // Encode DC
-   diff = DU[0] - DC;
-   if (diff == 0) {
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
-   } else {
-      unsigned short bits[2];
-      stbiw__jpg_calcBits(diff, bits);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
-   }
-   // Encode ACs
-   end0pos = 63;
-   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
-   }
-   // end0pos = first element in reverse order !=0
-   if(end0pos == 0) {
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
-      return DU[0];
-   }
-   for(i = 1; i <= end0pos; ++i) {
-      int startpos = i;
-      int nrzeroes;
-      unsigned short bits[2];
-      for (; DU[i]==0 && i<=end0pos; ++i) {
-      }
-      nrzeroes = i-startpos;
-      if ( nrzeroes >= 16 ) {
-         int lng = nrzeroes>>4;
-         int nrmarker;
-         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
-            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
-         nrzeroes &= 15;
-      }
-      stbiw__jpg_calcBits(DU[i], bits);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
-   }
-   if(end0pos != 63) {
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
-   }
-   return DU[0];
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf,
+                                int *bitCnt, float *CDU, float *fdtbl, int DC,
+                                const unsigned short HTDC[256][2],
+                                const unsigned short HTAC[256][2]) {
+  const unsigned short EOB[2] = {HTAC[0x00][0], HTAC[0x00][1]};
+  const unsigned short M16zeroes[2] = {HTAC[0xF0][0], HTAC[0xF0][1]};
+  int dataOff, i, diff, end0pos;
+  int DU[64];
+
+  // DCT rows
+  for (dataOff = 0; dataOff < 64; dataOff += 8) {
+    stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 1], &CDU[dataOff + 2],
+                   &CDU[dataOff + 3], &CDU[dataOff + 4], &CDU[dataOff + 5],
+                   &CDU[dataOff + 6], &CDU[dataOff + 7]);
+  }
+  // DCT columns
+  for (dataOff = 0; dataOff < 8; ++dataOff) {
+    stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 8], &CDU[dataOff + 16],
+                   &CDU[dataOff + 24], &CDU[dataOff + 32], &CDU[dataOff + 40],
+                   &CDU[dataOff + 48], &CDU[dataOff + 56]);
+  }
+  // Quantize/descale/zigzag the coefficients
+  for (i = 0; i < 64; ++i) {
+    float v = CDU[i] * fdtbl[i];
+    // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v +
+    // 0.5f)); ceilf() and floorf() are C99, not C89, but I /think/ they're not
+    // needed here anyway?
+    DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+  }
+
+  // Encode DC
+  diff = DU[0] - DC;
+  if (diff == 0) {
+    stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
+  } else {
+    unsigned short bits[2];
+    stbiw__jpg_calcBits(diff, bits);
+    stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
+    stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+  }
+  // Encode ACs
+  end0pos = 63;
+  for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos) {
+  }
+  // end0pos = first element in reverse order !=0
+  if (end0pos == 0) {
+    stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+    return DU[0];
+  }
+  for (i = 1; i <= end0pos; ++i) {
+    int startpos = i;
+    int nrzeroes;
+    unsigned short bits[2];
+    for (; DU[i] == 0 && i <= end0pos; ++i) {
+    }
+    nrzeroes = i - startpos;
+    if (nrzeroes >= 16) {
+      int lng = nrzeroes >> 4;
+      int nrmarker;
+      for (nrmarker = 1; nrmarker <= lng; ++nrmarker)
+        stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
+      nrzeroes &= 15;
+    }
+    stbiw__jpg_calcBits(DU[i], bits);
+    stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes << 4) + bits[1]]);
+    stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+  }
+  if (end0pos != 63) {
+    stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+  }
+  return DU[0];
 }
 
-static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
-   // Constants that don't pollute global namespace
-   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
-   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
-   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
-   static const unsigned char std_ac_luminance_values[] = {
-      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
-      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
-      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
-      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
-      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
-      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
-      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
-   };
-   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
-   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
-   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
-   static const unsigned char std_ac_chrominance_values[] = {
-      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
-      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
-      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
-      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
-      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
-      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
-      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
-   };
-   // Huffman tables
-   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
-   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
-   static const unsigned short YAC_HT[256][2] = {
-      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
-   };
-   static const unsigned short UVAC_HT[256][2] = {
-      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
-   };
-   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
-                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
-   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
-                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
-   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 
-                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
-
-   int row, col, i, k;
-   float fdtbl_Y[64], fdtbl_UV[64];
-   unsigned char YTable[64], UVTable[64];
-
-   if(!data || !width || !height || comp > 4 || comp < 1) {
-      return 0;
-   }
-
-   quality = quality ? quality : 90;
-   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
-   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
-
-   for(i = 0; i < 64; ++i) {
-      int uvti, yti = (YQT[i]*quality+50)/100;
-      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
-      uvti = (UVQT[i]*quality+50)/100;
-      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
-   }
-
-   for(row = 0, k = 0; row < 8; ++row) {
-      for(col = 0; col < 8; ++col, ++k) {
-         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
-         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
-      }
-   }
-
-   // Write Headers
-   {
-      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
-      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
-      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
-                                      3,1,0x11,0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
-      s->func(s->context, (void*)head0, sizeof(head0));
-      s->func(s->context, (void*)YTable, sizeof(YTable));
-      stbiw__putc(s, 1);
-      s->func(s->context, UVTable, sizeof(UVTable));
-      s->func(s->context, (void*)head1, sizeof(head1));
-      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
-      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
-      stbiw__putc(s, 0x10); // HTYACinfo
-      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
-      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
-      stbiw__putc(s, 1); // HTUDCinfo
-      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
-      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
-      stbiw__putc(s, 0x11); // HTUACinfo
-      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
-      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
-      s->func(s->context, (void*)head2, sizeof(head2));
-   }
-
-   // Encode 8x8 macroblocks
-   {
-      static const unsigned short fillBits[] = {0x7F, 7};
-      const unsigned char *imageData = (const unsigned char *)data;
-      int DCY=0, DCU=0, DCV=0;
-      int bitBuf=0, bitCnt=0;
-      // comp == 2 is grey+alpha (alpha is ignored)
-      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
-      int x, y, pos;
-      for(y = 0; y < height; y += 8) {
-         for(x = 0; x < width; x += 8) {
-            float YDU[64], UDU[64], VDU[64];
-            for(row = y, pos = 0; row < y+8; ++row) {
-               // row >= height => use last input row
-               int clamped_row = (row < height) ? row : height - 1;
-               int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
-               for(col = x; col < x+8; ++col, ++pos) {
-                  float r, g, b;
-                  // if col >= width => use pixel from last input column
-                  int p = base_p + ((col < width) ? col : (width-1))*comp;
-
-                  r = imageData[p+0];
-                  g = imageData[p+ofsG];
-                  b = imageData[p+ofsB];
-                  YDU[pos]=+0.29900f*r+0.58700f*g+0.11400f*b-128;
-                  UDU[pos]=-0.16874f*r-0.33126f*g+0.50000f*b;
-                  VDU[pos]=+0.50000f*r-0.41869f*g-0.08131f*b;
-               }
-            }
-
-            DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY, YDC_HT, YAC_HT);
-            DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
-            DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
-         }
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height,
+                               int comp, const void *data, int quality) {
+  // Constants that don't pollute global namespace
+  static const unsigned char std_dc_luminance_nrcodes[] = {
+      0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0};
+  static const unsigned char std_dc_luminance_values[] = {0, 1, 2, 3, 4,  5,
+                                                          6, 7, 8, 9, 10, 11};
+  static const unsigned char std_ac_luminance_nrcodes[] = {
+      0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d};
+  static const unsigned char std_ac_luminance_values[] = {
+      0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06,
+      0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+      0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72,
+      0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+      0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45,
+      0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+      0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75,
+      0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+      0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3,
+      0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+      0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9,
+      0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+      0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4,
+      0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa};
+  static const unsigned char std_dc_chrominance_nrcodes[] = {
+      0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0};
+  static const unsigned char std_dc_chrominance_values[] = {0, 1, 2, 3, 4,  5,
+                                                            6, 7, 8, 9, 10, 11};
+  static const unsigned char std_ac_chrominance_nrcodes[] = {
+      0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77};
+  static const unsigned char std_ac_chrominance_values[] = {
+      0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41,
+      0x51, 0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+      0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1,
+      0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+      0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44,
+      0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+      0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74,
+      0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+      0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a,
+      0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+      0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+      0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+      0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4,
+      0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa};
+  // Huffman tables
+  static const unsigned short YDC_HT[256][2] = {
+      {0, 2},  {2, 3},  {3, 3},  {4, 3},   {5, 3},   {6, 3},
+      {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}};
+  static const unsigned short UVDC_HT[256][2] = {
+      {0, 2},  {1, 2},   {2, 2},   {6, 3},   {14, 4},    {30, 5},
+      {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11}};
+  static const unsigned short YAC_HT[256][2] = {
+      {10, 4},     {0, 2},      {1, 2},      {4, 3},      {11, 4},
+      {26, 5},     {120, 7},    {248, 8},    {1014, 10},  {65410, 16},
+      {65411, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {12, 4},     {27, 5},     {121, 7},
+      {502, 9},    {2038, 11},  {65412, 16}, {65413, 16}, {65414, 16},
+      {65415, 16}, {65416, 16}, {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {28, 5},     {249, 8},
+      {1015, 10},  {4084, 12},  {65417, 16}, {65418, 16}, {65419, 16},
+      {65420, 16}, {65421, 16}, {65422, 16}, {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {58, 6},
+      {503, 9},    {4085, 12},  {65423, 16}, {65424, 16}, {65425, 16},
+      {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {59, 6},     {1016, 10},  {65430, 16}, {65431, 16}, {65432, 16},
+      {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {122, 7},    {2039, 11},  {65438, 16}, {65439, 16},
+      {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16},
+      {65445, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {123, 7},    {4086, 12},  {65446, 16},
+      {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16},
+      {65452, 16}, {65453, 16}, {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {250, 8},    {4087, 12},
+      {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16},
+      {65459, 16}, {65460, 16}, {65461, 16}, {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {504, 9},
+      {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16},
+      {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {505, 9},    {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16},
+      {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {506, 9},    {65479, 16}, {65480, 16}, {65481, 16},
+      {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16},
+      {65487, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {1017, 10},  {65488, 16}, {65489, 16},
+      {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16},
+      {65495, 16}, {65496, 16}, {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {1018, 10},  {65497, 16},
+      {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16},
+      {65503, 16}, {65504, 16}, {65505, 16}, {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {2040, 11},
+      {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16},
+      {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16},
+      {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {2041, 11},  {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16},
+      {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16},
+      {65534, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0}};
+  static const unsigned short UVAC_HT[256][2] = {
+      {0, 2},      {1, 2},      {4, 3},      {10, 4},     {24, 5},
+      {25, 5},     {56, 6},     {120, 7},    {500, 9},    {1014, 10},
+      {4084, 12},  {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {11, 4},     {57, 6},     {246, 8},
+      {501, 9},    {2038, 11},  {4085, 12},  {65416, 16}, {65417, 16},
+      {65418, 16}, {65419, 16}, {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {26, 5},     {247, 8},
+      {1015, 10},  {4086, 12},  {32706, 15}, {65420, 16}, {65421, 16},
+      {65422, 16}, {65423, 16}, {65424, 16}, {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {27, 5},
+      {248, 8},    {1016, 10},  {4087, 12},  {65425, 16}, {65426, 16},
+      {65427, 16}, {65428, 16}, {65429, 16}, {65430, 16}, {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {58, 6},     {502, 9},    {65431, 16}, {65432, 16}, {65433, 16},
+      {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {59, 6},     {1017, 10},  {65439, 16}, {65440, 16},
+      {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16},
+      {65446, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {121, 7},    {2039, 11},  {65447, 16},
+      {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16},
+      {65453, 16}, {65454, 16}, {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {122, 7},    {2040, 11},
+      {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16},
+      {65460, 16}, {65461, 16}, {65462, 16}, {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {249, 8},
+      {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16},
+      {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {503, 9},    {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16},
+      {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16}, {65480, 16},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {504, 9},    {65481, 16}, {65482, 16}, {65483, 16},
+      {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {65488, 16},
+      {65489, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {505, 9},    {65490, 16}, {65491, 16},
+      {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16},
+      {65497, 16}, {65498, 16}, {0, 0},      {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {506, 9},    {65499, 16},
+      {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16},
+      {65505, 16}, {65506, 16}, {65507, 16}, {0, 0},      {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {2041, 11},
+      {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16},
+      {65513, 16}, {65514, 16}, {65515, 16}, {65516, 16}, {0, 0},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16},
+      {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16},
+      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {1018, 10},  {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16},
+      {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16},
+      {65534, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
+      {0, 0}};
+  static const int YQT[] = {
+      16, 11, 10, 16, 24,  40,  51,  61,  12, 12, 14, 19, 26,  58,  60,  55,
+      14, 13, 16, 24, 40,  57,  69,  56,  14, 17, 22, 29, 51,  87,  80,  62,
+      18, 22, 37, 56, 68,  109, 103, 77,  24, 35, 55, 64, 81,  104, 113, 92,
+      49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99};
+  static const int UVQT[] = {17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99,
+                             99, 99, 99, 24, 26, 56, 99, 99, 99, 99, 99, 47, 66,
+                             99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+                             99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+                             99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99};
+  static const float aasf[] = {
+      1.0f * 2.828427125f,         1.387039845f * 2.828427125f,
+      1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
+      1.0f * 2.828427125f,         0.785694958f * 2.828427125f,
+      0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f};
+
+  int row, col, i, k;
+  float fdtbl_Y[64], fdtbl_UV[64];
+  unsigned char YTable[64], UVTable[64];
+
+  if (!data || !width || !height || comp > 4 || comp < 1) {
+    return 0;
+  }
+
+  quality = quality ? quality : 90;
+  quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+  quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
+
+  for (i = 0; i < 64; ++i) {
+    int uvti, yti = (YQT[i] * quality + 50) / 100;
+    YTable[stbiw__jpg_ZigZag[i]] =
+        (unsigned char)(yti < 1 ? 1 : yti > 255 ? 255 : yti);
+    uvti = (UVQT[i] * quality + 50) / 100;
+    UVTable[stbiw__jpg_ZigZag[i]] =
+        (unsigned char)(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+  }
+
+  for (row = 0, k = 0; row < 8; ++row) {
+    for (col = 0; col < 8; ++col, ++k) {
+      fdtbl_Y[k] = 1 / (YTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+      fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+    }
+  }
+
+  // Write Headers
+  {
+    static const unsigned char head0[] = {
+        0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F',  'I',  'F', 0,    1, 1,
+        0,    0,    1,    0,    1, 0,    0,   0xFF, 0xDB, 0,   0x84, 0};
+    static const unsigned char head2[] = {0xFF, 0xDA, 0, 0xC,  3, 1,    0,
+                                          2,    0x11, 3, 0x11, 0, 0x3F, 0};
+    const unsigned char head1[] = {0xFF,
+                                   0xC0,
+                                   0,
+                                   0x11,
+                                   8,
+                                   (unsigned char)(height >> 8),
+                                   STBIW_UCHAR(height),
+                                   (unsigned char)(width >> 8),
+                                   STBIW_UCHAR(width),
+                                   3,
+                                   1,
+                                   0x11,
+                                   0,
+                                   2,
+                                   0x11,
+                                   1,
+                                   3,
+                                   0x11,
+                                   1,
+                                   0xFF,
+                                   0xC4,
+                                   0x01,
+                                   0xA2,
+                                   0};
+    s->func(s->context, (void *)head0, sizeof(head0));
+    s->func(s->context, (void *)YTable, sizeof(YTable));
+    stbiw__putc(s, 1);
+    s->func(s->context, UVTable, sizeof(UVTable));
+    s->func(s->context, (void *)head1, sizeof(head1));
+    s->func(s->context, (void *)(std_dc_luminance_nrcodes + 1),
+            sizeof(std_dc_luminance_nrcodes) - 1);
+    s->func(s->context, (void *)std_dc_luminance_values,
+            sizeof(std_dc_luminance_values));
+    stbiw__putc(s, 0x10); // HTYACinfo
+    s->func(s->context, (void *)(std_ac_luminance_nrcodes + 1),
+            sizeof(std_ac_luminance_nrcodes) - 1);
+    s->func(s->context, (void *)std_ac_luminance_values,
+            sizeof(std_ac_luminance_values));
+    stbiw__putc(s, 1); // HTUDCinfo
+    s->func(s->context, (void *)(std_dc_chrominance_nrcodes + 1),
+            sizeof(std_dc_chrominance_nrcodes) - 1);
+    s->func(s->context, (void *)std_dc_chrominance_values,
+            sizeof(std_dc_chrominance_values));
+    stbiw__putc(s, 0x11); // HTUACinfo
+    s->func(s->context, (void *)(std_ac_chrominance_nrcodes + 1),
+            sizeof(std_ac_chrominance_nrcodes) - 1);
+    s->func(s->context, (void *)std_ac_chrominance_values,
+            sizeof(std_ac_chrominance_values));
+    s->func(s->context, (void *)head2, sizeof(head2));
+  }
+
+  // Encode 8x8 macroblocks
+  {
+    static const unsigned short fillBits[] = {0x7F, 7};
+    const unsigned char *imageData = (const unsigned char *)data;
+    int DCY = 0, DCU = 0, DCV = 0;
+    int bitBuf = 0, bitCnt = 0;
+    // comp == 2 is grey+alpha (alpha is ignored)
+    int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+    int x, y, pos;
+    for (y = 0; y < height; y += 8) {
+      for (x = 0; x < width; x += 8) {
+        float YDU[64], UDU[64], VDU[64];
+        for (row = y, pos = 0; row < y + 8; ++row) {
+          // row >= height => use last input row
+          int clamped_row = (row < height) ? row : height - 1;
+          int base_p =
+              (stbi__flip_vertically_on_write ? (height - 1 - clamped_row)
+                                              : clamped_row) *
+              width * comp;
+          for (col = x; col < x + 8; ++col, ++pos) {
+            float r, g, b;
+            // if col >= width => use pixel from last input column
+            int p = base_p + ((col < width) ? col : (width - 1)) * comp;
+
+            r = imageData[p + 0];
+            g = imageData[p + ofsG];
+            b = imageData[p + ofsB];
+            YDU[pos] = +0.29900f * r + 0.58700f * g + 0.11400f * b - 128;
+            UDU[pos] = -0.16874f * r - 0.33126f * g + 0.50000f * b;
+            VDU[pos] = +0.50000f * r - 0.41869f * g - 0.08131f * b;
+          }
+        }
+
+        DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY,
+                                   YDC_HT, YAC_HT);
+        DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU,
+                                   UVDC_HT, UVAC_HT);
+        DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV,
+                                   UVDC_HT, UVAC_HT);
       }
+    }
 
-      // Do the bit alignment of the EOI marker
-      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
-   }
+    // Do the bit alignment of the EOI marker
+    stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
+  }
 
-   // EOI
-   stbiw__putc(s, 0xFF);
-   stbiw__putc(s, 0xD9);
+  // EOI
+  stbiw__putc(s, 0xFF);
+  stbiw__putc(s, 0xD9);
 
-   return 1;
+  return 1;
 }
 
-STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
-{
-   stbi__write_context s;
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x,
+                                    int y, int comp, const void *data,
+                                    int quality) {
+  stbi__write_context s;
+  stbi__start_write_callbacks(&s, func, context);
+  return stbi_write_jpg_core(&s, x, y, comp, (void *)data, quality);
 }
 
-
 #ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
-{
-   stbi__write_context s;
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp,
+                            const void *data, int quality) {
+  stbi__write_context s;
+  if (stbi__start_write_file(&s, filename)) {
+    int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+    stbi__end_write_file(&s);
+    return r;
+  } else
+    return 0;
 }
 #endif
 
@@ -1531,30 +1856,19 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const
 
 /* Revision history
       1.11  (2019-08-11)
-             
+
       1.10  (2019-02-07)
-             support utf8 filenames in Windows; fix warnings and platform ifdefs 
+             support utf8 filenames in Windows; fix warnings and platform ifdefs
       1.09  (2018-02-11)
              fix typo in zlib quality API, improve STB_I_W_STATIC in C++
       1.08  (2018-01-29)
-             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
-      1.07  (2017-07-24)
-             doc fix
-      1.06 (2017-07-23)
-             writing JPEG (using Jon Olick's code)
-      1.05   ???
-      1.04 (2017-03-03)
-             monochrome BMP expansion
-      1.03   ???
-      1.02 (2016-04-02)
-             avoid allocating large structures on the stack
-      1.01 (2016-01-16)
-             STBIW_REALLOC_SIZED: support allocators with no realloc support
-             avoid race-condition in crc initialization
-             minor compile issues
-      1.00 (2015-09-14)
-             installable file IO function
-      0.99 (2015-09-13)
+             add stbi__flip_vertically_on_write, external zlib, zlib quality,
+   choose PNG filter 1.07  (2017-07-24) doc fix 1.06 (2017-07-23) writing JPEG
+   (using Jon Olick's code) 1.05   ??? 1.04 (2017-03-03) monochrome BMP
+   expansion 1.03   ??? 1.02 (2016-04-02) avoid allocating large structures on
+   the stack 1.01 (2016-01-16) STBIW_REALLOC_SIZED: support allocators with no
+   realloc support avoid race-condition in crc initialization minor compile
+   issues 1.00 (2015-09-14) installable file IO function 0.99 (2015-09-13)
              warning fixes; TGA rle support
       0.98 (2015-04-08)
              added STBIW_MALLOC, STBIW_ASSERT etc
@@ -1564,7 +1878,7 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const
              add HDR output
              fix monochrome BMP
       0.95 (2014-08-17)
-		       add monochrome TGA output
+                       add monochrome TGA output
       0.94 (2014-05-31)
              rename private functions to avoid conflicts with stb_image.h
       0.93 (2014-05-27)
@@ -1582,38 +1896,38 @@ This software is available under 2 licenses -- choose whichever you prefer.
 ------------------------------------------------------------------------------
 ALTERNATIVE A - MIT License
 Copyright (c) 2017 Sean Barrett
-Permission is hereby granted, free of charge, to any person obtaining a copy of 
-this software and associated documentation files (the "Software"), to deal in 
-the Software without restriction, including without limitation the rights to 
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
-of the Software, and to permit persons to whom the Software is furnished to do 
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all 
+The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 ------------------------------------------------------------------------------
 ALTERNATIVE B - Public Domain (www.unlicense.org)
 This is free and unencumbered software released into the public domain.
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
-software, either in source code form or as a compiled binary, for any purpose, 
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
 commercial or non-commercial, and by any means.
-In jurisdictions that recognize copyright laws, the author or authors of this 
-software dedicate any and all copyright interest in the software to the public 
-domain. We make this dedication for the benefit of the public at large and to 
-the detriment of our heirs and successors. We intend this dedication to be an 
-overt act of relinquishment in perpetuity of all present and future rights to 
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
 this software under copyright law.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ------------------------------------------------------------------------------
 */
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
index 608107e1dfb39bb268899227dc21f45d969de1f7..52f08730620945d3559c58d26051e81437996eac 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
@@ -8,10 +8,10 @@
 // ***                        Runtime declaration                        *** //
 void *tensorFft(void *input, bool inverse);
 void *tensorFftHalf(void *input, bool inverse);
-void *
-tensorReduce(void *input, size_t axis, MathOp func, float skip_ratio = 0.0f);
-void *tensorReduceHalf(
-    void *input, size_t axis, MathOp func, float skip_ratio = 0.0f);
+void *tensorReduce(void *input, size_t axis, MathOp func,
+                   float skip_ratio = 0.0f);
+void *tensorReduceHalf(void *input, size_t axis, MathOp func,
+                       float skip_ratio = 0.0f);
 void *tensorProjectiveT(void *input, void *transformation);
 void *tensorMap1(MathOp f, void *i);
 void *tensorMap2(MathOp f2, void *i1, void *i2);
@@ -23,16 +23,15 @@ void *tensorMap3Half(MathOp f3, void *i1, void *i2, void *i3);
 // ***                      Wrapper API declaration                      *** //
 extern "C" {
 void *wrapper_tensorFft(const char *hpvm_node_id, void *input, bool inverse);
-void *
-wrapper_tensorReduce(const char *hpvm_node_id, void *input, int axis, int func);
-void *wrapper_tensorProjectiveT(
-    const char *hpvm_node_id, void *input, void *transformation);
+void *wrapper_tensorReduce(const char *hpvm_node_id, void *input, int axis,
+                           int func);
+void *wrapper_tensorProjectiveT(const char *hpvm_node_id, void *input,
+                                void *transformation);
 void *wrapper_tensorMap1(const char *hpvm_node_id, int func, void *input);
-void *wrapper_tensorMap2(
-    const char *hpvm_node_id, int func, void *input1, void *input2);
-void *wrapper_tensorMap3(
-    const char *hpvm_node_id, int func, void *input1, void *input2,
-    void *input3);
+void *wrapper_tensorMap2(const char *hpvm_node_id, int func, void *input1,
+                         void *input2);
+void *wrapper_tensorMap3(const char *hpvm_node_id, int func, void *input1,
+                         void *input2, void *input3);
 
 // Tentative
 void *wrapper_tensorStencil(const char *hpvm_node_id, void *input);
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h
index bf6664b0e87ce7fb68d0a8c0b992ba12e045c4d1..5dc3fe3dbc3cec9ea81fa33bc56471e2d6daaae5 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h
@@ -17,22 +17,21 @@ void *loadAsImage(const char *filename, size_t n_color = N_RGB_CHAN);
 
 void saveToImage(const char *filename, Tensor *tensor);
 
-Tensor *readDataSet(
-    const char *path, size_t start = 0, size_t count = std::string::npos,
-    size_t n_color = N_RGB_CHAN);
+Tensor *readDataSet(const char *path, size_t start = 0,
+                    size_t count = std::string::npos,
+                    size_t n_color = N_RGB_CHAN);
 
-void saveDataSet(
-    const char *path, Tensor *batch, size_t start_idx = 0, size_t write_n = 0);
+void saveDataSet(const char *path, Tensor *batch, size_t start_idx = 0,
+                 size_t write_n = 0);
 
 // Kernel constructor
-void *createFilterFromData(
-    int data_type, void *data, size_t w, size_t h, size_t n_chan);
+void *createFilterFromData(int data_type, void *data, size_t w, size_t h,
+                           size_t n_chan);
 
 std::vector<float> PSNR(void *gold_ptr, void *approx_ptr);
 
-float violationRate(
-    const std::vector<float> &values, float threshold,
-    bool higher_better = true);
+float violationRate(const std::vector<float> &values, float threshold,
+                    bool higher_better = true);
 
 float mean(const std::vector<float> &values);
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/init_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/init_api.h
index ac742876b054c88d50634ecae306b25d471f5c06..962f3e726513f265b8e1fbb27084e48b76a386bf 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/init_api.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/init_api.h
@@ -1,32 +1,29 @@
 
 
-
-#include <stdio.h>
-#include <stdarg.h>
 #include <cstdio>
 #include <cstdlib>
+#include <cublas_api.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <cudnn.h>
 #include <iostream>
 #include <map>
 #include <sstream>
+#include <stdarg.h>
+#include <stdio.h>
 #include <string>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <cudnn.h>
-#include <cublas_api.h>
-#include <cuda_fp16.h>
 
 // Tensor runtime header files
-#include "tensor_runtime.h"
-#include "tensor_utils.h"
+#include "approx_simulation.h"
 #include "debug.h"
-#include "profiling.h"
-#include "global_data.h"
 #include "error.h"
-#include "tensor.h"
+#include "global_data.h"
 #include "op_overheads.h"
-#include "approx_simulation.h"
-
-
+#include "profiling.h"
+#include "tensor.h"
+#include "tensor_runtime.h"
+#include "tensor_utils.h"
 
 void llvm_hpvm_initTensorRt(int gpuid);
 
@@ -50,4 +47,3 @@ void freeOutputTensors();
 void clearOpCounter();
 
 void freeBatchMemory();
-
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/op_overheads.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/op_overheads.h
index efa5c5e92ee8eeca3c1dc7644c4894451d7c24eb..aed7102fe621e09ab61ade91c402d36ae995af76 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/op_overheads.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/op_overheads.h
@@ -3,54 +3,43 @@
 #ifndef OP_OVERHEADS_HEADER
 #define OP_OVERHEADS_HEADER
 
-
+#include "tensor.h"
 #include <math.h>
 #include <sstream>
-#include "tensor.h"
-
-
 
 extern float scale_down_factor;
 extern std::string result_str;
 
+extern "C" {
 
-extern "C"{
-
-  
-  static float scaleDownComps(double total_comps);
-
-  // private function
-  static float getScaledComps(double total_comps, int error_scale, int factor_type);
-
-  static void addNormToResult(float comps);
-
-  static void addCompsToResult(float total_comps,
-			       float opt_comps1,
-			       float opt_comps2,
-			       float opt_comps3);
-
-  void dumpCompOverheads(double total_comps, int error_scale);
+static float scaleDownComps(double total_comps);
 
+// private function
+static float getScaledComps(double total_comps, int error_scale,
+                            int factor_type);
 
-  void add_conv_overheads(void* input_ptr, void* filter_ptr,
-			  int vertical_stride, int horizontal_stride,
-			  int error_scale);
+static void addNormToResult(float comps);
 
-  void add_gemm_overheads(void* lhs_ptr, void* rhs_ptr, int error_scale);
+static void addCompsToResult(float total_comps, float opt_comps1,
+                             float opt_comps2, float opt_comps3);
 
+void dumpCompOverheads(double total_comps, int error_scale);
 
-  void add_bias_overheads(void* input_ptr, int error_scale);
+void add_conv_overheads(void *input_ptr, void *filter_ptr, int vertical_stride,
+                        int horizontal_stride, int error_scale);
 
+void add_gemm_overheads(void *lhs_ptr, void *rhs_ptr, int error_scale);
 
-  void add_relu_overheads(void* input_ptr, int error_scale);
+void add_bias_overheads(void *input_ptr, int error_scale);
 
-  void add_pool_overheads(void* input_ptr, int kernel_size,
-			  int stride_size, int error_scale);
+void add_relu_overheads(void *input_ptr, int error_scale);
 
-  void add_norms(void* norms_ptr, char* op_name, int error_value);
+void add_pool_overheads(void *input_ptr, int kernel_size, int stride_size,
+                        int error_scale);
 
-  void dump_result(const char* file_name);
+void add_norms(void *norms_ptr, char *op_name, int error_value);
 
+void dump_result(const char *file_name);
 }
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h
index 84e15e266ae7e4b9b812b394fc71f2b91c1fd8f4..db802079711b48c08ccf2203f91c04fb0ade59ef 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h
@@ -2,17 +2,15 @@
 #ifndef PROFILING_HEADER
 #define PROFILING_HEADER
 
-
-
 /***** Profiling routines ***/
 
-extern "C"{
+extern "C" {
+
+void startProfiling();
 
-  void startProfiling();
+void stopProfiling();
 
-  void stopProfiling();
-  
-  void profileEvent(const char* event_name, bool compare_previous = false);
+void profileEvent(const char *event_name, bool compare_previous = false);
 }
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h
index 6dd06cb10a189b0a55eff854ec689b51c815a994..28dbf715e7350b496a2cfb6f550e8e3a83865671 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h
@@ -2,7 +2,7 @@ extern "C" {
 // Functions to be inserted with initializeTensorRT and clearTensorRT
 void llvm_hpvm_initializeRuntimeController(const char *, const char *);
 void llvm_hpvm_clearRuntimeController();
-void llvm_hpvm_invokeRtControl(
-    void *result, const char *str, int start, int end);
+void llvm_hpvm_invokeRtControl(void *result, const char *str, int start,
+                               int end);
 void llvm_hpvm_imgInvokeRtControl(void *result, void *gold, int start, int end);
 }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h
index a5461d1884c7d21fb124a8b4882f04a50a1b2fe3..6e81c7a3fbfbe4cae3cd1c40f43a4c7d5ea2d7c8 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h
@@ -5,15 +5,14 @@
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
-#include <cublas_v2.h>
 #include <cudnn.h>
+#include <cublas_v2.h>
+// Must come after cublas_v2.h
 #include <cublas_api.h>
 #include <cuda_fp16.h>
 #include <driver_types.h>
 
-
-
-struct Norm_t{
+struct Norm_t {
   float mean_l1;
   float mean_l2;
   float orig_inf_norm;
@@ -23,48 +22,40 @@ struct Norm_t{
   float inf_norm;
 };
 
-
-struct Dimension{
+struct Dimension {
   int num_dims;
-  size_t* dim_sizes;
+  size_t *dim_sizes;
 };
 
-enum data_location_t{
-  HOST,
-  DEVICE
-};
+enum data_location_t { HOST, DEVICE };
 
-
-struct Tensor{
+struct Tensor {
   int data_type;
   int cur_type;
   int data_format;
-  data_location_t data_placement; // Maintains the location of the tensor {host, device...} 
+  data_location_t
+      data_placement; // Maintains the location of the tensor {host, device...}
   cudnnTensorDescriptor_t tensor_desc;
-  cudnnFilterDescriptor_t filter_desc; // FIXIT: Rethink if this should be in tensor struct
+  cudnnFilterDescriptor_t
+      filter_desc; // FIXIT: Rethink if this should be in tensor struct
   cudnnTensorDescriptor_t tensor_half_desc;
-  cudnnFilterDescriptor_t filter_half_desc; // FIXIT: Rethink if this should be in tensor struct
-  void* host_data;
-  void* gpu_data; // Pointer to GPU FP32 data
-  void* gpu_half_data; // Pointer to GPU FP16 data
-  size_t num_elems; // Total elements
+  cudnnFilterDescriptor_t
+      filter_half_desc; // FIXIT: Rethink if this should be in tensor struct
+  void *host_data;
+  void *gpu_data;       // Pointer to GPU FP32 data
+  void *gpu_half_data;  // Pointer to GPU FP16 data
+  size_t num_elems;     // Total elements
   size_t size_in_bytes; // Total size in bytes
   struct Dimension dims;
 };
 
-
-
-struct Range{
+struct Range {
   float min;
   float max;
 };
 
-
 // NOTE: Currently only NCHW is supported due to limited cuDNN support
-enum Tensor_format_t{
-  nchw,
-  nhwc 
-};
+enum Tensor_format_t { nchw, nhwc };
 
 enum Tensor_type_t {
   float_type = CUDNN_DATA_FLOAT,
@@ -72,7 +63,7 @@ enum Tensor_type_t {
   half_type = CUDNN_DATA_HALF,
   int_type = CUDNN_DATA_INT8,
   float2_type, // complex<float>, for fft,
-  half2_type // complex<half>
+  half2_type   // complex<half>
 };
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu.h
index 07fb766493a8ddeccc90db60c4345dda7889e193..19b655465d6e9d9c5c2a64b130e506b71f46f4b7 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu.h
@@ -3,37 +3,24 @@
 #ifndef TENSOR_HEADER
 #define TENSOR_HEADER
 
-
-struct Dimension{
+struct Dimension {
   int num_dims;
-  size_t* dim_sizes;
+  size_t *dim_sizes;
 };
 
-
-struct Tensor{
+struct Tensor {
   int data_type;
   int data_format;
-  void* host_data;
-  void* gpu_data; // Pointers should not be device specific - Think: Better design
-  size_t num_elems; // Total elements
+  void *host_data;
+  void *
+      gpu_data; // Pointers should not be device specific - Think: Better design
+  size_t num_elems;     // Total elements
   size_t size_in_bytes; // Total size in bytes
   struct Dimension dims;
 };
 
+enum Tensor_format_t { nchw, nhwc };
 
-enum Tensor_format_t{
-  nchw,
-  nhwc 
-};
-
-enum Tensor_type_t{
-  float_type,
-  double_type,
-  half_type,
-  int_type
-};
-
-
+enum Tensor_type_t { float_type, double_type, half_type, int_type };
 
 #endif
-
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
index 05cb1e5cb931e5cc824697df30b2066e41d99e79..24f69c03903faf29b074284482f172efa334549f 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
@@ -1,67 +1,63 @@
-#include <stdio.h>
-#include <cstdlib>
 #include <cmath>
+#include <cstdlib>
 #include <memory>
+#include <stdio.h>
 #include <string>
 
-
 #ifndef CUDNN_HEADER
 #define CUDNN_HEADER
 
+extern "C" {
+/****  Initialization Routine - Must be inserted at program start (in the
+ * backend)  ****/
+void llvm_hpvm_initTensorRt(int gpuid = 0);
+void llvm_hpvm_cleanupTensorRt();
 
-extern "C"{
-  /****  Initialization Routine - Must be inserted at program start (in the backend)  ****/
-  void llvm_hpvm_initTensorRt(int gpuid = 0);
-  void llvm_hpvm_cleanupTensorRt();
-
-  // Routine to moving tensor data (from and to GPU,CPU)
-  void hpvm_request_tensor(void* tensor, int destination);
-
-
-  // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations
-  // NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width)
-  void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
-		       size_t dim3_size, size_t dim4_size, bool freeMemory = true);
-  
-  void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes);
-
-  /********** Tensor Operation API ******/
-
-  // NOTE: For conv_mode, only value '1' is supported
-void* tensorConvolutionCPU(void *input_ptr, void *filter_ptr,
-                          int vertical_pad, int horizontal_pad,
-                          int vertical_stride, int horizontal_stride,
-                          int conv_mode, int compute_precision,
-                          int row, int col, int skip_every, int start);
- 
-void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups);
-			
- void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-                         void* mean_ptr, void* variance_ptr, double epsilon);
-
-  void* tensorPoolingCPU(void* input,
-			 int poolFunction,
-			 int window_height, int window_width,
-			 int vertical_pad, int horizontal_pad,
-			 int vertical_stride, int horizontal_stride);
-
-  void* tensorGemmCPU(void* lhs, void* rhs);
-
-  void* tensorAddCPU(void* x, void* bias);
-
-  void* tensorReluCPU(void* input);
-
-  void* tensorRelu2CPU(void* input, float min, float max);
-  
-  void* tensorTanhCPU(void* input);
-  
-  void* tensorSoftmaxCPU(void* input);
-    
-}
+// Routine to moving tensor data (from and to GPU,CPU)
+void hpvm_request_tensor(void *tensor, int destination);
+
+// NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for
+// cuDNN operations NOTE: The only data format supported as of now is: NCHW
+// (batch_dimension, channels, Height, Width)
+void *create4DTensor(int data_type, int data_format, size_t dim1_size,
+                     size_t dim2_size, size_t dim3_size, size_t dim4_size,
+                     bool freeMemory = true);
+
+void initTensorData(void *tensor, void *data_ptr, size_t size_in_bytes);
+
+/********** Tensor Operation API ******/
+
+// NOTE: For conv_mode, only value '1' is supported
+void *tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int compute_precision, int row, int col,
+                           int skip_every, int start);
 
+void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int conv_groups);
+
+void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                         void *mean_ptr, void *variance_ptr, double epsilon);
+
+void *tensorPoolingCPU(void *input, int poolFunction, int window_height,
+                       int window_width, int vertical_pad, int horizontal_pad,
+                       int vertical_stride, int horizontal_stride);
+
+void *tensorGemmCPU(void *lhs, void *rhs);
+
+void *tensorAddCPU(void *x, void *bias);
+
+void *tensorReluCPU(void *input);
+
+void *tensorRelu2CPU(void *input, float min, float max);
+
+void *tensorTanhCPU(void *input);
+
+void *tensorSoftmaxCPU(void *input);
+}
 
 /*
 void dummyFunction(){
@@ -96,9 +92,8 @@ void dummyFunction(){
   void* tensorTanhPtr = (void*) &tensorTanh;
   void* tensorHalfTanhPtr = (void*) &tensorHalfTanh;
   void* tensorSoftmaxPtr = (void*) &tensorSoftmax;
-  void* tensorAddErrorPtr = (void*) &tensorAddError;    
+  void* tensorAddErrorPtr = (void*) &tensorAddError;
 }
 */
 
-
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h
index b9128c1a24ca5bd95a7e6fb9e962d56501558f8f..080b84c6017bd6153c7feaf9c5efc22114de1913 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h
@@ -4,175 +4,157 @@
 #include <stdlib.h>
 #include <vector>
 
+void *tensorArgMax(void *input_ptr) {
 
-void* tensorArgMax(void* input_ptr){
-
-  Tensor* input = (Tensor*) input_ptr;
-  float* host_ptr = (float*) input->host_data;
+  Tensor *input = (Tensor *)input_ptr;
+  float *host_ptr = (float *)input->host_data;
 
   int batch_size = input->dims.dim_sizes[0];
   int channels = input->dims.dim_sizes[1];
 
-  Tensor* output = (Tensor *) create4DTensor(0, 0, batch_size, 1, 1, 1);
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, 1, 1, 1);
   changeTensorPlacement(output, HOST);
-    
-  float* out_ptr = (float*) output->host_data;
-  
-  for(int i = 0; i < batch_size; i++){
+
+  float *out_ptr = (float *)output->host_data;
+
+  for (int i = 0; i < batch_size; i++) {
 
     int start = i * channels;
     float max_index = 0;
     float max_val = host_ptr[start];
-    for(int j = 0; j < channels; j++){
-      
+    for (int j = 0; j < channels; j++) {
+
       int index = start + j;
-      //printf ("index = %d \n", index);
+      // printf ("index = %d \n", index);
       float val = host_ptr[index];
-      if (val > max_val){
-	max_val = val;
-	max_index = j;
-      }	
+      if (val > max_val) {
+        max_val = val;
+        max_index = j;
+      }
     }
 
     out_ptr[i] = max_index;
   }
-  
 
   return output;
-  
 }
 
+void *tensorSelect(void *input_ptr, float target_value) {
 
-
-
-
-void* tensorSelect(void* input_ptr, float target_value){
-
-  Tensor* input = (Tensor*) input_ptr;
-  float* host_ptr = (float*) input->host_data;
+  Tensor *input = (Tensor *)input_ptr;
+  float *host_ptr = (float *)input->host_data;
 
   int batch_size = input->dims.dim_sizes[0];
   int channels = input->dims.dim_sizes[1];
 
-  if (channels != 1){
+  if (channels != 1) {
     printf("* Channels dimension must be 1 \n");
     abort();
   }
 
-  Tensor* output = (Tensor *) create4DTensor(0, 0, batch_size, 1, 1, 1);
-  changeTensorPlacement(output, HOST);    
-  float* out_ptr = (float*) output->host_data;
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, 1, 1, 1);
+  changeTensorPlacement(output, HOST);
+  float *out_ptr = (float *)output->host_data;
 
-  for(int i = 0; i < batch_size; i++){
-    if (host_ptr[i] == target_value){
+  for (int i = 0; i < batch_size; i++) {
+    if (host_ptr[i] == target_value) {
       out_ptr[i] = 1;
-    }
-    else{
+    } else {
       out_ptr[i] = 0;
-    }	   
+    }
   }
-      
+
   return output;
 }
 
+void *tensorSelect2(void *input_ptr, std::vector<int> index_vector) {
 
-
-
-void* tensorSelect2(void* input_ptr, std::vector<int> index_vector){
-
-  Tensor* input = (Tensor*) input_ptr;
-  float* host_ptr = (float*) input->host_data;
+  Tensor *input = (Tensor *)input_ptr;
+  float *host_ptr = (float *)input->host_data;
 
   int batch_size = input->dims.dim_sizes[0];
   int channels = input->dims.dim_sizes[1];
 
-  if (channels != 1){
+  if (channels != 1) {
     printf("* Channels dimension must be 1 \n");
     abort();
   }
 
-  Tensor* output = (Tensor *) create4DTensor(0, 0, batch_size, 1, 1, 1);
-  changeTensorPlacement(output, HOST);    
-  float* out_ptr = (float*) output->host_data;
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, 1, 1, 1);
+  changeTensorPlacement(output, HOST);
+  float *out_ptr = (float *)output->host_data;
 
-  for(int i = 0; i < batch_size; i++){
+  for (int i = 0; i < batch_size; i++) {
 
-    for(int j = 0; j < index_vector.size(); j++){
+    for (int j = 0; j < index_vector.size(); j++) {
       int target_value = index_vector[j];
-      if (host_ptr[i] == target_value){
-	out_ptr[i] = 1;
-	break;
-      }
-      else{
-	out_ptr[i] = 0;
+      if (host_ptr[i] == target_value) {
+        out_ptr[i] = 1;
+        break;
+      } else {
+        out_ptr[i] = 0;
       }
     }
-      
   }
-      
+
   return output;
 }
 
-
-
-
-
-
-long getOnesInVector(float* vector_host_ptr, long vector_length){
+long getOnesInVector(float *vector_host_ptr, long vector_length) {
 
   long ones_count = 0;
-  for(int i = 0; i < vector_length; i++){
+  for (int i = 0; i < vector_length; i++) {
 
-    if(vector_host_ptr[i] == 1)
+    if (vector_host_ptr[i] == 1)
       ones_count += 1;
   }
 
   return ones_count;
 }
 
+void *tensorContract(void *input_ptr, void *bitvector_ptr) {
 
-void* tensorContract(void* input_ptr, void* bitvector_ptr){
+  Tensor *input = (Tensor *)input_ptr;
+  float *host_ptr = (float *)input->host_data;
 
-  Tensor* input = (Tensor*) input_ptr;
-  float* host_ptr = (float*) input->host_data;
-
-  Tensor* bitvector = (Tensor*) bitvector_ptr;
-  float* vector_host_ptr = (float*) bitvector->host_data;  
+  Tensor *bitvector = (Tensor *)bitvector_ptr;
+  float *vector_host_ptr = (float *)bitvector->host_data;
   long vector_length = bitvector->dims.dim_sizes[0];
-  
-  long reduced_batch_size = getOnesInVector(vector_host_ptr, vector_length); 
-  
-  long batch_size = input->dims.dim_sizes[0]; 
+
+  long reduced_batch_size = getOnesInVector(vector_host_ptr, vector_length);
+
+  long batch_size = input->dims.dim_sizes[0];
   long channels = input->dims.dim_sizes[1];
   long height = input->dims.dim_sizes[2];
   long width = input->dims.dim_sizes[3];
 
   long image_size = channels * height * width; // Computing size of each image
-  
-  if (batch_size != vector_length){
+
+  if (batch_size != vector_length) {
     printf("ERROR: bitvector length has to match input batch size \n");
     abort();
   }
 
-  Tensor* output = (Tensor *) create4DTensor(0, 0, reduced_batch_size, channels, height, width);
-  changeTensorPlacement(output, HOST);    
-  float* out_ptr = (float*) output->host_data;
+  Tensor *output = (Tensor *)create4DTensor(0, 0, reduced_batch_size, channels,
+                                            height, width);
+  changeTensorPlacement(output, HOST);
+  float *out_ptr = (float *)output->host_data;
 
   long out_index = 0;
-  for(int i = 0; i < batch_size; i++){
+  for (int i = 0; i < batch_size; i++) {
 
     // Include image if corresponding index in bitvector is '1'
-    if (vector_host_ptr[i] == 1){
+    if (vector_host_ptr[i] == 1) {
 
-      for(int j = 0; j < image_size; j++){
+      for (int j = 0; j < image_size; j++) {
 
-	out_ptr[j] = host_ptr[i * image_size + j];
+        out_ptr[j] = host_ptr[i * image_size + j];
       }
 
-      out_ptr += image_size; // Update the output pointer to the next image boundary
-    }	   
+      out_ptr +=
+          image_size; // Update the output pointer to the next image boundary
+    }
   }
-      
+
   return output;
 }
-
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc
index 2216172eab78414b46814e0d457908f5584c606a..0de2808221adfb122860a031eea4ed8c89d6e2ba 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc
@@ -1,114 +1,105 @@
 
-#include <stdio.h>
-#include <cstdlib>
 #include <cmath>
+#include <cstdlib>
 #include <memory>
+#include <stdio.h>
 #include <string>
 
-
 #ifndef CUDNN_HEADER
 #define CUDNN_HEADER
 
-
-extern "C"{
-  /****  Initialization Routine - Must be inserted at program start (in the backend)  ****/
-  void llvm_hpvm_initTensorRt(int gpuid = 0);
-  void llvm_hpvm_cleanupTensorRt();
-
-  // Routine to moving tensor data (from and to GPU,CPU)
-  void hpvm_request_tensor(void* tensor, int destination);
-
-  /****** Profiling API - defines profiling scope */
-  void startProfiling();
-  void stopProfiling();
-
-  /****** Routines for tensor creation and initialization *******/
-  void* create2DTensor(int data_type, size_t dim1_size, size_t dim2_size);
-  void* create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
-		       size_t dim3_size);
-
-  // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations
-  // NOTE: The only data format supported as of now is: CUDNN_NCHW
-  void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
-		       size_t dim3_size, size_t dim4_size);
-  void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes);
-
-  /********** Tensor Operation API ******/
-
-  void** tensorSplit(void* tensor, int num_splits, int split_dim);
-  void* tensorConcat(void** tensors, int num_splits, int split_dim);
-
-  // NOTE: For conv_mode, only value '1' is supported
-  void* tensorConvolution(void* input, void* filter,
-			  int vertical_pad, int horizontal_pad,
-			  int vertical_stride, int horizontal_stride,
-			  int conv_mode, int compute_precision);
-  void* tensorHConvolution(void* input, void* filter,
-			  int vertical_pad, int horizontal_pad,
-			  int vertical_stride, int horizontal_stride,
-			  int conv_mode, int compute_precision);
-
-  void* tensorPooling(void* input,
-		      int poolFunction,
-		      int window_height, int window_width,
-		      int vertical_pad, int horizontal_pad,
-		      int vertical_stride, int horizontal_stride);
-
-  void* tensorLRN(void* input, unsigned int LRN_window,
-		  double LRN_alpha, double LRN_beta, double LRN_k);
-
-
-  /* 4 different Gemm versions */
-  void* tensorGemm(void* lhs, void* rhs);
-  void* tensorGemmCPU(void* lhs, void* rhs);
-  void* tensorGemmGPU(void* lhs, void* rhs);
-  void* tensorHgemm(void* lhs, void* rhs);
-
-  
-  // NOTE: In-place operation
-  void* tensorGemmBias(void* input, void* bias);
-  // NOTE: In place operation
-  void* tensorAdd(void* x, void* bias);
-  // NOTE: In-place operation
-  void* tensorRelu(void* input);
-  // NOTE: In-place operation
-  void* tensorSoftmax(void* input);
-
-  /* Error injection API - used for accuracy tuning */
-  void* tensorAddError(void* x_ptr);  
+extern "C" {
+/****  Initialization Routine - Must be inserted at program start (in the
+ * backend)  ****/
+void llvm_hpvm_initTensorRt(int gpuid = 0);
+void llvm_hpvm_cleanupTensorRt();
+
+// Routine to moving tensor data (from and to GPU,CPU)
+void hpvm_request_tensor(void *tensor, int destination);
+
+/****** Profiling API - defines profiling scope */
+void startProfiling();
+void stopProfiling();
+
+/****** Routines for tensor creation and initialization *******/
+void *create2DTensor(int data_type, size_t dim1_size, size_t dim2_size);
+void *create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
+                     size_t dim3_size);
+
+// NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for
+// cuDNN operations NOTE: The only data format supported as of now is:
+// CUDNN_NCHW
+void *create4DTensor(int data_type, int data_format, size_t dim1_size,
+                     size_t dim2_size, size_t dim3_size, size_t dim4_size);
+void initTensorData(void *tensor, void *data_ptr, size_t size_in_bytes);
+
+/********** Tensor Operation API ******/
+
+void **tensorSplit(void *tensor, int num_splits, int split_dim);
+void *tensorConcat(void **tensors, int num_splits, int split_dim);
+
+// NOTE: For conv_mode, only value '1' is supported
+void *tensorConvolution(void *input, void *filter, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode,
+                        int compute_precision);
+void *tensorHConvolution(void *input, void *filter, int vertical_pad,
+                         int horizontal_pad, int vertical_stride,
+                         int horizontal_stride, int conv_mode,
+                         int compute_precision);
+
+void *tensorPooling(void *input, int poolFunction, int window_height,
+                    int window_width, int vertical_pad, int horizontal_pad,
+                    int vertical_stride, int horizontal_stride);
+
+void *tensorLRN(void *input, unsigned int LRN_window, double LRN_alpha,
+                double LRN_beta, double LRN_k);
+
+/* 4 different Gemm versions */
+void *tensorGemm(void *lhs, void *rhs);
+void *tensorGemmCPU(void *lhs, void *rhs);
+void *tensorGemmGPU(void *lhs, void *rhs);
+void *tensorHgemm(void *lhs, void *rhs);
+
+// NOTE: In-place operation
+void *tensorGemmBias(void *input, void *bias);
+// NOTE: In place operation
+void *tensorAdd(void *x, void *bias);
+// NOTE: In-place operation
+void *tensorRelu(void *input);
+// NOTE: In-place operation
+void *tensorSoftmax(void *input);
+
+/* Error injection API - used for accuracy tuning */
+void *tensorAddError(void *x_ptr);
 }
 
-
-
-void emptyFunction(){
-
-  void* initRT = (void*) &llvm_hpvm_initTensorRt;
-  void* cleanRT = (void*) &llvm_hpvm_cleanupTensorRt;
-  void* request_tensorPtr = (void*) &hpvm_request_tensor;
-  void* startProf = (void*) &startProfiling;
-  void* stopProf = (void*) &stopProfiling;
-  void* create2Dptr = (void*) &create2DTensor;
-  void* create3Dptr = (void*) &create3DTensor;
-  void* create4Dptr = (void*) &create4DTensor;
-  void* initTensorPtr = (void*) &initTensorData;
-  void* tensorSplitPtr = (void*) &tensorSplit;
-  void* tensorConcatPtr = (void*) &tensorConcat;
-  void* tensorConvPtr = (void*) &tensorConvolution;
-  void* tensorHConvPtr = (void*) &tensorHConvolution;
-  void* tensorPoolPtr = (void*) &tensorPooling;
-  void* tensorLRNPtr = (void*) &tensorLRN;
-  void* tensorGemmPr = (void*) &tensorGemm;
-  void* tensorGemmCPUPtr = (void*) &tensorGemmCPU;
-  void* tensorGemmGPUPtr = (void*) &tensorGemmGPU;
-  void* tensorHgemmPtr = (void*) &tensorHgemm;
-  void* tensorGemmBiasPtr = (void*) &tensorGemmBias;
-  void* tensorAddPtr = (void*) &tensorAdd;
-  void* tensorReluPtr = (void*) &tensorRelu;
-  void* tensorSoftmaxPtr = (void*) &tensorSoftmax;
-  void* tensorAddErrorPtr = (void*) &tensorAddError;
-    
+void emptyFunction() {
+
+  void *initRT = (void *)&llvm_hpvm_initTensorRt;
+  void *cleanRT = (void *)&llvm_hpvm_cleanupTensorRt;
+  void *request_tensorPtr = (void *)&hpvm_request_tensor;
+  void *startProf = (void *)&startProfiling;
+  void *stopProf = (void *)&stopProfiling;
+  void *create2Dptr = (void *)&create2DTensor;
+  void *create3Dptr = (void *)&create3DTensor;
+  void *create4Dptr = (void *)&create4DTensor;
+  void *initTensorPtr = (void *)&initTensorData;
+  void *tensorSplitPtr = (void *)&tensorSplit;
+  void *tensorConcatPtr = (void *)&tensorConcat;
+  void *tensorConvPtr = (void *)&tensorConvolution;
+  void *tensorHConvPtr = (void *)&tensorHConvolution;
+  void *tensorPoolPtr = (void *)&tensorPooling;
+  void *tensorLRNPtr = (void *)&tensorLRN;
+  void *tensorGemmPr = (void *)&tensorGemm;
+  void *tensorGemmCPUPtr = (void *)&tensorGemmCPU;
+  void *tensorGemmGPUPtr = (void *)&tensorGemmGPU;
+  void *tensorHgemmPtr = (void *)&tensorHgemm;
+  void *tensorGemmBiasPtr = (void *)&tensorGemmBias;
+  void *tensorAddPtr = (void *)&tensorAdd;
+  void *tensorReluPtr = (void *)&tensorRelu;
+  void *tensorSoftmaxPtr = (void *)&tensorSoftmax;
+  void *tensorAddErrorPtr = (void *)&tensorAddError;
 }
 
-
-
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
index 8deddc88264bc4327bec8dad1709eac0d1a40322..abd89cc1ad76bab202d73db2e9cc576b98417564 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
@@ -3,244 +3,213 @@
 #ifndef CUDNN_HEADER
 #define CUDNN_HEADER
 
-
-#include <stdio.h>
-#include <cstdlib>
+#include "approx_api.h"
+#include "rt-controller-api.h"
+#include "tensor.h"
 #include <cmath>
+#include <cstdlib>
 #include <memory>
+#include <stdio.h>
 #include <string>
-#include "approx_api.h"
-#include "tensor.h"
-#include "rt-controller-api.h"
 
 #include "img_tensor_runtime.h"
 
+extern "C" {
+/****  Initialization Routine - Must be inserted at program start (in the
+ * backend)  ****/
+void llvm_hpvm_initTensorRt(int gpuid = 0);
+void llvm_hpvm_cleanupTensorRt();
 
+void llvm_hpvm_initApproxhpvmRt(int gpuid = 0);
+void llvm_hpvm_cleanupApproxhpvmRt();
 
-extern "C"{
-  /****  Initialization Routine - Must be inserted at program start (in the backend)  ****/
-  void llvm_hpvm_initTensorRt(int gpuid = 0);
-  void llvm_hpvm_cleanupTensorRt();
-
-  void llvm_hpvm_initApproxhpvmRt(int gpuid = 0);
-  void llvm_hpvm_cleanupApproxhpvmRt();
-
-  // Routine to moving tensor data (from and to GPU,CPU)
-  void hpvm_request_tensor(void* tensor, int destination);
-
-  /****** Profiling API - defines profiling scope */
-  void startProfiling();
-  void stopProfiling();
-
-  /****** Routines for tensor creation and initialization *******/
-  void* create2DTensor(int data_type, size_t dim1_size, size_t dim2_size);
-  void* create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
-		       size_t dim3_size);
-
-  // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations
-  // NOTE: The only data format supported as of now is: CUDNN_NCHW
-  void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
-		       size_t dim3_size, size_t dim4_size);
-  void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes);
-
-  void changeTensorPlacement(struct Tensor* tensor,
-			     data_location_t data_placement);
- 
-  void tensorCopy(void* srcTensor, void* dstTensor);
-  
-  void freeTensor(void*);
-
-  /********** Tensor Operation API ******/
-
-  void** tensorSplit(void* tensor, int num_splits, int split_dim);
-  void* tensorConcat(void** tensors, int num_splits, int split_dim);
-
-  // NOTE: For conv_mode, only value '1' is supported
-  void* tensorConvolution(void* input, void* filter,
-			  int vertical_pad, int horizontal_pad,
-			  int vertical_stride, int horizontal_stride,
-			  int conv_mode, int conv_groups);
-  void* tensorHalfConvolution(void* input, void* filter,
-			      int vertical_pad, int horizontal_pad,
-			      int vertical_stride, int horizontal_stride,
-			      int conv_mode, int conv_groups);
-
-  void* tensorPooling(void* input,
-		      int poolFunction,
-		      int window_height, int window_width,
-		      int vertical_pad, int horizontal_pad,
-		      int vertical_stride, int horizontal_stride);
-
-  void* tensorHalfPooling(void* input,
-			  int poolFunction,
-			  int window_height, int window_width,
-			  int vertical_pad, int horizontal_pad,
-			  int vertical_stride, int horizontal_stride);
-
-  
-  void* tensorLRN(void* input, unsigned int LRN_window,
-		  double LRN_alpha, double LRN_beta, double LRN_k);
-
-
-  /* 4 different Gemm versions */
-  void* tensorGemm(void* lhs, void* rhs);
-  void* tensorGemmCPU(void* lhs, void* rhs);
-  void* tensorGemmGPU(void* lhs, void* rhs); // , void* result_tensor = NULL);
-  void* tensorHalfGemmGPU(void* lhs, void* rhs);
-  void* tensorHalfGemm(void* lhs, void* rhs);
-
-  
-  // NOTE: In-place operation
-  void* tensorGemmBias(void* input, void* bias);
-  // NOTE: In place operation
-  void* tensorAdd(void* x, void* bias);
-  // NOTE: In place operation
-  void* tensorHalfAdd(void* x, void* bias);
-  // NOTE: In-place operation
-  void* tensorRelu(void* input);
-  // NOTE: In-place operation
-  void* tensorHalfRelu(void* input);
-  // NOTE: In-place operation
-  
-  void* tensorTanh(void* input);
-  // NOTE: In-place operation
-  void* tensorHalfTanh(void* input);
-
-  // NOTE: In-place operation
-  void* tensorRelu2(void* input, float min, float max);
-  // NOTE: In-place operation
-  void* tensorHalfRelu2(void* input, float min, float max);
-  // NOTE: In-place operation
-  void* tensorSoftmax(void* input);
-
-  // NOTE: In-place operation
-  void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-			void* mean_ptr, void* variance_ptr, double epsilon);
-
-  void* tensorHalfBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-			    void* mean_ptr, void* variance_ptr, double epsilon);
-
-  
-  /* Error injection API - used for accuracy tuning */
-  void* tensorAddError(void* x_ptr, int error_scale);
-  
-  void* tensorGemmModel(void* lhs, void* rhs);
-
-  /*** Error Injection API End **/
-
-
-  /****  PROMISE API *****/
-
-  /*************  
-  --- Synopsys:
-
-  input:  input activation tensor
-  filter: filter tensor
-  bias:  bias tensor
-  conv_pad_h, conv_pad_w:  convolution padding in height and width
-  conv_stride_h, conv_stride_w: convolution stride - vertical and horizontal
-  pool_id: {0, 1}    0: max_pooling ,   1: avg_pooling
-  pool_size: Size of pooling window. Note: Pass '0' for *NO* Pooling
-  activation_id: {-1,0,1,2}   -1: NO Activation, 0: Tanh, 1: Relu, 2: ClippedRelu
-  Swing: PROMISE swing level
-  
-  *************/
-  
-  void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
-			  void* filter, float w_min, float w_max,
-			  void* bias, float b_min, float b_max,
-			  int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w,
-			  int pool_id, int pool_size,
-			  int activation_id, // Relu, Tanh, ClipRelu
-			  float out_min, float out_max, int swing); // NOTE: min_val, max_val apply to 'ClippedRelu'
-
-  
-  void* ConvLayer_PROMISE2(void* input, float i_min, float i_max,
-  			   void* filter, float w_min, float w_max,
-			   void* bias, float b_min, float b_max,
-			   int conv_pad_h, int conv_pad_w,
-			   int conv_stride_h, int conv_stride_w,
-			   int pool_id, int pool_size, int pool_stride, 
-			   int activation_id, // Relu, Tanh, ClipRelu
-			   float out_min, float out_max, int swing);
-  
-
-
-  void* FCLayer_PROMISE(void* input, float i_min, float i_max,
-			void* weights, float w_min, float w_max,
-			void* bias, float b_min, float b_max,
-			int activation_id,
-			float out_min, float out_max, int swing); // NOTE: min_val, max_val apply to 'ClippedRelu'
-
-
-  /**** Wrapper Runtime API ***/
-  
-  void* wrapper_ConvLayer(const char* hpvm_node_id,
-			  void* input, 
-			  void* filter, 
-			  void* bias, 
-			  int conv_pad_h, int conv_pad_w,
-			  int conv_stride_h, int conv_stride_w,
-			  int pool_id, int pool_size,
-			  int activation_id, // Relu, Tanh, ClipRelu
-			  float out_min, float out_max);  
-
-
-  void* wrapper_FCLayer(const char* hpvm_node_id,
-			void* input, 
-		        void* weights, 
-		        void* bias, 
-		        int activation_id,
-		        float out_min, float out_max);
-
-  
-  void* wrapper_tensorGroupConvolution(const char* hpvm_node_id, void* input, void* filter,
-				       int vertical_pad, int horizontal_pad,
-				       int vertical_stride, int horizontal_stride,
-				       int conv_mode, int conv_groups);
-
-
-  void* wrapper_tensorRelu(const char* hpvm_node_id, void* input_ptr);
-  
-  void* wrapper_tensorTanh(const char* hpvm_node_id, void* input_ptr);
-  
-  void* wrapper_tensorBatchNorm(const char* hpvm_node_id,
-				void* input_ptr, void* gamma_ptr, void* beta_ptr,
-			        void* mean_ptr, void* variance_ptr, double epsilon);
-  
-  void* wrapper_tensorAdd(const char* hpvm_node_id, void* input_ptr, void* bias_ptr);
-  
-
-  void* wrapper_tensorPooling(const char* hpvm_node_id,
-			      void* input_ptr,
-			      int poolFunction,
-			      int window_height, int window_width,
-			      int vertical_pad, int horizontal_pad,
-			      int vertical_stride, int horizontal_stride);
-
-
-  void* wrapper_tensorSoftmax(const char* hpvm_node_id, void* input_ptr);
-
-
-
-
-  // Utilities
-  // TODO: separate utils in separate header
-  void dumpAccuracyNorms();
-  void readOpenTunerFlags(const char* file_name);
-  void clearOpCounter();
-  void clearTensorMap();
-  void startMemTracking();
-  void freeOutputTensors();
-  void freeBatchMemory();
-  void* quantizeTensorPromise(void* input_ptr, float min, float max);
-  void* addPromiseError(void* x_ptr, int error_scale);
-  void readSkipTensors(int* skip_tensor_ids, int op_count);
-  void convertToFP32(struct Tensor* tensor);
-
+// Routine to moving tensor data (from and to GPU,CPU)
+void hpvm_request_tensor(void *tensor, int destination);
+
+/****** Profiling API - defines profiling scope */
+void startProfiling();
+void stopProfiling();
+
+/****** Routines for tensor creation and initialization *******/
+void *create2DTensor(int data_type, size_t dim1_size, size_t dim2_size);
+void *create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
+                     size_t dim3_size);
+
+// NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for
+// cuDNN operations NOTE: The only data format supported as of now is:
+// CUDNN_NCHW
+void *create4DTensor(int data_type, int data_format, size_t dim1_size,
+                     size_t dim2_size, size_t dim3_size, size_t dim4_size);
+void initTensorData(void *tensor, void *data_ptr, size_t size_in_bytes);
+
+void changeTensorPlacement(struct Tensor *tensor,
+                           data_location_t data_placement);
+
+void tensorCopy(void *srcTensor, void *dstTensor);
+
+void freeTensor(void *);
+
+/********** Tensor Operation API ******/
+
+void **tensorSplit(void *tensor, int num_splits, int split_dim);
+void *tensorConcat(void **tensors, int num_splits, int split_dim);
+
+// NOTE: For conv_mode, only value '1' is supported
+void *tensorConvolution(void *input, void *filter, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups);
+void *tensorHalfConvolution(void *input, void *filter, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups);
+
+void *tensorPooling(void *input, int poolFunction, int window_height,
+                    int window_width, int vertical_pad, int horizontal_pad,
+                    int vertical_stride, int horizontal_stride);
+
+void *tensorHalfPooling(void *input, int poolFunction, int window_height,
+                        int window_width, int vertical_pad, int horizontal_pad,
+                        int vertical_stride, int horizontal_stride);
+
+void *tensorLRN(void *input, unsigned int LRN_window, double LRN_alpha,
+                double LRN_beta, double LRN_k);
+
+/* 4 different Gemm versions */
+void *tensorGemm(void *lhs, void *rhs);
+void *tensorGemmCPU(void *lhs, void *rhs);
+void *tensorGemmGPU(void *lhs, void *rhs); // , void* result_tensor = NULL);
+void *tensorHalfGemmGPU(void *lhs, void *rhs);
+void *tensorHalfGemm(void *lhs, void *rhs);
+
+// NOTE: In-place operation
+void *tensorGemmBias(void *input, void *bias);
+// NOTE: In place operation
+void *tensorAdd(void *x, void *bias);
+// NOTE: In place operation
+void *tensorHalfAdd(void *x, void *bias);
+// NOTE: In-place operation
+void *tensorRelu(void *input);
+// NOTE: In-place operation
+void *tensorHalfRelu(void *input);
+// NOTE: In-place operation
+
+void *tensorTanh(void *input);
+// NOTE: In-place operation
+void *tensorHalfTanh(void *input);
+
+// NOTE: In-place operation
+void *tensorRelu2(void *input, float min, float max);
+// NOTE: In-place operation
+void *tensorHalfRelu2(void *input, float min, float max);
+// NOTE: In-place operation
+void *tensorSoftmax(void *input);
+
+// NOTE: In-place operation
+void *tensorBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                      void *mean_ptr, void *variance_ptr, double epsilon);
+
+void *tensorHalfBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                          void *mean_ptr, void *variance_ptr, double epsilon);
+
+/* Error injection API - used for accuracy tuning */
+void *tensorAddError(void *x_ptr, int error_scale);
+
+void *tensorGemmModel(void *lhs, void *rhs);
+
+/*** Error Injection API End **/
+
+/****  PROMISE API *****/
+
+/*************
+--- Synopsys:
+
+input:  input activation tensor
+filter: filter tensor
+bias:  bias tensor
+conv_pad_h, conv_pad_w:  convolution padding in height and width
+conv_stride_h, conv_stride_w: convolution stride - vertical and horizontal
+pool_id: {0, 1}    0: max_pooling ,   1: avg_pooling
+pool_size: Size of pooling window. Note: Pass '0' for *NO* Pooling
+activation_id: {-1,0,1,2}   -1: NO Activation, 0: Tanh, 1: Relu, 2: ClippedRelu
+Swing: PROMISE swing level
+
+*************/
+
+void *
+ConvLayer_PROMISE(void *input, float i_min, float i_max, void *filter,
+                  float w_min, float w_max, void *bias, float b_min,
+                  float b_max, int conv_pad_h, int conv_pad_w,
+                  int conv_stride_h, int conv_stride_w, int pool_id,
+                  int pool_size,
+                  int activation_id, // Relu, Tanh, ClipRelu
+                  float out_min, float out_max,
+                  int swing); // NOTE: min_val, max_val apply to 'ClippedRelu'
+
+void *ConvLayer_PROMISE2(void *input, float i_min, float i_max, void *filter,
+                         float w_min, float w_max, void *bias, float b_min,
+                         float b_max, int conv_pad_h, int conv_pad_w,
+                         int conv_stride_h, int conv_stride_w, int pool_id,
+                         int pool_size, int pool_stride,
+                         int activation_id, // Relu, Tanh, ClipRelu
+                         float out_min, float out_max, int swing);
+
+void *
+FCLayer_PROMISE(void *input, float i_min, float i_max, void *weights,
+                float w_min, float w_max, void *bias, float b_min, float b_max,
+                int activation_id, float out_min, float out_max,
+                int swing); // NOTE: min_val, max_val apply to 'ClippedRelu'
+
+/**** Wrapper Runtime API ***/
+
+void *wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter,
+                        void *bias, int conv_pad_h, int conv_pad_w,
+                        int conv_stride_h, int conv_stride_w, int pool_id,
+                        int pool_size,
+                        int activation_id, // Relu, Tanh, ClipRelu
+                        float out_min, float out_max);
+
+void *wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights,
+                      void *bias, int activation_id, float out_min,
+                      float out_max);
+
+void *wrapper_tensorGroupConvolution(const char *hpvm_node_id, void *input,
+                                     void *filter, int vertical_pad,
+                                     int horizontal_pad, int vertical_stride,
+                                     int horizontal_stride, int conv_mode,
+                                     int conv_groups);
+
+void *wrapper_tensorRelu(const char *hpvm_node_id, void *input_ptr);
+
+void *wrapper_tensorTanh(const char *hpvm_node_id, void *input_ptr);
+
+void *wrapper_tensorBatchNorm(const char *hpvm_node_id, void *input_ptr,
+                              void *gamma_ptr, void *beta_ptr, void *mean_ptr,
+                              void *variance_ptr, double epsilon);
+
+void *wrapper_tensorAdd(const char *hpvm_node_id, void *input_ptr,
+                        void *bias_ptr);
+
+void *wrapper_tensorPooling(const char *hpvm_node_id, void *input_ptr,
+                            int poolFunction, int window_height,
+                            int window_width, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride);
+
+void *wrapper_tensorSoftmax(const char *hpvm_node_id, void *input_ptr);
+
+// Utilities
+// TODO: separate utils in separate header
+void dumpAccuracyNorms();
+void readOpenTunerFlags(const char *file_name);
+void clearOpCounter();
+void clearTensorMap();
+void startMemTracking();
+void freeOutputTensors();
+void freeBatchMemory();
+void *quantizeTensorPromise(void *input_ptr, float min, float max);
+void *addPromiseError(void *x_ptr, int error_scale);
+void readSkipTensors(int *skip_tensor_ids, int op_count);
+void convertToFP32(struct Tensor *tensor);
 }
 
-
-
-
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
index 61a895d63b9bcddcd18975eb54d6209771d645d0..19c385e27af0949cf3006c1947c34bb21d401017 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
@@ -1,66 +1,65 @@
 
 #include "tensor_runtime.h"
 
+void dummyFunction() {
 
-void dummyFunction(){
+  void *initRT = (void *)&llvm_hpvm_initTensorRt;
+  void *cleanRT = (void *)&llvm_hpvm_cleanupTensorRt;
 
-  void* initRT = (void*) &llvm_hpvm_initTensorRt;
-  void* cleanRT = (void*) &llvm_hpvm_cleanupTensorRt;
+  void *initApproxRT = (void *)&llvm_hpvm_initApproxhpvmRt;
+  void *cleanApproxRT = (void *)&llvm_hpvm_cleanupApproxhpvmRt;
 
-  void* initApproxRT = (void*) &llvm_hpvm_initApproxhpvmRt;
-  void* cleanApproxRT = (void*) &llvm_hpvm_cleanupApproxhpvmRt;
+  void *initRTController = (void *)&llvm_hpvm_initializeRuntimeController;
+  void *cleanRTController = (void *)&llvm_hpvm_clearRuntimeController;
 
-  void* initRTController = (void*) &llvm_hpvm_initializeRuntimeController;
-  void* cleanRTController = (void*) &llvm_hpvm_clearRuntimeController;
-  
-  void* request_tensorPtr = (void*) &hpvm_request_tensor;
-  void* startProf = (void*) &startProfiling;
-  void* stopProf = (void*) &stopProfiling;
-  void* create2Dptr = (void*) &create2DTensor;
-  void* create3Dptr = (void*) &create3DTensor;
-  void* create4Dptr = (void*) &create4DTensor;
-  void* initTensorPtr = (void*) &initTensorData;
-  void* tensorSplitPtr = (void*) &tensorSplit;
-  void* tensorConcatPtr = (void*) &tensorConcat;
-  void* tensorConvPtr = (void*) &tensorConvolution;
-  void* tensorHConvPtr = (void*) &tensorHalfConvolution;
-  void* tensorPoolPtr = (void*) &tensorPooling;
-  void* tensorHalfPoolPtr = (void*) &tensorHalfPooling;
-  void* tensorLRNPtr = (void*) &tensorLRN;
-  void* tensorGemmPr = (void*) &tensorGemm;
-  void* tensorGemmCPUPtr = (void*) &tensorGemmCPU;
-  void* tensorGemmGPUPtr = (void*) &tensorGemmGPU;
-  void* tensorHgemmPtr = (void*) &tensorHalfGemm;
-  void* tensorGemmBiasPtr = (void*) &tensorGemmBias;
-  void* tensorAddPtr = (void*) &tensorAdd;
-  void* tensorHalfAddPtr = (void*) &tensorHalfAdd;
-  void* tensorReluPtr = (void*) &tensorRelu;
-  //FIXME: --void* tensorHalfReluPtr = (void*) &tensorHalfRelu;
-  void* tensorRelu2Ptr = (void*) &tensorRelu2;
-  void* tensorHalfRelu2Ptr = (void*) &tensorHalfRelu2;
-  void* tensorTanhPtr = (void*) &tensorTanh;
-  void* tensorHalfTanhPtr = (void*) &tensorHalfTanh;
-  void* tensorSoftmaxPtr = (void*) &tensorSoftmax;
-  void* tensorBatchNormPtr = (void*) &tensorBatchNorm;
-  void* tensorAddErrorPtr = (void*) &tensorAddError;
-  void* ConvLayer = (void*) &ConvLayer_PROMISE;
-  void* FCLayer = (void*) &FCLayer_PROMISE;
-  
-  void* ConvLayer2 = (void*) &wrapper_ConvLayer;
-  void* FCLayer2 = (void*) &wrapper_FCLayer;
-  void* AddWrapper = (void*) &wrapper_tensorAdd;
-  void* ReluWrapper = (void*) &wrapper_tensorRelu;    
-  void* TanhWrapper = (void*) &wrapper_tensorTanh;
-  void* BatchNormWrapper = (void*) &wrapper_tensorBatchNorm;    
-  void* PoolingWrapper = (void*) &wrapper_tensorPooling;    
-  void* softmaxWrapper = (void*) &wrapper_tensorSoftmax;    
+  void *request_tensorPtr = (void *)&hpvm_request_tensor;
+  void *startProf = (void *)&startProfiling;
+  void *stopProf = (void *)&stopProfiling;
+  void *create2Dptr = (void *)&create2DTensor;
+  void *create3Dptr = (void *)&create3DTensor;
+  void *create4Dptr = (void *)&create4DTensor;
+  void *initTensorPtr = (void *)&initTensorData;
+  void *tensorSplitPtr = (void *)&tensorSplit;
+  void *tensorConcatPtr = (void *)&tensorConcat;
+  void *tensorConvPtr = (void *)&tensorConvolution;
+  void *tensorHConvPtr = (void *)&tensorHalfConvolution;
+  void *tensorPoolPtr = (void *)&tensorPooling;
+  void *tensorHalfPoolPtr = (void *)&tensorHalfPooling;
+  void *tensorLRNPtr = (void *)&tensorLRN;
+  void *tensorGemmPr = (void *)&tensorGemm;
+  void *tensorGemmCPUPtr = (void *)&tensorGemmCPU;
+  void *tensorGemmGPUPtr = (void *)&tensorGemmGPU;
+  void *tensorHgemmPtr = (void *)&tensorHalfGemm;
+  void *tensorGemmBiasPtr = (void *)&tensorGemmBias;
+  void *tensorAddPtr = (void *)&tensorAdd;
+  void *tensorHalfAddPtr = (void *)&tensorHalfAdd;
+  void *tensorReluPtr = (void *)&tensorRelu;
+  // FIXME: --void* tensorHalfReluPtr = (void*) &tensorHalfRelu;
+  void *tensorRelu2Ptr = (void *)&tensorRelu2;
+  void *tensorHalfRelu2Ptr = (void *)&tensorHalfRelu2;
+  void *tensorTanhPtr = (void *)&tensorTanh;
+  void *tensorHalfTanhPtr = (void *)&tensorHalfTanh;
+  void *tensorSoftmaxPtr = (void *)&tensorSoftmax;
+  void *tensorBatchNormPtr = (void *)&tensorBatchNorm;
+  void *tensorAddErrorPtr = (void *)&tensorAddError;
+  void *ConvLayer = (void *)&ConvLayer_PROMISE;
+  void *FCLayer = (void *)&FCLayer_PROMISE;
 
-  void* tensorFft = (void *) &wrapper_tensorFft;
-  void* tensorReduce = (void *) &wrapper_tensorReduce;
-  void* tensorProjectiveT = (void *) &wrapper_tensorProjectiveT;
-  void* tensorMap1 = (void *) &wrapper_tensorMap1;
-  void* tensorMap2 = (void *) &wrapper_tensorMap2;
-  void* tensorMap3 = (void *) &wrapper_tensorMap3;
-  void* tensorStencil = (void *) &wrapper_tensorStencil;
-  void* tensorCosineT = (void *) &wrapper_tensorCosineT;
+  void *ConvLayer2 = (void *)&wrapper_ConvLayer;
+  void *FCLayer2 = (void *)&wrapper_FCLayer;
+  void *AddWrapper = (void *)&wrapper_tensorAdd;
+  void *ReluWrapper = (void *)&wrapper_tensorRelu;
+  void *TanhWrapper = (void *)&wrapper_tensorTanh;
+  void *BatchNormWrapper = (void *)&wrapper_tensorBatchNorm;
+  void *PoolingWrapper = (void *)&wrapper_tensorPooling;
+  void *softmaxWrapper = (void *)&wrapper_tensorSoftmax;
+
+  void *tensorFft = (void *)&wrapper_tensorFft;
+  void *tensorReduce = (void *)&wrapper_tensorReduce;
+  void *tensorProjectiveT = (void *)&wrapper_tensorProjectiveT;
+  void *tensorMap1 = (void *)&wrapper_tensorMap1;
+  void *tensorMap2 = (void *)&wrapper_tensorMap2;
+  void *tensorMap3 = (void *)&wrapper_tensorMap3;
+  void *tensorStencil = (void *)&wrapper_tensorStencil;
+  void *tensorCosineT = (void *)&wrapper_tensorCosineT;
 }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.h
index 4204e50f99a192b523222d5aa6a54926c8032a92..f9a199eea2d5e80d3da9b238ac521409df1a1ac0 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.h
@@ -2,71 +2,64 @@
 #ifndef TENSOR_UTILS_HEADER
 #define TENSOR_UTILS_HEADER
 
-
-#include <vector>
 #include "tensor.h"
+#include <vector>
 
+extern "C" {
 
+void freeTensor(void *tensor_ptr);
 
-extern "C"{
-
-  void freeTensor(void* tensor_ptr);
-
-  // Returns the size of the target cudnn datatype
-  int getTypeSize(int data_type);
-
-  void setSizeInBytes(struct Tensor* tensor, int data_type, size_t num_elems);
-
-  // NOTE: Always allocates FP32 on Host, FP32/FP16 for Device (GPU)
-  void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems);
-
-  void setCudnnDataFormat(struct Tensor* tensor, int data_format);
+// Returns the size of the target cudnn datatype
+int getTypeSize(int data_type);
 
+void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems);
 
-  void set4DFilterDescriptor(struct Tensor* tensor, int data_format, size_t dim1_size,
-			     size_t dim2_size, size_t dim3_size, size_t dim4_size);
+// NOTE: Always allocates FP32 on Host, FP32/FP16 for Device (GPU)
+void allocateMem(struct Tensor *tensor, int data_type, size_t num_elems);
 
-  void set4DTensorDescriptor(struct Tensor* tensor, int data_format, size_t dim1_size,
-			     size_t dim2_size, size_t dim3_size, size_t dim4_size);
+void setCudnnDataFormat(struct Tensor *tensor, int data_format);
 
-  // FIXIT: Striding still not working - hence 2D and 3D tensor support is missing
-  void setTensorDescriptor(struct Tensor* tensor, int num_dims,
-			   size_t* dim_sizes);
+void set4DFilterDescriptor(struct Tensor *tensor, int data_format,
+                           size_t dim1_size, size_t dim2_size, size_t dim3_size,
+                           size_t dim4_size);
 
+void set4DTensorDescriptor(struct Tensor *tensor, int data_format,
+                           size_t dim1_size, size_t dim2_size, size_t dim3_size,
+                           size_t dim4_size);
 
-  void* create2DTensor(int data_type, size_t dim1_size, size_t dim2_size);
+// FIXIT: Striding still not working - hence 2D and 3D tensor support is missing
+void setTensorDescriptor(struct Tensor *tensor, int num_dims,
+                         size_t *dim_sizes);
 
-  void* create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
-		       size_t dim3_size);
+void *create2DTensor(int data_type, size_t dim1_size, size_t dim2_size);
 
-  void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
-		       size_t dim3_size, size_t dim4_size);
-  
-  void initTensorData(void* tensor_ptr, void* data_ptr, size_t size_in_bytes);
+void *create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
+                     size_t dim3_size);
 
-  void hostToDeviceCopy(struct Tensor* tensor);
-  
-  void deviceToHostCopy(struct Tensor* tensor);
+void *create4DTensor(int data_type, int data_format, size_t dim1_size,
+                     size_t dim2_size, size_t dim3_size, size_t dim4_size);
 
-  void tensorCopy(void* srcTensor_ptr, void* dstTensor_ptr);
+void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes);
 
-  void hpvm_request_tensor(void* tensor_ptr, int destination);
+void hostToDeviceCopy(struct Tensor *tensor);
 
+void deviceToHostCopy(struct Tensor *tensor);
 
+void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr);
 
-  void convertToFP16(struct Tensor* tensor);
+void hpvm_request_tensor(void *tensor_ptr, int destination);
 
-  void convertToFP32(struct Tensor* tensor);
+void convertToFP16(struct Tensor *tensor);
 
-  void convertToFP32_offline(struct Tensor* tensor);
+void convertToFP32(struct Tensor *tensor);
 
-  // Called from within the runtime to change the data placement
-  // This routine is required to change the output data placements from host to device
-  void changeTensorPlacement(struct Tensor* tensor, data_location_t data_placement);
+void convertToFP32_offline(struct Tensor *tensor);
 
+// Called from within the runtime to change the data placement
+// This routine is required to change the output data placements from host to
+// device
+void changeTensorPlacement(struct Tensor *tensor,
+                           data_location_t data_placement);
 }
 
-
 #endif
-
-
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc
index 8e13ca118621efb3769b620249c2bbb1bc40978f..9d7cb4976f30d1f16a90a1eb48524a95192888f5 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc
@@ -1,208 +1,192 @@
 
 
-#include <sstream>
 #include <fstream>
 #include <map>
-#include <vector>
+#include <sstream>
 #include <string.h>
+#include <vector>
 
 #include "approx_knob_utils.h"
 #include "debug.h"
 
-
-PerfParams::PerfParams(){
+PerfParams::PerfParams() {
   row = 1;
   col = 1;
   skip_offset = 0;
 }
-  
-PerfParams::PerfParams(int row1, int col1, int skip_offset1){
+
+PerfParams::PerfParams(int row1, int col1, int skip_offset1) {
   row = row1;
   col = col1;
   skip_offset = skip_offset1;
 }
- 		
 
+PerfParamSet::PerfParamSet() {
 
-PerfParamSet::PerfParamSet(){
-  
   char llvm_src_root[100];
-  char* env_str=  getenv("LLVM_SRC_ROOT");  
+  char *env_str = getenv("LLVM_SRC_ROOT");
 
-  if (env_str == NULL){
+  if (env_str == NULL) {
     ERROR("ERROR: SET LLVM_SRC_ROOT \n");
   }
 
-  strcpy(llvm_src_root, env_str);  
-  printf ("*LLVM_SRC_ROOT = %s", llvm_src_root);
+  strcpy(llvm_src_root, env_str);
+  printf("*LLVM_SRC_ROOT = %s", llvm_src_root);
+
+  char *knobs_file_path =
+      strcat(llvm_src_root,
+             "/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt");
+  printf("- knobs_file_path = %s \n", knobs_file_path);
 
-  char* knobs_file_path = strcat(llvm_src_root, "/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt");
-  printf ("- knobs_file_path = %s \n", knobs_file_path);
-  
   std::ifstream file(knobs_file_path);
 
   std::string line;
   std::string partial;
   std::vector<std::string> tokens;
 
-  while(std::getline(file, line)) {     // Read each line 
+  while (std::getline(file, line)) { // Read each line
 
-    //printf ("***** line === %s ", line);  
+    // printf ("***** line === %s ", line);
     std::istringstream iss(line);
     std::string token;
-    while(std::getline(iss, token, '\t')){   // Read each token in the line
+    while (std::getline(iss, token, '\t')) { // Read each token in the line
       tokens.push_back(token);
 
       int index = token.find("perf");
-      if (index != std::string::npos){
-
-	int index2 = token.find(",");
-	std::string knob_str = token.substr(index2 + 1);
-	int knob = atoi(knob_str.c_str());
-	  
-	std::getline(iss, token, '\t');
-	std::istringstream token_stream(token);
-
-	std::string tok;
-	  
-	std::getline(token_stream, tok, ',');	  
-	int row = atoi(tok.c_str());
-
-	std::getline(token_stream, tok, ',');
-	int col = atoi(tok.c_str());
-
-	std::getline(token_stream, tok, ',');
-	int offset = atoi(tok.c_str());
-
-	printf ("**** knob = %d, row = %d, col = %d, offset = %d \n\n", knob, row, col, offset); 
-	PerfParams params(row, col, offset);
-	perf_knob_map[knob] = params;
-	  
+      if (index != std::string::npos) {
+
+        int index2 = token.find(",");
+        std::string knob_str = token.substr(index2 + 1);
+        int knob = atoi(knob_str.c_str());
+
+        std::getline(iss, token, '\t');
+        std::istringstream token_stream(token);
+
+        std::string tok;
+
+        std::getline(token_stream, tok, ',');
+        int row = atoi(tok.c_str());
+
+        std::getline(token_stream, tok, ',');
+        int col = atoi(tok.c_str());
+
+        std::getline(token_stream, tok, ',');
+        int offset = atoi(tok.c_str());
+
+        printf("**** knob = %d, row = %d, col = %d, offset = %d \n\n", knob,
+               row, col, offset);
+        PerfParams params(row, col, offset);
+        perf_knob_map[knob] = params;
       }
-	
     }
   }
 
   file.close();
 }
 
-  
-PerfParams PerfParamSet::getPerfParams(int swing){
+PerfParams PerfParamSet::getPerfParams(int swing) {
 
-  if (swing >= 150){
+  if (swing >= 150) {
     swing = swing - 30;
   }
-  
+
   return perf_knob_map[swing];
-  
 }
 
-
-
-  
-SampParams::SampParams(){
-    skip_rate = 1;
-    skip_offset = 0;
-}
-  
-SampParams::SampParams(int skip_rate1, int skip_offset1, float interpolation_id1){
-    skip_rate = skip_rate1;
-    skip_offset = skip_offset1;
-    interpolation_id = interpolation_id1;
+SampParams::SampParams() {
+  skip_rate = 1;
+  skip_offset = 0;
 }
- 		
 
+SampParams::SampParams(int skip_rate1, int skip_offset1,
+                       float interpolation_id1) {
+  skip_rate = skip_rate1;
+  skip_offset = skip_offset1;
+  interpolation_id = interpolation_id1;
+}
 
-SampParamSet::SampParamSet(){
+SampParamSet::SampParamSet() {
 
   char llvm_src_root[100];
-  char* env_str=  getenv("LLVM_SRC_ROOT");  
+  char *env_str = getenv("LLVM_SRC_ROOT");
 
-  if (env_str == NULL){
+  if (env_str == NULL) {
     ERROR("ERROR: SET LLVM_SRC_ROOT \n");
   }
 
-  strcpy(llvm_src_root, env_str);  
-  printf ("* LLVM_SRC_ROOT = %s \n", llvm_src_root);
+  strcpy(llvm_src_root, env_str);
+  printf("* LLVM_SRC_ROOT = %s \n", llvm_src_root);
+
+  char *knobs_file_path =
+      strcat(llvm_src_root,
+             "/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt");
+  printf("- knobs_file_path = %s \n", knobs_file_path);
 
-  char* knobs_file_path = strcat(llvm_src_root, "/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt");
-  printf ("- knobs_file_path = %s \n", knobs_file_path);
- 
   std::ifstream file(knobs_file_path);
 
   std::string line;
   std::string partial;
   std::vector<std::string> tokens;
 
-  while(std::getline(file, line)) {     // Read each line 
+  while (std::getline(file, line)) { // Read each line
 
     std::istringstream iss(line);
     std::string token;
-    while(std::getline(iss, token, '\t')){   // Read each token in the line
+    while (std::getline(iss, token, '\t')) { // Read each token in the line
       tokens.push_back(token);
 
       int index = token.find("samp");
       int test_index = token.find("reduction");
-	
-      if (index != std::string::npos && test_index == std::string::npos){
-
-	int index2 = token.find(",");
-	std::string knob_str = token.substr(index2 + 1);
-	int knob = atoi(knob_str.c_str());
-	printf ("knob = %d \n", knob);
-	  
-	std::getline(iss, token, '\t');
-	std::istringstream token_stream(token);
-
-	std::string tok;
-	  
-	std::getline(token_stream, tok, ',');	  
-	int skip_every = atoi(tok.c_str());
-
-	std::getline(token_stream, tok, ',');
-	int offset = atoi(tok.c_str());
-
-	std::getline(token_stream, tok, ',');
-	float interpolation_id = atof(tok.c_str());
-
-	printf ("skip_every = %d, offset = %d \n", skip_every, offset);
-	SampParams params(skip_every, offset, interpolation_id);
-	samp_knob_map[knob] = params;
-	  
+
+      if (index != std::string::npos && test_index == std::string::npos) {
+
+        int index2 = token.find(",");
+        std::string knob_str = token.substr(index2 + 1);
+        int knob = atoi(knob_str.c_str());
+        printf("knob = %d \n", knob);
+
+        std::getline(iss, token, '\t');
+        std::istringstream token_stream(token);
+
+        std::string tok;
+
+        std::getline(token_stream, tok, ',');
+        int skip_every = atoi(tok.c_str());
+
+        std::getline(token_stream, tok, ',');
+        int offset = atoi(tok.c_str());
+
+        std::getline(token_stream, tok, ',');
+        float interpolation_id = atof(tok.c_str());
+
+        printf("skip_every = %d, offset = %d \n", skip_every, offset);
+        SampParams params(skip_every, offset, interpolation_id);
+        samp_knob_map[knob] = params;
       }
-	
     }
   }
 
-
   file.close();
-  
 }
 
+SampParams SampParamSet::getSampParams(int swing) {
 
-
-SampParams SampParamSet::getSampParams(int swing){
-
-  if (swing >= 260){
+  if (swing >= 260) {
     swing = swing - 30;
   }
 
-  return samp_knob_map[swing];  
+  return samp_knob_map[swing];
 }
 
-
-
-  
 RedSampParams::RedSampParams() {
   skip_ratio = 0.0f;
   is_half = false;
 }
-  
+
 RedSampParams::RedSampParams(float skip_ratio1, bool is_half1) {
   skip_ratio = skip_ratio1;
   is_half = is_half1;
 }
- 	       
 
 RedSampParams getRedSampParams(int swing) {
 
@@ -227,7 +211,4 @@ RedSampParams getRedSampParams(int swing) {
   red_samp_knob_map[46] = params46;
 
   return red_samp_knob_map[swing];
-  
 }
-
-
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp
index 08607a90796836d2218c53355a142c9c1e11cf6f..0fe6c20ca848c1caf8180735db9d5cce2f3b2f82 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp
@@ -30,7 +30,7 @@ template <> half *convertAndGetGPUData<half>(Tensor *t) {
     return static_cast<half *>(t->gpu_half_data);
   }
   ERROR("Type %s is incompatible with target type half\n",
-         std::to_string(t->cur_type));
+        std::to_string(t->cur_type));
 }
 
 template <> float2 *convertAndGetGPUData<float2>(Tensor *t) {
@@ -85,8 +85,8 @@ std::vector<size_t> sizes(const Dimension &dim) {
 std::vector<size_t> sizes(Tensor *t) { return sizes(t->dims); }
 
 size_t num_elems(const std::vector<size_t> &dim_sizes) {
-  return std::accumulate(
-      dim_sizes.begin(), dim_sizes.end(), 1, std::multiplies<>());
+  return std::accumulate(dim_sizes.begin(), dim_sizes.end(), 1,
+                         std::multiplies<>());
 }
 
 size_t num_elems(const Dimension &dim) { return num_elems(sizes(dim)); }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
index 9efbea07c9a1ef31a87a3266de89cb9d10660621..517b7c7009645c43c7f2af6fa733b4205590efd8 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
@@ -12,8 +12,8 @@ bool NodeConfiguration::isGPUNodeConfiguration() {
   return NODE_CONFIGURATION_TARGET_ID == GPU;
 }
 
-void PROMISENodeConfiguration::pushNewApproximationChoice(
-    P_APPROX approx, int u) {
+void PROMISENodeConfiguration::pushNewApproximationChoice(P_APPROX approx,
+                                                          int u) {
   ApproxChoices.push_back(std::make_pair(approx, u));
 }
 
@@ -36,9 +36,8 @@ void GPUNodeConfiguration::pushNewTensorOperation(enum TENSOR_OP top) {
 void GPUNodeConfiguration::pushNewApproximationChoiceForOperation(
     G_APPROX approx, int u) {
   unsigned size = ApproxChoices.size();
-  CUSTOM_ASSERT(
-      size >= 1 &&
-      "Cannot apply approximation choice to non existent operation.");
+  CUSTOM_ASSERT(size >= 1 &&
+                "Cannot apply approximation choice to non existent operation.");
   ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u));
 }
 
@@ -52,8 +51,8 @@ GPUNodeConfiguration::GPUNodeConfiguration() {
 }
 GPUNodeConfiguration::~GPUNodeConfiguration() {}
 
-Configuration::Configuration(
-    std::string &n, float f, float e, float a, float al)
+Configuration::Configuration(std::string &n, float f, float e, float a,
+                             float al)
     : name(n), speedup(f), energy(e), accuracy(a), accuracyLoss(al) {}
 
 float Configuration::getSpeedup() { return speedup; }
@@ -63,20 +62,20 @@ float Configuration::getEnergy() { return energy; }
 float Configuration::getAccuracy() { return accuracy; }
 
 float Configuration::getAccuracyLoss() { return accuracyLoss; }
-bool ConfigurationLessThan::
-operator()(const struct Configuration &a, const struct Configuration &b) const {
+bool ConfigurationLessThan::operator()(const struct Configuration &a,
+                                       const struct Configuration &b) const {
   return (a.accuracyLoss < b.accuracyLoss);
 }
-bool ConfigurationLessThan_AL::
-operator()(const struct Configuration *a, const float &b) const {
+bool ConfigurationLessThan_AL::operator()(const struct Configuration *a,
+                                          const float &b) const {
   return (a->accuracyLoss < b);
 }
-bool ConfigurationLessThan_SP::
-operator()(const struct Configuration *a, const float &b) const {
+bool ConfigurationLessThan_SP::operator()(const struct Configuration *a,
+                                          const float &b) const {
   return (a->speedup < b);
 }
-bool ConfigurationLessThan_E::
-operator()(const struct Configuration *a, const float &b) const {
+bool ConfigurationLessThan_E::operator()(const struct Configuration *a,
+                                         const float &b) const {
   return (a->energy < b);
 }
 
@@ -212,9 +211,8 @@ void GPUNodeConfiguration::print() {
 void Configuration::print() {
 
   printf("+++++\n");
-  printf(
-      "%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy,
-      accuracyLoss);
+  printf("%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy,
+         accuracyLoss);
   for (std::map<std::string, NodeConfiguration *>::const_iterator it =
            setup.begin();
        it != setup.end(); ++it) {
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc
index 3720d43f32065e14b27148753e505d874c771ec6..ebb7e73f2b5a019954e7390f3eb8fadc96a3719e 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc
@@ -3,20 +3,19 @@
 #ifndef RUNTIME_DEBUG
 #define RUNTIME_DEBUG
 
-#define LOG_DEBUG 1   // Sets the debug logging to true
-#define LOG_INFO 1 // Sets the info logging to true
+#define LOG_DEBUG 1 // Sets the debug logging to true
+#define LOG_INFO 1  // Sets the info logging to true
 #define ASSERT_FLAG // Sets assertions to true (opposite of NDEBUG macro)
 
-#include "tensor.h"
 #include "debug.h"
+#include "tensor.h"
+#include <sstream>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <sstream>
 
-
-void INFO(const char* format, ...){
-  if(!LOG_INFO) // Don't print if logging info is disabled
+void INFO(const char *format, ...) {
+  if (!LOG_INFO) // Don't print if logging info is disabled
     return;
   va_list args;
   va_start(args, format);
@@ -25,8 +24,8 @@ void INFO(const char* format, ...){
   va_end(args);
 }
 
-void DEBUG(const char* format, ...){
-  if(!LOG_DEBUG) // Don't print if logging info is disabled
+void DEBUG(const char *format, ...) {
+  if (!LOG_DEBUG) // Don't print if logging info is disabled
     return;
   va_list args;
   va_start(args, format);
@@ -35,8 +34,8 @@ void DEBUG(const char* format, ...){
   va_end(args);
 }
 
-void ERROR(const char* format, ...){
-  if(!LOG_DEBUG) // Don't print if logging info is disabled
+void ERROR(const char *format, ...) {
+  if (!LOG_DEBUG) // Don't print if logging info is disabled
     return;
   va_list args;
   va_start(args, format);
@@ -47,39 +46,30 @@ void ERROR(const char* format, ...){
   abort();
 }
 
-
-
-void fillOnes(struct Tensor* tensor){
+void fillOnes(struct Tensor *tensor) {
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = 1.0;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = 1.0;
     }
   }
 }
 
-
-void printTensorDescInfo(struct Tensor* tensor){
+void printTensorDescInfo(struct Tensor *tensor) {
 
   cudnnDataType_t dType;
   int nStride, cStride, hStride, wStride;
   int size1, size2, size3, size4;
-  cudnnGetTensor4dDescriptor(tensor->tensor_desc,
-  			     &dType,
-  			     &size1, &size2, &size3, &size4,
-  			     &nStride, &cStride, &hStride, &wStride);
-
-  DEBUG("dType = %d, size1 = %d, size2 = %d, size3 = %d, size4 = %d \n",
-  	 dType, size1, size2, size3, size4);
-  
-  DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n",
-  	 nStride, cStride, hStride, wStride);
-  
-}
-
-
+  cudnnGetTensor4dDescriptor(tensor->tensor_desc, &dType, &size1, &size2,
+                             &size3, &size4, &nStride, &cStride, &hStride,
+                             &wStride);
 
+  DEBUG("dType = %d, size1 = %d, size2 = %d, size3 = %d, size4 = %d \n", dType,
+        size1, size2, size3, size4);
 
+  DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n", nStride,
+        cStride, hStride, wStride);
+}
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp
index 8e163e7049fbe317624e934504d7dc9297032983..9bec84de77fc279547eaaba8410c0e25ba3f3cd0 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp
@@ -1,130 +1,127 @@
+#include "debug.h"
 #include <cstdarg>
 #include <cstdio>
-#include <stdexcept>
 #include <cuda_runtime_api.h>
-#include "debug.h"
+#include <stdexcept>
 
 void throwError(const char *file, int line, const char *fmt, ...) {
-    char msg[2048];
-    va_list args;
-    /* vasprintf not standard */
-    /* vsnprintf: how to handle if does not exist? */
-    va_start(args, fmt);
-    int n = vsnprintf(msg, 2048, fmt, args);
-    va_end(args);
-    if (n < 2048) {
-        snprintf(msg + n, 2048 - n, " at %s:%d", file, line);
-    }
+  char msg[2048];
+  va_list args;
+  /* vasprintf not standard */
+  /* vsnprintf: how to handle if does not exist? */
+  va_start(args, fmt);
+  int n = vsnprintf(msg, 2048, fmt, args);
+  va_end(args);
+  if (n < 2048) {
+    snprintf(msg + n, 2048 - n, " at %s:%d", file, line);
+  }
 
-    ERROR(msg);
+  ERROR(msg);
 }
 
-template<typename T, typename F>
-void checkCompareFlag(
-        T err, T success_const, F get_err_str, const char *error_kind, const char *file, int line
-) {
-    if (err != success_const) {
-        static int alreadyFailed = 0;
-        if (!alreadyFailed) {
-            fprintf(
-                    stderr, "%s Error file=%s line=%i error=%i : %s\n",
-                    error_kind, file, line, err,
-                    get_err_str(err)
-            );
-            alreadyFailed = 1;
-        }
-        throwError(
-                file, line, "%s Error error (%d) : %s", error_kind, err,
-                get_err_str(err)
-        );
+template <typename T, typename F>
+void checkCompareFlag(T err, T success_const, F get_err_str,
+                      const char *error_kind, const char *file, int line) {
+  if (err != success_const) {
+    static int alreadyFailed = 0;
+    if (!alreadyFailed) {
+      fprintf(stderr, "%s Error file=%s line=%i error=%i : %s\n", error_kind,
+              file, line, err, get_err_str(err));
+      alreadyFailed = 1;
     }
+    throwError(file, line, "%s Error error (%d) : %s", error_kind, err,
+               get_err_str(err));
+  }
 }
 
 void _checkCUDA(cudaError_t err, const char *file, int line) {
-    checkCompareFlag(err, cudaSuccess, cudaGetErrorString, "CUDA", file, line);
+  checkCompareFlag(err, cudaSuccess, cudaGetErrorString, "CUDA", file, line);
 }
 
 void _checkWarnCUDA(cudaError_t err, const char *file, int line) {
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA Warning file=%s line=%i error=%i : %s\n", file, line, err,
-                cudaGetErrorString(err));
-    }
+  if (err != cudaSuccess) {
+    fprintf(stderr, "CUDA Warning file=%s line=%i error=%i : %s\n", file, line,
+            err, cudaGetErrorString(err));
+  }
 }
 
 void _checkCUDNN(cudnnStatus_t error, const char *file, int line) {
-    checkCompareFlag(error, CUDNN_STATUS_SUCCESS, cudnnGetErrorString, "CUDNN", file, line);
+  checkCompareFlag(error, CUDNN_STATUS_SUCCESS, cudnnGetErrorString, "CUDNN",
+                   file, line);
 }
 
 static const char *cublasGetErrorString(cublasStatus_t status) {
-    switch (status) {
-        case CUBLAS_STATUS_SUCCESS:
-            return "CUBLAS_STATUS_SUCCESS";
-        case CUBLAS_STATUS_NOT_INITIALIZED:
-            return "CUBLAS_STATUS_NOT_INITIALIZED";
-        case CUBLAS_STATUS_ALLOC_FAILED:
-            return "CUBLAS_STATUS_ALLOC_FAILED";
-        case CUBLAS_STATUS_INVALID_VALUE:
-            return "CUBLAS_STATUS_INVALID_VALUE";
-        case CUBLAS_STATUS_ARCH_MISMATCH:
-            return "CUBLAS_STATUS_ARCH_MISMATCH";
-        case CUBLAS_STATUS_MAPPING_ERROR:
-            return "CUBLAS_STATUS_MAPPING_ERROR";
-        case CUBLAS_STATUS_EXECUTION_FAILED:
-            return "CUBLAS_STATUS_EXECUTION_FAILED";
-        case CUBLAS_STATUS_INTERNAL_ERROR:
-            return "CUBLAS_STATUS_INTERNAL_ERROR";
-        case CUBLAS_STATUS_NOT_SUPPORTED:
-            return "CUBLAS_STATUS_NOT_SUPPORTED";
-        case CUBLAS_STATUS_LICENSE_ERROR:
-            return "CUBLAS_STATUS_LICENSE_ERROR";
-    }
-    return "unknown error";
+  switch (status) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+  return "unknown error";
 }
 
 void _checkCUBLAS(cublasStatus_t error, const char *file, int line) {
-    checkCompareFlag(error, CUBLAS_STATUS_SUCCESS, cublasGetErrorString, "CUBLAS", file, line);
+  checkCompareFlag(error, CUBLAS_STATUS_SUCCESS, cublasGetErrorString, "CUBLAS",
+                   file, line);
 }
 
 static const char *cufftGetErrorString(cufftResult error) {
-    switch (error) {
-        case CUFFT_SUCCESS:
-            return "CUFFT_SUCCESS";
-        case CUFFT_INVALID_PLAN:
-            return "CUFFT_INVALID_PLAN";
-        case CUFFT_ALLOC_FAILED:
-            return "CUFFT_ALLOC_FAILED";
-        case CUFFT_INVALID_TYPE:
-            return "CUFFT_INVALID_TYPE";
-        case CUFFT_INVALID_VALUE:
-            return "CUFFT_INVALID_VALUE";
-        case CUFFT_INTERNAL_ERROR:
-            return "CUFFT_INTERNAL_ERROR";
-        case CUFFT_EXEC_FAILED:
-            return "CUFFT_EXEC_FAILED";
-        case CUFFT_SETUP_FAILED:
-            return "CUFFT_SETUP_FAILED";
-        case CUFFT_INVALID_SIZE:
-            return "CUFFT_INVALID_SIZE";
-        case CUFFT_UNALIGNED_DATA:
-            return "CUFFT_UNALIGNED_DATA";
-        case CUFFT_INCOMPLETE_PARAMETER_LIST:
-            return "CUFFT_INCOMPLETE_PARAMETER_LIST";
-        case CUFFT_INVALID_DEVICE:
-            return "CUFFT_INVALID_DEVICE";
-        case CUFFT_PARSE_ERROR:
-            return "CUFFT_PARSE_ERROR";
-        case CUFFT_NO_WORKSPACE:
-            return "CUFFT_NO_WORKSPACE";
-        case CUFFT_NOT_IMPLEMENTED:
-            return "CUFFT_NOT_IMPLEMENTED";
-        case CUFFT_LICENSE_ERROR:
-            return "CUFFT_LICENSE_ERROR";
-        case CUFFT_NOT_SUPPORTED:
-            return "CUFFT_NOT_SUPPORTED";
-    }
-    return "<unknown>";
+  switch (error) {
+  case CUFFT_SUCCESS:
+    return "CUFFT_SUCCESS";
+  case CUFFT_INVALID_PLAN:
+    return "CUFFT_INVALID_PLAN";
+  case CUFFT_ALLOC_FAILED:
+    return "CUFFT_ALLOC_FAILED";
+  case CUFFT_INVALID_TYPE:
+    return "CUFFT_INVALID_TYPE";
+  case CUFFT_INVALID_VALUE:
+    return "CUFFT_INVALID_VALUE";
+  case CUFFT_INTERNAL_ERROR:
+    return "CUFFT_INTERNAL_ERROR";
+  case CUFFT_EXEC_FAILED:
+    return "CUFFT_EXEC_FAILED";
+  case CUFFT_SETUP_FAILED:
+    return "CUFFT_SETUP_FAILED";
+  case CUFFT_INVALID_SIZE:
+    return "CUFFT_INVALID_SIZE";
+  case CUFFT_UNALIGNED_DATA:
+    return "CUFFT_UNALIGNED_DATA";
+  case CUFFT_INCOMPLETE_PARAMETER_LIST:
+    return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+  case CUFFT_INVALID_DEVICE:
+    return "CUFFT_INVALID_DEVICE";
+  case CUFFT_PARSE_ERROR:
+    return "CUFFT_PARSE_ERROR";
+  case CUFFT_NO_WORKSPACE:
+    return "CUFFT_NO_WORKSPACE";
+  case CUFFT_NOT_IMPLEMENTED:
+    return "CUFFT_NOT_IMPLEMENTED";
+  case CUFFT_LICENSE_ERROR:
+    return "CUFFT_LICENSE_ERROR";
+  case CUFFT_NOT_SUPPORTED:
+    return "CUFFT_NOT_SUPPORTED";
+  }
+  return "<unknown>";
 }
 
 void _checkCUFFT(cufftResult error, const char *file, int line) {
-    checkCompareFlag(error, CUFFT_SUCCESS, cufftGetErrorString, "CUFFT", file, line);
+  checkCompareFlag(error, CUFFT_SUCCESS, cufftGetErrorString, "CUFFT", file,
+                   line);
 }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/freq_utils.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/freq_utils.cc
index 92d7499d539dfb8a2e9bde4f18daea8a292ac91e..333635fbda6959f6f456153c444dc363531734df 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/freq_utils.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/freq_utils.cc
@@ -1,31 +1,28 @@
 
 
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 
+const char *available_freqs[] = {
+    "140250000",  "229500000",  "318750000",  "408000000", "497250000",
+    "586500000",  "675750000",  "765000000",  "854250000", "943500000",
+    "1032750000", "1122000000", "1211250000", "1300500000"};
 
+void updateJetsonGPUFreq(int freq_level) {
 
-
-const char* available_freqs[] = {"140250000", "229500000", "318750000", "408000000", "497250000",
-				 "586500000", "675750000", "765000000", "854250000",
-				 "943500000", "1032750000", "1122000000", "1211250000", "1300500000"};
-
-
-void updateJetsonGPUFreq(int freq_level){
-
-  if (freq_level < 0 || freq_level > 13){
+  if (freq_level < 0 || freq_level > 13) {
     printf("ERRROR: Provide freq level between {0, 13}  \n\n\n");
     abort();
   }
 
-  const char* freq_val = available_freqs[freq_level]; 
+  const char *freq_val = available_freqs[freq_level];
   printf("freq-val[0] = %s \n", freq_val);
-  
 
-  FILE* max_file = fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq", "w+");
+  FILE *max_file = fopen(
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq", "w+");
 
-  if (max_file == NULL){
+  if (max_file == NULL) {
     printf("Could not min_freq file \n");
   }
 
@@ -33,10 +30,10 @@ void updateJetsonGPUFreq(int freq_level){
 
   fclose(max_file);
 
-  
-  FILE* min_file = fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "w+");
+  FILE *min_file = fopen(
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "w+");
 
-  if (min_file == NULL){
+  if (min_file == NULL) {
     printf("Could not min_freq file \n");
     abort();
   }
@@ -44,23 +41,20 @@ void updateJetsonGPUFreq(int freq_level){
   fwrite(freq_val, strlen(freq_val), 1, min_file);
 
   fclose(min_file);
- 
 }
 
+unsigned long int readJetsonGPUFreq() {
 
+  FILE *cur_freq_file =
+      fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq", "r");
 
-unsigned long int readJetsonGPUFreq(){
- 
-  FILE* cur_freq_file = fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq", "r");
-
-  if (cur_freq_file == NULL){
+  if (cur_freq_file == NULL) {
     printf("Could not open cur_freq file \n");
   }
 
-
   char buf[50];
-  char* ptr;
-  
+  char *ptr;
+
   fread(buf, 50, 1, cur_freq_file);
 
   unsigned long cur_freq = strtoul(buf, &ptr, 10);
@@ -70,15 +64,13 @@ unsigned long int readJetsonGPUFreq(){
   return cur_freq;
 }
 
-
-
-int main(){
+int main() {
 
   updateJetsonGPUFreq(7);
 
   unsigned long int cur_freq = readJetsonGPUFreq();
 
   printf("** cur_freq = %lu \n\n", cur_freq);
-  
+
   return 0;
 }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
index 516030a75b0a113ab80c78401785e363cf5a7a29..61e37ed9a3dd5a7a40b170660e164d7ae9965344 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
@@ -1,22 +1,22 @@
-#include <stdio.h>
-#include <stdarg.h>
 #include <cstdio>
 #include <cstdlib>
+#include <stdarg.h>
+#include <stdio.h>
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
-
 #include <cublas_v2.h>
-#include <cudnn.h>
+// Must come after cublas_v2.h
 #include <cublas_api.h>
+#include <cudnn.h>
+
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "tensor.h"
-#include "global_data.h"
 #include "approx_knob_utils.h"
-
+#include "global_data.h"
+#include "tensor.h"
 
 /* Data declarations */
 cudnnHandle_t cudnnHandle;
@@ -26,17 +26,16 @@ bool runtime_initialized = false;
 // NOTE: Layers Mode is True or Approxhpvm wrappper runtime mode
 bool approxhpvm_runtime_mode = false;
 
-
 int op_counter = 0;
 int total_ops = 0;
 // NOTE: Both vectors asssume a linear CFG
 // FIXME: Each operation should have an ID passed to the runtime
 std::vector<int> op_accuracies;
-std::vector<Range*> quant_ranges;
+std::vector<Range *> quant_ranges;
 
-std::unordered_set<void*> tensors_ptr, host_ptr, obj_ptr;
+std::unordered_set<void *> tensors_ptr, host_ptr, obj_ptr;
 
-std::unordered_map<void*, int> tracked_tensors;
+std::unordered_map<void *, int> tracked_tensors;
 
 // Autotuning data
 std::unordered_map<int, int> skip_tensors;
@@ -45,7 +44,5 @@ std::unordered_map<int, int> skip_tensors;
 std::unordered_map<std::string, int> func_counters;
 std::string profile_data = "";
 
-
-
-PerfParamSet* perfParamSet;  
-SampParamSet* sampParamSet;
+PerfParamSet *perfParamSet;
+SampParamSet *sampParamSet;
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
index 9dba9ca15b2c267b505e6a3a72620b37e44c93e1..fd308a3409dc679a07b5374238e7150fb3c34beb 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
@@ -1,37 +1,35 @@
 
 #include "hpvm-rt-controller.h"
-#include "img_tensor_utils.h"
 #include "global_data.h"
+#include "img_tensor_utils.h"
 #include <fstream>
 
 //-------- Functionality to read and update frequency on Jetson board -------//
 /*const char* available_freqs[] = {"140250000", "229500000", "318750000",
-                                 "408000000", "497250000", "586500000", 
+                                 "408000000", "497250000", "586500000",
                                  "675750000", "765000000", "854250000",
                                  "943500000", "1032750000", "1122000000",
                                  "1211250000", "1300500000"};
 
 */
 
-
 const int available_freqs[] = {
-140250000, // 0
-229500000, // 1
-318750000, // 2
-408000000, // 3
-497250000, // 4
-586500000, // 5
-675750000, // 6
-765000000, // 7
-854250000, // 8
-943500000, // 9
-1032750000,// 10
-1122000000,// 11
-1211250000,// 12
-1300500000 // 13
+    140250000,  // 0
+    229500000,  // 1
+    318750000,  // 2
+    408000000,  // 3
+    497250000,  // 4
+    586500000,  // 5
+    675750000,  // 6
+    765000000,  // 7
+    854250000,  // 8
+    943500000,  // 9
+    1032750000, // 10
+    1122000000, // 11
+    1211250000, // 12
+    1300500000  // 13
 };
 
-
 /*void updateJetsonGPUFreq(int freq_level) {
 
   if (freq_level < 0 || freq_level > 13) {
@@ -39,7 +37,7 @@ const int available_freqs[] = {
     abort();
   }
 
-  const char* freq_val = available_freqs[freq_level]; 
+  const char* freq_val = available_freqs[freq_level];
   printf("freq-val[0] = %s \n", freq_val);
 
   FILE* max_file =
@@ -49,7 +47,7 @@ const int available_freqs[] = {
   }
   fwrite(freq_val, strlen(freq_val), 1, max_file);
   fclose(max_file);
-  
+
   FILE* min_file =
     fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "w+");
   if (min_file == NULL){
@@ -70,7 +68,7 @@ unsigned long int readJetsonGPUFreq() {
 
   char buf[50];
   char* ptr;
-  
+
   fread(buf, 50, 1, cur_freq_file);
   unsigned long cur_freq = strtoul(buf, &ptr, 10);
   fclose(cur_freq_file);
@@ -79,14 +77,15 @@ unsigned long int readJetsonGPUFreq() {
 
 */
 
-
 // Sets frequency
 void setFreq(unsigned freq_index) {
 
   unsigned target_freq = available_freqs[freq_index];
-  
-  const char * const min_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq";
-  const char * const max_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq";
+
+  const char *const min_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq";
+  const char *const max_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq";
 
   std::ofstream min_stream;
   std::ofstream max_stream;
@@ -105,7 +104,8 @@ void setFreq(unsigned freq_index) {
 unsigned recordFreq() {
 
   // Current frequency file
-  const char * const cur_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq";
+  const char *const cur_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq";
   std::ifstream cur_stream;
   cur_stream.open(cur_freq_file, std::ifstream::in);
 
@@ -118,10 +118,6 @@ unsigned recordFreq() {
   return cur_freq;
 }
 
-
-
-
-
 //---------------------------------------------------------------------------//
 
 /*
@@ -135,13 +131,13 @@ bool fileExists(const std::string &file) {
 
 // There will be no frequency request for the first batch
 // Therefore, we skip the first element by initializing to 1, not 0.
-FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) :
-  idx_list(il), rep_factor(rf), count(1), idx(0) {}
+FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf)
+    : idx_list(il), rep_factor(rf), count(1), idx(0) {}
 
 unsigned FrequencyIndexList::getNextIndex() {
   if (count == rep_factor) {
     count = 0;
-    idx = (idx+1) % idx_list.size();
+    idx = (idx + 1) % idx_list.size();
   }
   count++;
   return idx_list[idx];
@@ -208,7 +204,7 @@ void ProfileInfo::readIterationFrequency() {
   frequency_current_iteration = recordFreq();
 #else
   frequency_current_iteration = 0;
-#endif //JETSON_EXECUTION
+#endif // JETSON_EXECUTION
 }
 
 unsigned long ProfileInfo::getIterationFrequency() {
@@ -275,15 +271,14 @@ void ProfileInfo::printToFile() {
   // to have equal sizes, in outer and inner vectors both,
   // and all time_info and energy_info vectors must have the same size.
   unsigned iterations = tensor_time_info.size();
-  CUSTOM_ASSERT(
-      (tensor_time_info.size() == iterations) &&
-      (tensor_energy_info.size() == iterations) &&
-      (control_time_info.size() == iterations) &&
-      (control_energy_info.size() == iterations) &&
-      (config_time_info.size() == iterations) &&
-      (config_energy_info.size() == iterations) &&
-      (frequency_info.size() == iterations) &&
-      "time_info, energy_info, frequency_info size: \
+  CUSTOM_ASSERT((tensor_time_info.size() == iterations) &&
+                (tensor_energy_info.size() == iterations) &&
+                (control_time_info.size() == iterations) &&
+                (control_energy_info.size() == iterations) &&
+                (config_time_info.size() == iterations) &&
+                (config_energy_info.size() == iterations) &&
+                (frequency_info.size() == iterations) &&
+                "time_info, energy_info, frequency_info size: \
                    iteration number does not match.");
 
   for (unsigned i = 0; i < tensor_time_info.size(); i++) {
@@ -333,8 +328,8 @@ ProfileInfo::ProfileInfo()
       time_control_current_iteration(0.0), time_config_current_iteration(0.0),
       energy_compute_current_iteration(0.0),
       energy_control_current_iteration(0.0),
-      energy_config_current_iteration(0.0),
-      frequency_current_iteration(0), in_iteration(false) {}
+      energy_config_current_iteration(0.0), frequency_current_iteration(0),
+      in_iteration(false) {}
 
 Slowdowns::Slowdowns() {
   idx = 0;
@@ -376,37 +371,37 @@ void RuntimeController::stop_profiler() {
     profiler->stop_profiler();
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &RuntimeController::
-getSpeedupConfigurations() {
+std::vector<struct Configuration *> &
+RuntimeController::getSpeedupConfigurations() {
   return SpeedupConfigurations;
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &RuntimeController::
-getEnergyConfigurations() {
+std::vector<struct Configuration *> &
+RuntimeController::getEnergyConfigurations() {
   return EnergyConfigurations;
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &RuntimeController::
-getThreeDCurveConfigurations() {
+std::vector<struct Configuration *> &
+RuntimeController::getThreeDCurveConfigurations() {
   return ThreeDCurveConfigurations;
 }
 // For testing purposes only - do not use widely
 unsigned RuntimeController::getConfigurationIdx() { return configurationIdx; }
 
 double RuntimeController::getCurrentConfigurationSpeedup() {
-  return (double) (*Configurations)[configurationIdx]->speedup;
+  return (double)(*Configurations)[configurationIdx]->speedup;
 }
 
 double RuntimeController::getCurrentConfigurationEnergy() {
-  return (double) (*Configurations)[configurationIdx]->energy;
+  return (double)(*Configurations)[configurationIdx]->energy;
 }
 
 double RuntimeController::getCurrentConfigurationAccuracy() {
-  return (double) (*Configurations)[configurationIdx]->accuracy;
+  return (double)(*Configurations)[configurationIdx]->accuracy;
 }
 
 double RuntimeController::getCurrentConfigurationAccuracyLoss() {
-  return (double) (*Configurations)[configurationIdx]->accuracyLoss;
+  return (double)(*Configurations)[configurationIdx]->accuracyLoss;
 }
 
 std::vector<float> &RuntimeController::getQuantizationRanges(const char *data) {
@@ -448,8 +443,10 @@ void RuntimeController::init(const char *Cstr, const char *Qstr) {
   // Pseudo random variable (when we did few experiments)
   // or true random numbers for probabilistic control
   pseudo_rd = 0.0;
-  std::random_device rd;  //Will be used to obtain a seed for the random number engine
-  generator = std::mt19937 (rd()); //Standard mersenne_twister_engine seeded with rd()
+  std::random_device
+      rd; // Will be used to obtain a seed for the random number engine
+  generator =
+      std::mt19937(rd()); // Standard mersenne_twister_engine seeded with rd()
   distr = std::uniform_real_distribution<>(0.0, 1.0);
 
   g_freq = available_freqs[13];
@@ -471,8 +468,8 @@ void RuntimeController::end_iteration() {
     PI->end_iteration();
 }
 
-void RuntimeController::addToCurrentIterationComputeTime(
-    const char *s, double t) {
+void RuntimeController::addToCurrentIterationComputeTime(const char *s,
+                                                         double t) {
   if (PI)
     PI->addToCurrentIterationComputeTime(s, t);
 }
@@ -487,8 +484,8 @@ void RuntimeController::addToCurrentIterationConfigTime(double t) {
     PI->addToCurrentIterationConfigTime(t);
 }
 
-void RuntimeController::addToCurrentIterationComputeEnergy(
-    const char *s, double e) {
+void RuntimeController::addToCurrentIterationComputeEnergy(const char *s,
+                                                           double e) {
   if (PI)
     PI->addToCurrentIterationComputeEnergy(s, e);
 }
@@ -526,8 +523,8 @@ void RuntimeController::updateFrequency() {
   //--- updateJetsonGPUFreq(freq_idx);
 
   setFreq(freq_idx);
-  
-#endif //JETSON_EXECUTION
+
+#endif // JETSON_EXECUTION
 }
 
 void RuntimeController::writeProfileInfo() {
@@ -560,11 +557,9 @@ std::pair<double, double> RuntimeController::fc_profile(
     const unsigned num_rows_a, const unsigned num_cols_a,
     const unsigned num_rows_b, const unsigned num_cols_b,
     const unsigned voltage_swing, const unsigned patch_factor) {
-  return (
-      promise ? promise->fc_profile(
-                    num_rows_a, num_cols_a, num_rows_b, num_cols_b,
-                    voltage_swing, patch_factor)
-              : std::make_pair(0.0, 0.0));
+  return (promise ? promise->fc_profile(num_rows_a, num_cols_a, num_rows_b,
+                                        num_cols_b, voltage_swing, patch_factor)
+                  : std::make_pair(0.0, 0.0));
 }
 
 std::pair<double, double> RuntimeController::conv_profile(
@@ -572,17 +567,16 @@ std::pair<double, double> RuntimeController::conv_profile(
     const unsigned c_out, const unsigned c_in, const unsigned k_h,
     const unsigned k_w, const unsigned s_h, const unsigned s_w,
     const unsigned voltage_swing, const unsigned patch_factor) {
-  return (
-      promise ? promise->conv_profile(
-                    n, c, h, w, c_out, c_in, k_h, k_w, s_h, s_w, voltage_swing,
-                    patch_factor)
-              : std::make_pair(0.0, 0.0));
+  return (promise ? promise->conv_profile(n, c, h, w, c_out, c_in, k_h, k_w,
+                                          s_h, s_w, voltage_swing, patch_factor)
+                  : std::make_pair(0.0, 0.0));
 }
 
 // Constructor and descructor
 RuntimeController::RuntimeController() {
   configurationIdx = 0;
-  FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10);
+  FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+                               10);
 #ifdef ACTIVE_PROFILING
   PI = new ProfileInfo();
   profiler = new Profiler();
@@ -719,14 +713,13 @@ void RuntimeController::readConfigurationFile(const char *str) {
   std::getline(qin, first_line);
   DEBUG("first_line: %s\n", first_line.c_str());
 
-  try{
+  try {
     baseline_time = std::stod(first_line);
     DEBUG("Baseline time: %lf\n\n", baseline_time);
-  }
-  catch(...){
+  } catch (...) {
     ERROR("Please Add/Fix Baseline Time at Top of Config File.. ");
   }
-  
+
   for (std::string line; std::getline(qin, line);) {
     DEBUG("line: %s\n", line.c_str());
 
@@ -758,9 +751,9 @@ void RuntimeController::readConfigurationFile(const char *str) {
     if (readingFirstLine) {
       // Read first line, to create the new configuration struct
       readingFirstLine = false;
-      InitialConfigurations.push_back(Configuration(
-          tokens[0], std::stof(tokens[1]), std::stof(tokens[2]),
-          std::stof(tokens[3]), std::stof(tokens[4])));
+      InitialConfigurations.push_back(
+          Configuration(tokens[0], std::stof(tokens[1]), std::stof(tokens[2]),
+                        std::stof(tokens[3]), std::stof(tokens[4])));
       continue;
     }
 
@@ -768,8 +761,8 @@ void RuntimeController::readConfigurationFile(const char *str) {
       DEBUG("Found promise configuration\n");
 
       // There must be at least one approximation option
-      CUSTOM_ASSERT(
-          (tokens.size() >= 2) && "Not enough approximation options.");
+      CUSTOM_ASSERT((tokens.size() >= 2) &&
+                    "Not enough approximation options.");
 
       PROMISENodeConfiguration *NodeConf = new PROMISENodeConfiguration();
       InitialConfigurations.back().setup.insert(
@@ -792,9 +785,8 @@ void RuntimeController::readConfigurationFile(const char *str) {
       DEBUG("Found gpu configuration\n");
 
       // There must be at least one operation, with an approximation option
-      CUSTOM_ASSERT(
-          (tokens.size() >= 5) &&
-          "Not enough operations - approximation options.");
+      CUSTOM_ASSERT((tokens.size() >= 5) &&
+                    "Not enough operations - approximation options.");
 
       GPUNodeConfiguration *NodeConf = new GPUNodeConfiguration();
       InitialConfigurations.back().setup.insert(
@@ -968,9 +960,8 @@ void RuntimeController::computeParetoConfigurationPoints() {
 
   // Sort the configurations according to accuracy loss
   INFO("Sorting autotuner configurations...\n");
-  std::sort(
-      InitialConfigurations.begin() + 1, InitialConfigurations.end(),
-      ConfigurationLessThan());
+  std::sort(InitialConfigurations.begin() + 1, InitialConfigurations.end(),
+            ConfigurationLessThan());
   INFO("Done sorting.\n");
 
   for (unsigned start_idx = 1; start_idx < InitialConfigurations.size();) {
@@ -1004,14 +995,12 @@ void RuntimeController::computeParetoConfigurationPoints() {
         en_idx = i;
       }
     }
-    DEBUG(
-        "accuracy loss = %f, speedup = %f, at sp_idx = %d\n",
-        InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx);
+    DEBUG("accuracy loss = %f, speedup = %f, at sp_idx = %d\n",
+          InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx);
     // Found best speedup for this accuracy point (not dominated by any of
     // these).
-    DEBUG(
-        "accuracy loss = %f, energy = %f, at en_idx = %d\n",
-        InitialConfigurations[en_idx].accuracyLoss, en, en_idx);
+    DEBUG("accuracy loss = %f, energy = %f, at en_idx = %d\n",
+          InitialConfigurations[en_idx].accuracyLoss, en, en_idx);
     // Found best energy for this accuracy point (not dominated by any of
     // these).
 
@@ -1081,9 +1070,8 @@ void RuntimeController::compute3DParetoConfigurationPoints() {
 
   // Sort the configurations according to accuracy loss
   INFO("Sorting autotuner configurations...\n");
-  std::sort(
-      InitialConfigurations.begin(), InitialConfigurations.end(),
-      ConfigurationLessThan());
+  std::sort(InitialConfigurations.begin(), InitialConfigurations.end(),
+            ConfigurationLessThan());
   INFO("Done sorting.\n");
 
   for (unsigned start_idx = 0; start_idx < InitialConfigurations.size();) {
@@ -1117,11 +1105,10 @@ void RuntimeController::compute3DParetoConfigurationPoints() {
         }
       }
       if (!dominated) {
-        DEBUG(
-            "accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n",
-            InitialConfigurations[i].accuracyLoss,
-            InitialConfigurations[i].speedup, InitialConfigurations[i].energy,
-            i);
+        DEBUG("accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n",
+              InitialConfigurations[i].accuracyLoss,
+              InitialConfigurations[i].speedup, InitialConfigurations[i].energy,
+              i);
         Indices.push_back(i);
       }
     }
@@ -1180,31 +1167,22 @@ void RuntimeController::printConfigurations(
   }
 }
 
-unsigned long RuntimeController::getLastFrequency() {
-  return g_freq;
-}
+unsigned long RuntimeController::getLastFrequency() { return g_freq; }
 
-void RuntimeController::setLastFrequency(unsigned long f) {
-  g_freq = f;
-}
+void RuntimeController::setLastFrequency(unsigned long f) { g_freq = f; }
 
-double RuntimeController::getLastSpeedup() {
-  return g_speedup;
-}
+double RuntimeController::getLastSpeedup() { return g_speedup; }
 
-void RuntimeController::setLastSpeedup(double s) {
-  g_speedup = s;
-}
+void RuntimeController::setLastSpeedup(double s) { g_speedup = s; }
 
 void RuntimeController::findNextConfiguration() {
   configurationIdx = (configurationIdx + 1) % Configurations->size();
-  DEBUG(
-      "findNextConfiguration: Updated configurationIdx to %u.\n",
-      configurationIdx);
+  DEBUG("findNextConfiguration: Updated configurationIdx to %u.\n",
+        configurationIdx);
 }
 
-void RuntimeController::findTargetConfiguration(
-    float goal, enum SEARCH_KIND sk) {
+void RuntimeController::findTargetConfiguration(float goal,
+                                                enum SEARCH_KIND sk) {
   // We search in range begin(), end()-1 . It is OK to decrement end(), because
   // the configurations vector always points to one of the pareto curves, and
   // they are never empty - we have always pushed at least one configuration.
@@ -1214,25 +1192,25 @@ void RuntimeController::findTargetConfiguration(
   switch (sk) {
   case SPEEDUP: {
     Configurations = &SpeedupConfigurations;
-    low_it = std::lower_bound(
-        Configurations->begin(), Configurations->end() - 1, goal,
-        ConfigurationLessThan_SP());
+    low_it =
+        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+                         goal, ConfigurationLessThan_SP());
     configurationIdx = low_it - Configurations->begin();
     break;
   }
   case ENERGY: {
     Configurations = &EnergyConfigurations;
-    low_it = std::lower_bound(
-        Configurations->begin(), Configurations->end() - 1, goal,
-        ConfigurationLessThan_E());
+    low_it =
+        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+                         goal, ConfigurationLessThan_E());
     configurationIdx = low_it - Configurations->begin();
     break;
   }
   case ACCURACY_LOSS: {
     Configurations = &SpeedupConfigurations;
-    low_it = std::lower_bound(
-        Configurations->begin(), Configurations->end() - 1, goal,
-        ConfigurationLessThan_AL());
+    low_it =
+        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+                         goal, ConfigurationLessThan_AL());
     if ((*low_it)->accuracyLoss > goal)
       --low_it;
     configurationIdx = low_it - Configurations->begin();
@@ -1247,9 +1225,8 @@ void RuntimeController::findTargetConfiguration(
   // After search, low_it points to the Configuration to the element with the
   // goal value or the immediately lower value if it does not exist
 
-  DEBUG(
-      "findTargetConfiguration: Updated configurationIdx to %u.\n",
-      configurationIdx);
+  DEBUG("findTargetConfiguration: Updated configurationIdx to %u.\n",
+        configurationIdx);
 }
 
 void RuntimeController::adjustTargetConfiguration(float goal) {
@@ -1260,8 +1237,8 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
   // Find configuration before the selected one.
   // There is always one, unless goal is 1. Then, we would pick baseline, and
   //  both upper and lower should be the same configuration, at index 0.
-  unsigned prev_conf_idx = configurationIdx > 0 ? configurationIdx - 1
-                                                : configurationIdx;
+  unsigned prev_conf_idx =
+      configurationIdx > 0 ? configurationIdx - 1 : configurationIdx;
   // Get the two configurations' speedup, and compute the appropriate ranges
   float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup;
   float prev_conf_speedup = (*Configurations)[prev_conf_idx]->speedup;
@@ -1280,32 +1257,32 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
 
     //***--- Probability adjustment strategy 1 ---***//
     // No big adjustments at edges of probability range
-//    float adjust_val = 0.0;
-//    if (low_pb < high_pb) {
-//      adjust_val = low_pb * 0.2;
-//    } else {
-//      adjust_val = high_pb * 0.2;
-//    }
-//    low_pb -= adjust_val;
-//    high_pb += adjust_val;
+    //    float adjust_val = 0.0;
+    //    if (low_pb < high_pb) {
+    //      adjust_val = low_pb * 0.2;
+    //    } else {
+    //      adjust_val = high_pb * 0.2;
+    //    }
+    //    low_pb -= adjust_val;
+    //    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 2 ---***//
     // No big adjustment at high edge of probability range
-//    float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2;
-//    low_pb -= adjust_val;
-//    high_pb += adjust_val;
+    //    float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2;
+    //    low_pb -= adjust_val;
+    //    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 3 ---***//
-    //Similar to 2, but higher always increases, more significantly
-//    float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5;
-//    low_pb -= adjust_val;
-//    high_pb += adjust_val;
+    // Similar to 2, but higher always increases, more significantly
+    //    float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5;
+    //    low_pb -= adjust_val;
+    //    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 4 ---***//
-    //Similar to 2, but higher always increases, more significantly
+    // Similar to 2, but higher always increases, more significantly
     // Low end, high end a bit less aggressive than total range
     float adjust_val = low_pb * 0.6 > 0.2 ? 0.2 : low_pb * 0.6;
     adjust_val = adjust_val > high_pb ? high_pb : adjust_val;
@@ -1314,20 +1291,18 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
     //***---                                   ---***//
   }
 
-  DEBUG(
-      "**---- adjustTargetConfiguration: upper conf = %s with probability: "
-      "%f.\n",
-      ((*Configurations)[configurationIdx]->name).c_str(), high_pb);
-  DEBUG(
-      "**---- adjustTargetConfiguration: lower conf = %s with probability: "
-      "%f.\n\n",
-      ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb);
+  DEBUG("**---- adjustTargetConfiguration: upper conf = %s with probability: "
+        "%f.\n",
+        ((*Configurations)[configurationIdx]->name).c_str(), high_pb);
+  DEBUG("**---- adjustTargetConfiguration: lower conf = %s with probability: "
+        "%f.\n\n",
+        ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb);
 
   // Select a random number from 0 to 1
   // We assign the (0..low_pb) to the lower configuration, and the (low_pb..1)
   // to the upper
   // float rd = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ;
-  //float rd = pseudo_rd;
+  // float rd = pseudo_rd;
   float rd = distr(generator);
   if (rd < low_pb) {
     // If the probability is in the low range
@@ -1347,8 +1322,8 @@ double RuntimeController::getBaselineTime() { return baseline_time; }
 Slowdowns *RuntimeController::getSlowdowns() { return slowdowns; }
 
 // Functions to be inserted with initializeTensorRT and clearTensorRT
-extern "C" void llvm_hpvm_initializeRuntimeController(
-    const char *ConfigFile, const char *QRangeFile) {
+extern "C" void llvm_hpvm_initializeRuntimeController(const char *ConfigFile,
+                                                      const char *QRangeFile) {
   RC = new RuntimeController();
   RC->init(ConfigFile, QRangeFile);
   return;
@@ -1362,8 +1337,8 @@ extern "C" void llvm_hpvm_clearRuntimeController() {
 //*** Methods to compute accuracy of a tensor by the runtime controller   ***//
 uint32_t *labels_from_file = NULL;
 
-uint32_t *
-hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) {
+uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start,
+                                         int end) {
 
   // Initialize buffer
   if (!labels_from_file) {
@@ -1448,13 +1423,12 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
   return accuracy;
 }
 
-
 //#define llvm_hpvm_invokeRtControl_BASE llvm_hpvm_invokeRtControl
 #define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl
 //#define llvm_hpvm_invokeRtControl_ADJUST llvm_hpvm_invokeRtControl
 
-extern "C" void llvm_hpvm_invokeRtControl_BASE(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str,
+                                               int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1471,16 +1445,15 @@ extern "C" void llvm_hpvm_invokeRtControl_BASE(
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n\n",
+       current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ITERATE(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ITERATE(void *result, const char *str,
+                                                  int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1504,16 +1477,15 @@ extern "C" void llvm_hpvm_invokeRtControl_ITERATE(
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n\n",
+       current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ADJUST(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ADJUST(void *result, const char *str,
+                                                 int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1556,17 +1528,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST(
   RC->addToCurrentIterationConfigEnergy(pinfo2.second);
   //*                                                                        */
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("target speedup = %lf\n\n", target_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result,
+                                                    const char *str, int start,
+                                                    int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1610,17 +1582,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(
   RC->addToCurrentIterationConfigEnergy(pinfo2.second);
   //*                                                                        */
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("target speedup = %lf\n\n", target_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(void *result,
+                                                   const char *str, int start,
+                                                   int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1647,21 +1619,20 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(
   float next_conf_speedup =
       RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("slowdown (target speedup) = %f\n", slowdown);
   INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO(
-      "Swapping to next configuration: %s with speedup %f\n\n",
-      next_conf_name.c_str(), next_conf_speedup);
+  INFO("Swapping to next configuration: %s with speedup %f\n\n",
+       next_conf_name.c_str(), next_conf_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void *result,
+                                                      const char *str,
+                                                      int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1689,21 +1660,19 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(
   float next_conf_speedup =
       RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("slowdown (target speedup) = %f\n", slowdown);
   INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO(
-      "Swapping to next configuration: %s with speedup %f\n\n",
-      next_conf_name.c_str(), next_conf_speedup);
+  INFO("Swapping to next configuration: %s with speedup %f\n\n",
+       next_conf_name.c_str(), next_conf_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_RAND(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_RAND(void *result, const char *str,
+                                               int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1721,9 +1690,8 @@ extern "C" void llvm_hpvm_invokeRtControl_RAND(
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n\n",
+       current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
@@ -1734,12 +1702,13 @@ static void writeVectorToFile(const char *path, const std::vector<T> &vec) {
   std::ofstream of(path, std::ofstream::out | std::ofstream::app);
   if (!of.good())
     ERROR("Cannot write to %s file", path);
-  for (float f: vec)
+  for (float f : vec)
     of << f << ' ';
   of << '\n';
 }
 
-extern "C" void llvm_hpvm_imgInvokeRtControl(void* result, void *gold, int start, int end) {
+extern "C" void llvm_hpvm_imgInvokeRtControl(void *result, void *gold,
+                                             int start, int end) {
   RC->resume_profiler();
 
   if (gold != nullptr) {
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
index 38ba3d4683cb60483d4ec5d56f8c21f8fd50a7fa..b4e9e3fea8a2f0638267f6386698d5434a6b91fc 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
@@ -66,8 +66,8 @@ static Tensor *to_nhwc(Tensor *t) {
         for (int c0 = 0; c0 < c; c0++) {
           size_t nc = n0 * c + c0, nch = nc * h + h0, nchw_idx = nch * w + w0,
                  nchw_offset = nchw_idx * element_size;
-          std::memcpy(
-              out_data + nhwc_offset, in_data + nchw_offset, element_size);
+          std::memcpy(out_data + nhwc_offset, in_data + nchw_offset,
+                      element_size);
           nhwc_offset += element_size;
         }
   return out_tensor;
@@ -96,8 +96,8 @@ static Tensor *to_nchw(Tensor *t) {
         for (int w0 = 0; w0 < w; w0++) {
           size_t nh = n0 * h + h0, nhw = nh * w + w0, nhwc_idx = nhw * c + c0,
                  nhwc_offset = nhwc_idx * element_size;
-          std::memcpy(
-              out_data + nchw_offset, in_data + nhwc_offset, element_size);
+          std::memcpy(out_data + nchw_offset, in_data + nhwc_offset,
+                      element_size);
           nchw_offset += element_size;
         }
   return out_tensor;
@@ -116,8 +116,8 @@ static inline std::vector<std::string> listFiles(const std::string &folder) {
 
 // return in[start:start+count]
 template <typename T>
-std::vector<T>
-sliceVector(const std::vector<T> &in, size_t start, size_t count) {
+std::vector<T> sliceVector(const std::vector<T> &in, size_t start,
+                           size_t count) {
   auto slice_begin = in.begin() + start;
   if (slice_begin > in.end())
     slice_begin = in.end();
@@ -128,8 +128,8 @@ sliceVector(const std::vector<T> &in, size_t start, size_t count) {
 }
 
 // Read an image dataset from a folder with each image as a file.
-Tensor *
-readDataSet(const char *path, size_t start, size_t count, size_t n_color) {
+Tensor *readDataSet(const char *path, size_t start, size_t count,
+                    size_t n_color) {
   INFO("Loading image dataset from path %s\n", path);
   std::vector<std::string> filenames =
       sliceVector(listFiles(path), start, count);
@@ -141,10 +141,10 @@ readDataSet(const char *path, size_t start, size_t count, size_t n_color) {
   auto *first_image = (Tensor *)loadAsImage(filenames[0].c_str(), n_color);
   std::vector<size_t> sizes = ::sizes(first_image);
   size_t h = sizes[2], w = sizes[3];
-  DEBUG(
-      "Loading shape: (%lu, %lu, %lu, %lu)\n", filenames.size(), n_color, h, w);
-  auto *batch = (Tensor *)create4DTensor(
-      CUDNN_DATA_FLOAT, CUDNN_TENSOR_NHWC, filenames.size(), h, w, n_color);
+  DEBUG("Loading shape: (%lu, %lu, %lu, %lu)\n", filenames.size(), n_color, h,
+        w);
+  auto *batch = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NHWC,
+                                         filenames.size(), h, w, n_color);
   size_t n_floats = n_color * h * w;
   auto *base_data = (float *)batch->host_data;
   for (const auto &path : filenames) {
@@ -181,8 +181,8 @@ static Tensor *complexToFloat(Tensor *batch) {
   }
 
   size_t *dims = batch->dims.dim_sizes;
-  auto *ret = (Tensor *)create4DTensor(
-      float_type, batch->data_format, dims[0], dims[1], dims[2], dims[3]);
+  auto *ret = (Tensor *)create4DTensor(float_type, batch->data_format, dims[0],
+                                       dims[1], dims[2], dims[3]);
   auto *out_data = (float *)ret->host_data;
   for (size_t i = 0; i < magnitudes.size(); i++) {
     float f = magnitudes[i];
@@ -192,8 +192,8 @@ static Tensor *complexToFloat(Tensor *batch) {
 }
 
 // Save an image tensor image-by-image to a folder.
-void saveDataSet(
-    const char *path, Tensor *batch, size_t start_idx, size_t write_n) {
+void saveDataSet(const char *path, Tensor *batch, size_t start_idx,
+                 size_t write_n) {
   INFO("Saving image dataset to path %s\n", path);
   Tensor *float_batch = batch;
   if (batch->data_type == float2_type || batch->data_type == half2_type)
@@ -268,8 +268,8 @@ void saveToImage(const char *filename, Tensor *tensor) {
 }
 
 // Make a conv2d filter from 2-dim data.
-void *createFilterFromData(
-    int data_type, void *data, size_t w, size_t h, size_t n_chan) {
+void *createFilterFromData(int data_type, void *data, size_t w, size_t h,
+                           size_t n_chan) {
   DEBUG("Creating filter from data\n");
   auto *tensor =
       (Tensor *)create4DTensor(data_type, CUDNN_TENSOR_NCHW, n_chan, 1, h, w);
@@ -312,8 +312,8 @@ float compute_variance(float *arr, int left, int right, float mean) {
   return sum / (right - left - 1);
 }
 
-float compute_covariance(
-    float *x, float *y, int left, int right, float x_mean, float y_mean) {
+float compute_covariance(float *x, float *y, int left, int right, float x_mean,
+                         float y_mean) {
   float sum = 0;
   for (int i = left; i < right; i++) {
     sum += (x[i] - x_mean) * (y[i] - y_mean);
@@ -394,8 +394,8 @@ std::vector<float> PSNR(void *gold_ptr, void *approx_ptr) {
   return std::vector<float>(float_data, float_data + batch_dim);
 }
 
-float violationRate(
-    const std::vector<float> &values, float threshold, bool higher_better) {
+float violationRate(const std::vector<float> &values, float threshold,
+                    bool higher_better) {
   if (values.empty())
     return 0.0f;
   size_t violation = 0;
@@ -422,9 +422,9 @@ float mean(const std::vector<float> &values) {
 void *sliceTensorInBatch(void *whole, size_t start, size_t end) {
   auto *whole_tensor = (Tensor *)whole;
   size_t *dim_sizes = whole_tensor->dims.dim_sizes;
-  auto *output = (Tensor *)create4DTensor(
-      CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, end - start, dim_sizes[1],
-      dim_sizes[2], dim_sizes[3]);
+  auto *output =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, end - start,
+                               dim_sizes[1], dim_sizes[2], dim_sizes[3]);
   size_t single_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
   auto *in_data = (float *)(whole_tensor->host_data) + start * single_size;
   memcpy(output->host_data, in_data, (end - start) * single_size);
@@ -440,6 +440,6 @@ void reshape(void *t, const std::vector<size_t> &shape) {
   free(tensor->dims.dim_sizes);
   tensor->dims.dim_sizes = (size_t *)malloc(sizeof(size_t) * shape.size());
   std::copy(shape.begin(), shape.end(), tensor->dims.dim_sizes);
-  set4DTensorDescriptor(
-      tensor, tensor->data_format, shape[0], shape[1], shape[2], shape[3]);
+  set4DTensorDescriptor(tensor, tensor->data_format, shape[0], shape[1],
+                        shape[2], shape[3]);
 }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
index b311f50f99bf6ffc8ec508300d3e92bd9b314796..284a75c444f54a0f3aa3412c8cd177d4ebad4e2e 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
@@ -1,36 +1,36 @@
 
 
-#include <stdio.h>
-#include <stdarg.h>
 #include <cstdio>
 #include <cstdlib>
+#include <cublas_v2.h>
+// Must come after cublas_v2.h
+#include <cublas_api.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <cudnn.h>
 #include <iostream>
 #include <map>
 #include <sstream>
+#include <stdarg.h>
+#include <stdio.h>
 #include <string>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <cudnn.h>
-#include <cublas_api.h>
-#include <cuda_fp16.h>
 
 // Tensor runtime header files
-#include "tensor_runtime.h"
-#include "tensor_utils.h"
+#include "approx_simulation.h"
 #include "debug.h"
-#include "profiling.h"
-#include "global_data.h"
 #include "error.h"
-#include "tensor.h"
-#include "op_overheads.h"
-#include "approx_simulation.h"
+#include "global_data.h"
 #include "init_api.h"
+#include "op_overheads.h"
+#include "profiling.h"
+#include "tensor.h"
+#include "tensor_runtime.h"
+#include "tensor_utils.h"
 
+void llvm_hpvm_initTensorRt(int gpuid) {
 
-void llvm_hpvm_initTensorRt(int gpuid){
+  if (!runtime_initialized) {
 
-  if(!runtime_initialized){
-    
     printf("INITIALIZING GPU %d \n", gpuid);
     // NOTE: Setting the target GPU. Can we use multiple GPUs?
     checkCudaErrors(cudaSetDevice(gpuid));
@@ -40,10 +40,9 @@ void llvm_hpvm_initTensorRt(int gpuid){
 
     printf("CREATED HANDLES %d \n", gpuid);
 
-
 #ifdef PROMISE_TUNER_ENABLED
     //    readOpenTunerFlags("opentuner_flags");
-    
+
     readOpenTunerFlags("promise_flags");
     initializeAutotuner();
 
@@ -51,67 +50,52 @@ void llvm_hpvm_initTensorRt(int gpuid){
 
 #endif
 
-
 #ifdef ERROR_INJECTION_ENABLED
     readOpenTunerFlags("opentuner_flags");
 #endif
 
-    
     runtime_initialized = true;
   }
 
   printf("DONE INTIALIZING GPU %d \n", gpuid);
-
 }
 
-
-void llvm_hpvm_cleanupTensorRt(){
+void llvm_hpvm_cleanupTensorRt() {
   DEBUG("**** llvm_hpvm_cleanupTensorRt ***\n");
   dumpAccuracyNorms();
 }
 
-
-void llvm_hpvm_initApproxhpvmRt(int gpuid){
+void llvm_hpvm_initApproxhpvmRt(int gpuid) {
   llvm_hpvm_initTensorRt(gpuid);
   approxhpvm_runtime_mode = true;
 }
 
-void llvm_hpvm_cleanupApproxhpvmRt(){
-
-}
-
+void llvm_hpvm_cleanupApproxhpvmRt() {}
 
+void dumpAccuracyNorms() {
 
-void dumpAccuracyNorms(){
+#ifdef ERROR_INJECTION_ENABLED
 
-  #ifdef ERROR_INJECTION_ENABLED
-  
-  
-  #endif
+#endif
 
   dump_result("accuracy_summary");
-
 }
 
-
 // Returns the number of GPUs active on the platform
-unsigned int getGPUCount(){
+unsigned int getGPUCount() {
   int num_gpus;
   checkCudaErrors(cudaGetDeviceCount(&num_gpus));
   return num_gpus;
 }
 
-
-
-void clearTensorMap(){
+void clearTensorMap() {
   tensors_ptr.clear();
   host_ptr.clear();
   obj_ptr.clear();
   tracked_tensors.clear();
 }
 
-
-void startMemTracking(){
+void startMemTracking() {
   tensors_ptr.clear();
   host_ptr.clear();
   obj_ptr.clear();
@@ -119,33 +103,28 @@ void startMemTracking(){
   tracked_tensors.clear();
 }
 
-
-void freeOutputTensors(){
+void freeOutputTensors() {
 
   DEBUG("**** Freeing Ouput Tensors *** \n");
-  for (void *ptr: tensors_ptr)
+  for (void *ptr : tensors_ptr)
     cudaFree(ptr);
 
-  for(void *ptr: host_ptr)
+  for (void *ptr : host_ptr)
     free(ptr);
-  
-  for(void *ptr: obj_ptr)
+
+  for (void *ptr : obj_ptr)
     free(ptr);
-  
+
   clearTensorMap();
 }
 
-
-
-void clearOpCounter(){
+void clearOpCounter() {
   total_ops = 0;
   op_counter = 0;
   op_accuracies.clear();
 }
 
-
-
-void freeBatchMemory(){
+void freeBatchMemory() {
   // Free allocated memory for the current mini-batch
   freeOutputTensors();
   // Reinitialize couter for OpenTuner flags - next mini-batch of execution
@@ -153,5 +132,3 @@ void freeBatchMemory(){
   // Clearing profiling data map
   func_counters.clear();
 }
-
-
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/op_overheads.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/op_overheads.cc
index ed827de49594520096b0b423ea96f82fdeaaef3d..40418dd74a400f748b7877258912222e7005a372 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/op_overheads.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/op_overheads.cc
@@ -3,113 +3,105 @@
 #ifndef OP_OVERHEADS_HEADER
 #define OP_OVERHEADS_HEADER
 
-
-#include <math.h>
-#include <sstream>
-#include "tensor.h"
 #include "op_overheads.h"
 #include "debug.h"
+#include "tensor.h"
+#include <math.h>
+#include <sstream>
 
 float scale_down_factor = 10000.0;
 std::string result_str = "";
 
+extern "C" {
 
-extern "C"{
-
-static float scaleDownComps(double total_comps){
+static float scaleDownComps(double total_comps) {
 
   total_comps = total_comps / scale_down_factor;
   return total_comps;
 }
 
 // private function
-static float getScaledComps(double total_comps, int error_scale, int factor_type){
+static float getScaledComps(double total_comps, int error_scale,
+                            int factor_type) {
 
   double scaled_comps;
-  
+
   // Logarithmic error factor scaling - higher error, lower cost
-  if(factor_type == 1){   
-    float error_factor = log2((float) error_scale + 3);
+  if (factor_type == 1) {
+    float error_factor = log2((float)error_scale + 3);
     scaled_comps = total_comps / error_factor;
   }
   // Linear error factor scaling
-  if(factor_type == 2){
-    scaled_comps = total_comps / (error_scale + 1); 
+  if (factor_type == 2) {
+    scaled_comps = total_comps / (error_scale + 1);
   }
   // Quadratic error factor scaling (scaling down)
-  if(factor_type == 3){
+  if (factor_type == 3) {
     error_scale = (error_scale + 1) * (error_scale + 1);
-    scaled_comps = total_comps / error_scale; 
+    scaled_comps = total_comps / error_scale;
   }
 
-  
   return scaled_comps;
 }
 
-
-static void addNormToResult(float comps){
+static void addNormToResult(float comps) {
 
   std::ostringstream ss;
   ss << std::fixed << comps;
-  
-  result_str.append( std::string(ss.str()) );
+
+  result_str.append(std::string(ss.str()));
   result_str.append("\t");
 }
 
-
-
-static void addCompsToResult(float total_comps,
-			     float opt_comps1,
-			     float opt_comps2,
-			     float opt_comps3){
+static void addCompsToResult(float total_comps, float opt_comps1,
+                             float opt_comps2, float opt_comps3) {
 
   std::ostringstream ss;
   ss << std::fixed << total_comps;
-  result_str.append( std::string(ss.str()) );
+  result_str.append(std::string(ss.str()));
   result_str.append("\t");
 
   std::ostringstream ss2;
-  ss2 << std::fixed << opt_comps1;  
-  result_str.append( std::string(ss2.str()) );
+  ss2 << std::fixed << opt_comps1;
+  result_str.append(std::string(ss2.str()));
   result_str.append("\t");
-  
+
   std::ostringstream ss3;
   ss3 << std::fixed << opt_comps2;
-  result_str.append( std::string(ss3.str()) );
+  result_str.append(std::string(ss3.str()));
   result_str.append("\t");
 
   std::ostringstream ss4;
   ss4 << std::fixed << opt_comps3;
-  result_str.append( std::string(ss4.str()) );
+  result_str.append(std::string(ss4.str()));
   result_str.append("\n");
 }
 
-
-void dumpCompOverheads(double total_comps, int error_scale){
+void dumpCompOverheads(double total_comps, int error_scale) {
 
   total_comps = scaleDownComps(total_comps);
-  
-  float scaled_comps1 = getScaledComps(total_comps, error_scale, 1); // Log scaling
-  float scaled_comps2 = getScaledComps(total_comps, error_scale, 2); // Linear scaling
-  float scaled_comps3 = getScaledComps(total_comps, error_scale, 3); // Quadratic scaling
- 
-  //INFO("error_scale = %d, total_comps = %f, scaled_comps = %f \n",
-  //	 error_scale, total_comps, scaled_comps1);
 
-  addCompsToResult(total_comps, scaled_comps1, scaled_comps2, scaled_comps3); 
-}
+  float scaled_comps1 =
+      getScaledComps(total_comps, error_scale, 1); // Log scaling
+  float scaled_comps2 =
+      getScaledComps(total_comps, error_scale, 2); // Linear scaling
+  float scaled_comps3 =
+      getScaledComps(total_comps, error_scale, 3); // Quadratic scaling
 
+  // INFO("error_scale = %d, total_comps = %f, scaled_comps = %f \n",
+  //	 error_scale, total_comps, scaled_comps1);
 
+  addCompsToResult(total_comps, scaled_comps1, scaled_comps2, scaled_comps3);
+}
 
-void add_conv_overheads(void* input_ptr, void* filter_ptr,
-			int vertical_stride, int horizontal_stride,
-			int error_scale){
+void add_conv_overheads(void *input_ptr, void *filter_ptr, int vertical_stride,
+                        int horizontal_stride, int error_scale) {
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
   double kernel_comps = filter->dims.dim_sizes[0] * filter->dims.dim_sizes[1] *
-    filter->dims.dim_sizes[2] * filter->dims.dim_sizes[3];
+                        filter->dims.dim_sizes[2] * filter->dims.dim_sizes[3];
 
   double H_in = input->dims.dim_sizes[2] / vertical_stride;
   double W_in = input->dims.dim_sizes[3] / horizontal_stride;
@@ -118,31 +110,29 @@ void add_conv_overheads(void* input_ptr, void* filter_ptr,
   double total_comps = N_in * H_in * W_in * kernel_comps;
 
   dumpCompOverheads(total_comps, error_scale);
-    
 }
 
+void add_gemm_overheads(void *lhs_ptr, void *rhs_ptr, int error_scale) {
 
-void add_gemm_overheads(void* lhs_ptr, void* rhs_ptr, int error_scale){
+  Tensor *lhs = (Tensor *)lhs_ptr;
+  Tensor *rhs = (Tensor *)rhs_ptr;
 
-  Tensor* lhs = (Tensor*) lhs_ptr;
-  Tensor* rhs = (Tensor*) rhs_ptr;
-    
   int m = lhs->dims.dim_sizes[0];
   // The rhs last dimension must contain the neurons
-  int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons
+  int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
   int k = 1;
-  
+
   // Flattening the dimensions after the batch dimension
-  for (int j = 1 ; j < lhs->dims.num_dims; j++){
+  for (int j = 1; j < lhs->dims.num_dims; j++) {
     k = k * lhs->dims.dim_sizes[j]; // input neurons
   }
 
-  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2];
+  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
   // Dimension-note: Check if k is same across the two tensors
-  
-  //printf("m = %d, n = %d, k = %d \n", m, n, k);
-  
-  if(rhs_k != k){
+
+  // printf("m = %d, n = %d, k = %d \n", m, n, k);
+
+  if (rhs_k != k) {
     printf("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k);
     abort();
   }
@@ -150,40 +140,35 @@ void add_gemm_overheads(void* lhs_ptr, void* rhs_ptr, int error_scale){
   double m_d = m;
   double n_d = n;
   double rhs_k_d = rhs_k;
-  
+
   double total_comps = m_d * n_d * rhs_k_d * 1.0;
   dumpCompOverheads(total_comps, error_scale);
-  
 }
 
+void add_bias_overheads(void *input_ptr, int error_scale) {
 
-void add_bias_overheads(void* input_ptr, int error_scale){
-
-  Tensor* input = (Tensor*) input_ptr;  
+  Tensor *input = (Tensor *)input_ptr;
   double total_comps = input->num_elems;
 
   dumpCompOverheads(total_comps, error_scale);
-
 }
 
+void add_relu_overheads(void *input_ptr, int error_scale) {
 
-void add_relu_overheads(void* input_ptr, int error_scale){
-  
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
   double total_comps = input->num_elems;
 
   dumpCompOverheads(total_comps, error_scale);
 }
 
+void add_pool_overheads(void *input_ptr, int kernel_size, int stride_size,
+                        int error_scale) {
 
-void add_pool_overheads(void* input_ptr, int kernel_size,
-			 int stride_size, int error_scale){
+  Tensor *input = (Tensor *)input_ptr;
 
-  Tensor* input = (Tensor*) input_ptr;
-  
   int num_dims = input->dims.num_dims;
-  double H = input->dims.dim_sizes[num_dims-2];
-  double W = input->dims.dim_sizes[num_dims-1];
+  double H = input->dims.dim_sizes[num_dims - 2];
+  double W = input->dims.dim_sizes[num_dims - 1];
   double C = input->dims.dim_sizes[1]; // channel dimension
   double N = input->dims.dim_sizes[0]; // batch dimension
 
@@ -193,50 +178,42 @@ void add_pool_overheads(void* input_ptr, int kernel_size,
   double total_comps = N * C * H * W * kernel_size * kernel_size;
 
   dumpCompOverheads(total_comps, error_scale);
-
 }
 
-
-void add_norms(void* norms_ptr, char* op_name, int error_value){
+void add_norms(void *norms_ptr, char *op_name, int error_value) {
 
   // Print operation name - {tensorAdd, tensorPool, tensorGemm}
   result_str.append(op_name);
   result_str.append("\t");
-  
+
   addNormToResult(error_value);
-  
-  Norm_t* norms = (Norm_t*) norms_ptr;
+
+  Norm_t *norms = (Norm_t *)norms_ptr;
 
   addNormToResult(norms->mean_l1);
   addNormToResult(norms->mean_l2);
   addNormToResult(norms->orig_inf_norm);
-  
+
   addNormToResult(norms->l1_norm);
   addNormToResult(norms->l2_norm);
   addNormToResult(norms->inf_norm);
 }
 
+void dump_result(const char *file_name) {
 
-void dump_result(const char* file_name){
+  // printf ("DUMPING RESULT = %s \n", result_str.c_str());
+  // printf ("-- file name = %s \n", file_name);
 
-  //printf ("DUMPING RESULT = %s \n", result_str.c_str());
-  //printf ("-- file name = %s \n", file_name);
-  
-  FILE* fp = fopen(file_name, "w+");
-  if(fp != NULL){
+  FILE *fp = fopen(file_name, "w+");
+  if (fp != NULL) {
     fwrite(result_str.c_str(), 1, result_str.length(), fp);
     fclose(fp);
-  }
-  else{
+  } else {
     ERROR("Could not create file \n");
   }
 
-    
-  result_str = "";  
+  result_str = "";
 }
-
-
 }
 
-
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc
index 70ae14ca82fcf4627b8f49a573b76bb407c91718..18ebcfe4ef7e532e4657303baef6ea585b402a18 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc
@@ -2,95 +2,90 @@
 #ifndef PROFILING_HEADER
 #define PROFILING_HEADER
 
-
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <ctime>
 #include <chrono>
+#include <ctime>
+#include <cuda_runtime.h>
 #include <iostream>
 #include <map>
 #include <memory>
 #include <random>
+#include <stdarg.h>
+#include <stdio.h>
 #include <string>
 #include <unordered_map>
-#include <cuda_runtime.h>
 
-#include "global_data.h"
 #include "debug.h"
-
+#include "global_data.h"
 
 /***** Profiling routines ***/
 
-
 std::chrono::time_point<std::chrono::high_resolution_clock> start_time;
 // previous_time maintains time for the latest timed operation
 std::chrono::time_point<std::chrono::high_resolution_clock> previous_time;
 
-extern "C"{
+extern "C" {
 
-  void startProfiling(){
-    start_time = std::chrono::high_resolution_clock::now();
-  }
+void startProfiling() {
+  start_time = std::chrono::high_resolution_clock::now();
+}
+
+void stopProfiling() {
 
-  void stopProfiling(){
-    
-    FILE* fp = fopen("profile_data.txt", "w+");
-    if(fp != NULL){   
-      fwrite(profile_data.c_str(), 1, profile_data.length(), fp);
-      fclose(fp);
-    }
-    
-    profile_data = "";
-    func_counters.clear();
+  FILE *fp = fopen("profile_data.txt", "w+");
+  if (fp != NULL) {
+    fwrite(profile_data.c_str(), 1, profile_data.length(), fp);
+    fclose(fp);
   }
 
+  profile_data = "";
+  func_counters.clear();
+}
 
-  void profileEvent(const char* event_name, bool compare_previous = false){
+void profileEvent(const char *event_name, bool compare_previous = false) {
 
-    checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaDeviceSynchronize());
 
-    auto it = func_counters.find(event_name);
-    if(it == func_counters.end()){
-      func_counters[event_name] = 1; 
-    }
-    else{
-      int counter = func_counters[event_name];
-      counter++;
-      func_counters[event_name] = counter;
-    }
+  auto it = func_counters.find(event_name);
+  if (it == func_counters.end()) {
+    func_counters[event_name] = 1;
+  } else {
+    int counter = func_counters[event_name];
+    counter++;
+    func_counters[event_name] = counter;
+  }
 
-    std::stringstream ss;
-    ss << func_counters[event_name];
-    std::string event_count = ss.str();
+  std::stringstream ss;
+  ss << func_counters[event_name];
+  std::string event_count = ss.str();
 
-  
-    std::chrono::time_point<std::chrono::high_resolution_clock> zero_time; 
-    std::chrono::time_point<std::chrono::high_resolution_clock> time_reading =
+  std::chrono::time_point<std::chrono::high_resolution_clock> zero_time;
+  std::chrono::time_point<std::chrono::high_resolution_clock> time_reading =
       std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double, std::ratio<1>> current_time =
+  std::chrono::duration<double, std::ratio<1>> current_time =
       time_reading - zero_time;
-  
-    INFO("AbsoluteTime, Event = %s, Time = %f \n", event_name, current_time.count());  
-    profile_data.append(event_name);
-    profile_data.append(event_count);
+
+  INFO("AbsoluteTime, Event = %s, Time = %f \n", event_name,
+       current_time.count());
+  profile_data.append(event_name);
+  profile_data.append(event_count);
+  profile_data.append("\t");
+  profile_data.append(std::to_string(current_time.count()));
+
+  if (compare_previous) {
+    std::chrono::duration<double, std::ratio<1>> duration_time =
+        time_reading - previous_time;
+
     profile_data.append("\t");
-    profile_data.append(std::to_string(current_time.count()));
-  
-    if(compare_previous){
-      std::chrono::duration<double, std::ratio<1>> duration_time =
-	time_reading - previous_time;
-
-      profile_data.append("\t");
-      profile_data.append(std::to_string(duration_time.count()));
-      INFO("TimeDuration, Event = %s, Time = %f \n", event_name, duration_time.count());  
-    }
-
-    profile_data.append("\n");  
-  
-    previous_time = time_reading; // set the previous time reading to the current profiled time 
+    profile_data.append(std::to_string(duration_time.count()));
+    INFO("TimeDuration, Event = %s, Time = %f \n", event_name,
+         duration_time.count());
   }
 
+  profile_data.append("\n");
+
+  previous_time = time_reading; // set the previous time reading to the current
+                                // profiled time
+}
 }
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
index c629ca73b66111c5e5ad4cd67973d16cc13871df..98fd30ba9ee0ec7e0b81cfcaa9b3a699ec8e57b0 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -1,10 +1,12 @@
-/* This file includes the API implementation of the HPVM tensor runtime built for CPU
+/* This file includes the API implementation of the HPVM tensor runtime built
+*for CPU
 **
 **  Author: Hashim Sharif
 **  Email: hsharif3@illinois.edu
 */
 
 #include <algorithm>
+#include <bits/stdc++.h>
 #include <cfloat>
 #include <cmath>
 #include <cstdio>
@@ -14,1101 +16,1152 @@
 #include <iostream>
 #include <limits>
 #include <map>
-#include <cmath>
+#include <math.h>
 #include <memory>
-#include <vector>
+#include <omp.h>
+#include <pthread.h>
 #include <sstream>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string>
 #include <vector>
-#include <math.h>
-#include<bits/stdc++.h>
-#include <pthread.h>
-#include <omp.h>
 
 // Tensor runtime header files
 #include "tensor_cpu.h"
 #include "tensor_cpu_runtime.h"
 
 void llvm_hpvm_initTensorRt(int) {
-    // NOTE: Do Nothing
+  // NOTE: Do Nothing
 }
 
 void llvm_hpvm_cleanupTensorRt() {
-    // NOTE: Do Nothing
+  // NOTE: Do Nothing
 }
 
 void hpvm_request_tensor(void *tensor, int destination) {
-    // NOTE: Do Nothing
+  // NOTE: Do Nothing
 }
-  
+
 std::vector<void *> PtrVect;
 
 void freeBatchMemory() {
-    for(auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
-        free(*it);
-    }
-    PtrVect.erase(PtrVect.begin(), PtrVect.end());
+  for (auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
+    free(*it);
+  }
+  PtrVect.erase(PtrVect.begin(), PtrVect.end());
 }
 
 inline int getTypeSize(int data_type) {
-    return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
+  return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
 }
 
-void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline));
-inline void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) {
-    int type_size = getTypeSize(data_type);
-    size_t size_in_bytes = type_size * num_elems;
-    tensor->size_in_bytes = size_in_bytes;
+void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems)
+    __attribute__((always_inline));
+inline void setSizeInBytes(struct Tensor *tensor, int data_type,
+                           size_t num_elems) {
+  int type_size = getTypeSize(data_type);
+  size_t size_in_bytes = type_size * num_elems;
+  tensor->size_in_bytes = size_in_bytes;
 }
 
-void allocateMemCPU(struct Tensor *tensor, int data_type, 
-                    size_t num_elems, bool freeMemory = true) __attribute__((always_inline));
-inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, bool freeMemory) {
-    setSizeInBytes(tensor, data_type, num_elems);
-    tensor->data_type = data_type;
-    tensor->num_elems = num_elems;
-    tensor->host_data = (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
-    if(freeMemory)
-        PtrVect.push_back(tensor->host_data);
+void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems,
+                    bool freeMemory = true) __attribute__((always_inline));
+inline void allocateMemCPU(struct Tensor *tensor, int data_type,
+                           size_t num_elems, bool freeMemory) {
+  setSizeInBytes(tensor, data_type, num_elems);
+  tensor->data_type = data_type;
+  tensor->num_elems = num_elems;
+  tensor->host_data =
+      (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
+  if (freeMemory)
+    PtrVect.push_back(tensor->host_data);
 }
 
 void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
-    Tensor *tensor = (Tensor *)tensor_ptr;
-    if (tensor->size_in_bytes != size_in_bytes) {
-        printf("The destination and source sizes don't match");
-    }
-    memcpy(tensor->host_data, data_ptr, size_in_bytes); // Is this efficient enough?
+  Tensor *tensor = (Tensor *)tensor_ptr;
+  if (tensor->size_in_bytes != size_in_bytes) {
+    printf("The destination and source sizes don't match");
+  }
+  memcpy(tensor->host_data, data_ptr,
+         size_in_bytes); // Is this efficient enough?
 }
 
-
-//void *create4DTensor(int data_type, int data_format, size_t dim1_size,
-                    //  size_t dim2_size, size_t dim3_size, size_t dim4_size, 
-                    //bool freeMemory = true) __attribute__((always_inline));
-inline void *create4DTensor(int data_type, int data_format, size_t dim1_size,         
-                                    size_t dim2_size, size_t dim3_size, 
-                                    size_t dim4_size, bool freeMemory) {
-    struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
-    size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-    if(freeMemory)
-        PtrVect.push_back(tensor);
-    allocateMemCPU(tensor, data_type, num_elems, freeMemory);
-    
-    // Setting the tensor dimensions
-    size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
-    dim_sizes[0] = dim1_size;
-    dim_sizes[1] = dim2_size;
-    dim_sizes[2] = dim3_size;
-    dim_sizes[3] = dim4_size;
-    tensor->dims.dim_sizes = dim_sizes;
-    tensor->dims.num_dims = 4;
-    
-    return tensor;
+// void *create4DTensor(int data_type, int data_format, size_t dim1_size,
+//  size_t dim2_size, size_t dim3_size, size_t dim4_size,
+// bool freeMemory = true) __attribute__((always_inline));
+inline void *create4DTensor(int data_type, int data_format, size_t dim1_size,
+                            size_t dim2_size, size_t dim3_size,
+                            size_t dim4_size, bool freeMemory) {
+  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
+  size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
+  if (freeMemory)
+    PtrVect.push_back(tensor);
+  allocateMemCPU(tensor, data_type, num_elems, freeMemory);
+
+  // Setting the tensor dimensions
+  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
+  dim_sizes[0] = dim1_size;
+  dim_sizes[1] = dim2_size;
+  dim_sizes[2] = dim3_size;
+  dim_sizes[3] = dim4_size;
+  tensor->dims.dim_sizes = dim_sizes;
+  tensor->dims.num_dims = 4;
+
+  return tensor;
 }
 
-void* tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                                    int horizontal_pad, int vertical_stride,
-                                    int horizontal_stride, int conv_mode,
-                                    int compute_precision) {
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int num_filters = filter->dims.dim_sizes[0];
-    int kernel_height = filter->dims.dim_sizes[2];
-    int kernel_width = filter->dims.dim_sizes[3];
-    int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    int num_filter_elem = kernel_height * kernel_width * channels;
-    int output_size = output_width * output_height;
-    
-    Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters, 
-                                                    output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    
-    long int conv_data_size = 
-        sizeof(float) * num_filter_elem * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(conv_data_size);
-    //printf("number of batches: %d\n", batch_size);
-    omp_set_num_threads(4);
-     #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                        + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                                && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                        + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
+void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
+                                  int vertical_pad, int horizontal_pad,
+                                  int vertical_stride, int horizontal_stride,
+                                  int conv_mode, int compute_precision) {
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int num_filters = filter->dims.dim_sizes[0];
+  int kernel_height = filter->dims.dim_sizes[2];
+  int kernel_width = filter->dims.dim_sizes[3];
+  int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) /
+                          horizontal_stride);
+  int num_filter_elem = kernel_height * kernel_width * channels;
+  int output_size = output_width * output_height;
+
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
+                                            output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  long int conv_data_size = sizeof(float) * num_filter_elem * output_height *
+                            output_width * batch_size;
+  float *host_data = (float *)malloc(conv_data_size);
+  // printf("number of batches: %d\n", batch_size);
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        for (int w = 0; w < output_width; w++) {
+          const int inH = h * vertical_stride - vertical_pad;
+          const int inW = w * horizontal_stride - horizontal_pad;
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
             }
+          }
         }
-        for (int p = 0; p < num_filters; ++p) {
-             for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < num_filter_elem; ++k) {
-                    int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size;
-                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
+      }
+    }
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < num_filter_elem; ++k) {
+          int input_index =
+              k + num_filter_elem * m + b * num_filter_elem * output_size;
+          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
         }
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
     }
-    free(host_data);
-    printf("END: %p\n", output);
-    return output;
+  }
+  free(host_data);
+  printf("END: %p\n", output);
+  return output;
 }
 
-void* tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, 
-                                                int vertical_pad, int horizontal_pad, 
-                                                int vertical_stride, int horizontal_stride, 
-                                                int conv_mode, int compute_precision, 
-                                                int skip_every, int start) {
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    const int batch_size = input->dims.dim_sizes[0];
-    const int channels = input->dims.dim_sizes[1];
-    const int image_height = input->dims.dim_sizes[2];
-    const int image_width = input->dims.dim_sizes[3];
-    const int num_filters = filter->dims.dim_sizes[0];
-    const int kernel_height = filter->dims.dim_sizes[2];
-    const int kernel_width = filter->dims.dim_sizes[3];
-    const int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    const int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    const int num_filter_elem = kernel_height * kernel_width * channels;
-
-    const int remainder = ((num_filter_elem - start) % skip_every > 0);
-    const int reduced_num_filter_elem = 
-            num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
-    const int output_size = output_width * output_height;
-    
-    Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters, 
-                                                    output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    
-    const long int host_data_size = sizeof(float) * reduced_num_filter_elem 
-                                    * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
-   
-    const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem;
-    float *reduced_kernels = (float *) malloc(reduced_filer_size);
-   
-    float fac =  (((float) skip_every) / ((float) skip_every - 1));
-    int reduced_filter_dim = reduced_num_filter_elem / channels;
-
-    // Create reduced filter
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int f = 0; f < num_filters; f++) {
-        for(int i = 0; i < reduced_num_filter_elem; i++) {
-            int ch = i / reduced_filter_dim;
-            int offset  = (start + ch) % skip_every; 
-            int in_index;
-            if(i < offset) {
-                in_index = i;
-            } else {
-                in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) 
-                        + (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset -1;
-            }
-            reduced_kernels[f * reduced_num_filter_elem + i] = 
-                                fac * host_filter[num_filter_elem * f + in_index];
+void *tensorRegularFilterSamplingConvolutionCPU(
+    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
+    int vertical_stride, int horizontal_stride, int conv_mode,
+    int compute_precision, int skip_every, int start) {
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  const int batch_size = input->dims.dim_sizes[0];
+  const int channels = input->dims.dim_sizes[1];
+  const int image_height = input->dims.dim_sizes[2];
+  const int image_width = input->dims.dim_sizes[3];
+  const int num_filters = filter->dims.dim_sizes[0];
+  const int kernel_height = filter->dims.dim_sizes[2];
+  const int kernel_width = filter->dims.dim_sizes[3];
+  const int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  const int output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  const int num_filter_elem = kernel_height * kernel_width * channels;
+
+  const int remainder = ((num_filter_elem - start) % skip_every > 0);
+  const int reduced_num_filter_elem =
+      num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
+  const int output_size = output_width * output_height;
+
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
+                                            output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  const long int host_data_size = sizeof(float) * reduced_num_filter_elem *
+                                  output_height * output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  const int reduced_filer_size =
+      sizeof(float) * num_filters * reduced_num_filter_elem;
+  float *reduced_kernels = (float *)malloc(reduced_filer_size);
+
+  float fac = (((float)skip_every) / ((float)skip_every - 1));
+  int reduced_filter_dim = reduced_num_filter_elem / channels;
+
+  // Create reduced filter
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int f = 0; f < num_filters; f++) {
+    for (int i = 0; i < reduced_num_filter_elem; i++) {
+      int ch = i / reduced_filter_dim;
+      int offset = (start + ch) % skip_every;
+      int in_index;
+      if (i < offset) {
+        in_index = i;
+      } else {
+        in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) +
+                   (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                   offset - 1;
+      }
+      reduced_kernels[f * reduced_num_filter_elem + i] =
+          fac * host_filter[num_filter_elem * f + in_index];
+    }
+  }
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int h = 0; h < output_height; h++) {
+      for (int w = 0; w < output_width; w++) {
+        const int inH = h * vertical_stride - vertical_pad;
+        const int inW = w * horizontal_stride - horizontal_pad;
+        for (int fi = 0; fi < reduced_num_filter_elem; fi++) {
+          int in_index;
+          const int ch = fi / reduced_filter_dim;
+          const int offset = (start + ch) % skip_every;
+          if (fi < offset) {
+            in_index = fi;
+          } else {
+            in_index =
+                ((fi - offset + 1) * skip_every) / (skip_every - 1) +
+                (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                offset - 1;
+          }
+          const int i =
+              (in_index % (kernel_width * kernel_height)) / kernel_width;
+          const int j = in_index % kernel_width;
+          const int output_index = h * output_width + w;
+          const int out_index = b * reduced_num_filter_elem * output_size +
+                                output_index * reduced_num_filter_elem + fi;
+          if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+              inW + j < image_width) {
+            host_data[out_index] =
+                host_image[((b * channels + ch) * image_height + (inH + i)) *
+                               image_width +
+                           (inW + j)];
+          } else {
+            host_data[out_index] = 0;
+          }
         }
+      }
     }
 
-    omp_set_num_threads(4);   
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int fi = 0; fi < reduced_num_filter_elem; fi++) {
-                        int in_index;
-                        const int ch = fi / reduced_filter_dim;
-                        const int offset  = (start + ch) % skip_every;
-                        if(fi < offset) {
-                            in_index = fi;
-                        } else {
-                            in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
-                                + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-                        }
-                        const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; 
-                        const int j = in_index % kernel_width;
-                        const int output_index = h * output_width + w;
-                        const int out_index = b * reduced_num_filter_elem * output_size 
-                                            + output_index * reduced_num_filter_elem + fi;
-                        if(inH + i >= 0 && inH + i < image_height 
-                        && inW + j >= 0 && inW + j < image_width) {
-                            host_data[out_index] = 
-                                host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                        } else {
-                            host_data[out_index] = 0;
-                        }
-                }
-            }
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < reduced_num_filter_elem; ++k) {
+          int input_index = k + reduced_num_filter_elem * m +
+                            b * reduced_num_filter_elem * output_size;
+          sum += host_data[input_index] *
+                 reduced_kernels[p * reduced_num_filter_elem + k];
         }
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
+    }
+  }
+  free(reduced_kernels);
+  free(host_data);
 
-         // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < reduced_num_filter_elem; ++k) {
-                    int input_index = k + reduced_num_filter_elem * m 
-                                    + b * reduced_num_filter_elem * output_size;
-                    sum += host_data[input_index] 
-                            * reduced_kernels[p * reduced_num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
+  return output;
+}
+
+void *tensorIrregularFilterSamplingConvolutionCPU(
+    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
+    int vertical_stride, int horizontal_stride, int conv_mode,
+    int compute_precision, int skip_every, int start) {
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  const int batch_size = input->dims.dim_sizes[0];
+  const int channels = input->dims.dim_sizes[1];
+  const int image_height = input->dims.dim_sizes[2];
+  const int image_width = input->dims.dim_sizes[3];
+  const int num_filters = filter->dims.dim_sizes[0];
+  const int kernel_height = filter->dims.dim_sizes[2];
+  const int kernel_width = filter->dims.dim_sizes[3];
+  const int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  const int output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  const int num_filter_elem = kernel_height * kernel_width * channels;
+
+  const int remainder = ((num_filter_elem - start) % skip_every > 0);
+  const int reduced_num_filter_elem =
+      num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
+  const int output_size = output_width * output_height;
+
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
+                                            output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  const long int host_data_size = sizeof(float) * reduced_num_filter_elem *
+                                  output_height * output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  const int reduced_filer_size =
+      sizeof(float) * num_filters * reduced_num_filter_elem;
+  float *reduced_kernels = (float *)malloc(reduced_filer_size);
+
+  float fac = (((float)skip_every) / ((float)skip_every - 1));
+  int reduced_filter_dim = reduced_num_filter_elem / channels;
+
+  // Create Reduced filter
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int f = 0; f < num_filters; f++) {
+    for (int i = 0; i < start; i++) {
+      reduced_kernels[f * reduced_num_filter_elem + i] =
+          host_filter[num_filter_elem * f + i];
+    }
+#pragma omp simd
+    for (int i = start; i < reduced_num_filter_elem; i++) {
+      int in_index = ((i - start + 1) * skip_every) / (skip_every - 1) +
+                     (((i - start + 1) * skip_every) % (skip_every - 1) > 0) +
+                     start - 1;
+      reduced_kernels[f * reduced_num_filter_elem + i] =
+          fac * host_filter[num_filter_elem * f + in_index];
+    }
+  }
+
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int h = 0; h < output_height; h++) {
+      for (int w = 0; w < output_width; w++) {
+        const int inH = h * vertical_stride - vertical_pad;
+        const int inW = w * horizontal_stride - horizontal_pad;
+        for (int fi = 0; fi < reduced_num_filter_elem; fi++) {
+          int in_index;
+          int offset = start;
+          if (fi < offset) {
+            in_index = fi;
+          } else {
+            in_index =
+                ((fi - offset + 1) * skip_every) / (skip_every - 1) +
+                (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                offset - 1;
+          }
+          const int ch = in_index / (kernel_width * kernel_height);
+          const int i =
+              (in_index % (kernel_width * kernel_height)) / kernel_width;
+          const int j = in_index % kernel_width;
+          const int output_index = h * output_width + w;
+          const int out_index = b * reduced_num_filter_elem * output_size +
+                                output_index * reduced_num_filter_elem + fi;
+          if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+              inW + j < image_width) {
+            host_data[out_index] =
+                host_image[((b * channels + ch) * image_height + (inH + i)) *
+                               image_width +
+                           (inW + j)];
+          } else {
+            host_data[out_index] = 0;
+          }
         }
+      }
+    }
 
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < reduced_num_filter_elem; ++k) {
+          int input_index = k + reduced_num_filter_elem * m +
+                            b * reduced_num_filter_elem * output_size;
+          sum += host_data[input_index] *
+                 reduced_kernels[p * reduced_num_filter_elem + k];
+        }
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
     }
-    free(reduced_kernels);
-    free(host_data);
-  
-    return output;
+  }
+  free(reduced_kernels);
+  free(host_data);
+
+  return output;
 }
 
-void* tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, 
-                                                  int vertical_pad, int horizontal_pad, 
-                                                  int vertical_stride, int horizontal_stride, 
-                                                  int conv_mode, int compute_precision, 
-                                                  int skip_every, int start) {
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    const int batch_size = input->dims.dim_sizes[0];
-    const int channels = input->dims.dim_sizes[1];
-    const int image_height = input->dims.dim_sizes[2];
-    const int image_width = input->dims.dim_sizes[3];
-    const int num_filters = filter->dims.dim_sizes[0];
-    const int kernel_height = filter->dims.dim_sizes[2];
-    const int kernel_width = filter->dims.dim_sizes[3];
-    const int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    const int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    const int num_filter_elem = kernel_height * kernel_width * channels;
-
-    const int remainder = ((num_filter_elem - start) % skip_every > 0);
-    const int reduced_num_filter_elem = 
-            num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
-    const int output_size = output_width * output_height;
-    
-    Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters, 
-                                                    output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    
-    const long int host_data_size = sizeof(float) * reduced_num_filter_elem 
-                                    * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
-   
-    const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem;
-    float *reduced_kernels = (float *) malloc(reduced_filer_size);
-   
-    float fac =  (((float) skip_every) / ((float) skip_every - 1));
-    int reduced_filter_dim = reduced_num_filter_elem / channels;
-
-    // Create Reduced filter
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int f = 0; f < num_filters; f++) {
-        for(int i = 0; i < start; i++) {
-            reduced_kernels[f * reduced_num_filter_elem + i] = 
-                                        host_filter[num_filter_elem * f + i];
+void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
+                                  int vertical_pad, int horizontal_pad,
+                                  int vertical_stride, int horizontal_stride,
+                                  int conv_mode, int compute_precision, int row,
+                                  int start) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int num_filters = filter->dims.dim_sizes[0];
+  int kernel_height = filter->dims.dim_sizes[2];
+  int kernel_width = filter->dims.dim_sizes[3];
+
+  int full_output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  int full_output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  int num_filter_elem = kernel_height * kernel_width * channels;
+  int full_output_size = full_output_height * full_output_width;
+
+  Tensor *full_output = (Tensor *)create4DTensor(
+      0, 0, batch_size, num_filters, full_output_height, full_output_width);
+  float *__restrict__ full_output_data = (float *)full_output->host_data;
+
+  int remainder = (full_output_height - start) % row > 0;
+  int output_height =
+      full_output_height - ((full_output_height - start) / row) - remainder;
+
+  int output_width = full_output_width;
+  float *output_data = (float *)malloc(
+      sizeof(float) * batch_size * num_filters * output_height * output_width);
+  int output_size = output_width * output_height;
+  long int host_data_size = sizeof(float) * num_filter_elem * output_height *
+                            output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        int inH;
+        if (h < start) {
+          inH = h * vertical_stride - vertical_pad;
+        } else {
+          int h_index = ((h - start + 1) * row) / (row - 1) +
+                        (((h - start + 1) * row) % (row - 1) > 0) + start - 1;
+          inH = h_index * vertical_stride - vertical_pad;
         }
-        #pragma omp simd
-        for(int i = start; i < reduced_num_filter_elem; i++) {
-            int in_index = ((i - start + 1) * skip_every) / (skip_every - 1)
-                    + (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + start - 1;
-            reduced_kernels[f * reduced_num_filter_elem + i] = 
-                            fac * host_filter[num_filter_elem * f + in_index];
+        for (int w = 0; w < output_width; w++) {
+          int inW = w * horizontal_stride - horizontal_pad;
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
+            }
+          }
         }
+      }
     }
 
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int fi = 0; fi < reduced_num_filter_elem; fi++) {
-                        int in_index;
-                        int offset = start;
-                        if(fi < offset) {
-                            in_index = fi;
-                        } else {
-                            in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
-                             + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-                        }
-                        const int ch = in_index / (kernel_width * kernel_height);
-                        const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; 
-                        const int j = in_index % kernel_width;
-                        const int output_index = h * output_width + w;
-                        const int out_index = b * reduced_num_filter_elem * output_size 
-                                            + output_index * reduced_num_filter_elem + fi;
-                        if(inH + i >= 0 && inH + i < image_height 
-                        && inW + j >= 0 && inW + j < image_width) {
-                            host_data[out_index] = 
-                                host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                        } else {
-                            host_data[out_index] = 0;
-                        }
-                }
-            }
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < num_filter_elem; ++k) {
+          int input_index =
+              k + num_filter_elem * m + b * num_filter_elem * output_size;
+          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
         }
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
+    }
 
-        // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < reduced_num_filter_elem; ++k) {
-                    int input_index = k + reduced_num_filter_elem * m 
-                                    + b * reduced_num_filter_elem * output_size;
-                    sum += host_data[input_index] 
-                                * reduced_kernels[p * reduced_num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
+    // Interpolate
+    for (int p = 0; p < num_filters; ++p) {
+      for (int h = 0; h < full_output_height; h++) {
+        for (int w = 0; w < full_output_width; w++) {
+          int full_output_index = b * num_filters * full_output_size +
+                                  p * full_output_size + h * full_output_width +
+                                  w;
+          if (h < start) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (h == full_output_height - 1) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               (output_height - 1) * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (h == 0) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               0 * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if ((h - start) % row == 0) {
+            int row_index = h - ((h + 1 - start) / row);
+            int output_index = b * num_filters * output_size + p * output_size +
+                               row_index * output_width + w;
+            full_output_data[full_output_index] =
+                (output_data[output_index] +
+                 output_data[output_index - output_width]) /
+                2;
+          } else {
+            int remainder = ((h + 1 - start) % row) > 0;
+            int row_index = h - ((h + 1 - start) / row) - remainder;
+            int output_index = b * num_filters * output_size + p * output_size +
+                               row_index * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          }
         }
-
+      }
     }
-    free(reduced_kernels);
-    free(host_data);
-  
-    return output;
-}
+  }
+  free(output_data);
+  free(host_data);
 
-void* tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                                int horizontal_pad, int vertical_stride, int horizontal_stride, 
-                                int conv_mode, int compute_precision, int row, int start) {
-    
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int num_filters = filter->dims.dim_sizes[0];
-    int kernel_height = filter->dims.dim_sizes[2];
-    int kernel_width = filter->dims.dim_sizes[3];
-
-    int full_output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    int full_output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    int num_filter_elem = kernel_height * kernel_width * channels;
-    int full_output_size = full_output_height * full_output_width;
-
-    Tensor *full_output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters, 
-                                            full_output_height, full_output_width);
-    float * __restrict__ full_output_data = (float *)full_output->host_data;
-   
-    int remainder = (full_output_height - start) % row > 0;
-    int output_height = 
-            full_output_height - ((full_output_height - start) / row) - remainder;
-
-    int output_width = full_output_width;
-    float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters 
-                                                * output_height * output_width);   
-    int output_size = output_width * output_height;
-    long int host_data_size = sizeof(float) * num_filter_elem * output_height 
-                                                        * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
+  return full_output;
+}
 
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                int inH;
-                if(h < start) {
-                    inH = h * vertical_stride - vertical_pad;
-                } else {
-                    int h_index = ((h - start + 1) * row) / (row - 1) 
-                                + (((h - start + 1) * row) % (row - 1) > 0) + start - 1;
-                    inH = h_index * vertical_stride - vertical_pad;
-                }
-                for(int w = 0; w < output_width; w++) {
-                    int inW = w * horizontal_stride - horizontal_pad;
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = 
-                                        (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                    + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                            && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
+void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
+                                  int vertical_pad, int horizontal_pad,
+                                  int vertical_stride, int horizontal_stride,
+                                  int conv_mode, int compute_precision, int col,
+                                  int start) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int num_filters = filter->dims.dim_sizes[0];
+  int kernel_height = filter->dims.dim_sizes[2];
+  int kernel_width = filter->dims.dim_sizes[3];
+  int full_output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  int full_output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  int num_filter_elem = kernel_height * kernel_width * channels;
+  int full_output_size = full_output_height * full_output_width;
+
+  Tensor *full_output = (Tensor *)create4DTensor(
+      0, 0, batch_size, num_filters, full_output_height, full_output_width);
+  float *__restrict__ full_output_data = (float *)full_output->host_data;
+
+  int remainder = (full_output_width - start) % col > 0;
+  int output_width =
+      full_output_width - ((full_output_width - start) / col) - remainder;
+
+  int output_height = full_output_height;
+  float *output_data = (float *)malloc(
+      sizeof(float) * batch_size * num_filters * output_height * output_width);
+  int output_size = output_width * output_height;
+  long int host_data_size = sizeof(float) * num_filter_elem * output_height *
+                            output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        int inH = h * vertical_stride - vertical_pad;
+        for (int w = 0; w < output_width; w++) {
+          int inW;
+          if (w < start) {
+            inW = w * horizontal_stride - horizontal_pad;
+          } else {
+            int w_index = ((w - start + 1) * col) / (col - 1) +
+                          (((w - start + 1) * col) % (col - 1) > 0) + start - 1;
+            inW = w_index * horizontal_stride - horizontal_pad;
+          }
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
             }
+          }
         }
+      }
+    }
 
-        // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < num_filter_elem; ++k) {
-                    int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size;
-                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < num_filter_elem; ++k) {
+          int input_index =
+              k + num_filter_elem * m + b * num_filter_elem * output_size;
+          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
         }
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
+    }
 
-        // Interpolate
-        for (int p = 0; p < num_filters; ++p) {
-            for(int h = 0; h < full_output_height; h++) { 
-                for(int w = 0; w < full_output_width; w++) {
-                   int full_output_index = b * num_filters * full_output_size 
-                            + p * full_output_size + h * full_output_width  + w;
-                   if(h < start) {
-                       int output_index = b * num_filters * output_size 
-                                        + p * output_size + h * output_width  + w;
-                       full_output_data[full_output_index] = output_data[output_index];
-                   } else if(h == full_output_height - 1) {
-                       int output_index = b * num_filters * output_size + p * output_size 
-                                                + (output_height - 1) * output_width  + w;
-                       full_output_data[full_output_index] = output_data[output_index];
-                    } else if(h == 0) {
-                        int output_index = b * num_filters * output_size 
-                                            + p * output_size + 0 * output_width  + w;
-                        full_output_data[full_output_index] = output_data[output_index]; 
-                    } else if((h - start) % row == 0) {
-                        int row_index = h - ((h + 1 - start) / row); 
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                            + row_index * output_width + w;
-                        full_output_data[full_output_index] = 
-                            (output_data[output_index] + output_data[output_index - output_width]) / 2;
-                   } else {
-                       int remainder = ((h + 1 - start) % row) > 0;
-                       int row_index = h - ((h + 1 - start) / row) - remainder;
-                       int output_index = b * num_filters * output_size + p * output_size 
-                                                        + row_index * output_width + w;
-                       full_output_data[full_output_index] = output_data[output_index];
-                  }
-                }
-            }
-         }
+    // Interpolate
+    for (int p = 0; p < num_filters; ++p) {
+      for (int h = 0; h < full_output_height; h++) {
+        for (int w = 0; w < full_output_width; w++) {
+          int full_output_index = b * num_filters * full_output_size +
+                                  p * full_output_size + h * full_output_width +
+                                  w;
+          if (w < start) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (w == full_output_width - 1) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + output_width - 1;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (w == 0) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + 0;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if ((w - start) % col == 0) {
+            int col_index = w - ((w + 1 - start) / col);
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + col_index;
+            full_output_data[full_output_index] =
+                (output_data[output_index] + output_data[output_index - 1]) / 2;
+          } else {
+            int remainder = ((w + 1 - start) % col) > 0;
+            int col_index = w - ((w + 1 - start) / col) - remainder;
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + col_index;
+            full_output_data[full_output_index] = output_data[output_index];
+          }
+        }
+      }
     }
-    free(output_data);
-    free(host_data);
+  }
+  free(output_data);
+  free(host_data);
 
-    return full_output;
+  return full_output;
 }
 
-void* tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                                int horizontal_pad, int vertical_stride, int horizontal_stride, 
-                                int conv_mode, int compute_precision, int col, int start) {
-    
+void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
+                       int horizontal_pad, int vertical_stride,
+                       int horizontal_stride, int conv_mode,
+                       int compute_precision, int row, int col, int skip_every,
+                       int start) {
+  if (row > 1) {
+    printf("ROW PERFORATION\n");
+    return tensorRowPerfConvolutionCPU(
+        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+        horizontal_stride, conv_mode, compute_precision, row, start);
+  }
+  if (col > 1) {
+    printf("COL PERFORATION\n");
+    return tensorColPerfConvolutionCPU(
+        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+        horizontal_stride, conv_mode, compute_precision, col, start);
+  }
+  if (skip_every > 1) {
+    printf("INPUT FILTERING\n");
     Tensor *input = (Tensor *)input_ptr;
     Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-    
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int num_filters = filter->dims.dim_sizes[0];
-    int kernel_height = filter->dims.dim_sizes[2];
-    int kernel_width = filter->dims.dim_sizes[3];
-    int full_output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    int full_output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    int num_filter_elem = kernel_height * kernel_width * channels;
-    int full_output_size = full_output_height * full_output_width;
-
-    Tensor *full_output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters, 
-                                                    full_output_height, full_output_width);
-    float * __restrict__ full_output_data = (float *)full_output->host_data;
-
-    int remainder = (full_output_width - start) % col > 0;
-    int output_width = full_output_width - ((full_output_width - start) / col) - remainder;
-
-    int output_height = full_output_height;
-    float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters 
-                                                    * output_height * output_width);
-    int output_size = output_width * output_height;
-    long int host_data_size = sizeof(float) * num_filter_elem * output_height 
-                                                        * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
 
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                int inH = h * vertical_stride - vertical_pad;
-                for(int w = 0; w < output_width; w++) {
-                    int inW;
-                    if(w < start) {
-                        inW = w * horizontal_stride - horizontal_pad;
-                    } else {
-                        int w_index = ((w - start + 1) * col) / (col - 1) 
-                                + (((w - start + 1) * col) % (col - 1) > 0) + start - 1;
-                        inW = w_index * horizontal_stride - horizontal_pad;
-                    }
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = 
-                                        (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                    + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                            && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
+    const int kernel_height = filter->dims.dim_sizes[2];
+    const int kernel_width = filter->dims.dim_sizes[3];
 
-        // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < num_filter_elem; ++k) {
-                    int input_index = k + num_filter_elem * m 
-                                            + b * num_filter_elem * output_size;
-                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
-        }
+    if (!(kernel_height * kernel_width % skip_every)) {
+      return tensorRegularFilterSamplingConvolutionCPU(
+          input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+          horizontal_stride, conv_mode, compute_precision, skip_every, start);
+    }
+    return tensorIrregularFilterSamplingConvolutionCPU(
+        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+        horizontal_stride, conv_mode, compute_precision, skip_every, start);
+  }
+  printf("REGULAR CONV\n");
+  return tensorRegularConvolutionCPU(
+      input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+      horizontal_stride, conv_mode, compute_precision);
+}
 
-        // Interpolate
-        for (int p = 0; p < num_filters; ++p) {
-            for(int h = 0; h < full_output_height; h++) {
-                for(int w = 0; w < full_output_width; w++) {
-                    int full_output_index = b * num_filters * full_output_size 
-                                + p * full_output_size + h * full_output_width  + w;
-                     if(w < start) {
-                         int output_index = b * num_filters * output_size 
-                                        + p * output_size + h * output_width + w;
-                         full_output_data[full_output_index] = output_data[output_index];
-                    } else if(w == full_output_width - 1) {
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                    + h * output_width  + output_width - 1;
-                        full_output_data[full_output_index] = output_data[output_index];
-                    } else if(w == 0) {
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                                + h * output_width  + 0;
-                        full_output_data[full_output_index] = output_data[output_index];
-                    } else if((w - start) % col == 0) {
-                        int col_index = w - ((w + 1 - start) / col);
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                            + h * output_width + col_index;
-                        full_output_data[full_output_index] = 
-                            (output_data[output_index] + output_data[output_index - 1]) / 2;
-                    } else {
-                        int remainder = ((w + 1 - start) % col) > 0;
-                        int col_index = w - ((w + 1 - start) / col) - remainder;
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                            + h * output_width + col_index;
-                        full_output_data[full_output_index] = output_data[output_index];
-                    }
-                }
+void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int conv_groups) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int num_filters = filter->dims.dim_sizes[0];
+  int kernel_height = filter->dims.dim_sizes[2];
+  int kernel_width = filter->dims.dim_sizes[3];
+  int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) /
+                          horizontal_stride);
+  int num_filter_elem = kernel_height * kernel_width * channels;
+  int output_size = output_width * output_height;
+
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
+                                            output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  long int conv_data_size = sizeof(float) * num_filter_elem * output_height *
+                            output_width * batch_size;
+  float *host_data = (float *)malloc(conv_data_size);
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        for (int w = 0; w < output_width; w++) {
+          const int inH = h * vertical_stride - vertical_pad;
+          const int inW = w * horizontal_stride - horizontal_pad;
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
             }
+          }
         }
+      }
     }
-    free(output_data);
-    free(host_data);
-
-    return full_output;
-}
-
-
-void* tensorConvApprox(void *input_ptr, void *filter_ptr, 
-                          int vertical_pad, int horizontal_pad, 
-                          int vertical_stride, int horizontal_stride, 
-                          int conv_mode, int compute_precision, 
-                          int row, int col, int skip_every, int start) {
-    if(row > 1) {
-        printf("ROW PERFORATION\n");
-        return tensorRowPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
-                        horizontal_pad, vertical_stride, horizontal_stride, conv_mode, 
-                        compute_precision, row, start);
-    } 
-    if(col > 1) {
-     printf("COL PERFORATION\n");
-     return tensorColPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
-                             horizontal_pad, vertical_stride, horizontal_stride, conv_mode, 
-                            compute_precision, col, start);
-    }  
-    if(skip_every > 1) {
-        printf("INPUT FILTERING\n");
-        Tensor *input = (Tensor *)input_ptr;
-        Tensor *filter = (Tensor *)filter_ptr;
-
-        const int kernel_height = filter->dims.dim_sizes[2];
-        const int kernel_width = filter->dims.dim_sizes[3];
-
-        if(!(kernel_height * kernel_width % skip_every)) {
-            return tensorRegularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, 
-                                    vertical_pad, horizontal_pad, vertical_stride,
-                                    horizontal_stride, conv_mode, 
-                                    compute_precision, skip_every, start);
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < num_filter_elem; ++k) {
+          int input_index =
+              k + num_filter_elem * m + b * num_filter_elem * output_size;
+          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
         }
-        return tensorIrregularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, 
-                                    vertical_pad, horizontal_pad, vertical_stride, 
-                                    horizontal_stride, conv_mode, 
-                                    compute_precision, skip_every, start);
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
     }
-    printf("REGULAR CONV\n");
-    return tensorRegularConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
-                                 horizontal_pad, vertical_stride, 
-                                 horizontal_stride, conv_mode, compute_precision);
+  }
+  free(host_data);
+  return output;
 }
 
-void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups){
-	
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int num_filters = filter->dims.dim_sizes[0];
-    int kernel_height = filter->dims.dim_sizes[2];
-    int kernel_width = filter->dims.dim_sizes[3];
-    int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    int num_filter_elem = kernel_height * kernel_width * channels;
-    int output_size = output_width * output_height;
-    
-    Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters, 
-                                                    output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    
-    long int conv_data_size = 
-        sizeof(float) * num_filter_elem * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(conv_data_size);
-   
+void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
+  Tensor *x = (Tensor *)x_ptr;
+  Tensor *bias = (Tensor *)bias_ptr;
+
+  float *__restrict__ x_data = (float *)x->host_data;
+  float *__restrict__ bias_data = (float *)bias->host_data;
+  int n = x->dims.dim_sizes[0];
+  int c = x->dims.dim_sizes[1];
+  int h = x->dims.dim_sizes[2];
+  int w = x->dims.dim_sizes[3];
+
+  if (x->num_elems == bias->num_elems) {
+    int const1 = c * h * w;
+    int const2 = h * w;
     omp_set_num_threads(4);
-     #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                        + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                                && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                        + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        for (int p = 0; p < num_filters; ++p) {
-             for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < num_filter_elem; ++k) {
-                    int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size;
-                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+#pragma omp simd collapse(2)
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            x_data[i * const1 + j * const2 + (k * w) + l] +=
+                bias_data[i * const1 + j * const2 + (k * w) + l];
+          }
         }
+      }
     }
-    free(host_data);
-    return output;
-}
-
-void* tensorAddCPU(void *x_ptr, void *bias_ptr) {
-    Tensor *x = (Tensor *)x_ptr;
-    Tensor *bias = (Tensor *)bias_ptr;
-    
-    float * __restrict__ x_data = (float *)x->host_data;
-    float * __restrict__ bias_data = (float *)bias->host_data;
-    int n = x->dims.dim_sizes[0];
-    int c = x->dims.dim_sizes[1];
-    int h = x->dims.dim_sizes[2];
-    int w = x->dims.dim_sizes[3];
-    
-    if(x->num_elems == bias->num_elems) {
-        int const1 = c * h * w;
-        int const2 = h * w;
-         omp_set_num_threads(4);
-        #pragma omp parallel for
-        for (int i = 0; i < n; i++) { 
-            for (int j = 0; j < c; j++) {
-                 #pragma omp simd collapse(2)
-                for (int k = 0; k < h; k++) {
-                    for (int l = 0; l < w; l++) {
-                        x_data[i * const1 + j * const2 + (k * w)  + l] += 
-                                bias_data[i * const1 + j * const2 + (k*w) + l];
-                    }
-                }
-            }
+  } else {
+    omp_set_num_threads(4);
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+#pragma omp simd collapse(2)
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j];
+          }
         }
-    } else {
-         omp_set_num_threads(4);
-        #pragma omp parallel for
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j < c; j++) {
-                #pragma omp simd collapse(2)
-                for (int k = 0; k < h; k++) {
-                    for (int l = 0; l < w; l++) {
-                        x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j];
-                    }
-                }
-            }
-        }   
+      }
     }
-    
-    return x;
+  }
+
+  return x;
 }
 
 float max(float v1, float v2) __attribute__((always_inline));
-inline float maximum(float v1, float v2){
-    return (v1 < v2) ? v2 : v1;
-}
+inline float maximum(float v1, float v2) { return (v1 < v2) ? v2 : v1; }
 
 void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
-             int window_width, int vertical_pad, int horizontal_pad,
-                          int vertical_stride, int horizontal_stride) {
-   
-    Tensor *input = (Tensor *)input_ptr;
-    float * __restrict__ input_data = (float *)input->host_data;
-    
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    
-    int output_height = 
-        1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride);
-    int output_width = 
-        1 + ((image_width - window_width + 2 * horizontal_pad) / horizontal_stride);
-    
-    int center_x = (window_width - 1) / 2 - horizontal_pad;
-    int center_y = (window_height - 1) / 2 - vertical_pad;
-    int x_radius = (window_width - 1) / 2;
-    int y_radius = (window_height - 1) / 2;
-    
-    Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, channels, 
-                                                output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-   
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for (int b = 0; b < batch_size; b++) {
-        for (int ch = 0; ch < channels; ch++) {
-            int ii = 0, jj = 0;
-            for (int r = center_y; r < image_height + vertical_pad - y_radius; 
-                                                        r += vertical_stride) {
-                for (int c = center_x; c < image_width + horizontal_pad - x_radius; 
-                                                            c += horizontal_stride) {
-                    float val = (poolFunction == 0) ? -3.40282e+38 : 0;
-                    int y_radius_var = y_radius - r;
-                    int y_radius_var_max = y_radius_var + image_height;
-                    int x_radius_var = x_radius - c;
-                    int x_radius_var_max = x_radius_var + image_width;
-                    int ki_min = (y_radius_var > 0) ? 
-                        ((y_radius_var < window_height) ? y_radius_var : -1) : 0;
-                    int ki_max = (y_radius_var_max < window_height) ? 
-                                 ((y_radius_var_max >= 0) ?  y_radius_var_max : -1) : window_height;
-                    int kj_min = (x_radius_var > 0) ? 
-                                ((x_radius_var < window_width) ? x_radius_var : -1) : 0;
-                    int kj_max = (x_radius_var_max < window_width) ? 
-                                    ((x_radius_var_max >= 0) ?  x_radius_var_max : -1) : window_width;
-                                        
-                    if(ki_min != ki_max && kj_min != kj_max && ki_min != -1 
-                            && ki_max != -1 && kj_min != -1 && kj_max != -1) {
-                        if(!poolFunction) {
-                            for (int ki = 0; ki < window_height; ki++) {
-                                for (int kj = 0; kj < window_width; kj++) {
-                                    val = maximum(
-                                    val,
-                                    input_data[b * (channels * image_height * image_width) +
-                                    ch * (image_height * image_width) +
-                                    (r - y_radius + ki) * image_width + (c - x_radius + kj)]);
-                                }
-                            }
-                        } else {
-                            for (int ki = 0; ki < window_height; ki++) {
-                                for (int kj = 0; kj < window_width; kj++) {
-                                    val += input_data[b * (channels * image_height * image_width) 
-                                            + ch * (image_height * image_width) +
-                                            (r - y_radius + ki) * image_width + (c - x_radius + kj)];
-                                }
-                            }
-                        }
-                    }
-                    if (poolFunction == 1) {
-                        val /= window_height * window_width;
-                    }
-                    output_data[b * (channels * output_height * output_width) +
-                        ch * (output_height * output_width) + ii * output_width + jj] = val;
-                    jj++;
-                    if (jj == output_width) {
-                        jj = 0;
-                        ii++;
-                    }
+                       int window_width, int vertical_pad, int horizontal_pad,
+                       int vertical_stride, int horizontal_stride) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  float *__restrict__ input_data = (float *)input->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+
+  int output_height =
+      1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride);
+  int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) /
+                          horizontal_stride);
+
+  int center_x = (window_width - 1) / 2 - horizontal_pad;
+  int center_y = (window_height - 1) / 2 - vertical_pad;
+  int x_radius = (window_width - 1) / 2;
+  int y_radius = (window_height - 1) / 2;
+
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, channels,
+                                            output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      int ii = 0, jj = 0;
+      for (int r = center_y; r < image_height + vertical_pad - y_radius;
+           r += vertical_stride) {
+        for (int c = center_x; c < image_width + horizontal_pad - x_radius;
+             c += horizontal_stride) {
+          float val = (poolFunction == 0) ? -3.40282e+38 : 0;
+          int y_radius_var = y_radius - r;
+          int y_radius_var_max = y_radius_var + image_height;
+          int x_radius_var = x_radius - c;
+          int x_radius_var_max = x_radius_var + image_width;
+          int ki_min =
+              (y_radius_var > 0)
+                  ? ((y_radius_var < window_height) ? y_radius_var : -1)
+                  : 0;
+          int ki_max = (y_radius_var_max < window_height)
+                           ? ((y_radius_var_max >= 0) ? y_radius_var_max : -1)
+                           : window_height;
+          int kj_min = (x_radius_var > 0)
+                           ? ((x_radius_var < window_width) ? x_radius_var : -1)
+                           : 0;
+          int kj_max = (x_radius_var_max < window_width)
+                           ? ((x_radius_var_max >= 0) ? x_radius_var_max : -1)
+                           : window_width;
+
+          if (ki_min != ki_max && kj_min != kj_max && ki_min != -1 &&
+              ki_max != -1 && kj_min != -1 && kj_max != -1) {
+            if (!poolFunction) {
+              for (int ki = 0; ki < window_height; ki++) {
+                for (int kj = 0; kj < window_width; kj++) {
+                  val = maximum(
+                      val,
+                      input_data[b * (channels * image_height * image_width) +
+                                 ch * (image_height * image_width) +
+                                 (r - y_radius + ki) * image_width +
+                                 (c - x_radius + kj)]);
                 }
+              }
+            } else {
+              for (int ki = 0; ki < window_height; ki++) {
+                for (int kj = 0; kj < window_width; kj++) {
+                  val +=
+                      input_data[b * (channels * image_height * image_width) +
+                                 ch * (image_height * image_width) +
+                                 (r - y_radius + ki) * image_width +
+                                 (c - x_radius + kj)];
+                }
+              }
             }
+          }
+          if (poolFunction == 1) {
+            val /= window_height * window_width;
+          }
+          output_data[b * (channels * output_height * output_width) +
+                      ch * (output_height * output_width) + ii * output_width +
+                      jj] = val;
+          jj++;
+          if (jj == output_width) {
+            jj = 0;
+            ii++;
+          }
         }
+      }
     }
-  
-    return output;
+  }
+
+  return output;
 }
 
 void *tensorTanhCPU(void *input_ptr) {
-    Tensor *input = (Tensor *)input_ptr;
-    
-    float *input_data = (float *)input->host_data;
-    size_t num_elems = input->num_elems;
-    
-     omp_set_num_threads(4);
-     #pragma omp parallel for
-    for (size_t i = 0; i < num_elems; i++) {
-        input_data[i] = tanhf(input_data[i]);
-    }
-   
-    return input;
+  Tensor *input = (Tensor *)input_ptr;
+
+  float *input_data = (float *)input->host_data;
+  size_t num_elems = input->num_elems;
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (size_t i = 0; i < num_elems; i++) {
+    input_data[i] = tanhf(input_data[i]);
+  }
+
+  return input;
 }
 
 void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
-    Tensor *lhs = (Tensor *)lhs_ptr;
-    Tensor *rhs = (Tensor *)rhs_ptr;
-    //printf("GEMM lhs_ptr: %p\n", lhs_ptr);
-    //printf("GEMM rhs_ptr: %p\n", rhs_ptr);
-    
-    int m = lhs->dims.dim_sizes[0];
-    int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
-    int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
-    
-    Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1);
-
-    float * __restrict__ lhs_arr = (float *)lhs->host_data;
-    float * __restrict__ rhs_arr = (float *)rhs->host_data;
-    float * __restrict__ output_arr = (float *)output->host_data;
-    
-    int k = 1;
-    #pragma unroll 4   // Can we unroll more???
-    for (int j = 1; j < lhs->dims.num_dims; j++) {
-        k = k * lhs->dims.dim_sizes[j]; // input neurons
-    }
-    //printf("unroll\n");
-    float *tran_rhs = (float *) malloc(sizeof(float) * k * n);
-   // printf("tran_rhs: %p\n", tran_rhs);
-   // printf("rhs_arr: %p\n", rhs_arr);
-   // printf("lhs_arr: %p\n", lhs_arr);
-    omp_set_num_threads(4);
-    #pragma omp parallel for simd
-    for (int l = 0; l < k; l++) {
-        for (int j = 0; j < n; j++) {
-            tran_rhs[j * k + l] = rhs_arr[l * n + j];
-        }   
+  Tensor *lhs = (Tensor *)lhs_ptr;
+  Tensor *rhs = (Tensor *)rhs_ptr;
+  // printf("GEMM lhs_ptr: %p\n", lhs_ptr);
+  // printf("GEMM rhs_ptr: %p\n", rhs_ptr);
+
+  int m = lhs->dims.dim_sizes[0];
+  int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
+  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
+
+  Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1);
+
+  float *__restrict__ lhs_arr = (float *)lhs->host_data;
+  float *__restrict__ rhs_arr = (float *)rhs->host_data;
+  float *__restrict__ output_arr = (float *)output->host_data;
+
+  int k = 1;
+#pragma unroll 4 // Can we unroll more???
+  for (int j = 1; j < lhs->dims.num_dims; j++) {
+    k = k * lhs->dims.dim_sizes[j]; // input neurons
+  }
+  // printf("unroll\n");
+  float *tran_rhs = (float *)malloc(sizeof(float) * k * n);
+  // printf("tran_rhs: %p\n", tran_rhs);
+  // printf("rhs_arr: %p\n", rhs_arr);
+  // printf("lhs_arr: %p\n", lhs_arr);
+  omp_set_num_threads(4);
+#pragma omp parallel for simd
+  for (int l = 0; l < k; l++) {
+    for (int j = 0; j < n; j++) {
+      tran_rhs[j * k + l] = rhs_arr[l * n + j];
     }
-    //printf("TRANS\n");
-    #pragma omp parallel for
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-           float sum = 0.0;
-          #pragma omp simd reduction(+:sum)
-           for (int l = 0; l < k; l++) {
-                sum += lhs_arr[i * k + l] * tran_rhs[j * k + l];
-            }
-            output_arr[i * n + j] = sum;
-        }
+  }
+// printf("TRANS\n");
+#pragma omp parallel for
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      float sum = 0.0;
+#pragma omp simd reduction(+ : sum)
+      for (int l = 0; l < k; l++) {
+        sum += lhs_arr[i * k + l] * tran_rhs[j * k + l];
+      }
+      output_arr[i * n + j] = sum;
     }
-    free(tran_rhs);
-    //printf("GEMM OUTPUT: %p\n", output);
-    return output;
+  }
+  free(tran_rhs);
+  // printf("GEMM OUTPUT: %p\n", output);
+  return output;
 }
 
 void *tensorSoftmaxCPU(void *input_ptr) {
-    Tensor *input = (Tensor *)input_ptr;
-    
-    float *logits = (float *)input->host_data;
-    int n = input->dims.dim_sizes[0];
-    int c = input->dims.dim_sizes[1];
-    
-     omp_set_num_threads(4);
-    #pragma omp parallel for
-    for (int i = 0; i < n; i++) {
-        float x = 0;
-        for(int j = i*c; j < c + i*c; j++) {
-            logits[j] = expf(logits[j]);
-        }
-       
-        #pragma omp simd reduction(+:x)
-        for(int j = i*c; j < i*c+c; j++) {
-            x += logits[j];
-        }
-        
-         #pragma omp simd
-        for(int j = i*c; j < i*c + c; j++) {
-            logits[j] /= x;
-        }                                                                                                                                                   
+  Tensor *input = (Tensor *)input_ptr;
+
+  float *logits = (float *)input->host_data;
+  int n = input->dims.dim_sizes[0];
+  int c = input->dims.dim_sizes[1];
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int i = 0; i < n; i++) {
+    float x = 0;
+    for (int j = i * c; j < c + i * c; j++) {
+      logits[j] = expf(logits[j]);
     }
 
-    return input;
-}
-
-void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-                         void* mean_ptr, void* variance_ptr, double epsilon) {
-    
-    Tensor* input = (Tensor*) input_ptr;
-    Tensor* gamma = (Tensor*) gamma_ptr;
-    Tensor* beta = (Tensor*) beta_ptr;
-    Tensor* mean = (Tensor*) mean_ptr;
-    Tensor* variance = (Tensor*) variance_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_beta = (float *)beta->host_data;
-    float * __restrict__ host_gamma = (float *)gamma->host_data;
-    float * __restrict__ host_mean = (float *)mean->host_data;
-    float * __restrict__ host_variance = (float *)variance->host_data;
-    
-    float alpha_val = 1.0f, beta_val = 0.0f;
-    size_t num_elems = input->num_elems;
-
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int image_dim = image_height * image_width;
+#pragma omp simd reduction(+ : x)
+    for (int j = i * c; j < i * c + c; j++) {
+      x += logits[j];
+    }
 
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            float mean = 0;
-            #pragma omp simd reduction(+:mean)
-            for(int i = 0; i < image_dim; i++) {
-                int index = b * channels * image_dim + ch * image_dim + i;
-                mean += host_image[index];
-            }
-            mean = mean / channels;
-         
-            float variance = 0;
-            #pragma omp simd reduction(+:variance)
-            for(int i = 0; i < image_dim; i++) {
-                int index = b * channels * image_dim + ch * image_dim + i;
-                float tmp = host_image[index] - mean;
-                variance += (tmp * tmp);  
-            }
-            variance = variance / channels;
-            
-           #pragma omp simd 
-            for(int i = 0; i < image_dim; i++) {
-                int index = b * channels * image_dim + ch * image_dim + i;
-                host_image[index] = host_beta[ch] 
-                                  + (host_gamma[ch] * ((host_image[index] - mean) / sqrt(epsilon + variance)));
-            }
-        }
+#pragma omp simd
+    for (int j = i * c; j < i * c + c; j++) {
+      logits[j] /= x;
     }
-    return input;
+  }
+
+  return input;
 }
 
- void *tensorReluCPU(void *input_ptr) {
-     Tensor *input = (Tensor *)input_ptr;
-     float *input_data = (float *)input->host_data;
-     size_t num_elems = input->num_elems;
-     
-     #pragma omp simd
-     for (size_t i = 0; i < num_elems; i++) {
-         input_data[i] = (input_data[i] < 0) ? 0 : input_data[i];
+void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                         void *mean_ptr, void *variance_ptr, double epsilon) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *gamma = (Tensor *)gamma_ptr;
+  Tensor *beta = (Tensor *)beta_ptr;
+  Tensor *mean = (Tensor *)mean_ptr;
+  Tensor *variance = (Tensor *)variance_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_beta = (float *)beta->host_data;
+  float *__restrict__ host_gamma = (float *)gamma->host_data;
+  float *__restrict__ host_mean = (float *)mean->host_data;
+  float *__restrict__ host_variance = (float *)variance->host_data;
+
+  float alpha_val = 1.0f, beta_val = 0.0f;
+  size_t num_elems = input->num_elems;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int image_dim = image_height * image_width;
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      float mean = 0;
+#pragma omp simd reduction(+ : mean)
+      for (int i = 0; i < image_dim; i++) {
+        int index = b * channels * image_dim + ch * image_dim + i;
+        mean += host_image[index];
+      }
+      mean = mean / channels;
+
+      float variance = 0;
+#pragma omp simd reduction(+ : variance)
+      for (int i = 0; i < image_dim; i++) {
+        int index = b * channels * image_dim + ch * image_dim + i;
+        float tmp = host_image[index] - mean;
+        variance += (tmp * tmp);
+      }
+      variance = variance / channels;
+
+#pragma omp simd
+      for (int i = 0; i < image_dim; i++) {
+        int index = b * channels * image_dim + ch * image_dim + i;
+        host_image[index] =
+            host_beta[ch] + (host_gamma[ch] * ((host_image[index] - mean) /
+                                               sqrt(epsilon + variance)));
+      }
     }
+  }
+  return input;
+}
+
+void *tensorReluCPU(void *input_ptr) {
+  Tensor *input = (Tensor *)input_ptr;
+  float *input_data = (float *)input->host_data;
+  size_t num_elems = input->num_elems;
 
-    return input;
+#pragma omp simd
+  for (size_t i = 0; i < num_elems; i++) {
+    input_data[i] = (input_data[i] < 0) ? 0 : input_data[i];
+  }
+
+  return input;
 }
 
 void *tensorRelu2CPU(void *input_ptr, float min, float max) {
-    Tensor *input = (Tensor *)input_ptr;
-    float *input_data = (float *)input->host_data;
-    size_t num_elems = input->num_elems;
-    
-    #pragma omp simd
-    for (size_t i = 0; i < num_elems; i++) {
-        input_data[i] = (input_data[i] < min) ? min : ((input_data[i] > max) ? 
-                                                        max : input_data[i]);
-    }       
-
-    return input;
-}         
\ No newline at end of file
+  Tensor *input = (Tensor *)input_ptr;
+  float *input_data = (float *)input->host_data;
+  size_t num_elems = input->num_elems;
+
+#pragma omp simd
+  for (size_t i = 0; i < num_elems; i++) {
+    input_data[i] = (input_data[i] < min)
+                        ? min
+                        : ((input_data[i] > max) ? max : input_data[i]);
+  }
+
+  return input;
+}
\ No newline at end of file