Apply clang-format to the whole source tree

2025-11-17 15:47:15 +00:00 · 2018-03-04 02:04:27 +01:00
parent 41571db894
commit 07b25ebb06
383 changed files with 8704 additions and 8229 deletions
--- a/src/algorithms/acquisition/gnuradio_blocks/galileo_pcps_8ms_acquisition_cc.cc
+++ b/src/algorithms/acquisition/gnuradio_blocks/galileo_pcps_8ms_acquisition_cc.cc
@@ -54,9 +54,10 @@ galileo_pcps_8ms_acquisition_cc::galileo_pcps_8ms_acquisition_cc(
    unsigned int sampled_ms, unsigned int max_dwells,
    unsigned int doppler_max, long freq, long fs_in,
    int samples_per_ms, int samples_per_code,
-    bool dump, std::string dump_filename) : gr::block("galileo_pcps_8ms_acquisition_cc",
-                                                gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
-                                                gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
+    bool dump,
+    std::string dump_filename) : gr::block("galileo_pcps_8ms_acquisition_cc",
+                                     gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
+                                     gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
 {
    this->message_port_register_out(pmt::mp("events"));
    d_sample_counter = 0;  // SAMPLE COUNTER
--- a/src/algorithms/acquisition/gnuradio_blocks/gps_pcps_acquisition_fpga_sc.cc
+++ b/src/algorithms/acquisition/gnuradio_blocks/gps_pcps_acquisition_fpga_sc.cc
@@ -71,9 +71,7 @@ gps_pcps_acquisition_fpga_sc::gps_pcps_acquisition_fpga_sc(
    int samples_per_code, int vector_length, unsigned int nsamples_total,
    bool bit_transition_flag, bool use_CFAR_algorithm_flag,
    unsigned int select_queue_Fpga, std::string device_name, bool dump,
-    std::string dump_filename) :
-
-                                 gr::block("pcps_acquisition_fpga_sc",
+    std::string dump_filename) : gr::block("pcps_acquisition_fpga_sc",
                                     gr::io_signature::make(0, 0, sizeof(lv_16sc_t)),
                                     gr::io_signature::make(0, 0, 0))
 {
--- a/src/algorithms/acquisition/gnuradio_blocks/pcps_acquisition.cc
+++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_acquisition.cc
@@ -65,9 +65,10 @@ pcps_acquisition::pcps_acquisition(
    int samples_per_ms, int samples_per_code,
    bool bit_transition_flag, bool use_CFAR_algorithm_flag,
    bool dump, bool blocking,
-    std::string dump_filename, size_t it_size) : gr::block("pcps_acquisition",
-                                                     gr::io_signature::make(1, 1, it_size * sampled_ms * samples_per_ms * (bit_transition_flag ? 2 : 1)),
-                                                     gr::io_signature::make(0, 0, it_size * sampled_ms * samples_per_ms * (bit_transition_flag ? 2 : 1)))
+    std::string dump_filename,
+    size_t it_size) : gr::block("pcps_acquisition",
+                          gr::io_signature::make(1, 1, it_size * sampled_ms * samples_per_ms * (bit_transition_flag ? 2 : 1)),
+                          gr::io_signature::make(0, 0, it_size * sampled_ms * samples_per_ms * (bit_transition_flag ? 2 : 1)))
 {
    this->message_port_register_out(pmt::mp("events"));

--- a/src/algorithms/acquisition/gnuradio_blocks/pcps_cccwsr_acquisition_cc.cc
+++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_cccwsr_acquisition_cc.cc
@@ -61,9 +61,10 @@ pcps_cccwsr_acquisition_cc::pcps_cccwsr_acquisition_cc(
    unsigned int sampled_ms, unsigned int max_dwells,
    unsigned int doppler_max, long freq, long fs_in,
    int samples_per_ms, int samples_per_code,
-    bool dump, std::string dump_filename) : gr::block("pcps_cccwsr_acquisition_cc",
-                                                gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
-                                                gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
+    bool dump,
+    std::string dump_filename) : gr::block("pcps_cccwsr_acquisition_cc",
+                                     gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
+                                     gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
 {
    this->message_port_register_out(pmt::mp("events"));
    d_sample_counter = 0;  // SAMPLE COUNTER
--- a/src/algorithms/acquisition/gnuradio_blocks/pcps_quicksync_acquisition_cc.cc
+++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_quicksync_acquisition_cc.cc
@@ -67,9 +67,10 @@ pcps_quicksync_acquisition_cc::pcps_quicksync_acquisition_cc(
    unsigned int doppler_max, long freq, long fs_in,
    int samples_per_ms, int samples_per_code,
    bool bit_transition_flag,
-    bool dump, std::string dump_filename) : gr::block("pcps_quicksync_acquisition_cc",
-                                                gr::io_signature::make(1, 1, (sizeof(gr_complex) * sampled_ms * samples_per_ms)),
-                                                gr::io_signature::make(0, 0, (sizeof(gr_complex) * sampled_ms * samples_per_ms)))
+    bool dump,
+    std::string dump_filename) : gr::block("pcps_quicksync_acquisition_cc",
+                                     gr::io_signature::make(1, 1, (sizeof(gr_complex) * sampled_ms * samples_per_ms)),
+                                     gr::io_signature::make(0, 0, (sizeof(gr_complex) * sampled_ms * samples_per_ms)))
 {
    this->message_port_register_out(pmt::mp("events"));
    d_sample_counter = 0;  // SAMPLE COUNTER
--- a/src/algorithms/acquisition/gnuradio_blocks/pcps_tong_acquisition_cc.cc
+++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_tong_acquisition_cc.cc
@@ -76,9 +76,10 @@ pcps_tong_acquisition_cc::pcps_tong_acquisition_cc(
    long freq, long fs_in, int samples_per_ms,
    int samples_per_code, unsigned int tong_init_val,
    unsigned int tong_max_val, unsigned int tong_max_dwells,
-    bool dump, std::string dump_filename) : gr::block("pcps_tong_acquisition_cc",
-                                                gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
-                                                gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
+    bool dump,
+    std::string dump_filename) : gr::block("pcps_tong_acquisition_cc",
+                                     gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
+                                     gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
 {
    this->message_port_register_out(pmt::mp("events"));
    d_sample_counter = 0;  // SAMPLE COUNTER
--- a/src/algorithms/libs/opencl/clFFT.h
+++ b/src/algorithms/libs/opencl/clFFT.h
@@ -50,85 +50,86 @@
 #define __CLFFT_H

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #include <stdio.h>

 #ifdef __APPLE__
-    #include <OpenCL/opencl.h>
+#include <OpenCL/opencl.h>
 #else
-    #include <CL/cl.h>
+#include <CL/cl.h>
 #endif

-// XForm type
-typedef enum 
-{
-	clFFT_Forward	= 	-1,
-	clFFT_Inverse	= 	 1
-	
-}clFFT_Direction;
+    // XForm type
+    typedef enum
+    {
+        clFFT_Forward = -1,
+        clFFT_Inverse = 1

-// XForm dimension
-typedef enum
-{
-	clFFT_1D	= 0,
-	clFFT_2D	= 1,
-	clFFT_3D	= 3
-	
-}clFFT_Dimension;
+    } clFFT_Direction;

-// XForm Data type
-typedef enum
-{
-	clFFT_SplitComplexFormat       = 0,
-	clFFT_InterleavedComplexFormat = 1
-}clFFT_DataFormat;
+    // XForm dimension
+    typedef enum
+    {
+        clFFT_1D = 0,
+        clFFT_2D = 1,
+        clFFT_3D = 3

-typedef struct
-{
-	unsigned int x;
-	unsigned int y;
-	unsigned int z;
-}clFFT_Dim3;	
-	
-typedef struct
-{
-	float *real;
-	float *imag;
-} clFFT_SplitComplex;
+    } clFFT_Dimension;

-typedef struct
-{
-	float real;
-	float imag;
-}clFFT_Complex;
+    // XForm Data type
+    typedef enum
+    {
+        clFFT_SplitComplexFormat = 0,
+        clFFT_InterleavedComplexFormat = 1
+    } clFFT_DataFormat;

-typedef void* clFFT_Plan;	
+    typedef struct
+    {
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+    } clFFT_Dim3;

-clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code );
+    typedef struct
+    {
+        float *real;
+        float *imag;
+    } clFFT_SplitComplex;

-void clFFT_DestroyPlan( clFFT_Plan plan );
+    typedef struct
+    {
+        float real;
+        float imag;
+    } clFFT_Complex;

-cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, 
-								 cl_mem data_in, cl_mem data_out,
-								 cl_int num_events, cl_event *event_list, cl_event *event );
+    typedef void *clFFT_Plan;

-cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, 
-							 cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
-							 cl_int num_events, cl_event *event_list, cl_event *event );
+    clFFT_Plan clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code);

-cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, 
-						        size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
-	
+    void clFFT_DestroyPlan(clFFT_Plan plan);

-cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, 
-					        size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
-	
-void clFFT_DumpPlan( clFFT_Plan plan, FILE *file);	
+    cl_int clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir,
+        cl_mem data_in, cl_mem data_out,
+        cl_int num_events, cl_event *event_list, cl_event *event);
+
+    cl_int clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir,
+        cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
+        cl_int num_events, cl_event *event_list, cl_event *event);
+
+    cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
+        size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
+
+
+    cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
+        size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
+
+    void clFFT_DumpPlan(clFFT_Plan plan, FILE *file);

 #ifdef __cplusplus
 }
 #endif

-#endif 
+#endif
--- a/src/algorithms/libs/opencl/fft_base_kernels.h
+++ b/src/algorithms/libs/opencl/fft_base_kernels.h
@@ -54,224 +54,220 @@
 using namespace std;

 static string baseKernels = string(
-                          "#ifndef M_PI\n"
-						  "#define M_PI 0x1.921fb54442d18p+1\n"
-						  "#endif\n"
-						  "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
-						  "#define conj(a) ((float2)((a).x, -(a).y))\n"
-						  "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n"		   
-						  "\n"
-						  "#define fftKernel2(a,dir) \\\n"
-						  "{ \\\n"
-						  "    float2 c = (a)[0];    \\\n"
-						  "    (a)[0] = c + (a)[1];  \\\n"
-						  "    (a)[1] = c - (a)[1];  \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel2S(d1,d2,dir) \\\n"
-						  "{ \\\n"
-						  "    float2 c = (d1);   \\\n"
-						  "    (d1) = c + (d2);   \\\n"
-						  "    (d2) = c - (d2);   \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel4(a,dir) \\\n"
-						  "{ \\\n"
-						  "    fftKernel2S((a)[0], (a)[2], dir); \\\n"
-						  "    fftKernel2S((a)[1], (a)[3], dir); \\\n"
-						  "    fftKernel2S((a)[0], (a)[1], dir); \\\n"
-						  "    (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
-						  "    fftKernel2S((a)[2], (a)[3], dir); \\\n"
-						  "    float2 c = (a)[1]; \\\n"
-						  "    (a)[1] = (a)[2]; \\\n"
-						  "    (a)[2] = c; \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n"
-						  "{ \\\n"
-						  "    fftKernel2S((a0), (a2), dir); \\\n"
-						  "    fftKernel2S((a1), (a3), dir); \\\n"
-						  "    fftKernel2S((a0), (a1), dir); \\\n"
-						  "    (a3) = (float2)(dir)*(conjTransp((a3))); \\\n"
-						  "    fftKernel2S((a2), (a3), dir); \\\n"
-						  "    float2 c = (a1); \\\n"
-						  "    (a1) = (a2); \\\n"
-						  "    (a2) = c; \\\n" 
-						  "}\n"
-						  "\n"						  
-						  "#define bitreverse8(a) \\\n"
-						  "{ \\\n"
-						  "    float2 c; \\\n"
-						  "    c = (a)[1]; \\\n"
-						  "    (a)[1] = (a)[4]; \\\n"
-						  "    (a)[4] = c; \\\n"
-						  "    c = (a)[3]; \\\n"
-						  "    (a)[3] = (a)[6]; \\\n"
-						  "    (a)[6] = c; \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel8(a,dir) \\\n"
-						  "{ \\\n"
-						  "	const float2 w1  = (float2)(0x1.6a09e6p-1f,  dir*0x1.6a09e6p-1f);  \\\n"
-						  "	const float2 w3  = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f);  \\\n"
-						  "	float2 c; \\\n"
-						  "	fftKernel2S((a)[0], (a)[4], dir); \\\n"
-						  "	fftKernel2S((a)[1], (a)[5], dir); \\\n"
-						  "	fftKernel2S((a)[2], (a)[6], dir); \\\n"
-						  "	fftKernel2S((a)[3], (a)[7], dir); \\\n"
-						  "	(a)[5] = complexMul(w1, (a)[5]); \\\n"
-						  "	(a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n"
-						  "	(a)[7] = complexMul(w3, (a)[7]); \\\n"
-						  "	fftKernel2S((a)[0], (a)[2], dir); \\\n"
-						  "	fftKernel2S((a)[1], (a)[3], dir); \\\n"
-						  "	fftKernel2S((a)[4], (a)[6], dir); \\\n"
-						  "	fftKernel2S((a)[5], (a)[7], dir); \\\n"
-						  "	(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
-						  "	(a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n"
-						  "	fftKernel2S((a)[0], (a)[1], dir); \\\n"
-						  "	fftKernel2S((a)[2], (a)[3], dir); \\\n"
-						  "	fftKernel2S((a)[4], (a)[5], dir); \\\n"
-						  "	fftKernel2S((a)[6], (a)[7], dir); \\\n"
-						  "	bitreverse8((a)); \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define bitreverse4x4(a) \\\n"
-						  "{ \\\n"
-						  "	float2 c; \\\n"
-						  "	c = (a)[1];  (a)[1]  = (a)[4];  (a)[4]  = c; \\\n"
-						  "	c = (a)[2];  (a)[2]  = (a)[8];  (a)[8]  = c; \\\n"
-						  "	c = (a)[3];  (a)[3]  = (a)[12]; (a)[12] = c; \\\n"
-						  "	c = (a)[6];  (a)[6]  = (a)[9];  (a)[9]  = c; \\\n"
-						  "	c = (a)[7];  (a)[7]  = (a)[13]; (a)[13] = c; \\\n"
-						  "	c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel16(a,dir) \\\n"
-						  "{ \\\n"
-						  "    const float w0 = 0x1.d906bcp-1f; \\\n"
-						  "    const float w1 = 0x1.87de2ap-2f; \\\n"
-						  "    const float w2 = 0x1.6a09e6p-1f; \\\n"
-						  "    fftKernel4s((a)[0], (a)[4], (a)[8],  (a)[12], dir); \\\n"
-						  "    fftKernel4s((a)[1], (a)[5], (a)[9],  (a)[13], dir); \\\n"
-						  "    fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n"
-						  "    fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n"
-						  "    (a)[5]  = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n"
-						  "    (a)[6]  = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n"
-						  "    (a)[7]  = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n"
-						  "    (a)[9]  = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n"
-						  "    (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n"
-						  "    (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n"
-						  "    (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n"
-						  "    (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n"
-						  "    (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n"
-						  "    fftKernel4((a), dir); \\\n"
-						  "    fftKernel4((a) + 4, dir); \\\n"
-						  "    fftKernel4((a) + 8, dir); \\\n"
-						  "    fftKernel4((a) + 12, dir); \\\n"
-						  "    bitreverse4x4((a)); \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define bitreverse32(a) \\\n"
-						  "{ \\\n"
-						  "    float2 c1, c2; \\\n"
-						  "    c1 = (a)[2];   (a)[2] = (a)[1];   c2 = (a)[4];   (a)[4] = c1;   c1 = (a)[8];   (a)[8] = c2;    c2 = (a)[16];  (a)[16] = c1;   (a)[1] = c2; \\\n"
-						  "    c1 = (a)[6];   (a)[6] = (a)[3];   c2 = (a)[12];  (a)[12] = c1;  c1 = (a)[24];  (a)[24] = c2;   c2 = (a)[17];  (a)[17] = c1;   (a)[3] = c2; \\\n"
-						  "    c1 = (a)[10];  (a)[10] = (a)[5];  c2 = (a)[20];  (a)[20] = c1;  c1 = (a)[9];   (a)[9] = c2;    c2 = (a)[18];  (a)[18] = c1;   (a)[5] = c2; \\\n"
-						  "    c1 = (a)[14];  (a)[14] = (a)[7];  c2 = (a)[28];  (a)[28] = c1;  c1 = (a)[25];  (a)[25] = c2;   c2 = (a)[19];  (a)[19] = c1;   (a)[7] = c2; \\\n"
-						  "    c1 = (a)[22];  (a)[22] = (a)[11]; c2 = (a)[13];  (a)[13] = c1;  c1 = (a)[26];  (a)[26] = c2;   c2 = (a)[21];  (a)[21] = c1;   (a)[11] = c2; \\\n"
-						  "    c1 = (a)[30];  (a)[30] = (a)[15]; c2 = (a)[29];  (a)[29] = c1;  c1 = (a)[27];  (a)[27] = c2;   c2 = (a)[23];  (a)[23] = c1;   (a)[15] = c2; \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel32(a,dir) \\\n"
-						  "{ \\\n"
-						  "    fftKernel2S((a)[0],  (a)[16], dir); \\\n"
-						  "    fftKernel2S((a)[1],  (a)[17], dir); \\\n"
-						  "    fftKernel2S((a)[2],  (a)[18], dir); \\\n"
-						  "    fftKernel2S((a)[3],  (a)[19], dir); \\\n"
-						  "    fftKernel2S((a)[4],  (a)[20], dir); \\\n"
-						  "    fftKernel2S((a)[5],  (a)[21], dir); \\\n"
-						  "    fftKernel2S((a)[6],  (a)[22], dir); \\\n"
-						  "    fftKernel2S((a)[7],  (a)[23], dir); \\\n"
-						  "    fftKernel2S((a)[8],  (a)[24], dir); \\\n"
-						  "    fftKernel2S((a)[9],  (a)[25], dir); \\\n"
-						  "    fftKernel2S((a)[10], (a)[26], dir); \\\n"
-						  "    fftKernel2S((a)[11], (a)[27], dir); \\\n"
-						  "    fftKernel2S((a)[12], (a)[28], dir); \\\n"
-						  "    fftKernel2S((a)[13], (a)[29], dir); \\\n"
-						  "    fftKernel2S((a)[14], (a)[30], dir); \\\n"
-						  "    fftKernel2S((a)[15], (a)[31], dir); \\\n"
-						  "    (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
-						  "    (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
-						  "    (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
-						  "    (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
-						  "    (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
-						  "    (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
-						  "    (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
-						  "    (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n"
-						  "    (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
-						  "    (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
-						  "    (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
-						  "    (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
-						  "    (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
-						  "    (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
-						  "    (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
-						  "    fftKernel16((a), dir); \\\n"
-						  "    fftKernel16((a) + 16, dir); \\\n"
-						  "    bitreverse32((a)); \\\n"
-						  "}\n\n"
-						  );
+    "#ifndef M_PI\n"
+    "#define M_PI 0x1.921fb54442d18p+1\n"
+    "#endif\n"
+    "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
+    "#define conj(a) ((float2)((a).x, -(a).y))\n"
+    "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n"
+    "\n"
+    "#define fftKernel2(a,dir) \\\n"
+    "{ \\\n"
+    "    float2 c = (a)[0];    \\\n"
+    "    (a)[0] = c + (a)[1];  \\\n"
+    "    (a)[1] = c - (a)[1];  \\\n"
+    "}\n"
+    "\n"
+    "#define fftKernel2S(d1,d2,dir) \\\n"
+    "{ \\\n"
+    "    float2 c = (d1);   \\\n"
+    "    (d1) = c + (d2);   \\\n"
+    "    (d2) = c - (d2);   \\\n"
+    "}\n"
+    "\n"
+    "#define fftKernel4(a,dir) \\\n"
+    "{ \\\n"
+    "    fftKernel2S((a)[0], (a)[2], dir); \\\n"
+    "    fftKernel2S((a)[1], (a)[3], dir); \\\n"
+    "    fftKernel2S((a)[0], (a)[1], dir); \\\n"
+    "    (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
+    "    fftKernel2S((a)[2], (a)[3], dir); \\\n"
+    "    float2 c = (a)[1]; \\\n"
+    "    (a)[1] = (a)[2]; \\\n"
+    "    (a)[2] = c; \\\n"
+    "}\n"
+    "\n"
+    "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n"
+    "{ \\\n"
+    "    fftKernel2S((a0), (a2), dir); \\\n"
+    "    fftKernel2S((a1), (a3), dir); \\\n"
+    "    fftKernel2S((a0), (a1), dir); \\\n"
+    "    (a3) = (float2)(dir)*(conjTransp((a3))); \\\n"
+    "    fftKernel2S((a2), (a3), dir); \\\n"
+    "    float2 c = (a1); \\\n"
+    "    (a1) = (a2); \\\n"
+    "    (a2) = c; \\\n"
+    "}\n"
+    "\n"
+    "#define bitreverse8(a) \\\n"
+    "{ \\\n"
+    "    float2 c; \\\n"
+    "    c = (a)[1]; \\\n"
+    "    (a)[1] = (a)[4]; \\\n"
+    "    (a)[4] = c; \\\n"
+    "    c = (a)[3]; \\\n"
+    "    (a)[3] = (a)[6]; \\\n"
+    "    (a)[6] = c; \\\n"
+    "}\n"
+    "\n"
+    "#define fftKernel8(a,dir) \\\n"
+    "{ \\\n"
+    "	const float2 w1  = (float2)(0x1.6a09e6p-1f,  dir*0x1.6a09e6p-1f);  \\\n"
+    "	const float2 w3  = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f);  \\\n"
+    "	float2 c; \\\n"
+    "	fftKernel2S((a)[0], (a)[4], dir); \\\n"
+    "	fftKernel2S((a)[1], (a)[5], dir); \\\n"
+    "	fftKernel2S((a)[2], (a)[6], dir); \\\n"
+    "	fftKernel2S((a)[3], (a)[7], dir); \\\n"
+    "	(a)[5] = complexMul(w1, (a)[5]); \\\n"
+    "	(a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n"
+    "	(a)[7] = complexMul(w3, (a)[7]); \\\n"
+    "	fftKernel2S((a)[0], (a)[2], dir); \\\n"
+    "	fftKernel2S((a)[1], (a)[3], dir); \\\n"
+    "	fftKernel2S((a)[4], (a)[6], dir); \\\n"
+    "	fftKernel2S((a)[5], (a)[7], dir); \\\n"
+    "	(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
+    "	(a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n"
+    "	fftKernel2S((a)[0], (a)[1], dir); \\\n"
+    "	fftKernel2S((a)[2], (a)[3], dir); \\\n"
+    "	fftKernel2S((a)[4], (a)[5], dir); \\\n"
+    "	fftKernel2S((a)[6], (a)[7], dir); \\\n"
+    "	bitreverse8((a)); \\\n"
+    "}\n"
+    "\n"
+    "#define bitreverse4x4(a) \\\n"
+    "{ \\\n"
+    "	float2 c; \\\n"
+    "	c = (a)[1];  (a)[1]  = (a)[4];  (a)[4]  = c; \\\n"
+    "	c = (a)[2];  (a)[2]  = (a)[8];  (a)[8]  = c; \\\n"
+    "	c = (a)[3];  (a)[3]  = (a)[12]; (a)[12] = c; \\\n"
+    "	c = (a)[6];  (a)[6]  = (a)[9];  (a)[9]  = c; \\\n"
+    "	c = (a)[7];  (a)[7]  = (a)[13]; (a)[13] = c; \\\n"
+    "	c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n"
+    "}\n"
+    "\n"
+    "#define fftKernel16(a,dir) \\\n"
+    "{ \\\n"
+    "    const float w0 = 0x1.d906bcp-1f; \\\n"
+    "    const float w1 = 0x1.87de2ap-2f; \\\n"
+    "    const float w2 = 0x1.6a09e6p-1f; \\\n"
+    "    fftKernel4s((a)[0], (a)[4], (a)[8],  (a)[12], dir); \\\n"
+    "    fftKernel4s((a)[1], (a)[5], (a)[9],  (a)[13], dir); \\\n"
+    "    fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n"
+    "    fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n"
+    "    (a)[5]  = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n"
+    "    (a)[6]  = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n"
+    "    (a)[7]  = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n"
+    "    (a)[9]  = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n"
+    "    (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n"
+    "    (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n"
+    "    (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n"
+    "    (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n"
+    "    (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n"
+    "    fftKernel4((a), dir); \\\n"
+    "    fftKernel4((a) + 4, dir); \\\n"
+    "    fftKernel4((a) + 8, dir); \\\n"
+    "    fftKernel4((a) + 12, dir); \\\n"
+    "    bitreverse4x4((a)); \\\n"
+    "}\n"
+    "\n"
+    "#define bitreverse32(a) \\\n"
+    "{ \\\n"
+    "    float2 c1, c2; \\\n"
+    "    c1 = (a)[2];   (a)[2] = (a)[1];   c2 = (a)[4];   (a)[4] = c1;   c1 = (a)[8];   (a)[8] = c2;    c2 = (a)[16];  (a)[16] = c1;   (a)[1] = c2; \\\n"
+    "    c1 = (a)[6];   (a)[6] = (a)[3];   c2 = (a)[12];  (a)[12] = c1;  c1 = (a)[24];  (a)[24] = c2;   c2 = (a)[17];  (a)[17] = c1;   (a)[3] = c2; \\\n"
+    "    c1 = (a)[10];  (a)[10] = (a)[5];  c2 = (a)[20];  (a)[20] = c1;  c1 = (a)[9];   (a)[9] = c2;    c2 = (a)[18];  (a)[18] = c1;   (a)[5] = c2; \\\n"
+    "    c1 = (a)[14];  (a)[14] = (a)[7];  c2 = (a)[28];  (a)[28] = c1;  c1 = (a)[25];  (a)[25] = c2;   c2 = (a)[19];  (a)[19] = c1;   (a)[7] = c2; \\\n"
+    "    c1 = (a)[22];  (a)[22] = (a)[11]; c2 = (a)[13];  (a)[13] = c1;  c1 = (a)[26];  (a)[26] = c2;   c2 = (a)[21];  (a)[21] = c1;   (a)[11] = c2; \\\n"
+    "    c1 = (a)[30];  (a)[30] = (a)[15]; c2 = (a)[29];  (a)[29] = c1;  c1 = (a)[27];  (a)[27] = c2;   c2 = (a)[23];  (a)[23] = c1;   (a)[15] = c2; \\\n"
+    "}\n"
+    "\n"
+    "#define fftKernel32(a,dir) \\\n"
+    "{ \\\n"
+    "    fftKernel2S((a)[0],  (a)[16], dir); \\\n"
+    "    fftKernel2S((a)[1],  (a)[17], dir); \\\n"
+    "    fftKernel2S((a)[2],  (a)[18], dir); \\\n"
+    "    fftKernel2S((a)[3],  (a)[19], dir); \\\n"
+    "    fftKernel2S((a)[4],  (a)[20], dir); \\\n"
+    "    fftKernel2S((a)[5],  (a)[21], dir); \\\n"
+    "    fftKernel2S((a)[6],  (a)[22], dir); \\\n"
+    "    fftKernel2S((a)[7],  (a)[23], dir); \\\n"
+    "    fftKernel2S((a)[8],  (a)[24], dir); \\\n"
+    "    fftKernel2S((a)[9],  (a)[25], dir); \\\n"
+    "    fftKernel2S((a)[10], (a)[26], dir); \\\n"
+    "    fftKernel2S((a)[11], (a)[27], dir); \\\n"
+    "    fftKernel2S((a)[12], (a)[28], dir); \\\n"
+    "    fftKernel2S((a)[13], (a)[29], dir); \\\n"
+    "    fftKernel2S((a)[14], (a)[30], dir); \\\n"
+    "    fftKernel2S((a)[15], (a)[31], dir); \\\n"
+    "    (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
+    "    (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
+    "    (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
+    "    (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
+    "    (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
+    "    (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
+    "    (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
+    "    (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n"
+    "    (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
+    "    (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
+    "    (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
+    "    (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
+    "    (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
+    "    (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
+    "    (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
+    "    fftKernel16((a), dir); \\\n"
+    "    fftKernel16((a) + 16, dir); \\\n"
+    "    bitreverse32((a)); \\\n"
+    "}\n\n");

 static string twistKernelInterleaved = string(
-											  "__kernel void \\\n"
-											  "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
-											  "{ \\\n"
-											  "   float2 a, w; \\\n"
-											  "   float ang; \\\n"
-											  "   unsigned int j; \\\n"
-											  "	unsigned int i = get_global_id(0); \\\n"
-											  "	unsigned int startIndex = i; \\\n"
-											  "	 \\\n"
-											  "	if(i < numCols) \\\n"
-											  "	{ \\\n"
-											  "	    for(j = 0; j < numRowsToProcess; j++) \\\n"
-											  "	    { \\\n"
-											  "	        a = in[startIndex]; \\\n"
-											  "	        ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
-											  "	        w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
-											  "	        a = complexMul(a, w); \\\n"
-											  "	        in[startIndex] = a; \\\n"
-											  "	        startIndex += numCols; \\\n"
-											  "	    } \\\n"
-											  "	}	 \\\n"
-											  "} \\\n"
-											  );
+    "__kernel void \\\n"
+    "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
+    "{ \\\n"
+    "   float2 a, w; \\\n"
+    "   float ang; \\\n"
+    "   unsigned int j; \\\n"
+    "	unsigned int i = get_global_id(0); \\\n"
+    "	unsigned int startIndex = i; \\\n"
+    "	 \\\n"
+    "	if(i < numCols) \\\n"
+    "	{ \\\n"
+    "	    for(j = 0; j < numRowsToProcess; j++) \\\n"
+    "	    { \\\n"
+    "	        a = in[startIndex]; \\\n"
+    "	        ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
+    "	        w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
+    "	        a = complexMul(a, w); \\\n"
+    "	        in[startIndex] = a; \\\n"
+    "	        startIndex += numCols; \\\n"
+    "	    } \\\n"
+    "	}	 \\\n"
+    "} \\\n");

 static string twistKernelPlannar = string(
-										  "__kernel void \\\n"
-										  "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
-										  "{ \\\n"
-										  "    float2 a, w; \\\n"
-										  "    float ang; \\\n"
-										  "    unsigned int j; \\\n"
-										  "	unsigned int i = get_global_id(0); \\\n"
-										  "	unsigned int startIndex = i; \\\n"
-										  "	 \\\n"
-										  "	if(i < numCols) \\\n"
-										  "	{ \\\n"
-										  "	    for(j = 0; j < numRowsToProcess; j++) \\\n"
-										  "	    { \\\n"
-										  "	        a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n"
-										  "	        ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
-										  "	        w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
-										  "	        a = complexMul(a, w); \\\n"
-										  "	        in_real[startIndex] = a.x; \\\n"
-										  "	        in_imag[startIndex] = a.y; \\\n"
-										  "	        startIndex += numCols; \\\n"
-										  "	    } \\\n"
-										  "	}	 \\\n"
-										  "} \\\n"
-										  );										  
-
+    "__kernel void \\\n"
+    "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
+    "{ \\\n"
+    "    float2 a, w; \\\n"
+    "    float ang; \\\n"
+    "    unsigned int j; \\\n"
+    "	unsigned int i = get_global_id(0); \\\n"
+    "	unsigned int startIndex = i; \\\n"
+    "	 \\\n"
+    "	if(i < numCols) \\\n"
+    "	{ \\\n"
+    "	    for(j = 0; j < numRowsToProcess; j++) \\\n"
+    "	    { \\\n"
+    "	        a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n"
+    "	        ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
+    "	        w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
+    "	        a = complexMul(a, w); \\\n"
+    "	        in_real[startIndex] = a.x; \\\n"
+    "	        in_imag[startIndex] = a.y; \\\n"
+    "	        startIndex += numCols; \\\n"
+    "	    } \\\n"
+    "	}	 \\\n"
+    "} \\\n");


 #endif
--- a/src/algorithms/libs/opencl/fft_execute.cc
+++ b/src/algorithms/libs/opencl/fft_execute.cc
@@ -52,354 +52,352 @@
 #include <stdio.h>
 #include <math.h>

-#define max(a,b) (((a)>(b)) ? (a) : (b))
-#define min(a,b) (((a)<(b)) ? (a) : (b))
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#define min(a, b) (((a) < (b)) ? (a) : (b))

 static cl_int
 allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
 {
-	cl_int err = CL_SUCCESS;
-	if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) 
-	{
-		plan->last_batch_size = batchSize; 
-		size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
-		
-		if(plan->tempmemobj)
-			clReleaseMemObject(plan->tempmemobj);
-			
-		plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
-	}
-	return err;	
+    cl_int err = CL_SUCCESS;
+    if (plan->temp_buffer_needed && plan->last_batch_size != batchSize)
+        {
+            plan->last_batch_size = batchSize;
+            size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
+
+            if (plan->tempmemobj)
+                clReleaseMemObject(plan->tempmemobj);
+
+            plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
+        }
+    return err;
 }

 static cl_int
 allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
 {
-	cl_int err = CL_SUCCESS;
-	cl_int terr;
-	if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) 
-	{
-		plan->last_batch_size = batchSize; 
-		size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
-		
-		if(plan->tempmemobj_real)
-			clReleaseMemObject(plan->tempmemobj_real);
+    cl_int err = CL_SUCCESS;
+    cl_int terr;
+    if (plan->temp_buffer_needed && plan->last_batch_size != batchSize)
+        {
+            plan->last_batch_size = batchSize;
+            size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);

-		if(plan->tempmemobj_imag)
-			clReleaseMemObject(plan->tempmemobj_imag);			
-			
-		plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
-		plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr);
-		err |= terr;
- 	}	
-	return err;
+            if (plan->tempmemobj_real)
+                clReleaseMemObject(plan->tempmemobj_real);
+
+            if (plan->tempmemobj_imag)
+                clReleaseMemObject(plan->tempmemobj_imag);
+
+            plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
+            plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr);
+            err |= terr;
+        }
+    return err;
 }

-void
-getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
+void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
 {
-	*lWorkItems = kernelInfo->num_workitems_per_workgroup;
-	int numWorkGroups = kernelInfo->num_workgroups;
+    *lWorkItems = kernelInfo->num_workitems_per_workgroup;
+    int numWorkGroups = kernelInfo->num_workgroups;
    int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup;
-	
-	switch(kernelInfo->dir)
-	{
-		case cl_fft_kernel_x:
+
+    switch (kernelInfo->dir)
+        {
+        case cl_fft_kernel_x:
            *batchSize *= (plan->n.y * plan->n.z);
-            numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG);
+            numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize / numXFormsPerWG + 1) : (*batchSize / numXFormsPerWG);
            numWorkGroups *= kernelInfo->num_workgroups;
-			break;
-		case cl_fft_kernel_y:
-			*batchSize *= plan->n.z;
-			numWorkGroups *= *batchSize;
-			break;
-		case cl_fft_kernel_z:
-			numWorkGroups *= *batchSize;
-			break;
-	}
-	
-	*gWorkItems = numWorkGroups * *lWorkItems;
+            break;
+        case cl_fft_kernel_y:
+            *batchSize *= plan->n.z;
+            numWorkGroups *= *batchSize;
+            break;
+        case cl_fft_kernel_z:
+            numWorkGroups *= *batchSize;
+            break;
+        }
+
+    *gWorkItems = numWorkGroups * *lWorkItems;
 }

-cl_int 
-clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, 
-						 cl_mem data_in, cl_mem data_out, 
-						 cl_int num_events, cl_event *event_list, cl_event *event )
-{	
-	int s;
-	cl_fft_plan *plan = (cl_fft_plan *) Plan;
-	if(plan->format != clFFT_InterleavedComplexFormat)
-		return CL_INVALID_VALUE;
-	
-	cl_int err;
-	size_t gWorkItems, lWorkItems;
-	int inPlaceDone;
-	
-	cl_int isInPlace = data_in == data_out ? 1 : 0;
-	
-	if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
-		return err;	
-	
-	cl_mem memObj[3];
-	memObj[0] = data_in;
-	memObj[1] = data_out;
-	memObj[2] = plan->tempmemobj;
-	cl_fft_kernel_info *kernelInfo = plan->kernel_info;
-	int numKernels = plan->num_kernels;
-	
-	int numKernelsOdd = numKernels & 1;
-	int currRead  = 0;
-	int currWrite = 1;
-	
-	// at least one external dram shuffle (transpose) required
-	if(plan->temp_buffer_needed) 
-	{
-		// in-place transform
-		if(isInPlace) 
-		{
-			inPlaceDone = 0;
-			currRead  = 1;
-			currWrite = 2;
-		}
-		else
-		{
-			currWrite = (numKernels & 1) ? 1 : 2;
-		}
-		
-		while(kernelInfo) 
-		{
-			if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) 
-			{
-				currWrite = currRead;
-				inPlaceDone = 1;
-			}
-			
-			s = batchSize;
-			getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
-			err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
-			err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
-			err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
-			err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
-			
-			err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
-			if(err)
-				return err;
-			
-			currRead  = (currWrite == 1) ? 1 : 2;
-			currWrite = (currWrite == 1) ? 2 : 1; 
-			
-			kernelInfo = kernelInfo->next;
-		}			
-	}
-	// no dram shuffle (transpose required) transform
-	// all kernels can execute in-place.
-	else {
-		
-		while(kernelInfo)
-		{
-		    s = batchSize;
-		    getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
-		    err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
-		    err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
-		
-		    err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
-		    if(err)
-			    return err;		
-			
-			currRead  = 1;
-			currWrite = 1;
-			
-			kernelInfo = kernelInfo->next;
-		}
-	}
-	
-	return err;
-}
-
-cl_int 
-clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, 
-					  cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
-					  cl_int num_events, cl_event *event_list, cl_event *event)
-{	
-	int s;
-	cl_fft_plan *plan = (cl_fft_plan *) Plan;
-	
-	if(plan->format != clFFT_SplitComplexFormat)
-		return CL_INVALID_VALUE;
-	
-	cl_int err;
-	size_t gWorkItems, lWorkItems;
-	int inPlaceDone;
-	
-	cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
-	
-	if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
-		return err;	
-	
-	cl_mem memObj_real[3];
-	cl_mem memObj_imag[3];
-	memObj_real[0] = data_in_real;
-	memObj_real[1] = data_out_real;
-	memObj_real[2] = plan->tempmemobj_real;
-	memObj_imag[0] = data_in_imag;
-	memObj_imag[1] = data_out_imag;
-	memObj_imag[2] = plan->tempmemobj_imag;
-		
-	cl_fft_kernel_info *kernelInfo = plan->kernel_info;
-	int numKernels = plan->num_kernels;
-	
-	int numKernelsOdd = numKernels & 1;
-	int currRead  = 0;
-	int currWrite = 1;
-	
-	// at least one external dram shuffle (transpose) required
-	if(plan->temp_buffer_needed) 
-	{
-		// in-place transform
-		if(isInPlace) 
-		{
-			inPlaceDone = 0;
-			currRead  = 1;
-			currWrite = 2;
-		}
-		else
-		{
-			currWrite = (numKernels & 1) ? 1 : 2;
-		}
-		
-		while(kernelInfo) 
-		{
-			if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) 
-			{
-				currWrite = currRead;
-				inPlaceDone = 1;
-			}
-			
-			s = batchSize;
-			getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
-			err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
-			err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
-			err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
-			err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
-			err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
-			err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
-			
-			err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
-			if(err)
-				return err;			
-			
-			currRead  = (currWrite == 1) ? 1 : 2;
-			currWrite = (currWrite == 1) ? 2 : 1; 
-			
-			kernelInfo = kernelInfo->next;
-		}			
-	}
-	// no dram shuffle (transpose required) transform
-	else {
-		
-		while(kernelInfo)
-		{
-		    s = batchSize;
-		    getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
-		    err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
-		    err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
-		
-		    err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
-		    if(err)
-			    return err;	
-			
-			currRead  = 1;
-			currWrite = 1;
-		
-			kernelInfo = kernelInfo->next;
-		}
-	}
-	
-	return err;
-}
-
-cl_int 
-clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, 
-						 unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
+cl_int
+clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
+    cl_mem data_in, cl_mem data_out,
+    cl_int num_events, cl_event *event_list, cl_event *event)
 {
-	cl_fft_plan *plan = (cl_fft_plan *) Plan;
-	
-	unsigned int N = numRows*numCols;
-	unsigned int nCols = numCols;
-	unsigned int sRow = startRow;
-	unsigned int rToProcess = rowsToProcess;
-	int d = dir;
-	int err = 0;
-	
-	cl_device_id device_id;
-	err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
-	if(err)
-	    return err;
-	
-	size_t gSize;
-	err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
-	if(err)
-	    return err;
-	      
-	gSize = min(128, gSize);
-	size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
-	size_t numLocalThreads[1]  = { gSize };
-	
-	err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array);
-	err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow);
-	err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols);
-	err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N);
-	err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess);
-	err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d);
-	
-	err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);            
-	
-	return err;	
+    int s;
+    cl_fft_plan *plan = (cl_fft_plan *)Plan;
+    if (plan->format != clFFT_InterleavedComplexFormat)
+        return CL_INVALID_VALUE;
+
+    cl_int err;
+    size_t gWorkItems, lWorkItems;
+    int inPlaceDone;
+
+    cl_int isInPlace = data_in == data_out ? 1 : 0;
+
+    if ((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
+        return err;
+
+    cl_mem memObj[3];
+    memObj[0] = data_in;
+    memObj[1] = data_out;
+    memObj[2] = plan->tempmemobj;
+    cl_fft_kernel_info *kernelInfo = plan->kernel_info;
+    int numKernels = plan->num_kernels;
+
+    int numKernelsOdd = numKernels & 1;
+    int currRead = 0;
+    int currWrite = 1;
+
+    // at least one external dram shuffle (transpose) required
+    if (plan->temp_buffer_needed)
+        {
+            // in-place transform
+            if (isInPlace)
+                {
+                    inPlaceDone = 0;
+                    currRead = 1;
+                    currWrite = 2;
+                }
+            else
+                {
+                    currWrite = (numKernels & 1) ? 1 : 2;
+                }
+
+            while (kernelInfo)
+                {
+                    if (isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
+                        {
+                            currWrite = currRead;
+                            inPlaceDone = 1;
+                        }
+
+                    s = batchSize;
+                    getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+                    err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
+                    err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
+
+                    err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+                    if (err)
+                        return err;
+
+                    currRead = (currWrite == 1) ? 1 : 2;
+                    currWrite = (currWrite == 1) ? 2 : 1;
+
+                    kernelInfo = kernelInfo->next;
+                }
+        }
+    // no dram shuffle (transpose required) transform
+    // all kernels can execute in-place.
+    else
+        {
+            while (kernelInfo)
+                {
+                    s = batchSize;
+                    getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+                    err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
+                    err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
+
+                    err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+                    if (err)
+                        return err;
+
+                    currRead = 1;
+                    currWrite = 1;
+
+                    kernelInfo = kernelInfo->next;
+                }
+        }
+
+    return err;
 }

-cl_int 
-clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, 
-					 unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
+cl_int
+clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
+    cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
+    cl_int num_events, cl_event *event_list, cl_event *event)
 {
-	cl_fft_plan *plan = (cl_fft_plan *) Plan;
-	
-	unsigned int N = numRows*numCols;
-	unsigned int nCols = numCols;
-	unsigned int sRow = startRow;
-	unsigned int rToProcess = rowsToProcess;
-	int d = dir;
-	int err = 0;
-	
-	cl_device_id device_id;
-	err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
-	if(err)
-	    return err;
-	
-	size_t gSize;
-	err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
-	if(err)
-	    return err;
-	      
-	gSize = min(128, gSize);
-	size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
-	size_t numLocalThreads[1]  = { gSize };
-	
-	err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real);
-	err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag);
-	err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow);
-	err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols);
-	err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N);
-	err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess);
-	err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d);
-	
-	err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);            
-	
-	return err;	
+    int s;
+    cl_fft_plan *plan = (cl_fft_plan *)Plan;
+
+    if (plan->format != clFFT_SplitComplexFormat)
+        return CL_INVALID_VALUE;
+
+    cl_int err;
+    size_t gWorkItems, lWorkItems;
+    int inPlaceDone;
+
+    cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
+
+    if ((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
+        return err;
+
+    cl_mem memObj_real[3];
+    cl_mem memObj_imag[3];
+    memObj_real[0] = data_in_real;
+    memObj_real[1] = data_out_real;
+    memObj_real[2] = plan->tempmemobj_real;
+    memObj_imag[0] = data_in_imag;
+    memObj_imag[1] = data_out_imag;
+    memObj_imag[2] = plan->tempmemobj_imag;
+
+    cl_fft_kernel_info *kernelInfo = plan->kernel_info;
+    int numKernels = plan->num_kernels;
+
+    int numKernelsOdd = numKernels & 1;
+    int currRead = 0;
+    int currWrite = 1;
+
+    // at least one external dram shuffle (transpose) required
+    if (plan->temp_buffer_needed)
+        {
+            // in-place transform
+            if (isInPlace)
+                {
+                    inPlaceDone = 0;
+                    currRead = 1;
+                    currWrite = 2;
+                }
+            else
+                {
+                    currWrite = (numKernels & 1) ? 1 : 2;
+                }
+
+            while (kernelInfo)
+                {
+                    if (isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
+                        {
+                            currWrite = currRead;
+                            inPlaceDone = 1;
+                        }
+
+                    s = batchSize;
+                    getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+                    err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
+                    err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
+
+                    err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+                    if (err)
+                        return err;
+
+                    currRead = (currWrite == 1) ? 1 : 2;
+                    currWrite = (currWrite == 1) ? 2 : 1;
+
+                    kernelInfo = kernelInfo->next;
+                }
+        }
+    // no dram shuffle (transpose required) transform
+    else
+        {
+            while (kernelInfo)
+                {
+                    s = batchSize;
+                    getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+                    err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
+                    err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
+                    err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
+
+                    err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+                    if (err)
+                        return err;
+
+                    currRead = 1;
+                    currWrite = 1;
+
+                    kernelInfo = kernelInfo->next;
+                }
+        }
+
+    return err;
 }

+cl_int
+clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
+    unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
+{
+    cl_fft_plan *plan = (cl_fft_plan *)Plan;
+
+    unsigned int N = numRows * numCols;
+    unsigned int nCols = numCols;
+    unsigned int sRow = startRow;
+    unsigned int rToProcess = rowsToProcess;
+    int d = dir;
+    int err = 0;
+
+    cl_device_id device_id;
+    err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
+    if (err)
+        return err;
+
+    size_t gSize;
+    err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
+    if (err)
+        return err;
+
+    gSize = min(128, gSize);
+    size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};
+    size_t numLocalThreads[1] = {gSize};
+
+    err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array);
+    err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow);
+    err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols);
+    err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N);
+    err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess);
+    err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d);
+
+    err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
+
+    return err;
+}
+
+cl_int
+clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
+    unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
+{
+    cl_fft_plan *plan = (cl_fft_plan *)Plan;
+
+    unsigned int N = numRows * numCols;
+    unsigned int nCols = numCols;
+    unsigned int sRow = startRow;
+    unsigned int rToProcess = rowsToProcess;
+    int d = dir;
+    int err = 0;
+
+    cl_device_id device_id;
+    err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
+    if (err)
+        return err;
+
+    size_t gSize;
+    err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
+    if (err)
+        return err;
+
+    gSize = min(128, gSize);
+    size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};
+    size_t numLocalThreads[1] = {gSize};
+
+    err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real);
+    err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag);
+    err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow);
+    err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols);
+    err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N);
+    err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess);
+    err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d);
+
+    err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
+
+    return err;
+}
--- a/src/algorithms/libs/opencl/fft_internal.h
+++ b/src/algorithms/libs/opencl/fft_internal.h
@@ -58,106 +58,106 @@ using namespace std;

 typedef enum kernel_dir_t
 {
-	cl_fft_kernel_x,
-	cl_fft_kernel_y,
-	cl_fft_kernel_z
-}cl_fft_kernel_dir;
+    cl_fft_kernel_x,
+    cl_fft_kernel_y,
+    cl_fft_kernel_z
+} cl_fft_kernel_dir;

 typedef struct kernel_info_t
 {
-	cl_kernel kernel;
-	char *kernel_name;
-	unsigned lmem_size;
-	unsigned num_workgroups;
+    cl_kernel kernel;
+    char *kernel_name;
+    unsigned lmem_size;
+    unsigned num_workgroups;
    unsigned num_xforms_per_workgroup;
-	unsigned num_workitems_per_workgroup;
-	cl_fft_kernel_dir dir;
-	int in_place_possible;
-	kernel_info_t *next;
-}cl_fft_kernel_info;
+    unsigned num_workitems_per_workgroup;
+    cl_fft_kernel_dir dir;
+    int in_place_possible;
+    kernel_info_t *next;
+} cl_fft_kernel_info;

-typedef struct 
+typedef struct
 {
-	// context in which fft resources are created and kernels are executed
-	cl_context              context;
-	
-	// size of signal
-	clFFT_Dim3              n;
-	
-	// dimension of transform ... must be either 1D, 2D or 3D
-	clFFT_Dimension			dim;
-	
-	// data format ... must be either interleaved or plannar
-	clFFT_DataFormat		format;
-	
-	// string containing kernel source. Generated at runtime based on
-	// n, dim, format and other parameters
-	string                  *kernel_string;
-	
-	// CL program containing source and kernel this particular 
-	// n, dim, data format
-	cl_program				program;
-	
-	// linked list of kernels which needs to be executed for this fft
-	cl_fft_kernel_info		*kernel_info;
-	
-	// number of kernels
-	int                     num_kernels;
-	
-	// twist kernel for virtualizing fft of very large sizes that do not
-	// fit in GPU global memory
-	cl_kernel				twist_kernel;
-	
-	// flag indicating if temporary intermediate buffer is needed or not.
-	// this depends on fft kernels being executed and if transform is 
-	// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... 
-	// one that does not require global transpose do not need temporary buffer)
-	// 2D 1024x1024 out-of-place fft however do require intermediate buffer.
-	// If temp buffer is needed, its allocation is lazy i.e. its not allocated
-	// until its needed
-	cl_int                  temp_buffer_needed;
-	
-	// Batch size is runtime parameter and size of temporary buffer (if needed)
-	// depends on batch size. Allocation of temporary buffer is lazy i.e. its
-	// only created when needed. Once its created at first call of clFFT_Executexxx
-	// it is not allocated next time if next time clFFT_Executexxx is called with 
-	// batch size different than the first call. last_batch_size caches the last
-	// batch size with which this plan is used so that we dont keep allocating/deallocating
-	// temp buffer if same batch size is used again and again.
-	unsigned                  last_batch_size;
-	
-	// temporary buffer for interleaved plan
-	cl_mem   				tempmemobj;
-	
-	// temporary buffer for planner plan. Only one of tempmemobj or 
-	// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending 
-	// data format of plan (plannar or interleaved)
-	cl_mem                  tempmemobj_real, tempmemobj_imag;
-	
-	// Maximum size of signal for which local memory transposed based
-	// fft is sufficient i.e. no global mem transpose (communication)
-	// is needed
-	unsigned					max_localmem_fft_size;
-	
-	// Maximum work items per work group allowed. This, along with max_radix below controls 
-	// maximum local memory being used by fft kernels of this plan. Set to 256 by default
-	unsigned                  max_work_item_per_workgroup;
-	
-	// Maximum base radix for local memory fft ... this controls the maximum register 
-	// space used by work items. Currently defaults to 16
-	unsigned                  max_radix;
-	
-	// Device depended parameter that tells how many work-items need to be read consecutive
-	// values to make sure global memory access by work-items of a work-group result in 
-	// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
-	unsigned                  min_mem_coalesce_width;
-	
-	// Number of local memory banks. This is used to geneate kernel with local memory 
-	// transposes with appropriate padding to avoid bank conflicts to local memory
-	// e.g. on NVidia it is 16.
-	unsigned                  num_local_mem_banks;
-}cl_fft_plan;
+    // context in which fft resources are created and kernels are executed
+    cl_context context;
+
+    // size of signal
+    clFFT_Dim3 n;
+
+    // dimension of transform ... must be either 1D, 2D or 3D
+    clFFT_Dimension dim;
+
+    // data format ... must be either interleaved or plannar
+    clFFT_DataFormat format;
+
+    // string containing kernel source. Generated at runtime based on
+    // n, dim, format and other parameters
+    string *kernel_string;
+
+    // CL program containing source and kernel this particular
+    // n, dim, data format
+    cl_program program;
+
+    // linked list of kernels which needs to be executed for this fft
+    cl_fft_kernel_info *kernel_info;
+
+    // number of kernels
+    int num_kernels;
+
+    // twist kernel for virtualizing fft of very large sizes that do not
+    // fit in GPU global memory
+    cl_kernel twist_kernel;
+
+    // flag indicating if temporary intermediate buffer is needed or not.
+    // this depends on fft kernels being executed and if transform is
+    // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
+    // one that does not require global transpose do not need temporary buffer)
+    // 2D 1024x1024 out-of-place fft however do require intermediate buffer.
+    // If temp buffer is needed, its allocation is lazy i.e. its not allocated
+    // until its needed
+    cl_int temp_buffer_needed;
+
+    // Batch size is runtime parameter and size of temporary buffer (if needed)
+    // depends on batch size. Allocation of temporary buffer is lazy i.e. its
+    // only created when needed. Once its created at first call of clFFT_Executexxx
+    // it is not allocated next time if next time clFFT_Executexxx is called with
+    // batch size different than the first call. last_batch_size caches the last
+    // batch size with which this plan is used so that we dont keep allocating/deallocating
+    // temp buffer if same batch size is used again and again.
+    unsigned last_batch_size;
+
+    // temporary buffer for interleaved plan
+    cl_mem tempmemobj;
+
+    // temporary buffer for planner plan. Only one of tempmemobj or
+    // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
+    // data format of plan (plannar or interleaved)
+    cl_mem tempmemobj_real, tempmemobj_imag;
+
+    // Maximum size of signal for which local memory transposed based
+    // fft is sufficient i.e. no global mem transpose (communication)
+    // is needed
+    unsigned max_localmem_fft_size;
+
+    // Maximum work items per work group allowed. This, along with max_radix below controls
+    // maximum local memory being used by fft kernels of this plan. Set to 256 by default
+    unsigned max_work_item_per_workgroup;
+
+    // Maximum base radix for local memory fft ... this controls the maximum register
+    // space used by work items. Currently defaults to 16
+    unsigned max_radix;
+
+    // Device depended parameter that tells how many work-items need to be read consecutive
+    // values to make sure global memory access by work-items of a work-group result in
+    // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
+    unsigned min_mem_coalesce_width;
+
+    // Number of local memory banks. This is used to geneate kernel with local memory
+    // transposes with appropriate padding to avoid bank conflicts to local memory
+    // e.g. on NVidia it is 16.
+    unsigned num_local_mem_banks;
+} cl_fft_plan;

 void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);

-#endif  
+#endif
--- a/src/algorithms/libs/opencl/fft_kernelstring.cc
+++ b/src/algorithms/libs/opencl/fft_kernelstring.cc
--- a/src/algorithms/libs/opencl/fft_setup.cc
+++ b/src/algorithms/libs/opencl/fft_setup.cc
@@ -61,59 +61,59 @@ using namespace std;

 extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems);

-static void 
+static void
 getBlockConfigAndKernelString(cl_fft_plan *plan)
 {
-	plan->temp_buffer_needed = 0;
-	*plan->kernel_string += baseKernels;
-	
-	if(plan->format == clFFT_SplitComplexFormat)
-		*plan->kernel_string += twistKernelPlannar;
-	else
-		*plan->kernel_string += twistKernelInterleaved;
-	
-	switch(plan->dim) 
-	{
-		case clFFT_1D:
-			FFT1D(plan, cl_fft_kernel_x);
-			break;
-			
-		case clFFT_2D:
-			FFT1D(plan, cl_fft_kernel_x); 
-			FFT1D(plan, cl_fft_kernel_y);  
-			break;
-			
-		case clFFT_3D:
-			FFT1D(plan, cl_fft_kernel_x); 
-			FFT1D(plan, cl_fft_kernel_y); 
-			FFT1D(plan, cl_fft_kernel_z); 
-			break;
-			
-		default:
-			return;
-	}
-	
-	plan->temp_buffer_needed = 0;
-	cl_fft_kernel_info *kInfo = plan->kernel_info;
-	while(kInfo)
-	{
-		plan->temp_buffer_needed |= !kInfo->in_place_possible;
-		kInfo = kInfo->next;
-	}
+    plan->temp_buffer_needed = 0;
+    *plan->kernel_string += baseKernels;
+
+    if (plan->format == clFFT_SplitComplexFormat)
+        *plan->kernel_string += twistKernelPlannar;
+    else
+        *plan->kernel_string += twistKernelInterleaved;
+
+    switch (plan->dim)
+        {
+        case clFFT_1D:
+            FFT1D(plan, cl_fft_kernel_x);
+            break;
+
+        case clFFT_2D:
+            FFT1D(plan, cl_fft_kernel_x);
+            FFT1D(plan, cl_fft_kernel_y);
+            break;
+
+        case clFFT_3D:
+            FFT1D(plan, cl_fft_kernel_x);
+            FFT1D(plan, cl_fft_kernel_y);
+            FFT1D(plan, cl_fft_kernel_z);
+            break;
+
+        default:
+            return;
+        }
+
+    plan->temp_buffer_needed = 0;
+    cl_fft_kernel_info *kInfo = plan->kernel_info;
+    while (kInfo)
+        {
+            plan->temp_buffer_needed |= !kInfo->in_place_possible;
+            kInfo = kInfo->next;
+        }
 }

- 
+
 static void
 deleteKernelInfo(cl_fft_kernel_info *kInfo)
 {
-	if(kInfo)
-	{
-	    if(kInfo->kernel_name)
-		    free(kInfo->kernel_name);
-	    if(kInfo->kernel)
-		    clReleaseKernel(kInfo->kernel);
-		free(kInfo);
-	}	
+    if (kInfo)
+        {
+            if (kInfo->kernel_name)
+                free(kInfo->kernel_name);
+            if (kInfo->kernel)
+                clReleaseKernel(kInfo->kernel);
+            free(kInfo);
+        }
 }

 static void
@@ -121,282 +121,282 @@ destroy_plan(cl_fft_plan *Plan)
 {
    cl_fft_kernel_info *kernel_info = Plan->kernel_info;

-	while(kernel_info)
-	{
-		cl_fft_kernel_info *tmp = kernel_info->next;
-		deleteKernelInfo(kernel_info);
-		kernel_info = tmp;
-	}
-	
-	Plan->kernel_info = NULL;
-		
-	if(Plan->kernel_string)
-	{
-		delete Plan->kernel_string;
-		Plan->kernel_string = NULL;
-	}			
-	if(Plan->twist_kernel)
-	{
-		clReleaseKernel(Plan->twist_kernel);
-		Plan->twist_kernel = NULL;
-	}
-	if(Plan->program)
-	{
-		clReleaseProgram(Plan->program);
-		Plan->program = NULL;
-	}
-	if(Plan->tempmemobj) 
-	{
-		clReleaseMemObject(Plan->tempmemobj);
-		Plan->tempmemobj = NULL;
-	}
-	if(Plan->tempmemobj_real)
-	{
-		clReleaseMemObject(Plan->tempmemobj_real);
-		Plan->tempmemobj_real = NULL;
-	}
-	if(Plan->tempmemobj_imag)
-	{
-		clReleaseMemObject(Plan->tempmemobj_imag);
-		Plan->tempmemobj_imag = NULL;
-	}
+    while (kernel_info)
+        {
+            cl_fft_kernel_info *tmp = kernel_info->next;
+            deleteKernelInfo(kernel_info);
+            kernel_info = tmp;
+        }
+
+    Plan->kernel_info = NULL;
+
+    if (Plan->kernel_string)
+        {
+            delete Plan->kernel_string;
+            Plan->kernel_string = NULL;
+        }
+    if (Plan->twist_kernel)
+        {
+            clReleaseKernel(Plan->twist_kernel);
+            Plan->twist_kernel = NULL;
+        }
+    if (Plan->program)
+        {
+            clReleaseProgram(Plan->program);
+            Plan->program = NULL;
+        }
+    if (Plan->tempmemobj)
+        {
+            clReleaseMemObject(Plan->tempmemobj);
+            Plan->tempmemobj = NULL;
+        }
+    if (Plan->tempmemobj_real)
+        {
+            clReleaseMemObject(Plan->tempmemobj_real);
+            Plan->tempmemobj_real = NULL;
+        }
+    if (Plan->tempmemobj_imag)
+        {
+            clReleaseMemObject(Plan->tempmemobj_imag);
+            Plan->tempmemobj_imag = NULL;
+        }
 }

 static int
-createKernelList(cl_fft_plan *plan) 
+createKernelList(cl_fft_plan *plan)
 {
-	cl_program program = plan->program;
-	cl_fft_kernel_info *kernel_info = plan->kernel_info;
-	
-	cl_int err;
-	while(kernel_info)
-	{
-		kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
-		if(!kernel_info->kernel || err != CL_SUCCESS)
-			return err;
-		kernel_info = kernel_info->next;		
-	}
-	
-	if(plan->format == clFFT_SplitComplexFormat)
-		plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
-	else
-		plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
-	
-	if(!plan->twist_kernel || err)
-		return err;
+    cl_program program = plan->program;
+    cl_fft_kernel_info *kernel_info = plan->kernel_info;

-	return CL_SUCCESS;
+    cl_int err;
+    while (kernel_info)
+        {
+            kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
+            if (!kernel_info->kernel || err != CL_SUCCESS)
+                return err;
+            kernel_info = kernel_info->next;
+        }
+
+    if (plan->format == clFFT_SplitComplexFormat)
+        plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
+    else
+        plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
+
+    if (!plan->twist_kernel || err)
+        return err;
+
+    return CL_SUCCESS;
 }

 int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
-{	
+{
    int reg_needed = 0;
    *max_wg_size = std::numeric_limits<int>::max();
    int err;
    unsigned wg_size;
-    
-    unsigned int i;
-    for(i = 0; i < num_devices; i++)
-    {
-	    cl_fft_kernel_info *kInfo = plan->kernel_info;
-	    while(kInfo)
-	    {
-		    err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-		    if(err != CL_SUCCESS)
-		        return -1;
-		        
-		    if(wg_size < kInfo->num_workitems_per_workgroup)
-		        reg_needed |= 1;
-		    
-		    if(*max_wg_size > wg_size)
-		        *max_wg_size = wg_size;
-		        
-		    kInfo = kInfo->next;
-	    }
-	}
-	
-	return reg_needed;
-}	

-#define ERR_MACRO(err) { \
-                         if( err != CL_SUCCESS) \
-                         { \
-                           if(error_code) \
-                               *error_code = err; \
-                           clFFT_DestroyPlan((clFFT_Plan) plan); \
-						   return (clFFT_Plan) NULL; \
-                         } \
-					   }
+    unsigned int i;
+    for (i = 0; i < num_devices; i++)
+        {
+            cl_fft_kernel_info *kInfo = plan->kernel_info;
+            while (kInfo)
+                {
+                    err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
+                    if (err != CL_SUCCESS)
+                        return -1;
+
+                    if (wg_size < kInfo->num_workitems_per_workgroup)
+                        reg_needed |= 1;
+
+                    if (*max_wg_size > wg_size)
+                        *max_wg_size = wg_size;
+
+                    kInfo = kInfo->next;
+                }
+        }
+
+    return reg_needed;
+}
+
+#define ERR_MACRO(err)                               \
+    {                                                \
+        if (err != CL_SUCCESS)                       \
+            {                                        \
+                if (error_code)                      \
+                    *error_code = err;               \
+                clFFT_DestroyPlan((clFFT_Plan)plan); \
+                return (clFFT_Plan)NULL;             \
+            }                                        \
+    }

 clFFT_Plan
-clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code )
+clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code)
 {
-	int i;
-	cl_int err;
-	int isPow2 = 1;
-	cl_fft_plan *plan = NULL;
-	ostringstream kString;
-	int num_devices;
-	int gpu_found = 0;
-	cl_device_id devices[16];
-	size_t ret_size;
-	cl_device_type device_type;
-	
-    if(!context)
-		ERR_MACRO(CL_INVALID_VALUE);
-	
-	isPow2 |= n.x && !( (n.x - 1) & n.x );
-	isPow2 |= n.y && !( (n.y - 1) & n.y );
-    isPow2 |= n.z && !( (n.z - 1) & n.z );
+    int i;
+    cl_int err;
+    int isPow2 = 1;
+    cl_fft_plan *plan = NULL;
+    ostringstream kString;
+    int num_devices;
+    int gpu_found = 0;
+    cl_device_id devices[16];
+    size_t ret_size;
+    cl_device_type device_type;

-	if(!isPow2)
-		ERR_MACRO(CL_INVALID_VALUE);
-	
-	if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) )
-		ERR_MACRO(CL_INVALID_VALUE);
+    if (!context)
+        ERR_MACRO(CL_INVALID_VALUE);

-	plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan));
-	if(!plan)
-		ERR_MACRO(CL_OUT_OF_RESOURCES);
-	
-	plan->context = context;
-	clRetainContext(context);
-	plan->n = n;
-	plan->dim = dim;
-	plan->format = dataFormat;
-	plan->kernel_info = 0;
-	plan->num_kernels = 0;
-	plan->twist_kernel = 0;
-	plan->program = 0;
-	plan->temp_buffer_needed = 0;
-	plan->last_batch_size = 0;
-	plan->tempmemobj = 0;
-	plan->tempmemobj_real = 0;
-	plan->tempmemobj_imag = 0;
-	plan->max_localmem_fft_size = 2048;
-	plan->max_work_item_per_workgroup = 256;
-	plan->max_radix = 16;
-	plan->min_mem_coalesce_width = 16;
-	plan->num_local_mem_banks = 16;	
-	
-patch_kernel_source:
+    isPow2 |= n.x && !((n.x - 1) & n.x);
+    isPow2 |= n.y && !((n.y - 1) & n.y);
+    isPow2 |= n.z && !((n.z - 1) & n.z);

-	plan->kernel_string = new string("");
-	if(!plan->kernel_string)
+    if (!isPow2)
+        ERR_MACRO(CL_INVALID_VALUE);
+
+    if ((dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1))
+        ERR_MACRO(CL_INVALID_VALUE);
+
+    plan = (cl_fft_plan *)malloc(sizeof(cl_fft_plan));
+    if (!plan)
        ERR_MACRO(CL_OUT_OF_RESOURCES);

-	getBlockConfigAndKernelString(plan);
-	
-	const char *source_str = plan->kernel_string->c_str();
-	plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err);
+    plan->context = context;
+    clRetainContext(context);
+    plan->n = n;
+    plan->dim = dim;
+    plan->format = dataFormat;
+    plan->kernel_info = 0;
+    plan->num_kernels = 0;
+    plan->twist_kernel = 0;
+    plan->program = 0;
+    plan->temp_buffer_needed = 0;
+    plan->last_batch_size = 0;
+    plan->tempmemobj = 0;
+    plan->tempmemobj_real = 0;
+    plan->tempmemobj_imag = 0;
+    plan->max_localmem_fft_size = 2048;
+    plan->max_work_item_per_workgroup = 256;
+    plan->max_radix = 16;
+    plan->min_mem_coalesce_width = 16;
+    plan->num_local_mem_banks = 16;
+
+patch_kernel_source:
+
+    plan->kernel_string = new string("");
+    if (!plan->kernel_string)
+        ERR_MACRO(CL_OUT_OF_RESOURCES);
+
+    getBlockConfigAndKernelString(plan);
+
+    const char *source_str = plan->kernel_string->c_str();
+    plan->program = clCreateProgramWithSource(context, 1, (const char **)&source_str, NULL, &err);
    ERR_MACRO(err);

-	err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size);
-	ERR_MACRO(err);
-	
-	num_devices = (int)(ret_size / sizeof(cl_device_id));
-	
-	for(i = 0; i < num_devices; i++)
-	{
-		err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
-		ERR_MACRO(err);
-		
-		if(device_type == CL_DEVICE_TYPE_GPU)
-		{	
-			gpu_found = 1;
-	        err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL);
-	        if (err != CL_SUCCESS)
-	        {
-		        char *build_log;				
-				char devicename[200];
-		        size_t log_size;
-				
-		        err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-				ERR_MACRO(err);
-				
-		        build_log = (char *) malloc(log_size + 1);
-				
-			    err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
-				ERR_MACRO(err);
-				
-				err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL);
-				ERR_MACRO(err);
-				
-				fprintf(stdout, "FFT program build log on device %s\n", devicename);
-		        fprintf(stdout, "%s\n", build_log);
-		        free(build_log);
-				
-				ERR_MACRO(err);
-			}	
-		}	
-	}
-	
-	if(!gpu_found)
-		ERR_MACRO(CL_INVALID_CONTEXT);
-	
-	err = createKernelList(plan); 
+    err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size);
    ERR_MACRO(err);
-    
+
+    num_devices = (int)(ret_size / sizeof(cl_device_id));
+
+    for (i = 0; i < num_devices; i++)
+        {
+            err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
+            ERR_MACRO(err);
+
+            if (device_type == CL_DEVICE_TYPE_GPU)
+                {
+                    gpu_found = 1;
+                    err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL);
+                    if (err != CL_SUCCESS)
+                        {
+                            char *build_log;
+                            char devicename[200];
+                            size_t log_size;
+
+                            err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+                            ERR_MACRO(err);
+
+                            build_log = (char *)malloc(log_size + 1);
+
+                            err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
+                            ERR_MACRO(err);
+
+                            err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL);
+                            ERR_MACRO(err);
+
+                            fprintf(stdout, "FFT program build log on device %s\n", devicename);
+                            fprintf(stdout, "%s\n", build_log);
+                            free(build_log);
+
+                            ERR_MACRO(err);
+                        }
+                }
+        }
+
+    if (!gpu_found)
+        ERR_MACRO(CL_INVALID_CONTEXT);
+
+    err = createKernelList(plan);
+    ERR_MACRO(err);
+
    // we created program and kernels based on "some max work group size (default 256)" ... this work group size
-    // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source 
-    // setting this as limit i.e max group size and rebuild. 
-	unsigned int max_kernel_wg_size; 
-	int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices);
-	if(patching_req == -1)
-	{
-	    ERR_MACRO(err);
-	}
-	
-	if(patching_req)
-	{
-	    destroy_plan(plan);
-	    plan->max_work_item_per_workgroup = max_kernel_wg_size;
-	    goto patch_kernel_source;
-	}
-	
-	cl_fft_kernel_info *kInfo = plan->kernel_info;
-	while(kInfo)
-	{
-		plan->num_kernels++;
-		kInfo = kInfo->next;
-	}
-	
-	if(error_code)
-		*error_code = CL_SUCCESS;
-			
-	return (clFFT_Plan) plan;
+    // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source
+    // setting this as limit i.e max group size and rebuild.
+    unsigned int max_kernel_wg_size;
+    int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices);
+    if (patching_req == -1)
+        {
+            ERR_MACRO(err);
+        }
+
+    if (patching_req)
+        {
+            destroy_plan(plan);
+            plan->max_work_item_per_workgroup = max_kernel_wg_size;
+            goto patch_kernel_source;
+        }
+
+    cl_fft_kernel_info *kInfo = plan->kernel_info;
+    while (kInfo)
+        {
+            plan->num_kernels++;
+            kInfo = kInfo->next;
+        }
+
+    if (error_code)
+        *error_code = CL_SUCCESS;
+
+    return (clFFT_Plan)plan;
 }

-void		 
-clFFT_DestroyPlan(clFFT_Plan plan)
+void clFFT_DestroyPlan(clFFT_Plan plan)
 {
-    cl_fft_plan *Plan = (cl_fft_plan *) plan;
-	if(Plan) 
-	{	
-		destroy_plan(Plan);	
-		clReleaseContext(Plan->context);
-		free(Plan);
-	}		
+    cl_fft_plan *Plan = (cl_fft_plan *)plan;
+    if (Plan)
+        {
+            destroy_plan(Plan);
+            clReleaseContext(Plan->context);
+            free(Plan);
+        }
 }

-void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file)
+void clFFT_DumpPlan(clFFT_Plan Plan, FILE *file)
 {
-	size_t gDim, lDim;
-	FILE *out;
-	if(!file)
-		out = stdout;
-	else 
-		out = file;
-	
-	cl_fft_plan *plan = (cl_fft_plan *) Plan;
-	cl_fft_kernel_info *kInfo = plan->kernel_info;
-	
-	while(kInfo)
-	{
-		cl_int s = 1;
-		getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim);
-		fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim);
-		kInfo = kInfo->next;
-	}
-	fprintf(out, "%s\n", plan->kernel_string->c_str());
+    size_t gDim, lDim;
+    FILE *out;
+    if (!file)
+        out = stdout;
+    else
+        out = file;
+
+    cl_fft_plan *plan = (cl_fft_plan *)Plan;
+    cl_fft_kernel_info *kInfo = plan->kernel_info;
+
+    while (kInfo)
+        {
+            cl_int s = 1;
+            getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim);
+            fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim);
+            kInfo = kInfo->next;
+        }
+    fprintf(out, "%s\n", plan->kernel_string->c_str());
 }
--- a/src/algorithms/libs/rtklib/rtklib_rtkcmn.cc
+++ b/src/algorithms/libs/rtklib/rtklib_rtkcmn.cc
@@ -253,11 +253,12 @@ const unsigned int tbl_CRC24Q[] = {
    0x42FA2F, 0xC4B6D4, 0xC82F22, 0x4E63D9, 0xD11CCE, 0x575035, 0x5BC9C3, 0xDD8538};


-extern "C" {
-void dgemm_(char *, char *, int *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *);
-extern void dgetrf_(int *, int *, double *, int *, int *, int *);
-extern void dgetri_(int *, double *, int *, int *, double *, int *, int *);
-extern void dgetrs_(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+extern "C"
+{
+    void dgemm_(char *, char *, int *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *);
+    extern void dgetrf_(int *, int *, double *, int *, int *, int *);
+    extern void dgetri_(int *, double *, int *, int *, double *, int *, int *);
+    extern void dgetrs_(char *, int *, int *, double *, int *, int *, double *, int *, int *);
 }


--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/cmake/msvc/config.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/cmake/msvc/config.h
@@ -1,15 +1,15 @@
-#ifndef _MSC_VER // [
+#ifndef _MSC_VER  // [
 #error "Use this header only with Microsoft Visual C++ compilers!"
-#endif // _MSC_VER ]
+#endif  // _MSC_VER ]

-#ifndef _MSC_CONFIG_H_ // [
+#ifndef _MSC_CONFIG_H_  // [
 #define _MSC_CONFIG_H_

 ////////////////////////////////////////////////////////////////////////
 // enable inline functions for C code
 ////////////////////////////////////////////////////////////////////////
 #ifndef __cplusplus
-#  define inline __inline
+#define inline __inline
 #endif

 ////////////////////////////////////////////////////////////////////////
@@ -23,12 +23,15 @@ typedef ptrdiff_t ssize_t;
 ////////////////////////////////////////////////////////////////////////
 #if _MSC_VER < 1800
 #include <math.h>
-static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);}
-static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
-static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);}
-static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
-static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);}
-static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);}
+static inline long lrint(double x)
+{
+    return (long)(x > 0.0 ? x + 0.5 : x - 0.5);
+}
+static inline long lrintf(float x) { return (long)(x > 0.0f ? x + 0.5f : x - 0.5f); }
+static inline long long llrint(double x) { return (long long)(x > 0.0 ? x + 0.5 : x - 0.5); }
+static inline long long llrintf(float x) { return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f); }
+static inline double rint(double x) { return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5); }
+static inline float rintf(float x) { return (x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f); }
 #endif

 ////////////////////////////////////////////////////////////////////////
@@ -43,7 +46,10 @@ static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x
 // random and srandom
 ////////////////////////////////////////////////////////////////////////
 #include <stdlib.h>
-static inline long int random (void) { return rand(); }
-static inline void srandom (unsigned int seed) { srand(seed); }
+static inline long int random(void)
+{
+    return rand();
+}
+static inline void srandom(unsigned int seed) { srand(seed); }

-#endif // _MSC_CONFIG_H_ ]
+#endif  // _MSC_CONFIG_H_ ]
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c
@@ -31,85 +31,97 @@ static intptr_t __alignment_mask = 0;

 struct volk_gnsssdr_machine *get_machine(void)
 {
-  extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
-  extern unsigned int n_volk_gnsssdr_machines;
-  static struct volk_gnsssdr_machine *machine = NULL;
+    extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
+    extern unsigned int n_volk_gnsssdr_machines;
+    static struct volk_gnsssdr_machine *machine = NULL;

-  if(machine != NULL)
-    return machine;
-  else {
-    unsigned int max_score = 0;
-    unsigned int i;
-    struct volk_gnsssdr_machine *max_machine = NULL;
-    for(i=0; i<n_volk_gnsssdr_machines; i++) {
-      if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
-        if(volk_gnsssdr_machines[i]->caps > max_score) {
-          max_score = volk_gnsssdr_machines[i]->caps;
-          max_machine = volk_gnsssdr_machines[i];
+    if (machine != NULL)
+        return machine;
+    else
+        {
+            unsigned int max_score = 0;
+            unsigned int i;
+            struct volk_gnsssdr_machine *max_machine = NULL;
+            for (i = 0; i < n_volk_gnsssdr_machines; i++)
+                {
+                    if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
+                        {
+                            if (volk_gnsssdr_machines[i]->caps > max_score)
+                                {
+                                    max_score = volk_gnsssdr_machines[i]->caps;
+                                    max_machine = volk_gnsssdr_machines[i];
+                                }
+                        }
+                }
+            machine = max_machine;
+            //printf("Using Volk machine: %s\n", machine->name);
+            __alignment = machine->alignment;
+            __alignment_mask = (intptr_t)(__alignment - 1);
+            return machine;
        }
-      }
-    }
-    machine = max_machine;
-    //printf("Using Volk machine: %s\n", machine->name);
-    __alignment = machine->alignment;
-    __alignment_mask = (intptr_t)(__alignment-1);
-    return machine;
-  }
 }

 void volk_gnsssdr_list_machines(void)
 {
-  extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
-  extern unsigned int n_volk_gnsssdr_machines;
+    extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
+    extern unsigned int n_volk_gnsssdr_machines;

-  unsigned int i;
-  for(i=0; i<n_volk_gnsssdr_machines; i++) {
-    if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
-        printf("%s;", volk_gnsssdr_machines[i]->name);
-    }
-  }
-  printf("\n");
+    unsigned int i;
+    for (i = 0; i < n_volk_gnsssdr_machines; i++)
+        {
+            if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
+                {
+                    printf("%s;", volk_gnsssdr_machines[i]->name);
+                }
+        }
+    printf("\n");
 }

-const char* volk_gnsssdr_get_machine(void)
+const char *volk_gnsssdr_get_machine(void)
 {
-  extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
-  extern unsigned int n_volk_gnsssdr_machines;
-  static struct volk_gnsssdr_machine *machine = NULL;
+    extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
+    extern unsigned int n_volk_gnsssdr_machines;
+    static struct volk_gnsssdr_machine *machine = NULL;

-  if(machine != NULL)
-    return machine->name;
-  else {
-    unsigned int max_score = 0;
-    unsigned int i;
-    struct volk_gnsssdr_machine *max_machine = NULL;
-    for(i=0; i<n_volk_gnsssdr_machines; i++) {
-      if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
-        if(volk_gnsssdr_machines[i]->caps > max_score) {
-          max_score = volk_gnsssdr_machines[i]->caps;
-          max_machine = volk_gnsssdr_machines[i];
+    if (machine != NULL)
+        return machine->name;
+    else
+        {
+            unsigned int max_score = 0;
+            unsigned int i;
+            struct volk_gnsssdr_machine *max_machine = NULL;
+            for (i = 0; i < n_volk_gnsssdr_machines; i++)
+                {
+                    if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
+                        {
+                            if (volk_gnsssdr_machines[i]->caps > max_score)
+                                {
+                                    max_score = volk_gnsssdr_machines[i]->caps;
+                                    max_machine = volk_gnsssdr_machines[i];
+                                }
+                        }
+                }
+            machine = max_machine;
+            return machine->name;
        }
-      }
-    }
-    machine = max_machine;
-    return machine->name;
-  }
 }

 size_t volk_gnsssdr_get_alignment(void)
 {
-    get_machine(); //ensures alignment is set
+    get_machine();  //ensures alignment is set
    return __alignment;
 }

 bool volk_gnsssdr_is_aligned(const void *ptr)
 {
-    return ((intptr_t)(ptr) & __alignment_mask) == 0;
+    return ((intptr_t)(ptr)&__alignment_mask) == 0;
 }

 #define LV_HAVE_GENERIC
 #define LV_HAVE_DISPATCHER

+// clang-format off
+
 %for kern in kernels:

 %if kern.has_dispatcher:
@@ -190,6 +202,8 @@ void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name)
    );
 }

+
+
 volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void) {
    const char **impl_names = get_machine()->${kern.name}_impl_names;
    const int *impl_deps = get_machine()->${kern.name}_impl_deps;
@@ -205,3 +219,5 @@ volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void) {
 }

 %endfor
+
+    // clang-format on
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h
@@ -42,7 +42,7 @@ typedef struct volk_gnsssdr_func_desc
 VOLK_API void volk_gnsssdr_list_machines(void);

 //! Returns the name of the machine this instance will use
-VOLK_API const char* volk_gnsssdr_get_machine(void);
+VOLK_API const char *volk_gnsssdr_get_machine(void);

 //! Get the machine alignment in bytes
 VOLK_API size_t volk_gnsssdr_get_alignment(void);
@@ -73,6 +73,7 @@ VOLK_API bool volk_gnsssdr_is_aligned(const void *ptr);
 //! A function pointer to the dispatcher implementation
 extern VOLK_API ${kern.pname} ${kern.name};

+// clang-format off
 //! A function pointer to the fastest aligned implementation
 extern VOLK_API ${kern.pname} ${kern.name}_a;

@@ -85,6 +86,7 @@ extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_
 //! Get description parameters for this kernel
 extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void);
 %endfor
+// clang-format off

 __VOLK_DECL_END

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h
@@ -19,10 +19,11 @@
 #ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
 #define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H

+// clang-format off
 %for i, arch in enumerate(archs):
 //#ifndef LV_${arch.name.upper()}
 #define LV_${arch.name.upper()} ${i}
 //#endif
 %endfor
-
+// clang-format on
 #endif /*INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED*/
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c
@@ -24,50 +24,54 @@
 struct VOLK_CPU volk_gnsssdr_cpu;

 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-    #define VOLK_CPU_x86
+#define VOLK_CPU_x86
 #endif

 #if defined(VOLK_CPU_x86)

 //implement get cpuid for gcc compilers using a system or local copy of cpuid.h
 #if defined(__GNUC__)
-    #include <cpuid.h>
-    #define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3)
-    #define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int*)regs), *((unsigned int*)regs+1), *((unsigned int*)regs+2), *((unsigned int*)regs+3))
+#include <cpuid.h>
+#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r + 0, (unsigned int *)r + 1, (unsigned int *)r + 2, (unsigned int *)r + 3)
+#define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int *)regs), *((unsigned int *)regs + 1), *((unsigned int *)regs + 2), *((unsigned int *)regs + 3))

-    /* Return Intel AVX extended CPU capabilities register.
+/* Return Intel AVX extended CPU capabilities register.
     * This function will bomb on non-AVX-capable machines, so
     * check for AVX capability before executing.
     */
-    #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV)
-    static inline unsigned long long _xgetbv(unsigned int index){
-        unsigned int eax, edx;
-        __VOLK_ASM __VOLK_VOLATILE ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
-        return ((unsigned long long)edx << 32) | eax;
-    }
-    #define __xgetbv() _xgetbv(0)
-    #else
-    #define __xgetbv() 0
-    #endif
+#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV)
+static inline unsigned long long _xgetbv(unsigned int index)
+{
+    unsigned int eax, edx;
+    __VOLK_ASM __VOLK_VOLATILE("xgetbv"
+                               : "=a"(eax), "=d"(edx)
+                               : "c"(index));
+    return ((unsigned long long)edx << 32) | eax;
+}
+#define __xgetbv() _xgetbv(0)
+#else
+#define __xgetbv() 0
+#endif

 //implement get cpuid for MSVC compilers using __cpuid intrinsic
 #elif defined(_MSC_VER) && defined(HAVE_INTRIN_H)
-    #include <intrin.h>
-    #define cpuid_x86(op, r) __cpuid(((int*)r), op)
+#include <intrin.h>
+#define cpuid_x86(op, r) __cpuid(((int *)r), op)

-    #if defined(_XCR_XFEATURE_ENABLED_MASK)
-    #define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
-    #else
-    #define __xgetbv() 0
-    #endif
+#if defined(_XCR_XFEATURE_ENABLED_MASK)
+#define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
+#else
+#define __xgetbv() 0
+#endif

 #else
-    #error "A get cpuid for volk_gnsssdr is not available on this compiler..."
-#endif //defined(__GNUC__)
+#error "A get cpuid for volk_gnsssdr is not available on this compiler..."
+#endif  //defined(__GNUC__)

-#endif //defined(VOLK_CPU_x86)
+#endif  //defined(VOLK_CPU_x86)

-static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit) {
+static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit)
+{
 #if defined(VOLK_CPU_x86)
    unsigned int regs[4] = {0};
    cpuid_x86_count(level, count, regs);
@@ -77,10 +81,11 @@ static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int
 #endif
 }

-static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) {
+static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit)
+{
 #if defined(VOLK_CPU_x86)
    unsigned int regs[4];
-    memset(regs, 0, sizeof(unsigned int)*4);
+    memset(regs, 0, sizeof(unsigned int) * 4);
    cpuid_x86(op, regs);
    return regs[reg] >> bit & 0x01;
 #else
@@ -88,10 +93,11 @@ static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsi
 #endif
 }

-static inline unsigned int check_extended_cpuid(unsigned int val) {
+static inline unsigned int check_extended_cpuid(unsigned int val)
+{
 #if defined(VOLK_CPU_x86)
    unsigned int regs[4];
-    memset(regs, 0, sizeof(unsigned int)*4);
+    memset(regs, 0, sizeof(unsigned int) * 4);
    cpuid_x86(0x80000000, regs);
    return regs[0] >= val;
 #else
@@ -99,7 +105,8 @@ static inline unsigned int check_extended_cpuid(unsigned int val) {
 #endif
 }

-static inline unsigned int get_avx_enabled(void) {
+static inline unsigned int get_avx_enabled(void)
+{
 #if defined(VOLK_CPU_x86)
    return __xgetbv() & 0x6;
 #else
@@ -107,7 +114,8 @@ static inline unsigned int get_avx_enabled(void) {
 #endif
 }

-static inline unsigned int get_avx2_enabled(void) {
+static inline unsigned int get_avx2_enabled(void)
+{
 #if defined(VOLK_CPU_x86)
    return __xgetbv() & 0x6;
 #else
@@ -117,28 +125,30 @@ static inline unsigned int get_avx2_enabled(void) {

 //neon detection is linux specific
 #if defined(__arm__) && defined(__linux__)
-    #include <asm/hwcap.h>
-    #include <linux/auxvec.h>
-    #include <stdio.h>
-    #define VOLK_CPU_ARM
+#include <asm/hwcap.h>
+#include <linux/auxvec.h>
+#include <stdio.h>
+#define VOLK_CPU_ARM
 #endif

-static int has_neon(void){
+static int has_neon(void)
+{
 #if defined(VOLK_CPU_ARM)
    FILE *auxvec_f;
    unsigned long auxvec[2];
    unsigned int found_neon = 0;
    auxvec_f = fopen("/proc/self/auxv", "rb");
-    if(!auxvec_f) return 0;
+    if (!auxvec_f) return 0;

    size_t r = 1;
    //so auxv is basically 32b of ID and 32b of value
    //so it goes like this
-    while(!found_neon && r) {
-      r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
-      if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
-        found_neon = 1;
-    }
+    while (!found_neon && r)
+        {
+            r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
+            if ((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
+                found_neon = 1;
+        }

    fclose(auxvec_f);
    return found_neon;
@@ -146,6 +156,7 @@ static int has_neon(void){
    return 0;
 #endif
 }
+// clang-format off

 %for arch in archs:
 static int i_can_has_${arch.name} (void) {
@@ -195,3 +206,4 @@ unsigned int volk_gnsssdr_get_lvarch() {
    %endfor
    return retval;
 }
+// clang-format on
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h
@@ -23,16 +23,18 @@

 __VOLK_DECL_BEGIN

+// clang-format off
 struct VOLK_CPU {
    %for arch in archs:
    int (*has_${arch.name}) ();
    %endfor
 };
+// clang-format on

 extern struct VOLK_CPU volk_gnsssdr_cpu;

-void volk_gnsssdr_cpu_init ();
-unsigned int volk_gnsssdr_get_lvarch ();
+void volk_gnsssdr_cpu_init();
+unsigned int volk_gnsssdr_get_lvarch();

 __VOLK_DECL_END

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c
@@ -16,6 +16,8 @@
 * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
 */

+// clang-format off
+
 <% this_machine = machine_dict[args[0]] %>
 <% arch_names = this_machine.arch_names %>

@@ -31,6 +33,7 @@
 #include "config.h"
 #endif

+
 %for kern in kernels:
 #include <volk_gnsssdr/${kern.name}.h>
 %endfor
@@ -56,3 +59,4 @@ struct volk_gnsssdr_machine volk_gnsssdr_machine_${this_machine.name} = {
 <% len_impls = len(impls) %>    ${len_impls},
    %endfor
 };
+// clang-format on
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c
@@ -20,6 +20,7 @@
 #include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
 #include "volk_gnsssdr_machines.h"

+// clang-format off
 struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = {
 %for machine in machines:
 #ifdef LV_MACHINE_${machine.name.upper()}
@@ -27,5 +28,5 @@ struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = {
 #endif
 %endfor
 };
-
-unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines)/sizeof(*volk_gnsssdr_machines);
+// clang-format on
+unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines) / sizeof(*volk_gnsssdr_machines);
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h
@@ -27,6 +27,7 @@

 __VOLK_DECL_BEGIN

+// clang-format off
 struct volk_gnsssdr_machine {
    const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format)
    const char *name;
@@ -48,5 +49,6 @@ extern struct volk_gnsssdr_machine volk_gnsssdr_machine_${machine.name};
 %endfor

 __VOLK_DECL_END
+// clang-format on

-#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
+#endif  //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h
@@ -22,8 +22,10 @@
 #include <inttypes.h>
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>

+// clang-format off
 %for kern in kernels:
 typedef void (*${kern.pname})(${kern.arglist_types});
 %endfor
+// clang-format on

 #endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/
--- a/src/algorithms/telemetry_decoder/gnuradio_blocks/gps_l2c_telemetry_decoder_cc.h
+++ b/src/algorithms/telemetry_decoder/gnuradio_blocks/gps_l2c_telemetry_decoder_cc.h
@@ -45,7 +45,8 @@
 #include <vector>


-extern "C" {
+extern "C"
+{
 #include "cnav_msg.h"
 #include "edc.h"
 #include "bits.h"
--- a/src/algorithms/telemetry_decoder/gnuradio_blocks/gps_l5_telemetry_decoder_cc.h
+++ b/src/algorithms/telemetry_decoder/gnuradio_blocks/gps_l5_telemetry_decoder_cc.h
@@ -42,7 +42,8 @@
 #include <utility>
 #include <vector>

-extern "C" {
+extern "C"
+{
 #include "cnav_msg.h"
 #include "edc.h"
 #include "bits.h"
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -76,9 +76,9 @@ struct GPU_Complex
    }
    CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
    {
-    //c=a*b+c
-    //real part
-    //c.r=(a.r*b.r - a.i*b.i)+c.r
+        //c=a*b+c
+        //real part
+        //c.r=(a.r*b.r - a.i*b.i)+c.r
 #ifdef __CUDACC__
        r = __fmaf_rn(a.r, b.r, r);
        r = __fmaf_rn(-a.i, b.i, r);