mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-11-17 15:47:15 +00:00
Apply clang-format to the whole source tree
This commit is contained in:
@@ -54,9 +54,10 @@ galileo_pcps_8ms_acquisition_cc::galileo_pcps_8ms_acquisition_cc(
|
||||
unsigned int sampled_ms, unsigned int max_dwells,
|
||||
unsigned int doppler_max, long freq, long fs_in,
|
||||
int samples_per_ms, int samples_per_code,
|
||||
bool dump, std::string dump_filename) : gr::block("galileo_pcps_8ms_acquisition_cc",
|
||||
gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
|
||||
gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
|
||||
bool dump,
|
||||
std::string dump_filename) : gr::block("galileo_pcps_8ms_acquisition_cc",
|
||||
gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
|
||||
gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
|
||||
{
|
||||
this->message_port_register_out(pmt::mp("events"));
|
||||
d_sample_counter = 0; // SAMPLE COUNTER
|
||||
|
||||
@@ -71,9 +71,7 @@ gps_pcps_acquisition_fpga_sc::gps_pcps_acquisition_fpga_sc(
|
||||
int samples_per_code, int vector_length, unsigned int nsamples_total,
|
||||
bool bit_transition_flag, bool use_CFAR_algorithm_flag,
|
||||
unsigned int select_queue_Fpga, std::string device_name, bool dump,
|
||||
std::string dump_filename) :
|
||||
|
||||
gr::block("pcps_acquisition_fpga_sc",
|
||||
std::string dump_filename) : gr::block("pcps_acquisition_fpga_sc",
|
||||
gr::io_signature::make(0, 0, sizeof(lv_16sc_t)),
|
||||
gr::io_signature::make(0, 0, 0))
|
||||
{
|
||||
|
||||
@@ -65,9 +65,10 @@ pcps_acquisition::pcps_acquisition(
|
||||
int samples_per_ms, int samples_per_code,
|
||||
bool bit_transition_flag, bool use_CFAR_algorithm_flag,
|
||||
bool dump, bool blocking,
|
||||
std::string dump_filename, size_t it_size) : gr::block("pcps_acquisition",
|
||||
gr::io_signature::make(1, 1, it_size * sampled_ms * samples_per_ms * (bit_transition_flag ? 2 : 1)),
|
||||
gr::io_signature::make(0, 0, it_size * sampled_ms * samples_per_ms * (bit_transition_flag ? 2 : 1)))
|
||||
std::string dump_filename,
|
||||
size_t it_size) : gr::block("pcps_acquisition",
|
||||
gr::io_signature::make(1, 1, it_size * sampled_ms * samples_per_ms * (bit_transition_flag ? 2 : 1)),
|
||||
gr::io_signature::make(0, 0, it_size * sampled_ms * samples_per_ms * (bit_transition_flag ? 2 : 1)))
|
||||
{
|
||||
this->message_port_register_out(pmt::mp("events"));
|
||||
|
||||
|
||||
@@ -61,9 +61,10 @@ pcps_cccwsr_acquisition_cc::pcps_cccwsr_acquisition_cc(
|
||||
unsigned int sampled_ms, unsigned int max_dwells,
|
||||
unsigned int doppler_max, long freq, long fs_in,
|
||||
int samples_per_ms, int samples_per_code,
|
||||
bool dump, std::string dump_filename) : gr::block("pcps_cccwsr_acquisition_cc",
|
||||
gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
|
||||
gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
|
||||
bool dump,
|
||||
std::string dump_filename) : gr::block("pcps_cccwsr_acquisition_cc",
|
||||
gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
|
||||
gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
|
||||
{
|
||||
this->message_port_register_out(pmt::mp("events"));
|
||||
d_sample_counter = 0; // SAMPLE COUNTER
|
||||
|
||||
@@ -67,9 +67,10 @@ pcps_quicksync_acquisition_cc::pcps_quicksync_acquisition_cc(
|
||||
unsigned int doppler_max, long freq, long fs_in,
|
||||
int samples_per_ms, int samples_per_code,
|
||||
bool bit_transition_flag,
|
||||
bool dump, std::string dump_filename) : gr::block("pcps_quicksync_acquisition_cc",
|
||||
gr::io_signature::make(1, 1, (sizeof(gr_complex) * sampled_ms * samples_per_ms)),
|
||||
gr::io_signature::make(0, 0, (sizeof(gr_complex) * sampled_ms * samples_per_ms)))
|
||||
bool dump,
|
||||
std::string dump_filename) : gr::block("pcps_quicksync_acquisition_cc",
|
||||
gr::io_signature::make(1, 1, (sizeof(gr_complex) * sampled_ms * samples_per_ms)),
|
||||
gr::io_signature::make(0, 0, (sizeof(gr_complex) * sampled_ms * samples_per_ms)))
|
||||
{
|
||||
this->message_port_register_out(pmt::mp("events"));
|
||||
d_sample_counter = 0; // SAMPLE COUNTER
|
||||
|
||||
@@ -76,9 +76,10 @@ pcps_tong_acquisition_cc::pcps_tong_acquisition_cc(
|
||||
long freq, long fs_in, int samples_per_ms,
|
||||
int samples_per_code, unsigned int tong_init_val,
|
||||
unsigned int tong_max_val, unsigned int tong_max_dwells,
|
||||
bool dump, std::string dump_filename) : gr::block("pcps_tong_acquisition_cc",
|
||||
gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
|
||||
gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
|
||||
bool dump,
|
||||
std::string dump_filename) : gr::block("pcps_tong_acquisition_cc",
|
||||
gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
|
||||
gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
|
||||
{
|
||||
this->message_port_register_out(pmt::mp("events"));
|
||||
d_sample_counter = 0; // SAMPLE COUNTER
|
||||
|
||||
@@ -50,85 +50,86 @@
|
||||
#define __CLFFT_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/opencl.h>
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
|
||||
// XForm type
|
||||
typedef enum
|
||||
{
|
||||
clFFT_Forward = -1,
|
||||
clFFT_Inverse = 1
|
||||
|
||||
}clFFT_Direction;
|
||||
// XForm type
|
||||
typedef enum
|
||||
{
|
||||
clFFT_Forward = -1,
|
||||
clFFT_Inverse = 1
|
||||
|
||||
// XForm dimension
|
||||
typedef enum
|
||||
{
|
||||
clFFT_1D = 0,
|
||||
clFFT_2D = 1,
|
||||
clFFT_3D = 3
|
||||
|
||||
}clFFT_Dimension;
|
||||
} clFFT_Direction;
|
||||
|
||||
// XForm Data type
|
||||
typedef enum
|
||||
{
|
||||
clFFT_SplitComplexFormat = 0,
|
||||
clFFT_InterleavedComplexFormat = 1
|
||||
}clFFT_DataFormat;
|
||||
// XForm dimension
|
||||
typedef enum
|
||||
{
|
||||
clFFT_1D = 0,
|
||||
clFFT_2D = 1,
|
||||
clFFT_3D = 3
|
||||
|
||||
typedef struct
|
||||
{
|
||||
unsigned int x;
|
||||
unsigned int y;
|
||||
unsigned int z;
|
||||
}clFFT_Dim3;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
float *real;
|
||||
float *imag;
|
||||
} clFFT_SplitComplex;
|
||||
} clFFT_Dimension;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
float real;
|
||||
float imag;
|
||||
}clFFT_Complex;
|
||||
// XForm Data type
|
||||
typedef enum
|
||||
{
|
||||
clFFT_SplitComplexFormat = 0,
|
||||
clFFT_InterleavedComplexFormat = 1
|
||||
} clFFT_DataFormat;
|
||||
|
||||
typedef void* clFFT_Plan;
|
||||
typedef struct
|
||||
{
|
||||
unsigned int x;
|
||||
unsigned int y;
|
||||
unsigned int z;
|
||||
} clFFT_Dim3;
|
||||
|
||||
clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code );
|
||||
typedef struct
|
||||
{
|
||||
float *real;
|
||||
float *imag;
|
||||
} clFFT_SplitComplex;
|
||||
|
||||
void clFFT_DestroyPlan( clFFT_Plan plan );
|
||||
typedef struct
|
||||
{
|
||||
float real;
|
||||
float imag;
|
||||
} clFFT_Complex;
|
||||
|
||||
cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir,
|
||||
cl_mem data_in, cl_mem data_out,
|
||||
cl_int num_events, cl_event *event_list, cl_event *event );
|
||||
typedef void *clFFT_Plan;
|
||||
|
||||
cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir,
|
||||
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
|
||||
cl_int num_events, cl_event *event_list, cl_event *event );
|
||||
clFFT_Plan clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code);
|
||||
|
||||
cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
|
||||
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
|
||||
|
||||
void clFFT_DestroyPlan(clFFT_Plan plan);
|
||||
|
||||
cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
|
||||
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
|
||||
|
||||
void clFFT_DumpPlan( clFFT_Plan plan, FILE *file);
|
||||
cl_int clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir,
|
||||
cl_mem data_in, cl_mem data_out,
|
||||
cl_int num_events, cl_event *event_list, cl_event *event);
|
||||
|
||||
cl_int clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir,
|
||||
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
|
||||
cl_int num_events, cl_event *event_list, cl_event *event);
|
||||
|
||||
cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
|
||||
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
|
||||
|
||||
|
||||
cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
|
||||
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
|
||||
|
||||
void clFFT_DumpPlan(clFFT_Plan plan, FILE *file);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -54,224 +54,220 @@
|
||||
using namespace std;
|
||||
|
||||
static string baseKernels = string(
|
||||
"#ifndef M_PI\n"
|
||||
"#define M_PI 0x1.921fb54442d18p+1\n"
|
||||
"#endif\n"
|
||||
"#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
|
||||
"#define conj(a) ((float2)((a).x, -(a).y))\n"
|
||||
"#define conjTransp(a) ((float2)(-(a).y, (a).x))\n"
|
||||
"\n"
|
||||
"#define fftKernel2(a,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 c = (a)[0]; \\\n"
|
||||
" (a)[0] = c + (a)[1]; \\\n"
|
||||
" (a)[1] = c - (a)[1]; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel2S(d1,d2,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 c = (d1); \\\n"
|
||||
" (d1) = c + (d2); \\\n"
|
||||
" (d2) = c - (d2); \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel4(a,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" fftKernel2S((a)[0], (a)[2], dir); \\\n"
|
||||
" fftKernel2S((a)[1], (a)[3], dir); \\\n"
|
||||
" fftKernel2S((a)[0], (a)[1], dir); \\\n"
|
||||
" (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
|
||||
" fftKernel2S((a)[2], (a)[3], dir); \\\n"
|
||||
" float2 c = (a)[1]; \\\n"
|
||||
" (a)[1] = (a)[2]; \\\n"
|
||||
" (a)[2] = c; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel4s(a0,a1,a2,a3,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" fftKernel2S((a0), (a2), dir); \\\n"
|
||||
" fftKernel2S((a1), (a3), dir); \\\n"
|
||||
" fftKernel2S((a0), (a1), dir); \\\n"
|
||||
" (a3) = (float2)(dir)*(conjTransp((a3))); \\\n"
|
||||
" fftKernel2S((a2), (a3), dir); \\\n"
|
||||
" float2 c = (a1); \\\n"
|
||||
" (a1) = (a2); \\\n"
|
||||
" (a2) = c; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define bitreverse8(a) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 c; \\\n"
|
||||
" c = (a)[1]; \\\n"
|
||||
" (a)[1] = (a)[4]; \\\n"
|
||||
" (a)[4] = c; \\\n"
|
||||
" c = (a)[3]; \\\n"
|
||||
" (a)[3] = (a)[6]; \\\n"
|
||||
" (a)[6] = c; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel8(a,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n"
|
||||
" const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n"
|
||||
" float2 c; \\\n"
|
||||
" fftKernel2S((a)[0], (a)[4], dir); \\\n"
|
||||
" fftKernel2S((a)[1], (a)[5], dir); \\\n"
|
||||
" fftKernel2S((a)[2], (a)[6], dir); \\\n"
|
||||
" fftKernel2S((a)[3], (a)[7], dir); \\\n"
|
||||
" (a)[5] = complexMul(w1, (a)[5]); \\\n"
|
||||
" (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n"
|
||||
" (a)[7] = complexMul(w3, (a)[7]); \\\n"
|
||||
" fftKernel2S((a)[0], (a)[2], dir); \\\n"
|
||||
" fftKernel2S((a)[1], (a)[3], dir); \\\n"
|
||||
" fftKernel2S((a)[4], (a)[6], dir); \\\n"
|
||||
" fftKernel2S((a)[5], (a)[7], dir); \\\n"
|
||||
" (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
|
||||
" (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n"
|
||||
" fftKernel2S((a)[0], (a)[1], dir); \\\n"
|
||||
" fftKernel2S((a)[2], (a)[3], dir); \\\n"
|
||||
" fftKernel2S((a)[4], (a)[5], dir); \\\n"
|
||||
" fftKernel2S((a)[6], (a)[7], dir); \\\n"
|
||||
" bitreverse8((a)); \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define bitreverse4x4(a) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 c; \\\n"
|
||||
" c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \\\n"
|
||||
" c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \\\n"
|
||||
" c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \\\n"
|
||||
" c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \\\n"
|
||||
" c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n"
|
||||
" c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel16(a,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" const float w0 = 0x1.d906bcp-1f; \\\n"
|
||||
" const float w1 = 0x1.87de2ap-2f; \\\n"
|
||||
" const float w2 = 0x1.6a09e6p-1f; \\\n"
|
||||
" fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \\\n"
|
||||
" fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \\\n"
|
||||
" fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n"
|
||||
" fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n"
|
||||
" (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n"
|
||||
" (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n"
|
||||
" (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n"
|
||||
" (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n"
|
||||
" (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n"
|
||||
" (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n"
|
||||
" (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n"
|
||||
" (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n"
|
||||
" (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n"
|
||||
" fftKernel4((a), dir); \\\n"
|
||||
" fftKernel4((a) + 4, dir); \\\n"
|
||||
" fftKernel4((a) + 8, dir); \\\n"
|
||||
" fftKernel4((a) + 12, dir); \\\n"
|
||||
" bitreverse4x4((a)); \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define bitreverse32(a) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 c1, c2; \\\n"
|
||||
" c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \\\n"
|
||||
" c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \\\n"
|
||||
" c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \\\n"
|
||||
" c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \\\n"
|
||||
" c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n"
|
||||
" c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel32(a,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" fftKernel2S((a)[0], (a)[16], dir); \\\n"
|
||||
" fftKernel2S((a)[1], (a)[17], dir); \\\n"
|
||||
" fftKernel2S((a)[2], (a)[18], dir); \\\n"
|
||||
" fftKernel2S((a)[3], (a)[19], dir); \\\n"
|
||||
" fftKernel2S((a)[4], (a)[20], dir); \\\n"
|
||||
" fftKernel2S((a)[5], (a)[21], dir); \\\n"
|
||||
" fftKernel2S((a)[6], (a)[22], dir); \\\n"
|
||||
" fftKernel2S((a)[7], (a)[23], dir); \\\n"
|
||||
" fftKernel2S((a)[8], (a)[24], dir); \\\n"
|
||||
" fftKernel2S((a)[9], (a)[25], dir); \\\n"
|
||||
" fftKernel2S((a)[10], (a)[26], dir); \\\n"
|
||||
" fftKernel2S((a)[11], (a)[27], dir); \\\n"
|
||||
" fftKernel2S((a)[12], (a)[28], dir); \\\n"
|
||||
" fftKernel2S((a)[13], (a)[29], dir); \\\n"
|
||||
" fftKernel2S((a)[14], (a)[30], dir); \\\n"
|
||||
" fftKernel2S((a)[15], (a)[31], dir); \\\n"
|
||||
" (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
|
||||
" (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
|
||||
" (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
|
||||
" (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
|
||||
" (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
|
||||
" (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
|
||||
" (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
|
||||
" (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n"
|
||||
" (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
|
||||
" (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
|
||||
" (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
|
||||
" (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
|
||||
" (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
|
||||
" (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
|
||||
" (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
|
||||
" fftKernel16((a), dir); \\\n"
|
||||
" fftKernel16((a) + 16, dir); \\\n"
|
||||
" bitreverse32((a)); \\\n"
|
||||
"}\n\n"
|
||||
);
|
||||
"#ifndef M_PI\n"
|
||||
"#define M_PI 0x1.921fb54442d18p+1\n"
|
||||
"#endif\n"
|
||||
"#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
|
||||
"#define conj(a) ((float2)((a).x, -(a).y))\n"
|
||||
"#define conjTransp(a) ((float2)(-(a).y, (a).x))\n"
|
||||
"\n"
|
||||
"#define fftKernel2(a,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 c = (a)[0]; \\\n"
|
||||
" (a)[0] = c + (a)[1]; \\\n"
|
||||
" (a)[1] = c - (a)[1]; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel2S(d1,d2,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 c = (d1); \\\n"
|
||||
" (d1) = c + (d2); \\\n"
|
||||
" (d2) = c - (d2); \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel4(a,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" fftKernel2S((a)[0], (a)[2], dir); \\\n"
|
||||
" fftKernel2S((a)[1], (a)[3], dir); \\\n"
|
||||
" fftKernel2S((a)[0], (a)[1], dir); \\\n"
|
||||
" (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
|
||||
" fftKernel2S((a)[2], (a)[3], dir); \\\n"
|
||||
" float2 c = (a)[1]; \\\n"
|
||||
" (a)[1] = (a)[2]; \\\n"
|
||||
" (a)[2] = c; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel4s(a0,a1,a2,a3,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" fftKernel2S((a0), (a2), dir); \\\n"
|
||||
" fftKernel2S((a1), (a3), dir); \\\n"
|
||||
" fftKernel2S((a0), (a1), dir); \\\n"
|
||||
" (a3) = (float2)(dir)*(conjTransp((a3))); \\\n"
|
||||
" fftKernel2S((a2), (a3), dir); \\\n"
|
||||
" float2 c = (a1); \\\n"
|
||||
" (a1) = (a2); \\\n"
|
||||
" (a2) = c; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define bitreverse8(a) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 c; \\\n"
|
||||
" c = (a)[1]; \\\n"
|
||||
" (a)[1] = (a)[4]; \\\n"
|
||||
" (a)[4] = c; \\\n"
|
||||
" c = (a)[3]; \\\n"
|
||||
" (a)[3] = (a)[6]; \\\n"
|
||||
" (a)[6] = c; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel8(a,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n"
|
||||
" const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n"
|
||||
" float2 c; \\\n"
|
||||
" fftKernel2S((a)[0], (a)[4], dir); \\\n"
|
||||
" fftKernel2S((a)[1], (a)[5], dir); \\\n"
|
||||
" fftKernel2S((a)[2], (a)[6], dir); \\\n"
|
||||
" fftKernel2S((a)[3], (a)[7], dir); \\\n"
|
||||
" (a)[5] = complexMul(w1, (a)[5]); \\\n"
|
||||
" (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n"
|
||||
" (a)[7] = complexMul(w3, (a)[7]); \\\n"
|
||||
" fftKernel2S((a)[0], (a)[2], dir); \\\n"
|
||||
" fftKernel2S((a)[1], (a)[3], dir); \\\n"
|
||||
" fftKernel2S((a)[4], (a)[6], dir); \\\n"
|
||||
" fftKernel2S((a)[5], (a)[7], dir); \\\n"
|
||||
" (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
|
||||
" (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n"
|
||||
" fftKernel2S((a)[0], (a)[1], dir); \\\n"
|
||||
" fftKernel2S((a)[2], (a)[3], dir); \\\n"
|
||||
" fftKernel2S((a)[4], (a)[5], dir); \\\n"
|
||||
" fftKernel2S((a)[6], (a)[7], dir); \\\n"
|
||||
" bitreverse8((a)); \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define bitreverse4x4(a) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 c; \\\n"
|
||||
" c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \\\n"
|
||||
" c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \\\n"
|
||||
" c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \\\n"
|
||||
" c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \\\n"
|
||||
" c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n"
|
||||
" c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel16(a,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" const float w0 = 0x1.d906bcp-1f; \\\n"
|
||||
" const float w1 = 0x1.87de2ap-2f; \\\n"
|
||||
" const float w2 = 0x1.6a09e6p-1f; \\\n"
|
||||
" fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \\\n"
|
||||
" fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \\\n"
|
||||
" fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n"
|
||||
" fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n"
|
||||
" (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n"
|
||||
" (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n"
|
||||
" (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n"
|
||||
" (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n"
|
||||
" (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n"
|
||||
" (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n"
|
||||
" (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n"
|
||||
" (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n"
|
||||
" (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n"
|
||||
" fftKernel4((a), dir); \\\n"
|
||||
" fftKernel4((a) + 4, dir); \\\n"
|
||||
" fftKernel4((a) + 8, dir); \\\n"
|
||||
" fftKernel4((a) + 12, dir); \\\n"
|
||||
" bitreverse4x4((a)); \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define bitreverse32(a) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 c1, c2; \\\n"
|
||||
" c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \\\n"
|
||||
" c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \\\n"
|
||||
" c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \\\n"
|
||||
" c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \\\n"
|
||||
" c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n"
|
||||
" c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"#define fftKernel32(a,dir) \\\n"
|
||||
"{ \\\n"
|
||||
" fftKernel2S((a)[0], (a)[16], dir); \\\n"
|
||||
" fftKernel2S((a)[1], (a)[17], dir); \\\n"
|
||||
" fftKernel2S((a)[2], (a)[18], dir); \\\n"
|
||||
" fftKernel2S((a)[3], (a)[19], dir); \\\n"
|
||||
" fftKernel2S((a)[4], (a)[20], dir); \\\n"
|
||||
" fftKernel2S((a)[5], (a)[21], dir); \\\n"
|
||||
" fftKernel2S((a)[6], (a)[22], dir); \\\n"
|
||||
" fftKernel2S((a)[7], (a)[23], dir); \\\n"
|
||||
" fftKernel2S((a)[8], (a)[24], dir); \\\n"
|
||||
" fftKernel2S((a)[9], (a)[25], dir); \\\n"
|
||||
" fftKernel2S((a)[10], (a)[26], dir); \\\n"
|
||||
" fftKernel2S((a)[11], (a)[27], dir); \\\n"
|
||||
" fftKernel2S((a)[12], (a)[28], dir); \\\n"
|
||||
" fftKernel2S((a)[13], (a)[29], dir); \\\n"
|
||||
" fftKernel2S((a)[14], (a)[30], dir); \\\n"
|
||||
" fftKernel2S((a)[15], (a)[31], dir); \\\n"
|
||||
" (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
|
||||
" (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
|
||||
" (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
|
||||
" (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
|
||||
" (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
|
||||
" (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
|
||||
" (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
|
||||
" (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n"
|
||||
" (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
|
||||
" (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
|
||||
" (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
|
||||
" (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
|
||||
" (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
|
||||
" (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
|
||||
" (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
|
||||
" fftKernel16((a), dir); \\\n"
|
||||
" fftKernel16((a) + 16, dir); \\\n"
|
||||
" bitreverse32((a)); \\\n"
|
||||
"}\n\n");
|
||||
|
||||
static string twistKernelInterleaved = string(
|
||||
"__kernel void \\\n"
|
||||
"clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 a, w; \\\n"
|
||||
" float ang; \\\n"
|
||||
" unsigned int j; \\\n"
|
||||
" unsigned int i = get_global_id(0); \\\n"
|
||||
" unsigned int startIndex = i; \\\n"
|
||||
" \\\n"
|
||||
" if(i < numCols) \\\n"
|
||||
" { \\\n"
|
||||
" for(j = 0; j < numRowsToProcess; j++) \\\n"
|
||||
" { \\\n"
|
||||
" a = in[startIndex]; \\\n"
|
||||
" ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
|
||||
" w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
|
||||
" a = complexMul(a, w); \\\n"
|
||||
" in[startIndex] = a; \\\n"
|
||||
" startIndex += numCols; \\\n"
|
||||
" } \\\n"
|
||||
" } \\\n"
|
||||
"} \\\n"
|
||||
);
|
||||
"__kernel void \\\n"
|
||||
"clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 a, w; \\\n"
|
||||
" float ang; \\\n"
|
||||
" unsigned int j; \\\n"
|
||||
" unsigned int i = get_global_id(0); \\\n"
|
||||
" unsigned int startIndex = i; \\\n"
|
||||
" \\\n"
|
||||
" if(i < numCols) \\\n"
|
||||
" { \\\n"
|
||||
" for(j = 0; j < numRowsToProcess; j++) \\\n"
|
||||
" { \\\n"
|
||||
" a = in[startIndex]; \\\n"
|
||||
" ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
|
||||
" w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
|
||||
" a = complexMul(a, w); \\\n"
|
||||
" in[startIndex] = a; \\\n"
|
||||
" startIndex += numCols; \\\n"
|
||||
" } \\\n"
|
||||
" } \\\n"
|
||||
"} \\\n");
|
||||
|
||||
static string twistKernelPlannar = string(
|
||||
"__kernel void \\\n"
|
||||
"clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 a, w; \\\n"
|
||||
" float ang; \\\n"
|
||||
" unsigned int j; \\\n"
|
||||
" unsigned int i = get_global_id(0); \\\n"
|
||||
" unsigned int startIndex = i; \\\n"
|
||||
" \\\n"
|
||||
" if(i < numCols) \\\n"
|
||||
" { \\\n"
|
||||
" for(j = 0; j < numRowsToProcess; j++) \\\n"
|
||||
" { \\\n"
|
||||
" a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n"
|
||||
" ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
|
||||
" w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
|
||||
" a = complexMul(a, w); \\\n"
|
||||
" in_real[startIndex] = a.x; \\\n"
|
||||
" in_imag[startIndex] = a.y; \\\n"
|
||||
" startIndex += numCols; \\\n"
|
||||
" } \\\n"
|
||||
" } \\\n"
|
||||
"} \\\n"
|
||||
);
|
||||
|
||||
"__kernel void \\\n"
|
||||
"clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
|
||||
"{ \\\n"
|
||||
" float2 a, w; \\\n"
|
||||
" float ang; \\\n"
|
||||
" unsigned int j; \\\n"
|
||||
" unsigned int i = get_global_id(0); \\\n"
|
||||
" unsigned int startIndex = i; \\\n"
|
||||
" \\\n"
|
||||
" if(i < numCols) \\\n"
|
||||
" { \\\n"
|
||||
" for(j = 0; j < numRowsToProcess; j++) \\\n"
|
||||
" { \\\n"
|
||||
" a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n"
|
||||
" ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
|
||||
" w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
|
||||
" a = complexMul(a, w); \\\n"
|
||||
" in_real[startIndex] = a.x; \\\n"
|
||||
" in_imag[startIndex] = a.y; \\\n"
|
||||
" startIndex += numCols; \\\n"
|
||||
" } \\\n"
|
||||
" } \\\n"
|
||||
"} \\\n");
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -52,354 +52,352 @@
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#define max(a,b) (((a)>(b)) ? (a) : (b))
|
||||
#define min(a,b) (((a)<(b)) ? (a) : (b))
|
||||
#define max(a, b) (((a) > (b)) ? (a) : (b))
|
||||
#define min(a, b) (((a) < (b)) ? (a) : (b))
|
||||
|
||||
static cl_int
|
||||
allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
|
||||
{
|
||||
cl_int err = CL_SUCCESS;
|
||||
if(plan->temp_buffer_needed && plan->last_batch_size != batchSize)
|
||||
{
|
||||
plan->last_batch_size = batchSize;
|
||||
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
|
||||
|
||||
if(plan->tempmemobj)
|
||||
clReleaseMemObject(plan->tempmemobj);
|
||||
|
||||
plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
|
||||
}
|
||||
return err;
|
||||
cl_int err = CL_SUCCESS;
|
||||
if (plan->temp_buffer_needed && plan->last_batch_size != batchSize)
|
||||
{
|
||||
plan->last_batch_size = batchSize;
|
||||
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
|
||||
|
||||
if (plan->tempmemobj)
|
||||
clReleaseMemObject(plan->tempmemobj);
|
||||
|
||||
plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static cl_int
|
||||
allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
|
||||
{
|
||||
cl_int err = CL_SUCCESS;
|
||||
cl_int terr;
|
||||
if(plan->temp_buffer_needed && plan->last_batch_size != batchSize)
|
||||
{
|
||||
plan->last_batch_size = batchSize;
|
||||
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
|
||||
|
||||
if(plan->tempmemobj_real)
|
||||
clReleaseMemObject(plan->tempmemobj_real);
|
||||
cl_int err = CL_SUCCESS;
|
||||
cl_int terr;
|
||||
if (plan->temp_buffer_needed && plan->last_batch_size != batchSize)
|
||||
{
|
||||
plan->last_batch_size = batchSize;
|
||||
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
|
||||
|
||||
if(plan->tempmemobj_imag)
|
||||
clReleaseMemObject(plan->tempmemobj_imag);
|
||||
|
||||
plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
|
||||
plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr);
|
||||
err |= terr;
|
||||
}
|
||||
return err;
|
||||
if (plan->tempmemobj_real)
|
||||
clReleaseMemObject(plan->tempmemobj_real);
|
||||
|
||||
if (plan->tempmemobj_imag)
|
||||
clReleaseMemObject(plan->tempmemobj_imag);
|
||||
|
||||
plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
|
||||
plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr);
|
||||
err |= terr;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
void
|
||||
getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
|
||||
void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
|
||||
{
|
||||
*lWorkItems = kernelInfo->num_workitems_per_workgroup;
|
||||
int numWorkGroups = kernelInfo->num_workgroups;
|
||||
*lWorkItems = kernelInfo->num_workitems_per_workgroup;
|
||||
int numWorkGroups = kernelInfo->num_workgroups;
|
||||
int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup;
|
||||
|
||||
switch(kernelInfo->dir)
|
||||
{
|
||||
case cl_fft_kernel_x:
|
||||
|
||||
switch (kernelInfo->dir)
|
||||
{
|
||||
case cl_fft_kernel_x:
|
||||
*batchSize *= (plan->n.y * plan->n.z);
|
||||
numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG);
|
||||
numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize / numXFormsPerWG + 1) : (*batchSize / numXFormsPerWG);
|
||||
numWorkGroups *= kernelInfo->num_workgroups;
|
||||
break;
|
||||
case cl_fft_kernel_y:
|
||||
*batchSize *= plan->n.z;
|
||||
numWorkGroups *= *batchSize;
|
||||
break;
|
||||
case cl_fft_kernel_z:
|
||||
numWorkGroups *= *batchSize;
|
||||
break;
|
||||
}
|
||||
|
||||
*gWorkItems = numWorkGroups * *lWorkItems;
|
||||
break;
|
||||
case cl_fft_kernel_y:
|
||||
*batchSize *= plan->n.z;
|
||||
numWorkGroups *= *batchSize;
|
||||
break;
|
||||
case cl_fft_kernel_z:
|
||||
numWorkGroups *= *batchSize;
|
||||
break;
|
||||
}
|
||||
|
||||
*gWorkItems = numWorkGroups * *lWorkItems;
|
||||
}
|
||||
|
||||
cl_int
|
||||
clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
|
||||
cl_mem data_in, cl_mem data_out,
|
||||
cl_int num_events, cl_event *event_list, cl_event *event )
|
||||
{
|
||||
int s;
|
||||
cl_fft_plan *plan = (cl_fft_plan *) Plan;
|
||||
if(plan->format != clFFT_InterleavedComplexFormat)
|
||||
return CL_INVALID_VALUE;
|
||||
|
||||
cl_int err;
|
||||
size_t gWorkItems, lWorkItems;
|
||||
int inPlaceDone;
|
||||
|
||||
cl_int isInPlace = data_in == data_out ? 1 : 0;
|
||||
|
||||
if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
|
||||
return err;
|
||||
|
||||
cl_mem memObj[3];
|
||||
memObj[0] = data_in;
|
||||
memObj[1] = data_out;
|
||||
memObj[2] = plan->tempmemobj;
|
||||
cl_fft_kernel_info *kernelInfo = plan->kernel_info;
|
||||
int numKernels = plan->num_kernels;
|
||||
|
||||
int numKernelsOdd = numKernels & 1;
|
||||
int currRead = 0;
|
||||
int currWrite = 1;
|
||||
|
||||
// at least one external dram shuffle (transpose) required
|
||||
if(plan->temp_buffer_needed)
|
||||
{
|
||||
// in-place transform
|
||||
if(isInPlace)
|
||||
{
|
||||
inPlaceDone = 0;
|
||||
currRead = 1;
|
||||
currWrite = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
currWrite = (numKernels & 1) ? 1 : 2;
|
||||
}
|
||||
|
||||
while(kernelInfo)
|
||||
{
|
||||
if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
|
||||
{
|
||||
currWrite = currRead;
|
||||
inPlaceDone = 1;
|
||||
}
|
||||
|
||||
s = batchSize;
|
||||
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
|
||||
if(err)
|
||||
return err;
|
||||
|
||||
currRead = (currWrite == 1) ? 1 : 2;
|
||||
currWrite = (currWrite == 1) ? 2 : 1;
|
||||
|
||||
kernelInfo = kernelInfo->next;
|
||||
}
|
||||
}
|
||||
// no dram shuffle (transpose required) transform
|
||||
// all kernels can execute in-place.
|
||||
else {
|
||||
|
||||
while(kernelInfo)
|
||||
{
|
||||
s = batchSize;
|
||||
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
|
||||
if(err)
|
||||
return err;
|
||||
|
||||
currRead = 1;
|
||||
currWrite = 1;
|
||||
|
||||
kernelInfo = kernelInfo->next;
|
||||
}
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
cl_int
|
||||
clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
|
||||
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
|
||||
cl_int num_events, cl_event *event_list, cl_event *event)
|
||||
{
|
||||
int s;
|
||||
cl_fft_plan *plan = (cl_fft_plan *) Plan;
|
||||
|
||||
if(plan->format != clFFT_SplitComplexFormat)
|
||||
return CL_INVALID_VALUE;
|
||||
|
||||
cl_int err;
|
||||
size_t gWorkItems, lWorkItems;
|
||||
int inPlaceDone;
|
||||
|
||||
cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
|
||||
|
||||
if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
|
||||
return err;
|
||||
|
||||
cl_mem memObj_real[3];
|
||||
cl_mem memObj_imag[3];
|
||||
memObj_real[0] = data_in_real;
|
||||
memObj_real[1] = data_out_real;
|
||||
memObj_real[2] = plan->tempmemobj_real;
|
||||
memObj_imag[0] = data_in_imag;
|
||||
memObj_imag[1] = data_out_imag;
|
||||
memObj_imag[2] = plan->tempmemobj_imag;
|
||||
|
||||
cl_fft_kernel_info *kernelInfo = plan->kernel_info;
|
||||
int numKernels = plan->num_kernels;
|
||||
|
||||
int numKernelsOdd = numKernels & 1;
|
||||
int currRead = 0;
|
||||
int currWrite = 1;
|
||||
|
||||
// at least one external dram shuffle (transpose) required
|
||||
if(plan->temp_buffer_needed)
|
||||
{
|
||||
// in-place transform
|
||||
if(isInPlace)
|
||||
{
|
||||
inPlaceDone = 0;
|
||||
currRead = 1;
|
||||
currWrite = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
currWrite = (numKernels & 1) ? 1 : 2;
|
||||
}
|
||||
|
||||
while(kernelInfo)
|
||||
{
|
||||
if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
|
||||
{
|
||||
currWrite = currRead;
|
||||
inPlaceDone = 1;
|
||||
}
|
||||
|
||||
s = batchSize;
|
||||
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
|
||||
if(err)
|
||||
return err;
|
||||
|
||||
currRead = (currWrite == 1) ? 1 : 2;
|
||||
currWrite = (currWrite == 1) ? 2 : 1;
|
||||
|
||||
kernelInfo = kernelInfo->next;
|
||||
}
|
||||
}
|
||||
// no dram shuffle (transpose required) transform
|
||||
else {
|
||||
|
||||
while(kernelInfo)
|
||||
{
|
||||
s = batchSize;
|
||||
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
|
||||
if(err)
|
||||
return err;
|
||||
|
||||
currRead = 1;
|
||||
currWrite = 1;
|
||||
|
||||
kernelInfo = kernelInfo->next;
|
||||
}
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
cl_int
|
||||
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
|
||||
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
|
||||
cl_int
|
||||
clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
|
||||
cl_mem data_in, cl_mem data_out,
|
||||
cl_int num_events, cl_event *event_list, cl_event *event)
|
||||
{
|
||||
cl_fft_plan *plan = (cl_fft_plan *) Plan;
|
||||
|
||||
unsigned int N = numRows*numCols;
|
||||
unsigned int nCols = numCols;
|
||||
unsigned int sRow = startRow;
|
||||
unsigned int rToProcess = rowsToProcess;
|
||||
int d = dir;
|
||||
int err = 0;
|
||||
|
||||
cl_device_id device_id;
|
||||
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
|
||||
if(err)
|
||||
return err;
|
||||
|
||||
size_t gSize;
|
||||
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
|
||||
if(err)
|
||||
return err;
|
||||
|
||||
gSize = min(128, gSize);
|
||||
size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
|
||||
size_t numLocalThreads[1] = { gSize };
|
||||
|
||||
err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
|
||||
|
||||
return err;
|
||||
int s;
|
||||
cl_fft_plan *plan = (cl_fft_plan *)Plan;
|
||||
if (plan->format != clFFT_InterleavedComplexFormat)
|
||||
return CL_INVALID_VALUE;
|
||||
|
||||
cl_int err;
|
||||
size_t gWorkItems, lWorkItems;
|
||||
int inPlaceDone;
|
||||
|
||||
cl_int isInPlace = data_in == data_out ? 1 : 0;
|
||||
|
||||
if ((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
|
||||
return err;
|
||||
|
||||
cl_mem memObj[3];
|
||||
memObj[0] = data_in;
|
||||
memObj[1] = data_out;
|
||||
memObj[2] = plan->tempmemobj;
|
||||
cl_fft_kernel_info *kernelInfo = plan->kernel_info;
|
||||
int numKernels = plan->num_kernels;
|
||||
|
||||
int numKernelsOdd = numKernels & 1;
|
||||
int currRead = 0;
|
||||
int currWrite = 1;
|
||||
|
||||
// at least one external dram shuffle (transpose) required
|
||||
if (plan->temp_buffer_needed)
|
||||
{
|
||||
// in-place transform
|
||||
if (isInPlace)
|
||||
{
|
||||
inPlaceDone = 0;
|
||||
currRead = 1;
|
||||
currWrite = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
currWrite = (numKernels & 1) ? 1 : 2;
|
||||
}
|
||||
|
||||
while (kernelInfo)
|
||||
{
|
||||
if (isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
|
||||
{
|
||||
currWrite = currRead;
|
||||
inPlaceDone = 1;
|
||||
}
|
||||
|
||||
s = batchSize;
|
||||
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
currRead = (currWrite == 1) ? 1 : 2;
|
||||
currWrite = (currWrite == 1) ? 2 : 1;
|
||||
|
||||
kernelInfo = kernelInfo->next;
|
||||
}
|
||||
}
|
||||
// no dram shuffle (transpose required) transform
|
||||
// all kernels can execute in-place.
|
||||
else
|
||||
{
|
||||
while (kernelInfo)
|
||||
{
|
||||
s = batchSize;
|
||||
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
currRead = 1;
|
||||
currWrite = 1;
|
||||
|
||||
kernelInfo = kernelInfo->next;
|
||||
}
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
cl_int
|
||||
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
|
||||
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
|
||||
cl_int
|
||||
clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
|
||||
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
|
||||
cl_int num_events, cl_event *event_list, cl_event *event)
|
||||
{
|
||||
cl_fft_plan *plan = (cl_fft_plan *) Plan;
|
||||
|
||||
unsigned int N = numRows*numCols;
|
||||
unsigned int nCols = numCols;
|
||||
unsigned int sRow = startRow;
|
||||
unsigned int rToProcess = rowsToProcess;
|
||||
int d = dir;
|
||||
int err = 0;
|
||||
|
||||
cl_device_id device_id;
|
||||
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
|
||||
if(err)
|
||||
return err;
|
||||
|
||||
size_t gSize;
|
||||
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
|
||||
if(err)
|
||||
return err;
|
||||
|
||||
gSize = min(128, gSize);
|
||||
size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
|
||||
size_t numLocalThreads[1] = { gSize };
|
||||
|
||||
err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
|
||||
|
||||
return err;
|
||||
int s;
|
||||
cl_fft_plan *plan = (cl_fft_plan *)Plan;
|
||||
|
||||
if (plan->format != clFFT_SplitComplexFormat)
|
||||
return CL_INVALID_VALUE;
|
||||
|
||||
cl_int err;
|
||||
size_t gWorkItems, lWorkItems;
|
||||
int inPlaceDone;
|
||||
|
||||
cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
|
||||
|
||||
if ((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
|
||||
return err;
|
||||
|
||||
cl_mem memObj_real[3];
|
||||
cl_mem memObj_imag[3];
|
||||
memObj_real[0] = data_in_real;
|
||||
memObj_real[1] = data_out_real;
|
||||
memObj_real[2] = plan->tempmemobj_real;
|
||||
memObj_imag[0] = data_in_imag;
|
||||
memObj_imag[1] = data_out_imag;
|
||||
memObj_imag[2] = plan->tempmemobj_imag;
|
||||
|
||||
cl_fft_kernel_info *kernelInfo = plan->kernel_info;
|
||||
int numKernels = plan->num_kernels;
|
||||
|
||||
int numKernelsOdd = numKernels & 1;
|
||||
int currRead = 0;
|
||||
int currWrite = 1;
|
||||
|
||||
// at least one external dram shuffle (transpose) required
|
||||
if (plan->temp_buffer_needed)
|
||||
{
|
||||
// in-place transform
|
||||
if (isInPlace)
|
||||
{
|
||||
inPlaceDone = 0;
|
||||
currRead = 1;
|
||||
currWrite = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
currWrite = (numKernels & 1) ? 1 : 2;
|
||||
}
|
||||
|
||||
while (kernelInfo)
|
||||
{
|
||||
if (isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
|
||||
{
|
||||
currWrite = currRead;
|
||||
inPlaceDone = 1;
|
||||
}
|
||||
|
||||
s = batchSize;
|
||||
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
currRead = (currWrite == 1) ? 1 : 2;
|
||||
currWrite = (currWrite == 1) ? 2 : 1;
|
||||
|
||||
kernelInfo = kernelInfo->next;
|
||||
}
|
||||
}
|
||||
// no dram shuffle (transpose required) transform
|
||||
else
|
||||
{
|
||||
while (kernelInfo)
|
||||
{
|
||||
s = batchSize;
|
||||
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
|
||||
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
currRead = 1;
|
||||
currWrite = 1;
|
||||
|
||||
kernelInfo = kernelInfo->next;
|
||||
}
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
cl_int
|
||||
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
|
||||
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
|
||||
{
|
||||
cl_fft_plan *plan = (cl_fft_plan *)Plan;
|
||||
|
||||
unsigned int N = numRows * numCols;
|
||||
unsigned int nCols = numCols;
|
||||
unsigned int sRow = startRow;
|
||||
unsigned int rToProcess = rowsToProcess;
|
||||
int d = dir;
|
||||
int err = 0;
|
||||
|
||||
cl_device_id device_id;
|
||||
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
size_t gSize;
|
||||
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
gSize = min(128, gSize);
|
||||
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};
|
||||
size_t numLocalThreads[1] = {gSize};
|
||||
|
||||
err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
cl_int
|
||||
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
|
||||
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
|
||||
{
|
||||
cl_fft_plan *plan = (cl_fft_plan *)Plan;
|
||||
|
||||
unsigned int N = numRows * numCols;
|
||||
unsigned int nCols = numCols;
|
||||
unsigned int sRow = startRow;
|
||||
unsigned int rToProcess = rowsToProcess;
|
||||
int d = dir;
|
||||
int err = 0;
|
||||
|
||||
cl_device_id device_id;
|
||||
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
size_t gSize;
|
||||
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
gSize = min(128, gSize);
|
||||
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};
|
||||
size_t numLocalThreads[1] = {gSize};
|
||||
|
||||
err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess);
|
||||
err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d);
|
||||
|
||||
err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
@@ -58,106 +58,106 @@ using namespace std;
|
||||
|
||||
typedef enum kernel_dir_t
|
||||
{
|
||||
cl_fft_kernel_x,
|
||||
cl_fft_kernel_y,
|
||||
cl_fft_kernel_z
|
||||
}cl_fft_kernel_dir;
|
||||
cl_fft_kernel_x,
|
||||
cl_fft_kernel_y,
|
||||
cl_fft_kernel_z
|
||||
} cl_fft_kernel_dir;
|
||||
|
||||
typedef struct kernel_info_t
|
||||
{
|
||||
cl_kernel kernel;
|
||||
char *kernel_name;
|
||||
unsigned lmem_size;
|
||||
unsigned num_workgroups;
|
||||
cl_kernel kernel;
|
||||
char *kernel_name;
|
||||
unsigned lmem_size;
|
||||
unsigned num_workgroups;
|
||||
unsigned num_xforms_per_workgroup;
|
||||
unsigned num_workitems_per_workgroup;
|
||||
cl_fft_kernel_dir dir;
|
||||
int in_place_possible;
|
||||
kernel_info_t *next;
|
||||
}cl_fft_kernel_info;
|
||||
unsigned num_workitems_per_workgroup;
|
||||
cl_fft_kernel_dir dir;
|
||||
int in_place_possible;
|
||||
kernel_info_t *next;
|
||||
} cl_fft_kernel_info;
|
||||
|
||||
typedef struct
|
||||
typedef struct
|
||||
{
|
||||
// context in which fft resources are created and kernels are executed
|
||||
cl_context context;
|
||||
|
||||
// size of signal
|
||||
clFFT_Dim3 n;
|
||||
|
||||
// dimension of transform ... must be either 1D, 2D or 3D
|
||||
clFFT_Dimension dim;
|
||||
|
||||
// data format ... must be either interleaved or plannar
|
||||
clFFT_DataFormat format;
|
||||
|
||||
// string containing kernel source. Generated at runtime based on
|
||||
// n, dim, format and other parameters
|
||||
string *kernel_string;
|
||||
|
||||
// CL program containing source and kernel this particular
|
||||
// n, dim, data format
|
||||
cl_program program;
|
||||
|
||||
// linked list of kernels which needs to be executed for this fft
|
||||
cl_fft_kernel_info *kernel_info;
|
||||
|
||||
// number of kernels
|
||||
int num_kernels;
|
||||
|
||||
// twist kernel for virtualizing fft of very large sizes that do not
|
||||
// fit in GPU global memory
|
||||
cl_kernel twist_kernel;
|
||||
|
||||
// flag indicating if temporary intermediate buffer is needed or not.
|
||||
// this depends on fft kernels being executed and if transform is
|
||||
// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
|
||||
// one that does not require global transpose do not need temporary buffer)
|
||||
// 2D 1024x1024 out-of-place fft however do require intermediate buffer.
|
||||
// If temp buffer is needed, its allocation is lazy i.e. its not allocated
|
||||
// until its needed
|
||||
cl_int temp_buffer_needed;
|
||||
|
||||
// Batch size is runtime parameter and size of temporary buffer (if needed)
|
||||
// depends on batch size. Allocation of temporary buffer is lazy i.e. its
|
||||
// only created when needed. Once its created at first call of clFFT_Executexxx
|
||||
// it is not allocated next time if next time clFFT_Executexxx is called with
|
||||
// batch size different than the first call. last_batch_size caches the last
|
||||
// batch size with which this plan is used so that we dont keep allocating/deallocating
|
||||
// temp buffer if same batch size is used again and again.
|
||||
unsigned last_batch_size;
|
||||
|
||||
// temporary buffer for interleaved plan
|
||||
cl_mem tempmemobj;
|
||||
|
||||
// temporary buffer for planner plan. Only one of tempmemobj or
|
||||
// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
|
||||
// data format of plan (plannar or interleaved)
|
||||
cl_mem tempmemobj_real, tempmemobj_imag;
|
||||
|
||||
// Maximum size of signal for which local memory transposed based
|
||||
// fft is sufficient i.e. no global mem transpose (communication)
|
||||
// is needed
|
||||
unsigned max_localmem_fft_size;
|
||||
|
||||
// Maximum work items per work group allowed. This, along with max_radix below controls
|
||||
// maximum local memory being used by fft kernels of this plan. Set to 256 by default
|
||||
unsigned max_work_item_per_workgroup;
|
||||
|
||||
// Maximum base radix for local memory fft ... this controls the maximum register
|
||||
// space used by work items. Currently defaults to 16
|
||||
unsigned max_radix;
|
||||
|
||||
// Device depended parameter that tells how many work-items need to be read consecutive
|
||||
// values to make sure global memory access by work-items of a work-group result in
|
||||
// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
|
||||
unsigned min_mem_coalesce_width;
|
||||
|
||||
// Number of local memory banks. This is used to geneate kernel with local memory
|
||||
// transposes with appropriate padding to avoid bank conflicts to local memory
|
||||
// e.g. on NVidia it is 16.
|
||||
unsigned num_local_mem_banks;
|
||||
}cl_fft_plan;
|
||||
// context in which fft resources are created and kernels are executed
|
||||
cl_context context;
|
||||
|
||||
// size of signal
|
||||
clFFT_Dim3 n;
|
||||
|
||||
// dimension of transform ... must be either 1D, 2D or 3D
|
||||
clFFT_Dimension dim;
|
||||
|
||||
// data format ... must be either interleaved or plannar
|
||||
clFFT_DataFormat format;
|
||||
|
||||
// string containing kernel source. Generated at runtime based on
|
||||
// n, dim, format and other parameters
|
||||
string *kernel_string;
|
||||
|
||||
// CL program containing source and kernel this particular
|
||||
// n, dim, data format
|
||||
cl_program program;
|
||||
|
||||
// linked list of kernels which needs to be executed for this fft
|
||||
cl_fft_kernel_info *kernel_info;
|
||||
|
||||
// number of kernels
|
||||
int num_kernels;
|
||||
|
||||
// twist kernel for virtualizing fft of very large sizes that do not
|
||||
// fit in GPU global memory
|
||||
cl_kernel twist_kernel;
|
||||
|
||||
// flag indicating if temporary intermediate buffer is needed or not.
|
||||
// this depends on fft kernels being executed and if transform is
|
||||
// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
|
||||
// one that does not require global transpose do not need temporary buffer)
|
||||
// 2D 1024x1024 out-of-place fft however do require intermediate buffer.
|
||||
// If temp buffer is needed, its allocation is lazy i.e. its not allocated
|
||||
// until its needed
|
||||
cl_int temp_buffer_needed;
|
||||
|
||||
// Batch size is runtime parameter and size of temporary buffer (if needed)
|
||||
// depends on batch size. Allocation of temporary buffer is lazy i.e. its
|
||||
// only created when needed. Once its created at first call of clFFT_Executexxx
|
||||
// it is not allocated next time if next time clFFT_Executexxx is called with
|
||||
// batch size different than the first call. last_batch_size caches the last
|
||||
// batch size with which this plan is used so that we dont keep allocating/deallocating
|
||||
// temp buffer if same batch size is used again and again.
|
||||
unsigned last_batch_size;
|
||||
|
||||
// temporary buffer for interleaved plan
|
||||
cl_mem tempmemobj;
|
||||
|
||||
// temporary buffer for planner plan. Only one of tempmemobj or
|
||||
// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
|
||||
// data format of plan (plannar or interleaved)
|
||||
cl_mem tempmemobj_real, tempmemobj_imag;
|
||||
|
||||
// Maximum size of signal for which local memory transposed based
|
||||
// fft is sufficient i.e. no global mem transpose (communication)
|
||||
// is needed
|
||||
unsigned max_localmem_fft_size;
|
||||
|
||||
// Maximum work items per work group allowed. This, along with max_radix below controls
|
||||
// maximum local memory being used by fft kernels of this plan. Set to 256 by default
|
||||
unsigned max_work_item_per_workgroup;
|
||||
|
||||
// Maximum base radix for local memory fft ... this controls the maximum register
|
||||
// space used by work items. Currently defaults to 16
|
||||
unsigned max_radix;
|
||||
|
||||
// Device depended parameter that tells how many work-items need to be read consecutive
|
||||
// values to make sure global memory access by work-items of a work-group result in
|
||||
// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
|
||||
unsigned min_mem_coalesce_width;
|
||||
|
||||
// Number of local memory banks. This is used to geneate kernel with local memory
|
||||
// transposes with appropriate padding to avoid bank conflicts to local memory
|
||||
// e.g. on NVidia it is 16.
|
||||
unsigned num_local_mem_banks;
|
||||
} cl_fft_plan;
|
||||
|
||||
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -61,59 +61,59 @@ using namespace std;
|
||||
|
||||
extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems);
|
||||
|
||||
static void
|
||||
static void
|
||||
getBlockConfigAndKernelString(cl_fft_plan *plan)
|
||||
{
|
||||
plan->temp_buffer_needed = 0;
|
||||
*plan->kernel_string += baseKernels;
|
||||
|
||||
if(plan->format == clFFT_SplitComplexFormat)
|
||||
*plan->kernel_string += twistKernelPlannar;
|
||||
else
|
||||
*plan->kernel_string += twistKernelInterleaved;
|
||||
|
||||
switch(plan->dim)
|
||||
{
|
||||
case clFFT_1D:
|
||||
FFT1D(plan, cl_fft_kernel_x);
|
||||
break;
|
||||
|
||||
case clFFT_2D:
|
||||
FFT1D(plan, cl_fft_kernel_x);
|
||||
FFT1D(plan, cl_fft_kernel_y);
|
||||
break;
|
||||
|
||||
case clFFT_3D:
|
||||
FFT1D(plan, cl_fft_kernel_x);
|
||||
FFT1D(plan, cl_fft_kernel_y);
|
||||
FFT1D(plan, cl_fft_kernel_z);
|
||||
break;
|
||||
|
||||
default:
|
||||
return;
|
||||
}
|
||||
|
||||
plan->temp_buffer_needed = 0;
|
||||
cl_fft_kernel_info *kInfo = plan->kernel_info;
|
||||
while(kInfo)
|
||||
{
|
||||
plan->temp_buffer_needed |= !kInfo->in_place_possible;
|
||||
kInfo = kInfo->next;
|
||||
}
|
||||
plan->temp_buffer_needed = 0;
|
||||
*plan->kernel_string += baseKernels;
|
||||
|
||||
if (plan->format == clFFT_SplitComplexFormat)
|
||||
*plan->kernel_string += twistKernelPlannar;
|
||||
else
|
||||
*plan->kernel_string += twistKernelInterleaved;
|
||||
|
||||
switch (plan->dim)
|
||||
{
|
||||
case clFFT_1D:
|
||||
FFT1D(plan, cl_fft_kernel_x);
|
||||
break;
|
||||
|
||||
case clFFT_2D:
|
||||
FFT1D(plan, cl_fft_kernel_x);
|
||||
FFT1D(plan, cl_fft_kernel_y);
|
||||
break;
|
||||
|
||||
case clFFT_3D:
|
||||
FFT1D(plan, cl_fft_kernel_x);
|
||||
FFT1D(plan, cl_fft_kernel_y);
|
||||
FFT1D(plan, cl_fft_kernel_z);
|
||||
break;
|
||||
|
||||
default:
|
||||
return;
|
||||
}
|
||||
|
||||
plan->temp_buffer_needed = 0;
|
||||
cl_fft_kernel_info *kInfo = plan->kernel_info;
|
||||
while (kInfo)
|
||||
{
|
||||
plan->temp_buffer_needed |= !kInfo->in_place_possible;
|
||||
kInfo = kInfo->next;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void
|
||||
deleteKernelInfo(cl_fft_kernel_info *kInfo)
|
||||
{
|
||||
if(kInfo)
|
||||
{
|
||||
if(kInfo->kernel_name)
|
||||
free(kInfo->kernel_name);
|
||||
if(kInfo->kernel)
|
||||
clReleaseKernel(kInfo->kernel);
|
||||
free(kInfo);
|
||||
}
|
||||
if (kInfo)
|
||||
{
|
||||
if (kInfo->kernel_name)
|
||||
free(kInfo->kernel_name);
|
||||
if (kInfo->kernel)
|
||||
clReleaseKernel(kInfo->kernel);
|
||||
free(kInfo);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -121,282 +121,282 @@ destroy_plan(cl_fft_plan *Plan)
|
||||
{
|
||||
cl_fft_kernel_info *kernel_info = Plan->kernel_info;
|
||||
|
||||
while(kernel_info)
|
||||
{
|
||||
cl_fft_kernel_info *tmp = kernel_info->next;
|
||||
deleteKernelInfo(kernel_info);
|
||||
kernel_info = tmp;
|
||||
}
|
||||
|
||||
Plan->kernel_info = NULL;
|
||||
|
||||
if(Plan->kernel_string)
|
||||
{
|
||||
delete Plan->kernel_string;
|
||||
Plan->kernel_string = NULL;
|
||||
}
|
||||
if(Plan->twist_kernel)
|
||||
{
|
||||
clReleaseKernel(Plan->twist_kernel);
|
||||
Plan->twist_kernel = NULL;
|
||||
}
|
||||
if(Plan->program)
|
||||
{
|
||||
clReleaseProgram(Plan->program);
|
||||
Plan->program = NULL;
|
||||
}
|
||||
if(Plan->tempmemobj)
|
||||
{
|
||||
clReleaseMemObject(Plan->tempmemobj);
|
||||
Plan->tempmemobj = NULL;
|
||||
}
|
||||
if(Plan->tempmemobj_real)
|
||||
{
|
||||
clReleaseMemObject(Plan->tempmemobj_real);
|
||||
Plan->tempmemobj_real = NULL;
|
||||
}
|
||||
if(Plan->tempmemobj_imag)
|
||||
{
|
||||
clReleaseMemObject(Plan->tempmemobj_imag);
|
||||
Plan->tempmemobj_imag = NULL;
|
||||
}
|
||||
while (kernel_info)
|
||||
{
|
||||
cl_fft_kernel_info *tmp = kernel_info->next;
|
||||
deleteKernelInfo(kernel_info);
|
||||
kernel_info = tmp;
|
||||
}
|
||||
|
||||
Plan->kernel_info = NULL;
|
||||
|
||||
if (Plan->kernel_string)
|
||||
{
|
||||
delete Plan->kernel_string;
|
||||
Plan->kernel_string = NULL;
|
||||
}
|
||||
if (Plan->twist_kernel)
|
||||
{
|
||||
clReleaseKernel(Plan->twist_kernel);
|
||||
Plan->twist_kernel = NULL;
|
||||
}
|
||||
if (Plan->program)
|
||||
{
|
||||
clReleaseProgram(Plan->program);
|
||||
Plan->program = NULL;
|
||||
}
|
||||
if (Plan->tempmemobj)
|
||||
{
|
||||
clReleaseMemObject(Plan->tempmemobj);
|
||||
Plan->tempmemobj = NULL;
|
||||
}
|
||||
if (Plan->tempmemobj_real)
|
||||
{
|
||||
clReleaseMemObject(Plan->tempmemobj_real);
|
||||
Plan->tempmemobj_real = NULL;
|
||||
}
|
||||
if (Plan->tempmemobj_imag)
|
||||
{
|
||||
clReleaseMemObject(Plan->tempmemobj_imag);
|
||||
Plan->tempmemobj_imag = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
createKernelList(cl_fft_plan *plan)
|
||||
createKernelList(cl_fft_plan *plan)
|
||||
{
|
||||
cl_program program = plan->program;
|
||||
cl_fft_kernel_info *kernel_info = plan->kernel_info;
|
||||
|
||||
cl_int err;
|
||||
while(kernel_info)
|
||||
{
|
||||
kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
|
||||
if(!kernel_info->kernel || err != CL_SUCCESS)
|
||||
return err;
|
||||
kernel_info = kernel_info->next;
|
||||
}
|
||||
|
||||
if(plan->format == clFFT_SplitComplexFormat)
|
||||
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
|
||||
else
|
||||
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
|
||||
|
||||
if(!plan->twist_kernel || err)
|
||||
return err;
|
||||
cl_program program = plan->program;
|
||||
cl_fft_kernel_info *kernel_info = plan->kernel_info;
|
||||
|
||||
return CL_SUCCESS;
|
||||
cl_int err;
|
||||
while (kernel_info)
|
||||
{
|
||||
kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
|
||||
if (!kernel_info->kernel || err != CL_SUCCESS)
|
||||
return err;
|
||||
kernel_info = kernel_info->next;
|
||||
}
|
||||
|
||||
if (plan->format == clFFT_SplitComplexFormat)
|
||||
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
|
||||
else
|
||||
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
|
||||
|
||||
if (!plan->twist_kernel || err)
|
||||
return err;
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
|
||||
{
|
||||
{
|
||||
int reg_needed = 0;
|
||||
*max_wg_size = std::numeric_limits<int>::max();
|
||||
int err;
|
||||
unsigned wg_size;
|
||||
|
||||
unsigned int i;
|
||||
for(i = 0; i < num_devices; i++)
|
||||
{
|
||||
cl_fft_kernel_info *kInfo = plan->kernel_info;
|
||||
while(kInfo)
|
||||
{
|
||||
err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
|
||||
if(err != CL_SUCCESS)
|
||||
return -1;
|
||||
|
||||
if(wg_size < kInfo->num_workitems_per_workgroup)
|
||||
reg_needed |= 1;
|
||||
|
||||
if(*max_wg_size > wg_size)
|
||||
*max_wg_size = wg_size;
|
||||
|
||||
kInfo = kInfo->next;
|
||||
}
|
||||
}
|
||||
|
||||
return reg_needed;
|
||||
}
|
||||
|
||||
#define ERR_MACRO(err) { \
|
||||
if( err != CL_SUCCESS) \
|
||||
{ \
|
||||
if(error_code) \
|
||||
*error_code = err; \
|
||||
clFFT_DestroyPlan((clFFT_Plan) plan); \
|
||||
return (clFFT_Plan) NULL; \
|
||||
} \
|
||||
}
|
||||
unsigned int i;
|
||||
for (i = 0; i < num_devices; i++)
|
||||
{
|
||||
cl_fft_kernel_info *kInfo = plan->kernel_info;
|
||||
while (kInfo)
|
||||
{
|
||||
err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
|
||||
if (err != CL_SUCCESS)
|
||||
return -1;
|
||||
|
||||
if (wg_size < kInfo->num_workitems_per_workgroup)
|
||||
reg_needed |= 1;
|
||||
|
||||
if (*max_wg_size > wg_size)
|
||||
*max_wg_size = wg_size;
|
||||
|
||||
kInfo = kInfo->next;
|
||||
}
|
||||
}
|
||||
|
||||
return reg_needed;
|
||||
}
|
||||
|
||||
#define ERR_MACRO(err) \
|
||||
{ \
|
||||
if (err != CL_SUCCESS) \
|
||||
{ \
|
||||
if (error_code) \
|
||||
*error_code = err; \
|
||||
clFFT_DestroyPlan((clFFT_Plan)plan); \
|
||||
return (clFFT_Plan)NULL; \
|
||||
} \
|
||||
}
|
||||
|
||||
clFFT_Plan
|
||||
clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code )
|
||||
clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code)
|
||||
{
|
||||
int i;
|
||||
cl_int err;
|
||||
int isPow2 = 1;
|
||||
cl_fft_plan *plan = NULL;
|
||||
ostringstream kString;
|
||||
int num_devices;
|
||||
int gpu_found = 0;
|
||||
cl_device_id devices[16];
|
||||
size_t ret_size;
|
||||
cl_device_type device_type;
|
||||
|
||||
if(!context)
|
||||
ERR_MACRO(CL_INVALID_VALUE);
|
||||
|
||||
isPow2 |= n.x && !( (n.x - 1) & n.x );
|
||||
isPow2 |= n.y && !( (n.y - 1) & n.y );
|
||||
isPow2 |= n.z && !( (n.z - 1) & n.z );
|
||||
int i;
|
||||
cl_int err;
|
||||
int isPow2 = 1;
|
||||
cl_fft_plan *plan = NULL;
|
||||
ostringstream kString;
|
||||
int num_devices;
|
||||
int gpu_found = 0;
|
||||
cl_device_id devices[16];
|
||||
size_t ret_size;
|
||||
cl_device_type device_type;
|
||||
|
||||
if(!isPow2)
|
||||
ERR_MACRO(CL_INVALID_VALUE);
|
||||
|
||||
if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) )
|
||||
ERR_MACRO(CL_INVALID_VALUE);
|
||||
if (!context)
|
||||
ERR_MACRO(CL_INVALID_VALUE);
|
||||
|
||||
plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan));
|
||||
if(!plan)
|
||||
ERR_MACRO(CL_OUT_OF_RESOURCES);
|
||||
|
||||
plan->context = context;
|
||||
clRetainContext(context);
|
||||
plan->n = n;
|
||||
plan->dim = dim;
|
||||
plan->format = dataFormat;
|
||||
plan->kernel_info = 0;
|
||||
plan->num_kernels = 0;
|
||||
plan->twist_kernel = 0;
|
||||
plan->program = 0;
|
||||
plan->temp_buffer_needed = 0;
|
||||
plan->last_batch_size = 0;
|
||||
plan->tempmemobj = 0;
|
||||
plan->tempmemobj_real = 0;
|
||||
plan->tempmemobj_imag = 0;
|
||||
plan->max_localmem_fft_size = 2048;
|
||||
plan->max_work_item_per_workgroup = 256;
|
||||
plan->max_radix = 16;
|
||||
plan->min_mem_coalesce_width = 16;
|
||||
plan->num_local_mem_banks = 16;
|
||||
|
||||
patch_kernel_source:
|
||||
isPow2 |= n.x && !((n.x - 1) & n.x);
|
||||
isPow2 |= n.y && !((n.y - 1) & n.y);
|
||||
isPow2 |= n.z && !((n.z - 1) & n.z);
|
||||
|
||||
plan->kernel_string = new string("");
|
||||
if(!plan->kernel_string)
|
||||
if (!isPow2)
|
||||
ERR_MACRO(CL_INVALID_VALUE);
|
||||
|
||||
if ((dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1))
|
||||
ERR_MACRO(CL_INVALID_VALUE);
|
||||
|
||||
plan = (cl_fft_plan *)malloc(sizeof(cl_fft_plan));
|
||||
if (!plan)
|
||||
ERR_MACRO(CL_OUT_OF_RESOURCES);
|
||||
|
||||
getBlockConfigAndKernelString(plan);
|
||||
|
||||
const char *source_str = plan->kernel_string->c_str();
|
||||
plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err);
|
||||
plan->context = context;
|
||||
clRetainContext(context);
|
||||
plan->n = n;
|
||||
plan->dim = dim;
|
||||
plan->format = dataFormat;
|
||||
plan->kernel_info = 0;
|
||||
plan->num_kernels = 0;
|
||||
plan->twist_kernel = 0;
|
||||
plan->program = 0;
|
||||
plan->temp_buffer_needed = 0;
|
||||
plan->last_batch_size = 0;
|
||||
plan->tempmemobj = 0;
|
||||
plan->tempmemobj_real = 0;
|
||||
plan->tempmemobj_imag = 0;
|
||||
plan->max_localmem_fft_size = 2048;
|
||||
plan->max_work_item_per_workgroup = 256;
|
||||
plan->max_radix = 16;
|
||||
plan->min_mem_coalesce_width = 16;
|
||||
plan->num_local_mem_banks = 16;
|
||||
|
||||
patch_kernel_source:
|
||||
|
||||
plan->kernel_string = new string("");
|
||||
if (!plan->kernel_string)
|
||||
ERR_MACRO(CL_OUT_OF_RESOURCES);
|
||||
|
||||
getBlockConfigAndKernelString(plan);
|
||||
|
||||
const char *source_str = plan->kernel_string->c_str();
|
||||
plan->program = clCreateProgramWithSource(context, 1, (const char **)&source_str, NULL, &err);
|
||||
ERR_MACRO(err);
|
||||
|
||||
err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size);
|
||||
ERR_MACRO(err);
|
||||
|
||||
num_devices = (int)(ret_size / sizeof(cl_device_id));
|
||||
|
||||
for(i = 0; i < num_devices; i++)
|
||||
{
|
||||
err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
|
||||
ERR_MACRO(err);
|
||||
|
||||
if(device_type == CL_DEVICE_TYPE_GPU)
|
||||
{
|
||||
gpu_found = 1;
|
||||
err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL);
|
||||
if (err != CL_SUCCESS)
|
||||
{
|
||||
char *build_log;
|
||||
char devicename[200];
|
||||
size_t log_size;
|
||||
|
||||
err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
||||
ERR_MACRO(err);
|
||||
|
||||
build_log = (char *) malloc(log_size + 1);
|
||||
|
||||
err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
|
||||
ERR_MACRO(err);
|
||||
|
||||
err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL);
|
||||
ERR_MACRO(err);
|
||||
|
||||
fprintf(stdout, "FFT program build log on device %s\n", devicename);
|
||||
fprintf(stdout, "%s\n", build_log);
|
||||
free(build_log);
|
||||
|
||||
ERR_MACRO(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!gpu_found)
|
||||
ERR_MACRO(CL_INVALID_CONTEXT);
|
||||
|
||||
err = createKernelList(plan);
|
||||
err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size);
|
||||
ERR_MACRO(err);
|
||||
|
||||
|
||||
num_devices = (int)(ret_size / sizeof(cl_device_id));
|
||||
|
||||
for (i = 0; i < num_devices; i++)
|
||||
{
|
||||
err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
|
||||
ERR_MACRO(err);
|
||||
|
||||
if (device_type == CL_DEVICE_TYPE_GPU)
|
||||
{
|
||||
gpu_found = 1;
|
||||
err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL);
|
||||
if (err != CL_SUCCESS)
|
||||
{
|
||||
char *build_log;
|
||||
char devicename[200];
|
||||
size_t log_size;
|
||||
|
||||
err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
||||
ERR_MACRO(err);
|
||||
|
||||
build_log = (char *)malloc(log_size + 1);
|
||||
|
||||
err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
|
||||
ERR_MACRO(err);
|
||||
|
||||
err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL);
|
||||
ERR_MACRO(err);
|
||||
|
||||
fprintf(stdout, "FFT program build log on device %s\n", devicename);
|
||||
fprintf(stdout, "%s\n", build_log);
|
||||
free(build_log);
|
||||
|
||||
ERR_MACRO(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!gpu_found)
|
||||
ERR_MACRO(CL_INVALID_CONTEXT);
|
||||
|
||||
err = createKernelList(plan);
|
||||
ERR_MACRO(err);
|
||||
|
||||
// we created program and kernels based on "some max work group size (default 256)" ... this work group size
|
||||
// may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source
|
||||
// setting this as limit i.e max group size and rebuild.
|
||||
unsigned int max_kernel_wg_size;
|
||||
int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices);
|
||||
if(patching_req == -1)
|
||||
{
|
||||
ERR_MACRO(err);
|
||||
}
|
||||
|
||||
if(patching_req)
|
||||
{
|
||||
destroy_plan(plan);
|
||||
plan->max_work_item_per_workgroup = max_kernel_wg_size;
|
||||
goto patch_kernel_source;
|
||||
}
|
||||
|
||||
cl_fft_kernel_info *kInfo = plan->kernel_info;
|
||||
while(kInfo)
|
||||
{
|
||||
plan->num_kernels++;
|
||||
kInfo = kInfo->next;
|
||||
}
|
||||
|
||||
if(error_code)
|
||||
*error_code = CL_SUCCESS;
|
||||
|
||||
return (clFFT_Plan) plan;
|
||||
// may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source
|
||||
// setting this as limit i.e max group size and rebuild.
|
||||
unsigned int max_kernel_wg_size;
|
||||
int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices);
|
||||
if (patching_req == -1)
|
||||
{
|
||||
ERR_MACRO(err);
|
||||
}
|
||||
|
||||
if (patching_req)
|
||||
{
|
||||
destroy_plan(plan);
|
||||
plan->max_work_item_per_workgroup = max_kernel_wg_size;
|
||||
goto patch_kernel_source;
|
||||
}
|
||||
|
||||
cl_fft_kernel_info *kInfo = plan->kernel_info;
|
||||
while (kInfo)
|
||||
{
|
||||
plan->num_kernels++;
|
||||
kInfo = kInfo->next;
|
||||
}
|
||||
|
||||
if (error_code)
|
||||
*error_code = CL_SUCCESS;
|
||||
|
||||
return (clFFT_Plan)plan;
|
||||
}
|
||||
|
||||
void
|
||||
clFFT_DestroyPlan(clFFT_Plan plan)
|
||||
void clFFT_DestroyPlan(clFFT_Plan plan)
|
||||
{
|
||||
cl_fft_plan *Plan = (cl_fft_plan *) plan;
|
||||
if(Plan)
|
||||
{
|
||||
destroy_plan(Plan);
|
||||
clReleaseContext(Plan->context);
|
||||
free(Plan);
|
||||
}
|
||||
cl_fft_plan *Plan = (cl_fft_plan *)plan;
|
||||
if (Plan)
|
||||
{
|
||||
destroy_plan(Plan);
|
||||
clReleaseContext(Plan->context);
|
||||
free(Plan);
|
||||
}
|
||||
}
|
||||
|
||||
void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file)
|
||||
void clFFT_DumpPlan(clFFT_Plan Plan, FILE *file)
|
||||
{
|
||||
size_t gDim, lDim;
|
||||
FILE *out;
|
||||
if(!file)
|
||||
out = stdout;
|
||||
else
|
||||
out = file;
|
||||
|
||||
cl_fft_plan *plan = (cl_fft_plan *) Plan;
|
||||
cl_fft_kernel_info *kInfo = plan->kernel_info;
|
||||
|
||||
while(kInfo)
|
||||
{
|
||||
cl_int s = 1;
|
||||
getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim);
|
||||
fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim);
|
||||
kInfo = kInfo->next;
|
||||
}
|
||||
fprintf(out, "%s\n", plan->kernel_string->c_str());
|
||||
size_t gDim, lDim;
|
||||
FILE *out;
|
||||
if (!file)
|
||||
out = stdout;
|
||||
else
|
||||
out = file;
|
||||
|
||||
cl_fft_plan *plan = (cl_fft_plan *)Plan;
|
||||
cl_fft_kernel_info *kInfo = plan->kernel_info;
|
||||
|
||||
while (kInfo)
|
||||
{
|
||||
cl_int s = 1;
|
||||
getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim);
|
||||
fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim);
|
||||
kInfo = kInfo->next;
|
||||
}
|
||||
fprintf(out, "%s\n", plan->kernel_string->c_str());
|
||||
}
|
||||
|
||||
@@ -253,11 +253,12 @@ const unsigned int tbl_CRC24Q[] = {
|
||||
0x42FA2F, 0xC4B6D4, 0xC82F22, 0x4E63D9, 0xD11CCE, 0x575035, 0x5BC9C3, 0xDD8538};
|
||||
|
||||
|
||||
extern "C" {
|
||||
void dgemm_(char *, char *, int *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *);
|
||||
extern void dgetrf_(int *, int *, double *, int *, int *, int *);
|
||||
extern void dgetri_(int *, double *, int *, int *, double *, int *, int *);
|
||||
extern void dgetrs_(char *, int *, int *, double *, int *, int *, double *, int *, int *);
|
||||
extern "C"
|
||||
{
|
||||
void dgemm_(char *, char *, int *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *);
|
||||
extern void dgetrf_(int *, int *, double *, int *, int *, int *);
|
||||
extern void dgetri_(int *, double *, int *, int *, double *, int *, int *);
|
||||
extern void dgetrs_(char *, int *, int *, double *, int *, int *, double *, int *, int *);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
#ifndef _MSC_VER // [
|
||||
#ifndef _MSC_VER // [
|
||||
#error "Use this header only with Microsoft Visual C++ compilers!"
|
||||
#endif // _MSC_VER ]
|
||||
#endif // _MSC_VER ]
|
||||
|
||||
#ifndef _MSC_CONFIG_H_ // [
|
||||
#ifndef _MSC_CONFIG_H_ // [
|
||||
#define _MSC_CONFIG_H_
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// enable inline functions for C code
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
#ifndef __cplusplus
|
||||
# define inline __inline
|
||||
#define inline __inline
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
@@ -23,12 +23,15 @@ typedef ptrdiff_t ssize_t;
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
#if _MSC_VER < 1800
|
||||
#include <math.h>
|
||||
static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);}
|
||||
static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
|
||||
static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);}
|
||||
static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
|
||||
static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);}
|
||||
static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);}
|
||||
static inline long lrint(double x)
|
||||
{
|
||||
return (long)(x > 0.0 ? x + 0.5 : x - 0.5);
|
||||
}
|
||||
static inline long lrintf(float x) { return (long)(x > 0.0f ? x + 0.5f : x - 0.5f); }
|
||||
static inline long long llrint(double x) { return (long long)(x > 0.0 ? x + 0.5 : x - 0.5); }
|
||||
static inline long long llrintf(float x) { return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f); }
|
||||
static inline double rint(double x) { return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5); }
|
||||
static inline float rintf(float x) { return (x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f); }
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
@@ -43,7 +46,10 @@ static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x
|
||||
// random and srandom
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
#include <stdlib.h>
|
||||
static inline long int random (void) { return rand(); }
|
||||
static inline void srandom (unsigned int seed) { srand(seed); }
|
||||
static inline long int random(void)
|
||||
{
|
||||
return rand();
|
||||
}
|
||||
static inline void srandom(unsigned int seed) { srand(seed); }
|
||||
|
||||
#endif // _MSC_CONFIG_H_ ]
|
||||
#endif // _MSC_CONFIG_H_ ]
|
||||
|
||||
@@ -31,85 +31,97 @@ static intptr_t __alignment_mask = 0;
|
||||
|
||||
struct volk_gnsssdr_machine *get_machine(void)
|
||||
{
|
||||
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
||||
extern unsigned int n_volk_gnsssdr_machines;
|
||||
static struct volk_gnsssdr_machine *machine = NULL;
|
||||
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
||||
extern unsigned int n_volk_gnsssdr_machines;
|
||||
static struct volk_gnsssdr_machine *machine = NULL;
|
||||
|
||||
if(machine != NULL)
|
||||
return machine;
|
||||
else {
|
||||
unsigned int max_score = 0;
|
||||
unsigned int i;
|
||||
struct volk_gnsssdr_machine *max_machine = NULL;
|
||||
for(i=0; i<n_volk_gnsssdr_machines; i++) {
|
||||
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
|
||||
if(volk_gnsssdr_machines[i]->caps > max_score) {
|
||||
max_score = volk_gnsssdr_machines[i]->caps;
|
||||
max_machine = volk_gnsssdr_machines[i];
|
||||
if (machine != NULL)
|
||||
return machine;
|
||||
else
|
||||
{
|
||||
unsigned int max_score = 0;
|
||||
unsigned int i;
|
||||
struct volk_gnsssdr_machine *max_machine = NULL;
|
||||
for (i = 0; i < n_volk_gnsssdr_machines; i++)
|
||||
{
|
||||
if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
|
||||
{
|
||||
if (volk_gnsssdr_machines[i]->caps > max_score)
|
||||
{
|
||||
max_score = volk_gnsssdr_machines[i]->caps;
|
||||
max_machine = volk_gnsssdr_machines[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
machine = max_machine;
|
||||
//printf("Using Volk machine: %s\n", machine->name);
|
||||
__alignment = machine->alignment;
|
||||
__alignment_mask = (intptr_t)(__alignment - 1);
|
||||
return machine;
|
||||
}
|
||||
}
|
||||
}
|
||||
machine = max_machine;
|
||||
//printf("Using Volk machine: %s\n", machine->name);
|
||||
__alignment = machine->alignment;
|
||||
__alignment_mask = (intptr_t)(__alignment-1);
|
||||
return machine;
|
||||
}
|
||||
}
|
||||
|
||||
void volk_gnsssdr_list_machines(void)
|
||||
{
|
||||
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
||||
extern unsigned int n_volk_gnsssdr_machines;
|
||||
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
||||
extern unsigned int n_volk_gnsssdr_machines;
|
||||
|
||||
unsigned int i;
|
||||
for(i=0; i<n_volk_gnsssdr_machines; i++) {
|
||||
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
|
||||
printf("%s;", volk_gnsssdr_machines[i]->name);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
unsigned int i;
|
||||
for (i = 0; i < n_volk_gnsssdr_machines; i++)
|
||||
{
|
||||
if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
|
||||
{
|
||||
printf("%s;", volk_gnsssdr_machines[i]->name);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
const char* volk_gnsssdr_get_machine(void)
|
||||
const char *volk_gnsssdr_get_machine(void)
|
||||
{
|
||||
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
||||
extern unsigned int n_volk_gnsssdr_machines;
|
||||
static struct volk_gnsssdr_machine *machine = NULL;
|
||||
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
||||
extern unsigned int n_volk_gnsssdr_machines;
|
||||
static struct volk_gnsssdr_machine *machine = NULL;
|
||||
|
||||
if(machine != NULL)
|
||||
return machine->name;
|
||||
else {
|
||||
unsigned int max_score = 0;
|
||||
unsigned int i;
|
||||
struct volk_gnsssdr_machine *max_machine = NULL;
|
||||
for(i=0; i<n_volk_gnsssdr_machines; i++) {
|
||||
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
|
||||
if(volk_gnsssdr_machines[i]->caps > max_score) {
|
||||
max_score = volk_gnsssdr_machines[i]->caps;
|
||||
max_machine = volk_gnsssdr_machines[i];
|
||||
if (machine != NULL)
|
||||
return machine->name;
|
||||
else
|
||||
{
|
||||
unsigned int max_score = 0;
|
||||
unsigned int i;
|
||||
struct volk_gnsssdr_machine *max_machine = NULL;
|
||||
for (i = 0; i < n_volk_gnsssdr_machines; i++)
|
||||
{
|
||||
if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
|
||||
{
|
||||
if (volk_gnsssdr_machines[i]->caps > max_score)
|
||||
{
|
||||
max_score = volk_gnsssdr_machines[i]->caps;
|
||||
max_machine = volk_gnsssdr_machines[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
machine = max_machine;
|
||||
return machine->name;
|
||||
}
|
||||
}
|
||||
}
|
||||
machine = max_machine;
|
||||
return machine->name;
|
||||
}
|
||||
}
|
||||
|
||||
size_t volk_gnsssdr_get_alignment(void)
|
||||
{
|
||||
get_machine(); //ensures alignment is set
|
||||
get_machine(); //ensures alignment is set
|
||||
return __alignment;
|
||||
}
|
||||
|
||||
bool volk_gnsssdr_is_aligned(const void *ptr)
|
||||
{
|
||||
return ((intptr_t)(ptr) & __alignment_mask) == 0;
|
||||
return ((intptr_t)(ptr)&__alignment_mask) == 0;
|
||||
}
|
||||
|
||||
#define LV_HAVE_GENERIC
|
||||
#define LV_HAVE_DISPATCHER
|
||||
|
||||
// clang-format off
|
||||
|
||||
%for kern in kernels:
|
||||
|
||||
%if kern.has_dispatcher:
|
||||
@@ -190,6 +202,8 @@ void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void) {
|
||||
const char **impl_names = get_machine()->${kern.name}_impl_names;
|
||||
const int *impl_deps = get_machine()->${kern.name}_impl_deps;
|
||||
@@ -205,3 +219,5 @@ volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void) {
|
||||
}
|
||||
|
||||
%endfor
|
||||
|
||||
// clang-format on
|
||||
|
||||
@@ -42,7 +42,7 @@ typedef struct volk_gnsssdr_func_desc
|
||||
VOLK_API void volk_gnsssdr_list_machines(void);
|
||||
|
||||
//! Returns the name of the machine this instance will use
|
||||
VOLK_API const char* volk_gnsssdr_get_machine(void);
|
||||
VOLK_API const char *volk_gnsssdr_get_machine(void);
|
||||
|
||||
//! Get the machine alignment in bytes
|
||||
VOLK_API size_t volk_gnsssdr_get_alignment(void);
|
||||
@@ -73,6 +73,7 @@ VOLK_API bool volk_gnsssdr_is_aligned(const void *ptr);
|
||||
//! A function pointer to the dispatcher implementation
|
||||
extern VOLK_API ${kern.pname} ${kern.name};
|
||||
|
||||
// clang-format off
|
||||
//! A function pointer to the fastest aligned implementation
|
||||
extern VOLK_API ${kern.pname} ${kern.name}_a;
|
||||
|
||||
@@ -85,6 +86,7 @@ extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_
|
||||
//! Get description parameters for this kernel
|
||||
extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void);
|
||||
%endfor
|
||||
// clang-format off
|
||||
|
||||
__VOLK_DECL_END
|
||||
|
||||
|
||||
@@ -19,10 +19,11 @@
|
||||
#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
|
||||
#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
|
||||
|
||||
// clang-format off
|
||||
%for i, arch in enumerate(archs):
|
||||
//#ifndef LV_${arch.name.upper()}
|
||||
#define LV_${arch.name.upper()} ${i}
|
||||
//#endif
|
||||
%endfor
|
||||
|
||||
// clang-format on
|
||||
#endif /*INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED*/
|
||||
|
||||
@@ -24,50 +24,54 @@
|
||||
struct VOLK_CPU volk_gnsssdr_cpu;
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
|
||||
#define VOLK_CPU_x86
|
||||
#define VOLK_CPU_x86
|
||||
#endif
|
||||
|
||||
#if defined(VOLK_CPU_x86)
|
||||
|
||||
//implement get cpuid for gcc compilers using a system or local copy of cpuid.h
|
||||
#if defined(__GNUC__)
|
||||
#include <cpuid.h>
|
||||
#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3)
|
||||
#define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int*)regs), *((unsigned int*)regs+1), *((unsigned int*)regs+2), *((unsigned int*)regs+3))
|
||||
#include <cpuid.h>
|
||||
#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r + 0, (unsigned int *)r + 1, (unsigned int *)r + 2, (unsigned int *)r + 3)
|
||||
#define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int *)regs), *((unsigned int *)regs + 1), *((unsigned int *)regs + 2), *((unsigned int *)regs + 3))
|
||||
|
||||
/* Return Intel AVX extended CPU capabilities register.
|
||||
/* Return Intel AVX extended CPU capabilities register.
|
||||
* This function will bomb on non-AVX-capable machines, so
|
||||
* check for AVX capability before executing.
|
||||
*/
|
||||
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV)
|
||||
static inline unsigned long long _xgetbv(unsigned int index){
|
||||
unsigned int eax, edx;
|
||||
__VOLK_ASM __VOLK_VOLATILE ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
|
||||
return ((unsigned long long)edx << 32) | eax;
|
||||
}
|
||||
#define __xgetbv() _xgetbv(0)
|
||||
#else
|
||||
#define __xgetbv() 0
|
||||
#endif
|
||||
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV)
|
||||
static inline unsigned long long _xgetbv(unsigned int index)
|
||||
{
|
||||
unsigned int eax, edx;
|
||||
__VOLK_ASM __VOLK_VOLATILE("xgetbv"
|
||||
: "=a"(eax), "=d"(edx)
|
||||
: "c"(index));
|
||||
return ((unsigned long long)edx << 32) | eax;
|
||||
}
|
||||
#define __xgetbv() _xgetbv(0)
|
||||
#else
|
||||
#define __xgetbv() 0
|
||||
#endif
|
||||
|
||||
//implement get cpuid for MSVC compilers using __cpuid intrinsic
|
||||
#elif defined(_MSC_VER) && defined(HAVE_INTRIN_H)
|
||||
#include <intrin.h>
|
||||
#define cpuid_x86(op, r) __cpuid(((int*)r), op)
|
||||
#include <intrin.h>
|
||||
#define cpuid_x86(op, r) __cpuid(((int *)r), op)
|
||||
|
||||
#if defined(_XCR_XFEATURE_ENABLED_MASK)
|
||||
#define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
|
||||
#else
|
||||
#define __xgetbv() 0
|
||||
#endif
|
||||
#if defined(_XCR_XFEATURE_ENABLED_MASK)
|
||||
#define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
|
||||
#else
|
||||
#define __xgetbv() 0
|
||||
#endif
|
||||
|
||||
#else
|
||||
#error "A get cpuid for volk_gnsssdr is not available on this compiler..."
|
||||
#endif //defined(__GNUC__)
|
||||
#error "A get cpuid for volk_gnsssdr is not available on this compiler..."
|
||||
#endif //defined(__GNUC__)
|
||||
|
||||
#endif //defined(VOLK_CPU_x86)
|
||||
#endif //defined(VOLK_CPU_x86)
|
||||
|
||||
static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit) {
|
||||
static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit)
|
||||
{
|
||||
#if defined(VOLK_CPU_x86)
|
||||
unsigned int regs[4] = {0};
|
||||
cpuid_x86_count(level, count, regs);
|
||||
@@ -77,10 +81,11 @@ static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) {
|
||||
static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit)
|
||||
{
|
||||
#if defined(VOLK_CPU_x86)
|
||||
unsigned int regs[4];
|
||||
memset(regs, 0, sizeof(unsigned int)*4);
|
||||
memset(regs, 0, sizeof(unsigned int) * 4);
|
||||
cpuid_x86(op, regs);
|
||||
return regs[reg] >> bit & 0x01;
|
||||
#else
|
||||
@@ -88,10 +93,11 @@ static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsi
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline unsigned int check_extended_cpuid(unsigned int val) {
|
||||
static inline unsigned int check_extended_cpuid(unsigned int val)
|
||||
{
|
||||
#if defined(VOLK_CPU_x86)
|
||||
unsigned int regs[4];
|
||||
memset(regs, 0, sizeof(unsigned int)*4);
|
||||
memset(regs, 0, sizeof(unsigned int) * 4);
|
||||
cpuid_x86(0x80000000, regs);
|
||||
return regs[0] >= val;
|
||||
#else
|
||||
@@ -99,7 +105,8 @@ static inline unsigned int check_extended_cpuid(unsigned int val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline unsigned int get_avx_enabled(void) {
|
||||
static inline unsigned int get_avx_enabled(void)
|
||||
{
|
||||
#if defined(VOLK_CPU_x86)
|
||||
return __xgetbv() & 0x6;
|
||||
#else
|
||||
@@ -107,7 +114,8 @@ static inline unsigned int get_avx_enabled(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline unsigned int get_avx2_enabled(void) {
|
||||
static inline unsigned int get_avx2_enabled(void)
|
||||
{
|
||||
#if defined(VOLK_CPU_x86)
|
||||
return __xgetbv() & 0x6;
|
||||
#else
|
||||
@@ -117,28 +125,30 @@ static inline unsigned int get_avx2_enabled(void) {
|
||||
|
||||
//neon detection is linux specific
|
||||
#if defined(__arm__) && defined(__linux__)
|
||||
#include <asm/hwcap.h>
|
||||
#include <linux/auxvec.h>
|
||||
#include <stdio.h>
|
||||
#define VOLK_CPU_ARM
|
||||
#include <asm/hwcap.h>
|
||||
#include <linux/auxvec.h>
|
||||
#include <stdio.h>
|
||||
#define VOLK_CPU_ARM
|
||||
#endif
|
||||
|
||||
static int has_neon(void){
|
||||
static int has_neon(void)
|
||||
{
|
||||
#if defined(VOLK_CPU_ARM)
|
||||
FILE *auxvec_f;
|
||||
unsigned long auxvec[2];
|
||||
unsigned int found_neon = 0;
|
||||
auxvec_f = fopen("/proc/self/auxv", "rb");
|
||||
if(!auxvec_f) return 0;
|
||||
if (!auxvec_f) return 0;
|
||||
|
||||
size_t r = 1;
|
||||
//so auxv is basically 32b of ID and 32b of value
|
||||
//so it goes like this
|
||||
while(!found_neon && r) {
|
||||
r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
|
||||
if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
|
||||
found_neon = 1;
|
||||
}
|
||||
while (!found_neon && r)
|
||||
{
|
||||
r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
|
||||
if ((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
|
||||
found_neon = 1;
|
||||
}
|
||||
|
||||
fclose(auxvec_f);
|
||||
return found_neon;
|
||||
@@ -146,6 +156,7 @@ static int has_neon(void){
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
// clang-format off
|
||||
|
||||
%for arch in archs:
|
||||
static int i_can_has_${arch.name} (void) {
|
||||
@@ -195,3 +206,4 @@ unsigned int volk_gnsssdr_get_lvarch() {
|
||||
%endfor
|
||||
return retval;
|
||||
}
|
||||
// clang-format on
|
||||
|
||||
@@ -23,16 +23,18 @@
|
||||
|
||||
__VOLK_DECL_BEGIN
|
||||
|
||||
// clang-format off
|
||||
struct VOLK_CPU {
|
||||
%for arch in archs:
|
||||
int (*has_${arch.name}) ();
|
||||
%endfor
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
extern struct VOLK_CPU volk_gnsssdr_cpu;
|
||||
|
||||
void volk_gnsssdr_cpu_init ();
|
||||
unsigned int volk_gnsssdr_get_lvarch ();
|
||||
void volk_gnsssdr_cpu_init();
|
||||
unsigned int volk_gnsssdr_get_lvarch();
|
||||
|
||||
__VOLK_DECL_END
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
// clang-format off
|
||||
|
||||
<% this_machine = machine_dict[args[0]] %>
|
||||
<% arch_names = this_machine.arch_names %>
|
||||
|
||||
@@ -31,6 +33,7 @@
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
|
||||
%for kern in kernels:
|
||||
#include <volk_gnsssdr/${kern.name}.h>
|
||||
%endfor
|
||||
@@ -56,3 +59,4 @@ struct volk_gnsssdr_machine volk_gnsssdr_machine_${this_machine.name} = {
|
||||
<% len_impls = len(impls) %> ${len_impls},
|
||||
%endfor
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
|
||||
#include "volk_gnsssdr_machines.h"
|
||||
|
||||
// clang-format off
|
||||
struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = {
|
||||
%for machine in machines:
|
||||
#ifdef LV_MACHINE_${machine.name.upper()}
|
||||
@@ -27,5 +28,5 @@ struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = {
|
||||
#endif
|
||||
%endfor
|
||||
};
|
||||
|
||||
unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines)/sizeof(*volk_gnsssdr_machines);
|
||||
// clang-format on
|
||||
unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines) / sizeof(*volk_gnsssdr_machines);
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
|
||||
__VOLK_DECL_BEGIN
|
||||
|
||||
// clang-format off
|
||||
struct volk_gnsssdr_machine {
|
||||
const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format)
|
||||
const char *name;
|
||||
@@ -48,5 +49,6 @@ extern struct volk_gnsssdr_machine volk_gnsssdr_machine_${machine.name};
|
||||
%endfor
|
||||
|
||||
__VOLK_DECL_END
|
||||
// clang-format on
|
||||
|
||||
#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
|
||||
#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
|
||||
|
||||
@@ -22,8 +22,10 @@
|
||||
#include <inttypes.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
|
||||
// clang-format off
|
||||
%for kern in kernels:
|
||||
typedef void (*${kern.pname})(${kern.arglist_types});
|
||||
%endfor
|
||||
// clang-format on
|
||||
|
||||
#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/
|
||||
|
||||
@@ -45,7 +45,8 @@
|
||||
#include <vector>
|
||||
|
||||
|
||||
extern "C" {
|
||||
extern "C"
|
||||
{
|
||||
#include "cnav_msg.h"
|
||||
#include "edc.h"
|
||||
#include "bits.h"
|
||||
|
||||
@@ -42,7 +42,8 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
extern "C" {
|
||||
extern "C"
|
||||
{
|
||||
#include "cnav_msg.h"
|
||||
#include "edc.h"
|
||||
#include "bits.h"
|
||||
|
||||
@@ -76,9 +76,9 @@ struct GPU_Complex
|
||||
}
|
||||
CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
|
||||
{
|
||||
//c=a*b+c
|
||||
//real part
|
||||
//c.r=(a.r*b.r - a.i*b.i)+c.r
|
||||
//c=a*b+c
|
||||
//real part
|
||||
//c.r=(a.r*b.r - a.i*b.i)+c.r
|
||||
#ifdef __CUDACC__
|
||||
r = __fmaf_rn(a.r, b.r, r);
|
||||
r = __fmaf_rn(-a.i, b.i, r);
|
||||
|
||||
Reference in New Issue
Block a user