1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-11-09 19:53:04 +00:00

CI: make clang-tidy happy

This commit is contained in:
Carles Fernandez
2021-12-17 19:01:41 +01:00
parent 12ed230cd7
commit d9a516e0b8
5 changed files with 394 additions and 125 deletions

View File

@@ -224,7 +224,7 @@ void GpsL1CaPcpsOpenClAcquisition::reset()
} }
float GpsL1CaPcpsOpenClAcquisition::calculate_threshold(float pfa) float GpsL1CaPcpsOpenClAcquisition::calculate_threshold(float pfa) const
{ {
// Calculate the threshold // Calculate the threshold
unsigned int frequency_bins = 0; unsigned int frequency_bins = 0;

View File

@@ -146,7 +146,7 @@ public:
} }
private: private:
float calculate_threshold(float pfa); float calculate_threshold(float pfa) const;
const ConfigurationInterface* configuration_; const ConfigurationInterface* configuration_;
pcps_opencl_acquisition_cc_sptr acquisition_cc_; pcps_opencl_acquisition_cc_sptr acquisition_cc_;
gr::blocks::stream_to_vector::sptr stream_to_vector_; gr::blocks::stream_to_vector::sptr stream_to_vector_;

View File

@@ -28,13 +28,16 @@ allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float); size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
if (plan->tempmemobj) if (plan->tempmemobj)
{
clReleaseMemObject(plan->tempmemobj); clReleaseMemObject(plan->tempmemobj);
}
plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err); plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err);
} }
return err; return err;
} }
static cl_int static cl_int
allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize) allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
{ {
@@ -46,10 +49,14 @@ allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float); size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
if (plan->tempmemobj_real) if (plan->tempmemobj_real)
{
clReleaseMemObject(plan->tempmemobj_real); clReleaseMemObject(plan->tempmemobj_real);
}
if (plan->tempmemobj_imag) if (plan->tempmemobj_imag)
{
clReleaseMemObject(plan->tempmemobj_imag); clReleaseMemObject(plan->tempmemobj_imag);
}
plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err); plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err);
plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &terr); plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &terr);
@@ -58,6 +65,7 @@ allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
return err; return err;
} }
void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems) void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
{ {
*lWorkItems = kernelInfo->num_workitems_per_workgroup; *lWorkItems = kernelInfo->num_workitems_per_workgroup;
@@ -83,6 +91,7 @@ void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo,
*gWorkItems = numWorkGroups * *lWorkItems; *gWorkItems = numWorkGroups * *lWorkItems;
} }
cl_int cl_int
clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
cl_mem data_in, cl_mem data_out, cl_mem data_in, cl_mem data_out,
@@ -91,7 +100,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
int s; int s;
auto *plan = (cl_fft_plan *)Plan; auto *plan = (cl_fft_plan *)Plan;
if (plan->format != clFFT_InterleavedComplexFormat) if (plan->format != clFFT_InterleavedComplexFormat)
{
return CL_INVALID_VALUE; return CL_INVALID_VALUE;
}
cl_int err; cl_int err;
size_t gWorkItems; size_t gWorkItems;
@@ -101,7 +112,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
cl_int isInPlace = data_in == data_out ? 1 : 0; cl_int isInPlace = data_in == data_out ? 1 : 0;
if ((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS) if ((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
{
return err; return err;
}
cl_mem memObj[3]; cl_mem memObj[3];
memObj[0] = data_in; memObj[0] = data_in;
@@ -146,7 +159,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err) if (err)
{
return err; return err;
}
currRead = (currWrite == 1) ? 1 : 2; currRead = (currWrite == 1) ? 1 : 2;
currWrite = (currWrite == 1) ? 2 : 1; currWrite = (currWrite == 1) ? 2 : 1;
@@ -169,7 +184,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err) if (err)
{
return err; return err;
}
currRead = 1; currRead = 1;
currWrite = 1; currWrite = 1;
@@ -181,6 +198,7 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
return err; return err;
} }
cl_int cl_int
clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
@@ -190,7 +208,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
auto *plan = (cl_fft_plan *)Plan; auto *plan = (cl_fft_plan *)Plan;
if (plan->format != clFFT_SplitComplexFormat) if (plan->format != clFFT_SplitComplexFormat)
{
return CL_INVALID_VALUE; return CL_INVALID_VALUE;
}
cl_int err; cl_int err;
size_t gWorkItems; size_t gWorkItems;
@@ -200,7 +220,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0; cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
if ((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS) if ((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
{
return err; return err;
}
cl_mem memObj_real[3]; cl_mem memObj_real[3];
cl_mem memObj_imag[3]; cl_mem memObj_imag[3];
@@ -252,7 +274,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err) if (err)
{
return err; return err;
}
currRead = (currWrite == 1) ? 1 : 2; currRead = (currWrite == 1) ? 1 : 2;
currWrite = (currWrite == 1) ? 2 : 1; currWrite = (currWrite == 1) ? 2 : 1;
@@ -276,7 +300,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err) if (err)
{
return err; return err;
}
currRead = 1; currRead = 1;
currWrite = 1; currWrite = 1;
@@ -288,6 +314,7 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
return err; return err;
} }
cl_int cl_int
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir) unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
@@ -304,12 +331,16 @@ clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
cl_device_id device_id; cl_device_id device_id;
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr); err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr);
if (err) if (err)
{
return err; return err;
}
size_t gSize; size_t gSize;
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr); err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr);
if (err) if (err)
{
return err; return err;
}
gSize = min(128, gSize); gSize = min(128, gSize);
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize}; size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};
@@ -327,6 +358,7 @@ clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
return err; return err;
} }
cl_int cl_int
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir) unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
@@ -343,12 +375,16 @@ clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real,
cl_device_id device_id; cl_device_id device_id;
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr); err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr);
if (err) if (err)
{
return err; return err;
}
size_t gSize; size_t gSize;
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr); err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr);
if (err) if (err)
{
return err; return err;
}
gSize = min(128, gSize); gSize = min(128, gSize);
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize}; size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};

View File

@@ -30,7 +30,7 @@ num2str(int num)
{ {
char temp[200]; char temp[200];
snprintf(temp, sizeof(temp), "%d", num); snprintf(temp, sizeof(temp), "%d", num);
return string(temp); return {temp};
} }
// For any n, this function decomposes n into factors for loacal memory tranpose // For any n, this function decomposes n into factors for loacal memory tranpose
@@ -155,14 +155,20 @@ getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices
} }
} }
static void static void
insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat) insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat)
{ {
if (dataFormat == clFFT_SplitComplexFormat) if (dataFormat == clFFT_SplitComplexFormat)
{
kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n"); kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
}
else else
{
kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n"); kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
} }
}
static void static void
insertVariables(string &kStream, int maxRadix) insertVariables(string &kStream, int maxRadix)
@@ -177,11 +183,14 @@ insertVariables(string &kStream, int maxRadix)
kStream += string(" int groupId = get_group_id( 0 );\n"); kStream += string(" int groupId = get_group_id( 0 );\n");
} }
static void static void
formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
{ {
if (dataFormat == clFFT_InterleavedComplexFormat) if (dataFormat == clFFT_InterleavedComplexFormat)
{
kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n"); kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
}
else else
{ {
kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n"); kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n");
@@ -189,11 +198,14 @@ formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dat
} }
} }
static void static void
formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
{ {
if (dataFormat == clFFT_InterleavedComplexFormat) if (dataFormat == clFFT_InterleavedComplexFormat)
{
kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n"); kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
}
else else
{ {
kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n"); kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n");
@@ -201,6 +213,7 @@ formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat da
} }
} }
static int static int
insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat) insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat)
{ {
@@ -211,7 +224,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
int lMemSize = 0; int lMemSize = 0;
if (numXFormsPerWG > 1) if (numXFormsPerWG > 1)
{
kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n"); kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
}
if (numWorkItemsPerXForm >= mem_coalesce_width) if (numWorkItemsPerXForm >= mem_coalesce_width)
{ {
@@ -234,7 +249,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" out_imag += offset;\n"); kernelString += string(" out_imag += offset;\n");
} }
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
{
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat); formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
}
kernelString += string(" }\n"); kernelString += string(" }\n");
} }
else else
@@ -255,9 +272,11 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" out_imag += offset;\n"); kernelString += string(" out_imag += offset;\n");
} }
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
{
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat); formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
} }
} }
}
else if (N >= mem_coalesce_width) else if (N >= mem_coalesce_width)
{ {
int numInnerIter = N / mem_coalesce_width; int numInnerIter = N / mem_coalesce_width;
@@ -286,18 +305,24 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
{ {
kernelString += string(" if( jj < s ) {\n"); kernelString += string(" if( jj < s ) {\n");
for (j = 0; j < numInnerIter; j++) for (j = 0; j < numInnerIter; j++)
{
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
}
kernelString += string(" }\n"); kernelString += string(" }\n");
if (i != numOuterIter - 1) if (i != numOuterIter - 1)
{
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
} }
}
kernelString += string("}\n "); kernelString += string("}\n ");
kernelString += string("else {\n"); kernelString += string("else {\n");
for (i = 0; i < numOuterIter; i++) for (i = 0; i < numOuterIter; i++)
{ {
for (j = 0; j < numInnerIter; j++) for (j = 0; j < numInnerIter; j++)
{
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
} }
}
kernelString += string("}\n"); kernelString += string("}\n");
kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
@@ -315,7 +340,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
{
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < numOuterIter; i++) for (i = 0; i < numOuterIter; i++)
@@ -329,7 +356,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
{
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
@@ -360,8 +389,10 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" if(jj < s )\n"); kernelString += string(" if(jj < s )\n");
formattedLoad(kernelString, i, i * groupSize, dataFormat); formattedLoad(kernelString, i, i * groupSize, dataFormat);
if (i != R0 - 1) if (i != R0 - 1)
{
kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n"); kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n");
} }
}
kernelString += string("}\n"); kernelString += string("}\n");
kernelString += string("else {\n"); kernelString += string("else {\n");
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
@@ -385,19 +416,27 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
{
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].x;\n"); kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].x;\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
{
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
{
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].y;\n"); kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].y;\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
{
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
@@ -406,6 +445,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
return lMemSize; return lMemSize;
} }
static int static int
insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat) insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat)
{ {
@@ -433,8 +473,10 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
formattedStore(kernelString, ind, i * numWorkItemsPerXForm, dataFormat); formattedStore(kernelString, ind, i * numWorkItemsPerXForm, dataFormat);
} }
if (numXFormsPerWG > 1) if (numXFormsPerWG > 1)
{
kernelString += string(" }\n"); kernelString += string(" }\n");
} }
}
else if (N >= mem_coalesce_width) else if (N >= mem_coalesce_width)
{ {
int numInnerIter = N / mem_coalesce_width; int numInnerIter = N / mem_coalesce_width;
@@ -455,8 +497,12 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < numOuterIter; i++) for (i = 0; i < numOuterIter; i++)
{
for (j = 0; j < numInnerIter; j++) for (j = 0; j < numInnerIter; j++)
{
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].x = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n"); kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].x = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
}
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++) for (i = 0; i < maxRadix; i++)
@@ -469,8 +515,12 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < numOuterIter; i++) for (i = 0; i < numOuterIter; i++)
{
for (j = 0; j < numInnerIter; j++) for (j = 0; j < numInnerIter; j++)
{
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].y = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n"); kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].y = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
}
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
@@ -478,18 +528,24 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
{ {
kernelString += string(" if( jj < s ) {\n"); kernelString += string(" if( jj < s ) {\n");
for (j = 0; j < numInnerIter; j++) for (j = 0; j < numInnerIter; j++)
{
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
}
kernelString += string(" }\n"); kernelString += string(" }\n");
if (i != numOuterIter - 1) if (i != numOuterIter - 1)
{
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
} }
}
kernelString += string("}\n"); kernelString += string("}\n");
kernelString += string("else {\n"); kernelString += string("else {\n");
for (i = 0; i < numOuterIter; i++) for (i = 0; i < numOuterIter; i++)
{ {
for (j = 0; j < numInnerIter; j++) for (j = 0; j < numInnerIter; j++)
{
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
} }
}
kernelString += string("}\n"); kernelString += string("}\n");
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
@@ -512,7 +568,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++) for (i = 0; i < maxRadix; i++)
{
kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n"); kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++) for (i = 0; i < maxRadix; i++)
@@ -525,7 +583,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++) for (i = 0; i < maxRadix; i++)
{
kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n"); kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
@@ -535,8 +595,10 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
formattedStore(kernelString, i, i * groupSize, dataFormat); formattedStore(kernelString, i, i * groupSize, dataFormat);
kernelString += string(" }\n"); kernelString += string(" }\n");
if (i != maxRadix - 1) if (i != maxRadix - 1)
{
kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n"); kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n");
} }
}
kernelString += string("}\n"); kernelString += string("}\n");
kernelString += string("else {\n"); kernelString += string("else {\n");
for (i = 0; i < maxRadix; i++) for (i = 0; i < maxRadix; i++)
@@ -551,6 +613,7 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
return lMemSize; return lMemSize;
} }
static void static void
insertfftKernel(string &kernelString, int Nr, int numIter) insertfftKernel(string &kernelString, int Nr, int numIter)
{ {
@@ -561,6 +624,7 @@ insertfftKernel(string &kernelString, int Nr, int numIter)
} }
} }
static void static void
insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
{ {
@@ -573,17 +637,25 @@ insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int le
if (z == 0) if (z == 0)
{ {
if (Nprev > 1) if (Nprev > 1)
{
kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n"); kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
}
else else
{
kernelString += string(" angf = (float) ii;\n"); kernelString += string(" angf = (float) ii;\n");
} }
}
else else
{ {
if (Nprev > 1) if (Nprev > 1)
{
kernelString += string(" angf = (float) ((") + num2str(z * numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); kernelString += string(" angf = (float) ((") + num2str(z * numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n");
}
else else
{
kernelString += string(" angf = (float) (") + num2str(z * numWorkItemsPerXForm) + string(" + ii);\n"); kernelString += string(" angf = (float) (") + num2str(z * numWorkItemsPerXForm) + string(" + ii);\n");
} }
}
for (k = 1; k < Nr; k++) for (k = 1; k < Nr; k++)
{ {
@@ -596,31 +668,42 @@ insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int le
} }
} }
static int static int
getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad) getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad)
{ {
if ((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks)) if ((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks))
{
*offset = 0; *offset = 0;
}
else else
{ {
int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev; int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
int numColsReq = 1; int numColsReq = 1;
if (numRowsReq > Nr) if (numRowsReq > Nr)
{
numColsReq = numRowsReq / Nr; numColsReq = numRowsReq / Nr;
}
numColsReq = Nprev * numColsReq; numColsReq = Nprev * numColsReq;
*offset = numColsReq; *offset = numColsReq;
} }
if (numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) if (numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1)
{
*midPad = 0; *midPad = 0;
}
else else
{ {
int bankNum = ((numWorkItemsReq + *offset) * Nr) & (numBanks - 1); int bankNum = ((numWorkItemsReq + *offset) * Nr) & (numBanks - 1);
if (bankNum >= numWorkItemsPerXForm) if (bankNum >= numWorkItemsPerXForm)
{
*midPad = 0; *midPad = 0;
}
else else
{
*midPad = numWorkItemsPerXForm - bankNum; *midPad = numWorkItemsPerXForm - bankNum;
} }
}
int lMemSize = (numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1); int lMemSize = (numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1);
return lMemSize; return lMemSize;
@@ -644,6 +727,7 @@ insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPer
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
} }
static void static void
insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
{ {
@@ -676,6 +760,7 @@ insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Nc
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
} }
static void static void
insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
{ {
@@ -687,33 +772,52 @@ insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numW
if (Ncurr < numWorkItemsPerXForm) if (Ncurr < numWorkItemsPerXForm)
{ {
if (Nprev == 1) if (Nprev == 1)
{
kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n"); kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n");
}
else else
{
kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n"); kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
}
if (Nprev == 1) if (Nprev == 1)
{
kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n"); kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n");
}
else else
{
kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n");
} }
}
else else
{ {
if (Nprev == 1) if (Nprev == 1)
{
kernelString += string(" j = ii;\n"); kernelString += string(" j = ii;\n");
}
else else
{
kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n"); kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n");
}
if (Nprev == 1) if (Nprev == 1)
{
kernelString += string(" i = 0;\n"); kernelString += string(" i = 0;\n");
}
else else
{
kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n"); kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n");
} }
}
if (numXFormsPerWG > 1) if (numXFormsPerWG > 1)
{
kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n"); kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n");
}
kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n");
} }
static void static void
insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad) insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad)
{ {
@@ -742,7 +846,9 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
assert(numRadix > 0 && "no radix array supplied\n"); assert(numRadix > 0 && "no radix array supplied\n");
if (n / radixArray[0] > plan->max_work_item_per_workgroup) if (n / radixArray[0] > plan->max_work_item_per_workgroup)
{
getRadixArray(n, radixArray, &numRadix, plan->max_radix); getRadixArray(n, radixArray, &numRadix, plan->max_radix);
}
assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n"); assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n");
assert(n / radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n"); assert(n / radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n");
@@ -839,11 +945,14 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
insertHeader(*kernelString, kernelName, dataFormat); insertHeader(*kernelString, kernelName, dataFormat);
*kernelString += string("{\n"); *kernelString += string("{\n");
if ((*kInfo)->lmem_size) if ((*kInfo)->lmem_size)
{
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
}
*kernelString += localString; *kernelString += localString;
*kernelString += string("}\n"); *kernelString += string("}\n");
} }
// For n larger than what can be computed using local memory fft, global transposes // For n larger than what can be computed using local memory fft, global transposes
// multiple kernel launces is needed. For these sizes, n can be decomposed using // multiple kernel launces is needed. For these sizes, n can be decomposed using
// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel // much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel
@@ -864,7 +973,6 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
// in this example. Users can play with difference base radices and difference // in this example. Users can play with difference base radices and difference
// decompositions of base radices to generates different kernels and see which gives // decompositions of base radices to generates different kernels and see which gives
// best performance. Following function is just fixed to use 128 as base radix // best performance. Following function is just fixed to use 128 as base radix
void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
{ {
int baseRadix = min(n, 128); int baseRadix = min(n, 128);
@@ -878,7 +986,9 @@ void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
} }
for (int i = 0; i < numR; i++) for (int i = 0; i < numR; i++)
{
radix[i] = baseRadix; radix[i] = baseRadix;
}
radix[numR] = N; radix[numR] = N;
numR++; numR++;
@@ -906,6 +1016,7 @@ void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
} }
} }
static void static void
createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
{ {
@@ -960,12 +1071,18 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
int strideI = Rinit; int strideI = Rinit;
for (i = 0; i < numPasses; i++) for (i = 0; i < numPasses; i++)
{
if (i != passNum) if (i != passNum)
{
strideI *= radixArr[i]; strideI *= radixArr[i];
}
}
int strideO = Rinit; int strideO = Rinit;
for (i = 0; i < passNum; i++) for (i = 0; i < passNum; i++)
{
strideO *= radixArr[i]; strideO *= radixArr[i];
}
int threadsPerXForm = R2; int threadsPerXForm = R2;
batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize; batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize;
@@ -986,30 +1103,44 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
int numBlocksPerXForm = strideI / batchSize; int numBlocksPerXForm = strideI / batchSize;
int numBlocks = numBlocksPerXForm; int numBlocks = numBlocksPerXForm;
if (!vertical) if (!vertical)
{
numBlocks *= BS; numBlocks *= BS;
}
else else
{
numBlocks *= vertBS; numBlocks *= vertBS;
}
kernelName = string("fft") + num2str(kCount); kernelName = string("fft") + num2str(kCount);
*kInfo = (cl_fft_kernel_info *)malloc(sizeof(cl_fft_kernel_info)); *kInfo = (cl_fft_kernel_info *)malloc(sizeof(cl_fft_kernel_info));
(*kInfo)->kernel = nullptr; (*kInfo)->kernel = nullptr;
if (R2 == 1) if (R2 == 1)
{
(*kInfo)->lmem_size = 0; (*kInfo)->lmem_size = 0;
}
else else
{ {
if (strideO == 1) if (strideO == 1)
{
(*kInfo)->lmem_size = (radix + 1) * batchSize; (*kInfo)->lmem_size = (radix + 1) * batchSize;
}
else else
{
(*kInfo)->lmem_size = threadsPerBlock * R1; (*kInfo)->lmem_size = threadsPerBlock * R1;
} }
}
(*kInfo)->num_workgroups = numBlocks; (*kInfo)->num_workgroups = numBlocks;
(*kInfo)->num_xforms_per_workgroup = 1; (*kInfo)->num_xforms_per_workgroup = 1;
(*kInfo)->num_workitems_per_workgroup = threadsPerBlock; (*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
(*kInfo)->dir = dir; (*kInfo)->dir = dir;
if ((passNum == (numPasses - 1)) && (numPasses & 1)) if ((passNum == (numPasses - 1)) && (numPasses & 1))
{
(*kInfo)->in_place_possible = 1; (*kInfo)->in_place_possible = 1;
}
else else
{
(*kInfo)->in_place_possible = 0; (*kInfo)->in_place_possible = 0;
}
(*kInfo)->next = nullptr; (*kInfo)->next = nullptr;
(*kInfo)->kernel_name = (char *)malloc(sizeof(char) * (kernelName.size() + 1)); (*kInfo)->kernel_name = (char *)malloc(sizeof(char) * (kernelName.size() + 1));
snprintf((*kInfo)->kernel_name, sizeof((*kInfo)->kernel_name), kernelName.c_str()); snprintf((*kInfo)->kernel_name, sizeof((*kInfo)->kernel_name), kernelName.c_str());
@@ -1026,7 +1157,9 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
int stride = radix * Rinit; int stride = radix * Rinit;
for (i = 0; i < passNum; i++) for (i = 0; i < passNum; i++)
{
stride *= radixArr[i]; stride *= radixArr[i];
}
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int)log2(n * BS)) + string("));\n"); localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int)log2(n * BS)) + string("));\n");
localString += string("bNum = groupId;\n"); localString += string("bNum = groupId;\n");
} }
@@ -1041,7 +1174,9 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
int stride = radix * Rinit; int stride = radix * Rinit;
for (i = 0; i < passNum; i++) for (i = 0; i < passNum; i++)
{
stride *= radixArr[i]; stride *= radixArr[i];
}
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n"); localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");
localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n"); localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n"); localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");
@@ -1059,16 +1194,22 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("in_real += indexIn;\n"); localString += string("in_real += indexIn;\n");
localString += string("in_imag += indexIn;\n"); localString += string("in_imag += indexIn;\n");
for (j = 0; j < R1; j++) for (j = 0; j < R1; j++)
{
localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j * gInInc * strideI) + string("];\n"); localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j * gInInc * strideI) + string("];\n");
}
for (j = 0; j < R1; j++) for (j = 0; j < R1; j++)
{
localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j * gInInc * strideI) + string("];\n"); localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j * gInInc * strideI) + string("];\n");
} }
}
else else
{ {
localString += string("in += indexIn;\n"); localString += string("in += indexIn;\n");
for (j = 0; j < R1; j++) for (j = 0; j < R1; j++)
{
localString += string("a[") + num2str(j) + string("] = in[") + num2str(j * gInInc * strideI) + string("];\n"); localString += string("a[") + num2str(j) + string("] = in[") + num2str(j * gInInc * strideI) + string("];\n");
} }
}
localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");
@@ -1088,23 +1229,37 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("lMemStore = sMem + tid;\n"); localString += string("lMemStore = sMem + tid;\n");
localString += string("lMemLoad = sMem + indexIn;\n"); localString += string("lMemLoad = sMem + indexIn;\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
{
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (k = 0; k < numIter; k++) for (k = 0; k < numIter; k++)
{
for (t = 0; t < R2; t++) for (t = 0; t < R2; t++)
{
localString += string("a[") + num2str(k * R2 + t) + string("].x = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n"); localString += string("a[") + num2str(k * R2 + t) + string("].x = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
{
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (k = 0; k < numIter; k++) for (k = 0; k < numIter; k++)
{
for (t = 0; t < R2; t++) for (t = 0; t < R2; t++)
{
localString += string("a[") + num2str(k * R2 + t) + string("].y = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n"); localString += string("a[") + num2str(k * R2 + t) + string("].y = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (j = 0; j < numIter; j++) for (j = 0; j < numIter; j++)
{
localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j * R2) + string(", dir);\n"); localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j * R2) + string(", dir);\n");
} }
}
// twiddle // twiddle
if (passNum < (numPasses - 1)) if (passNum < (numPasses - 1))
@@ -1127,41 +1282,61 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix + 1) + string(", tid & ") + num2str(radix - 1) + string(");\n"); localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix + 1) + string(", tid & ") + num2str(radix - 1) + string(");\n");
for (i = 0; i < R1 / R2; i++) for (i = 0; i < R1 / R2; i++)
{
for (j = 0; j < R2; j++) for (j = 0; j < R2; j++)
{
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].x;\n"); localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].x;\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if (threadsPerBlock >= radix) if (threadsPerBlock >= radix)
{ {
for (i = 0; i < R1; i++) for (i = 0; i < R1; i++)
{
localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n"); localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
} }
}
else else
{ {
int innerIter = radix / threadsPerBlock; int innerIter = radix / threadsPerBlock;
int outerIter = R1 / innerIter; int outerIter = R1 / innerIter;
for (i = 0; i < outerIter; i++) for (i = 0; i < outerIter; i++)
{
for (j = 0; j < innerIter; j++) for (j = 0; j < innerIter; j++)
{
localString += string("a[") + num2str(i * innerIter + j) + string("].x = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n"); localString += string("a[") + num2str(i * innerIter + j) + string("].x = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
} }
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (i = 0; i < R1 / R2; i++) for (i = 0; i < R1 / R2; i++)
{
for (j = 0; j < R2; j++) for (j = 0; j < R2; j++)
{
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].y;\n"); localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].y;\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if (threadsPerBlock >= radix) if (threadsPerBlock >= radix)
{ {
for (i = 0; i < R1; i++) for (i = 0; i < R1; i++)
{
localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n"); localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
} }
}
else else
{ {
int innerIter = radix / threadsPerBlock; int innerIter = radix / threadsPerBlock;
int outerIter = R1 / innerIter; int outerIter = R1 / innerIter;
for (i = 0; i < outerIter; i++) for (i = 0; i < outerIter; i++)
{
for (j = 0; j < innerIter; j++) for (j = 0; j < innerIter; j++)
{
localString += string("a[") + num2str(i * innerIter + j) + string("].y = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n"); localString += string("a[") + num2str(i * innerIter + j) + string("].y = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
} }
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
localString += string("indexOut += tid;\n"); localString += string("indexOut += tid;\n");
@@ -1170,17 +1345,23 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("out_real += indexOut;\n"); localString += string("out_real += indexOut;\n");
localString += string("out_imag += indexOut;\n"); localString += string("out_imag += indexOut;\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
{
localString += string("out_real[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); localString += string("out_real[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
}
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
{
localString += string("out_imag[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); localString += string("out_imag[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
} }
}
else else
{ {
localString += string("out += indexOut;\n"); localString += string("out += indexOut;\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
{
localString += string("out[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n"); localString += string("out[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");
} }
} }
}
else else
{ {
localString += string("indexOut += mad24(j, ") + num2str(numIter * strideO) + string(", i);\n"); localString += string("indexOut += mad24(j, ") + num2str(numIter * strideO) + string(", i);\n");
@@ -1189,22 +1370,30 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("out_real += indexOut;\n"); localString += string("out_real += indexOut;\n");
localString += string("out_imag += indexOut;\n"); localString += string("out_imag += indexOut;\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
{
localString += string("out_real[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].x;\n"); localString += string("out_real[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].x;\n");
}
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
{
localString += string("out_imag[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].y;\n"); localString += string("out_imag[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].y;\n");
} }
}
else else
{ {
localString += string("out += indexOut;\n"); localString += string("out += indexOut;\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
{
localString += string("out[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("];\n"); localString += string("out[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("];\n");
} }
} }
}
insertHeader(*kernelString, kernelName, dataFormat); insertHeader(*kernelString, kernelName, dataFormat);
*kernelString += string("{\n"); *kernelString += string("{\n");
if ((*kInfo)->lmem_size) if ((*kInfo)->lmem_size)
{
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
}
*kernelString += localString; *kernelString += localString;
*kernelString += string("}\n"); *kernelString += string("}\n");
@@ -1214,6 +1403,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
} }
} }
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir) void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
{ {
unsigned int radixArray[10]; unsigned int radixArray[10];
@@ -1237,21 +1427,29 @@ void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
{ {
getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix); getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix);
if (plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) if (plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
{
createLocalMemfftKernelString(plan); createLocalMemfftKernelString(plan);
}
else else
{
createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
} }
} }
}
break; break;
case cl_fft_kernel_y: case cl_fft_kernel_y:
if (plan->n.y > 1) if (plan->n.y > 1)
{
createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1); createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
}
break; break;
case cl_fft_kernel_z: case cl_fft_kernel_z:
if (plan->n.z > 1) if (plan->n.z > 1)
{
createGlobalFFTKernelString(plan, plan->n.z, plan->n.x * plan->n.y, cl_fft_kernel_z, 1); createGlobalFFTKernelString(plan, plan->n.z, plan->n.x * plan->n.y, cl_fft_kernel_z, 1);
}
default: default:
return; return;
} }

View File

@@ -31,9 +31,13 @@ getBlockConfigAndKernelString(cl_fft_plan *plan)
*plan->kernel_string += baseKernels; *plan->kernel_string += baseKernels;
if (plan->format == clFFT_SplitComplexFormat) if (plan->format == clFFT_SplitComplexFormat)
{
*plan->kernel_string += twistKernelPlannar; *plan->kernel_string += twistKernelPlannar;
}
else else
{
*plan->kernel_string += twistKernelInterleaved; *plan->kernel_string += twistKernelInterleaved;
}
switch (plan->dim) switch (plan->dim)
{ {
@@ -72,13 +76,18 @@ deleteKernelInfo(cl_fft_kernel_info *kInfo)
if (kInfo) if (kInfo)
{ {
if (kInfo->kernel_name) if (kInfo->kernel_name)
{
free(kInfo->kernel_name); free(kInfo->kernel_name);
}
if (kInfo->kernel) if (kInfo->kernel)
{
clReleaseKernel(kInfo->kernel); clReleaseKernel(kInfo->kernel);
}
free(kInfo); free(kInfo);
} }
} }
static void static void
destroy_plan(cl_fft_plan *Plan) destroy_plan(cl_fft_plan *Plan)
{ {
@@ -125,6 +134,7 @@ destroy_plan(cl_fft_plan *Plan)
} }
} }
static int static int
createKernelList(cl_fft_plan *plan) createKernelList(cl_fft_plan *plan)
{ {
@@ -136,21 +146,30 @@ createKernelList(cl_fft_plan *plan)
{ {
kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err); kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
if (!kernel_info->kernel || err != CL_SUCCESS) if (!kernel_info->kernel || err != CL_SUCCESS)
{
return err; return err;
}
kernel_info = kernel_info->next; kernel_info = kernel_info->next;
} }
if (plan->format == clFFT_SplitComplexFormat) if (plan->format == clFFT_SplitComplexFormat)
{
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err); plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
}
else else
{
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err); plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
}
if (!plan->twist_kernel || err) if (!plan->twist_kernel || err)
{
return err; return err;
}
return CL_SUCCESS; return CL_SUCCESS;
} }
int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices) int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
{ {
int reg_needed = 0; int reg_needed = 0;
@@ -166,13 +185,19 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
{ {
err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, nullptr); err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, nullptr);
if (err != CL_SUCCESS) if (err != CL_SUCCESS)
{
return -1; return -1;
}
if (wg_size < kInfo->num_workitems_per_workgroup) if (wg_size < kInfo->num_workitems_per_workgroup)
{
reg_needed |= 1; reg_needed |= 1;
}
if (*max_wg_size > wg_size) if (*max_wg_size > wg_size)
{
*max_wg_size = wg_size; *max_wg_size = wg_size;
}
kInfo = kInfo->next; kInfo = kInfo->next;
} }
@@ -181,6 +206,7 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
return reg_needed; return reg_needed;
} }
#define ERR_MACRO(err) \ #define ERR_MACRO(err) \
{ \ { \
if ((err) != CL_SUCCESS) \ if ((err) != CL_SUCCESS) \
@@ -192,6 +218,7 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
} \ } \
} }
clFFT_Plan clFFT_Plan
clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code) clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code)
{ {
@@ -326,11 +353,14 @@ patch_kernel_source:
} }
if (error_code) if (error_code)
{
*error_code = CL_SUCCESS; *error_code = CL_SUCCESS;
}
return (clFFT_Plan)plan; return (clFFT_Plan)plan;
} }
void clFFT_DestroyPlan(clFFT_Plan plan) void clFFT_DestroyPlan(clFFT_Plan plan)
{ {
auto *Plan = (cl_fft_plan *)plan; auto *Plan = (cl_fft_plan *)plan;
@@ -342,15 +372,20 @@ void clFFT_DestroyPlan(clFFT_Plan plan)
} }
} }
void clFFT_DumpPlan(clFFT_Plan Plan, FILE *file) void clFFT_DumpPlan(clFFT_Plan Plan, FILE *file)
{ {
size_t gDim; size_t gDim;
size_t lDim; size_t lDim;
FILE *out; FILE *out;
if (!file) if (!file)
{
out = stdout; out = stdout;
}
else else
{
out = file; out = file;
}
auto *plan = (cl_fft_plan *)Plan; auto *plan = (cl_fft_plan *)Plan;
cl_fft_kernel_info *kInfo = plan->kernel_info; cl_fft_kernel_info *kInfo = plan->kernel_info;