CI: make clang-tidy happy

This commit is contained in:
Carles Fernandez 2021-12-17 19:01:41 +01:00
parent 12ed230cd7
commit d9a516e0b8
No known key found for this signature in database
GPG Key ID: 4C583C52B0C3877D
5 changed files with 394 additions and 125 deletions

View File

@ -224,7 +224,7 @@ void GpsL1CaPcpsOpenClAcquisition::reset()
}
float GpsL1CaPcpsOpenClAcquisition::calculate_threshold(float pfa)
float GpsL1CaPcpsOpenClAcquisition::calculate_threshold(float pfa) const
{
// Calculate the threshold
unsigned int frequency_bins = 0;

View File

@ -146,7 +146,7 @@ public:
}
private:
float calculate_threshold(float pfa);
float calculate_threshold(float pfa) const;
const ConfigurationInterface* configuration_;
pcps_opencl_acquisition_cc_sptr acquisition_cc_;
gr::blocks::stream_to_vector::sptr stream_to_vector_;

View File

@ -28,13 +28,16 @@ allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
if (plan->tempmemobj)
clReleaseMemObject(plan->tempmemobj);
{
clReleaseMemObject(plan->tempmemobj);
}
plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err);
}
return err;
}
static cl_int
allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
{
@ -46,10 +49,14 @@ allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
if (plan->tempmemobj_real)
clReleaseMemObject(plan->tempmemobj_real);
{
clReleaseMemObject(plan->tempmemobj_real);
}
if (plan->tempmemobj_imag)
clReleaseMemObject(plan->tempmemobj_imag);
{
clReleaseMemObject(plan->tempmemobj_imag);
}
plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err);
plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &terr);
@ -58,6 +65,7 @@ allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
return err;
}
void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
{
*lWorkItems = kernelInfo->num_workitems_per_workgroup;
@ -83,6 +91,7 @@ void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo,
*gWorkItems = numWorkGroups * *lWorkItems;
}
cl_int
clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
cl_mem data_in, cl_mem data_out,
@ -91,7 +100,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
int s;
auto *plan = (cl_fft_plan *)Plan;
if (plan->format != clFFT_InterleavedComplexFormat)
return CL_INVALID_VALUE;
{
return CL_INVALID_VALUE;
}
cl_int err;
size_t gWorkItems;
@ -101,7 +112,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
cl_int isInPlace = data_in == data_out ? 1 : 0;
if ((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
return err;
{
return err;
}
cl_mem memObj[3];
memObj[0] = data_in;
@ -146,7 +159,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err)
return err;
{
return err;
}
currRead = (currWrite == 1) ? 1 : 2;
currWrite = (currWrite == 1) ? 2 : 1;
@ -169,7 +184,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err)
return err;
{
return err;
}
currRead = 1;
currWrite = 1;
@ -181,6 +198,7 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
return err;
}
cl_int
clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
@ -190,7 +208,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
auto *plan = (cl_fft_plan *)Plan;
if (plan->format != clFFT_SplitComplexFormat)
return CL_INVALID_VALUE;
{
return CL_INVALID_VALUE;
}
cl_int err;
size_t gWorkItems;
@ -200,7 +220,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
if ((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
return err;
{
return err;
}
cl_mem memObj_real[3];
cl_mem memObj_imag[3];
@ -252,7 +274,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err)
return err;
{
return err;
}
currRead = (currWrite == 1) ? 1 : 2;
currWrite = (currWrite == 1) ? 2 : 1;
@ -276,7 +300,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err)
return err;
{
return err;
}
currRead = 1;
currWrite = 1;
@ -288,6 +314,7 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
return err;
}
cl_int
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
@ -304,12 +331,16 @@ clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
cl_device_id device_id;
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr);
if (err)
return err;
{
return err;
}
size_t gSize;
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr);
if (err)
return err;
{
return err;
}
gSize = min(128, gSize);
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};
@ -327,6 +358,7 @@ clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
return err;
}
cl_int
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
@ -343,12 +375,16 @@ clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real,
cl_device_id device_id;
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr);
if (err)
return err;
{
return err;
}
size_t gSize;
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr);
if (err)
return err;
{
return err;
}
gSize = min(128, gSize);
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};

View File

@ -30,7 +30,7 @@ num2str(int num)
{
char temp[200];
snprintf(temp, sizeof(temp), "%d", num);
return string(temp);
return {temp};
}
// For any n, this function decomposes n into factors for loacal memory tranpose
@ -155,15 +155,21 @@ getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices
}
}
static void
insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat)
{
if (dataFormat == clFFT_SplitComplexFormat)
kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
{
kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
}
else
kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
{
kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
}
}
static void
insertVariables(string &kStream, int maxRadix)
{
@ -177,11 +183,14 @@ insertVariables(string &kStream, int maxRadix)
kStream += string(" int groupId = get_group_id( 0 );\n");
}
static void
formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
{
if (dataFormat == clFFT_InterleavedComplexFormat)
kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
{
kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
}
else
{
kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n");
@ -189,11 +198,14 @@ formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dat
}
}
static void
formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
{
if (dataFormat == clFFT_InterleavedComplexFormat)
kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
{
kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
}
else
{
kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n");
@ -201,6 +213,7 @@ formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat da
}
}
static int
insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat)
{
@ -211,7 +224,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
int lMemSize = 0;
if (numXFormsPerWG > 1)
kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
{
kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
}
if (numWorkItemsPerXForm >= mem_coalesce_width)
{
@ -234,7 +249,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" out_imag += offset;\n");
}
for (i = 0; i < R0; i++)
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
{
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
}
kernelString += string(" }\n");
}
else
@ -255,7 +272,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" out_imag += offset;\n");
}
for (i = 0; i < R0; i++)
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
{
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
}
}
}
else if (N >= mem_coalesce_width)
@ -286,17 +305,23 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
{
kernelString += string(" if( jj < s ) {\n");
for (j = 0; j < numInnerIter; j++)
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
{
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
}
kernelString += string(" }\n");
if (i != numOuterIter - 1)
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
{
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
}
}
kernelString += string("}\n ");
kernelString += string("else {\n");
for (i = 0; i < numOuterIter; i++)
{
for (j = 0; j < numInnerIter; j++)
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
{
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
}
}
kernelString += string("}\n");
@ -315,7 +340,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++)
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
{
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < numOuterIter; i++)
@ -329,7 +356,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++)
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
{
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
@ -360,7 +389,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" if(jj < s )\n");
formattedLoad(kernelString, i, i * groupSize, dataFormat);
if (i != R0 - 1)
kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n");
{
kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n");
}
}
kernelString += string("}\n");
kernelString += string("else {\n");
@ -385,19 +416,27 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
for (i = 0; i < R0; i++)
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].x;\n");
{
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].x;\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++)
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
{
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++)
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].y;\n");
{
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].y;\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++)
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
{
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
@ -406,6 +445,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
return lMemSize;
}
static int
insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat)
{
@ -433,7 +473,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
formattedStore(kernelString, ind, i * numWorkItemsPerXForm, dataFormat);
}
if (numXFormsPerWG > 1)
kernelString += string(" }\n");
{
kernelString += string(" }\n");
}
}
else if (N >= mem_coalesce_width)
{
@ -455,8 +497,12 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < numOuterIter; i++)
for (j = 0; j < numInnerIter; j++)
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].x = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
{
for (j = 0; j < numInnerIter; j++)
{
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].x = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
}
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++)
@ -469,8 +515,12 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < numOuterIter; i++)
for (j = 0; j < numInnerIter; j++)
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].y = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
{
for (j = 0; j < numInnerIter; j++)
{
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].y = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
}
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
@ -478,17 +528,23 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
{
kernelString += string(" if( jj < s ) {\n");
for (j = 0; j < numInnerIter; j++)
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
{
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
}
kernelString += string(" }\n");
if (i != numOuterIter - 1)
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
{
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
}
}
kernelString += string("}\n");
kernelString += string("else {\n");
for (i = 0; i < numOuterIter; i++)
{
for (j = 0; j < numInnerIter; j++)
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
{
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
}
}
kernelString += string("}\n");
@ -512,7 +568,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++)
kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
{
kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++)
@ -525,7 +583,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++)
kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
{
kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
@ -535,7 +595,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
formattedStore(kernelString, i, i * groupSize, dataFormat);
kernelString += string(" }\n");
if (i != maxRadix - 1)
kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n");
{
kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n");
}
}
kernelString += string("}\n");
kernelString += string("else {\n");
@ -551,6 +613,7 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
return lMemSize;
}
static void
insertfftKernel(string &kernelString, int Nr, int numIter)
{
@ -561,6 +624,7 @@ insertfftKernel(string &kernelString, int Nr, int numIter)
}
}
static void
insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
{
@ -573,16 +637,24 @@ insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int le
if (z == 0)
{
if (Nprev > 1)
kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
{
kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
}
else
kernelString += string(" angf = (float) ii;\n");
{
kernelString += string(" angf = (float) ii;\n");
}
}
else
{
if (Nprev > 1)
kernelString += string(" angf = (float) ((") + num2str(z * numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n");
{
kernelString += string(" angf = (float) ((") + num2str(z * numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n");
}
else
kernelString += string(" angf = (float) (") + num2str(z * numWorkItemsPerXForm) + string(" + ii);\n");
{
kernelString += string(" angf = (float) (") + num2str(z * numWorkItemsPerXForm) + string(" + ii);\n");
}
}
for (k = 1; k < Nr; k++)
@ -596,30 +668,41 @@ insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int le
}
}
static int
getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad)
{
if ((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks))
*offset = 0;
{
*offset = 0;
}
else
{
int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
int numColsReq = 1;
if (numRowsReq > Nr)
numColsReq = numRowsReq / Nr;
{
numColsReq = numRowsReq / Nr;
}
numColsReq = Nprev * numColsReq;
*offset = numColsReq;
}
if (numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1)
*midPad = 0;
{
*midPad = 0;
}
else
{
int bankNum = ((numWorkItemsReq + *offset) * Nr) & (numBanks - 1);
if (bankNum >= numWorkItemsPerXForm)
*midPad = 0;
{
*midPad = 0;
}
else
*midPad = numWorkItemsPerXForm - bankNum;
{
*midPad = numWorkItemsPerXForm - bankNum;
}
}
int lMemSize = (numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1);
@ -644,6 +727,7 @@ insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPer
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
}
static void
insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
{
@ -676,6 +760,7 @@ insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Nc
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
}
static void
insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
{
@ -687,33 +772,52 @@ insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numW
if (Ncurr < numWorkItemsPerXForm)
{
if (Nprev == 1)
kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n");
{
kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n");
}
else
kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
{
kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
}
if (Nprev == 1)
kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n");
{
kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n");
}
else
kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n");
{
kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n");
}
}
else
{
if (Nprev == 1)
kernelString += string(" j = ii;\n");
{
kernelString += string(" j = ii;\n");
}
else
kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n");
{
kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n");
}
if (Nprev == 1)
kernelString += string(" i = 0;\n");
{
kernelString += string(" i = 0;\n");
}
else
kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n");
{
kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n");
}
}
if (numXFormsPerWG > 1)
kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n");
{
kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n");
}
kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n");
}
static void
insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad)
{
@ -742,7 +846,9 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
assert(numRadix > 0 && "no radix array supplied\n");
if (n / radixArray[0] > plan->max_work_item_per_workgroup)
getRadixArray(n, radixArray, &numRadix, plan->max_radix);
{
getRadixArray(n, radixArray, &numRadix, plan->max_radix);
}
assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n");
assert(n / radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n");
@ -839,11 +945,14 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
insertHeader(*kernelString, kernelName, dataFormat);
*kernelString += string("{\n");
if ((*kInfo)->lmem_size)
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
{
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
}
*kernelString += localString;
*kernelString += string("}\n");
}
// For n larger than what can be computed using local memory fft, global transposes
// multiple kernel launces is needed. For these sizes, n can be decomposed using
// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel
@ -864,7 +973,6 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
// in this example. Users can play with difference base radices and difference
// decompositions of base radices to generates different kernels and see which gives
// best performance. Following function is just fixed to use 128 as base radix
void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
{
int baseRadix = min(n, 128);
@ -878,7 +986,9 @@ void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
}
for (int i = 0; i < numR; i++)
radix[i] = baseRadix;
{
radix[i] = baseRadix;
}
radix[numR] = N;
numR++;
@ -906,6 +1016,7 @@ void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
}
}
static void
createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
{
@ -960,12 +1071,18 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
int strideI = Rinit;
for (i = 0; i < numPasses; i++)
if (i != passNum)
strideI *= radixArr[i];
{
if (i != passNum)
{
strideI *= radixArr[i];
}
}
int strideO = Rinit;
for (i = 0; i < passNum; i++)
strideO *= radixArr[i];
{
strideO *= radixArr[i];
}
int threadsPerXForm = R2;
batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize;
@ -986,30 +1103,44 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
int numBlocksPerXForm = strideI / batchSize;
int numBlocks = numBlocksPerXForm;
if (!vertical)
numBlocks *= BS;
{
numBlocks *= BS;
}
else
numBlocks *= vertBS;
{
numBlocks *= vertBS;
}
kernelName = string("fft") + num2str(kCount);
*kInfo = (cl_fft_kernel_info *)malloc(sizeof(cl_fft_kernel_info));
(*kInfo)->kernel = nullptr;
if (R2 == 1)
(*kInfo)->lmem_size = 0;
{
(*kInfo)->lmem_size = 0;
}
else
{
if (strideO == 1)
(*kInfo)->lmem_size = (radix + 1) * batchSize;
{
(*kInfo)->lmem_size = (radix + 1) * batchSize;
}
else
(*kInfo)->lmem_size = threadsPerBlock * R1;
{
(*kInfo)->lmem_size = threadsPerBlock * R1;
}
}
(*kInfo)->num_workgroups = numBlocks;
(*kInfo)->num_xforms_per_workgroup = 1;
(*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
(*kInfo)->dir = dir;
if ((passNum == (numPasses - 1)) && (numPasses & 1))
(*kInfo)->in_place_possible = 1;
{
(*kInfo)->in_place_possible = 1;
}
else
(*kInfo)->in_place_possible = 0;
{
(*kInfo)->in_place_possible = 0;
}
(*kInfo)->next = nullptr;
(*kInfo)->kernel_name = (char *)malloc(sizeof(char) * (kernelName.size() + 1));
snprintf((*kInfo)->kernel_name, sizeof((*kInfo)->kernel_name), kernelName.c_str());
@ -1026,7 +1157,9 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
int stride = radix * Rinit;
for (i = 0; i < passNum; i++)
stride *= radixArr[i];
{
stride *= radixArr[i];
}
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int)log2(n * BS)) + string("));\n");
localString += string("bNum = groupId;\n");
}
@ -1041,7 +1174,9 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
int stride = radix * Rinit;
for (i = 0; i < passNum; i++)
stride *= radixArr[i];
{
stride *= radixArr[i];
}
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");
localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");
@ -1059,15 +1194,21 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("in_real += indexIn;\n");
localString += string("in_imag += indexIn;\n");
for (j = 0; j < R1; j++)
localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j * gInInc * strideI) + string("];\n");
{
localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j * gInInc * strideI) + string("];\n");
}
for (j = 0; j < R1; j++)
localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j * gInInc * strideI) + string("];\n");
{
localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j * gInInc * strideI) + string("];\n");
}
}
else
{
localString += string("in += indexIn;\n");
for (j = 0; j < R1; j++)
localString += string("a[") + num2str(j) + string("] = in[") + num2str(j * gInInc * strideI) + string("];\n");
{
localString += string("a[") + num2str(j) + string("] = in[") + num2str(j * gInInc * strideI) + string("];\n");
}
}
localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");
@ -1088,22 +1229,36 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("lMemStore = sMem + tid;\n");
localString += string("lMemLoad = sMem + indexIn;\n");
for (k = 0; k < R1; k++)
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
{
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (k = 0; k < numIter; k++)
for (t = 0; t < R2; t++)
localString += string("a[") + num2str(k * R2 + t) + string("].x = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
{
for (t = 0; t < R2; t++)
{
localString += string("a[") + num2str(k * R2 + t) + string("].x = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (k = 0; k < R1; k++)
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
{
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (k = 0; k < numIter; k++)
for (t = 0; t < R2; t++)
localString += string("a[") + num2str(k * R2 + t) + string("].y = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
{
for (t = 0; t < R2; t++)
{
localString += string("a[") + num2str(k * R2 + t) + string("].y = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (j = 0; j < numIter; j++)
localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j * R2) + string(", dir);\n");
{
localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j * R2) + string(", dir);\n");
}
}
// twiddle
@ -1127,40 +1282,60 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix + 1) + string(", tid & ") + num2str(radix - 1) + string(");\n");
for (i = 0; i < R1 / R2; i++)
for (j = 0; j < R2; j++)
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].x;\n");
{
for (j = 0; j < R2; j++)
{
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].x;\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if (threadsPerBlock >= radix)
{
for (i = 0; i < R1; i++)
localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
{
localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
}
}
else
{
int innerIter = radix / threadsPerBlock;
int outerIter = R1 / innerIter;
for (i = 0; i < outerIter; i++)
for (j = 0; j < innerIter; j++)
localString += string("a[") + num2str(i * innerIter + j) + string("].x = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
{
for (j = 0; j < innerIter; j++)
{
localString += string("a[") + num2str(i * innerIter + j) + string("].x = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
}
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (i = 0; i < R1 / R2; i++)
for (j = 0; j < R2; j++)
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].y;\n");
{
for (j = 0; j < R2; j++)
{
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].y;\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if (threadsPerBlock >= radix)
{
for (i = 0; i < R1; i++)
localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
{
localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
}
}
else
{
int innerIter = radix / threadsPerBlock;
int outerIter = R1 / innerIter;
for (i = 0; i < outerIter; i++)
for (j = 0; j < innerIter; j++)
localString += string("a[") + num2str(i * innerIter + j) + string("].y = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
{
for (j = 0; j < innerIter; j++)
{
localString += string("a[") + num2str(i * innerIter + j) + string("].y = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
}
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
@ -1170,15 +1345,21 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("out_real += indexOut;\n");
localString += string("out_imag += indexOut;\n");
for (k = 0; k < R1; k++)
localString += string("out_real[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
{
localString += string("out_real[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
}
for (k = 0; k < R1; k++)
localString += string("out_imag[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
{
localString += string("out_imag[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
}
}
else
{
localString += string("out += indexOut;\n");
for (k = 0; k < R1; k++)
localString += string("out[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");
{
localString += string("out[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");
}
}
}
else
@ -1189,22 +1370,30 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("out_real += indexOut;\n");
localString += string("out_imag += indexOut;\n");
for (k = 0; k < R1; k++)
localString += string("out_real[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].x;\n");
{
localString += string("out_real[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].x;\n");
}
for (k = 0; k < R1; k++)
localString += string("out_imag[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].y;\n");
{
localString += string("out_imag[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].y;\n");
}
}
else
{
localString += string("out += indexOut;\n");
for (k = 0; k < R1; k++)
localString += string("out[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("];\n");
{
localString += string("out[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("];\n");
}
}
}
insertHeader(*kernelString, kernelName, dataFormat);
*kernelString += string("{\n");
if ((*kInfo)->lmem_size)
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
{
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
}
*kernelString += localString;
*kernelString += string("}\n");
@ -1214,6 +1403,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
}
}
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
{
unsigned int radixArray[10];
@ -1237,21 +1427,29 @@ void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
{
getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix);
if (plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
createLocalMemfftKernelString(plan);
{
createLocalMemfftKernelString(plan);
}
else
createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
{
createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
}
}
}
break;
case cl_fft_kernel_y:
if (plan->n.y > 1)
createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
{
createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
}
break;
case cl_fft_kernel_z:
if (plan->n.z > 1)
createGlobalFFTKernelString(plan, plan->n.z, plan->n.x * plan->n.y, cl_fft_kernel_z, 1);
{
createGlobalFFTKernelString(plan, plan->n.z, plan->n.x * plan->n.y, cl_fft_kernel_z, 1);
}
default:
return;
}

View File

@ -31,9 +31,13 @@ getBlockConfigAndKernelString(cl_fft_plan *plan)
*plan->kernel_string += baseKernels;
if (plan->format == clFFT_SplitComplexFormat)
*plan->kernel_string += twistKernelPlannar;
{
*plan->kernel_string += twistKernelPlannar;
}
else
*plan->kernel_string += twistKernelInterleaved;
{
*plan->kernel_string += twistKernelInterleaved;
}
switch (plan->dim)
{
@ -72,13 +76,18 @@ deleteKernelInfo(cl_fft_kernel_info *kInfo)
if (kInfo)
{
if (kInfo->kernel_name)
free(kInfo->kernel_name);
{
free(kInfo->kernel_name);
}
if (kInfo->kernel)
clReleaseKernel(kInfo->kernel);
{
clReleaseKernel(kInfo->kernel);
}
free(kInfo);
}
}
static void
destroy_plan(cl_fft_plan *Plan)
{
@ -125,6 +134,7 @@ destroy_plan(cl_fft_plan *Plan)
}
}
static int
createKernelList(cl_fft_plan *plan)
{
@ -136,21 +146,30 @@ createKernelList(cl_fft_plan *plan)
{
kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
if (!kernel_info->kernel || err != CL_SUCCESS)
return err;
{
return err;
}
kernel_info = kernel_info->next;
}
if (plan->format == clFFT_SplitComplexFormat)
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
{
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
}
else
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
{
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
}
if (!plan->twist_kernel || err)
return err;
{
return err;
}
return CL_SUCCESS;
}
int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
{
int reg_needed = 0;
@ -166,13 +185,19 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
{
err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, nullptr);
if (err != CL_SUCCESS)
return -1;
{
return -1;
}
if (wg_size < kInfo->num_workitems_per_workgroup)
reg_needed |= 1;
{
reg_needed |= 1;
}
if (*max_wg_size > wg_size)
*max_wg_size = wg_size;
{
*max_wg_size = wg_size;
}
kInfo = kInfo->next;
}
@ -181,6 +206,7 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
return reg_needed;
}
#define ERR_MACRO(err) \
{ \
if ((err) != CL_SUCCESS) \
@ -192,6 +218,7 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
} \
}
clFFT_Plan
clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code)
{
@ -326,11 +353,14 @@ patch_kernel_source:
}
if (error_code)
*error_code = CL_SUCCESS;
{
*error_code = CL_SUCCESS;
}
return (clFFT_Plan)plan;
}
void clFFT_DestroyPlan(clFFT_Plan plan)
{
auto *Plan = (cl_fft_plan *)plan;
@ -342,15 +372,20 @@ void clFFT_DestroyPlan(clFFT_Plan plan)
}
}
void clFFT_DumpPlan(clFFT_Plan Plan, FILE *file)
{
size_t gDim;
size_t lDim;
FILE *out;
if (!file)
out = stdout;
{
out = stdout;
}
else
out = file;
{
out = file;
}
auto *plan = (cl_fft_plan *)Plan;
cl_fft_kernel_info *kInfo = plan->kernel_info;