1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-07-23 20:33:05 +00:00

CI: make clang-tidy happy

This commit is contained in:
Carles Fernandez 2021-12-17 19:01:41 +01:00
parent 12ed230cd7
commit d9a516e0b8
No known key found for this signature in database
GPG Key ID: 4C583C52B0C3877D
5 changed files with 394 additions and 125 deletions

View File

@ -224,7 +224,7 @@ void GpsL1CaPcpsOpenClAcquisition::reset()
} }
float GpsL1CaPcpsOpenClAcquisition::calculate_threshold(float pfa) float GpsL1CaPcpsOpenClAcquisition::calculate_threshold(float pfa) const
{ {
// Calculate the threshold // Calculate the threshold
unsigned int frequency_bins = 0; unsigned int frequency_bins = 0;

View File

@ -146,7 +146,7 @@ public:
} }
private: private:
float calculate_threshold(float pfa); float calculate_threshold(float pfa) const;
const ConfigurationInterface* configuration_; const ConfigurationInterface* configuration_;
pcps_opencl_acquisition_cc_sptr acquisition_cc_; pcps_opencl_acquisition_cc_sptr acquisition_cc_;
gr::blocks::stream_to_vector::sptr stream_to_vector_; gr::blocks::stream_to_vector::sptr stream_to_vector_;

View File

@ -28,13 +28,16 @@ allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float); size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
if (plan->tempmemobj) if (plan->tempmemobj)
clReleaseMemObject(plan->tempmemobj); {
clReleaseMemObject(plan->tempmemobj);
}
plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err); plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err);
} }
return err; return err;
} }
static cl_int static cl_int
allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize) allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
{ {
@ -46,10 +49,14 @@ allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float); size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
if (plan->tempmemobj_real) if (plan->tempmemobj_real)
clReleaseMemObject(plan->tempmemobj_real); {
clReleaseMemObject(plan->tempmemobj_real);
}
if (plan->tempmemobj_imag) if (plan->tempmemobj_imag)
clReleaseMemObject(plan->tempmemobj_imag); {
clReleaseMemObject(plan->tempmemobj_imag);
}
plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err); plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err);
plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &terr); plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &terr);
@ -58,6 +65,7 @@ allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
return err; return err;
} }
void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems) void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
{ {
*lWorkItems = kernelInfo->num_workitems_per_workgroup; *lWorkItems = kernelInfo->num_workitems_per_workgroup;
@ -83,6 +91,7 @@ void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo,
*gWorkItems = numWorkGroups * *lWorkItems; *gWorkItems = numWorkGroups * *lWorkItems;
} }
cl_int cl_int
clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
cl_mem data_in, cl_mem data_out, cl_mem data_in, cl_mem data_out,
@ -91,7 +100,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
int s; int s;
auto *plan = (cl_fft_plan *)Plan; auto *plan = (cl_fft_plan *)Plan;
if (plan->format != clFFT_InterleavedComplexFormat) if (plan->format != clFFT_InterleavedComplexFormat)
return CL_INVALID_VALUE; {
return CL_INVALID_VALUE;
}
cl_int err; cl_int err;
size_t gWorkItems; size_t gWorkItems;
@ -101,7 +112,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
cl_int isInPlace = data_in == data_out ? 1 : 0; cl_int isInPlace = data_in == data_out ? 1 : 0;
if ((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS) if ((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
return err; {
return err;
}
cl_mem memObj[3]; cl_mem memObj[3];
memObj[0] = data_in; memObj[0] = data_in;
@ -146,7 +159,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err) if (err)
return err; {
return err;
}
currRead = (currWrite == 1) ? 1 : 2; currRead = (currWrite == 1) ? 1 : 2;
currWrite = (currWrite == 1) ? 2 : 1; currWrite = (currWrite == 1) ? 2 : 1;
@ -169,7 +184,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err) if (err)
return err; {
return err;
}
currRead = 1; currRead = 1;
currWrite = 1; currWrite = 1;
@ -181,6 +198,7 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
return err; return err;
} }
cl_int cl_int
clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
@ -190,7 +208,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
auto *plan = (cl_fft_plan *)Plan; auto *plan = (cl_fft_plan *)Plan;
if (plan->format != clFFT_SplitComplexFormat) if (plan->format != clFFT_SplitComplexFormat)
return CL_INVALID_VALUE; {
return CL_INVALID_VALUE;
}
cl_int err; cl_int err;
size_t gWorkItems; size_t gWorkItems;
@ -200,7 +220,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0; cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
if ((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS) if ((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
return err; {
return err;
}
cl_mem memObj_real[3]; cl_mem memObj_real[3];
cl_mem memObj_imag[3]; cl_mem memObj_imag[3];
@ -252,7 +274,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err) if (err)
return err; {
return err;
}
currRead = (currWrite == 1) ? 1 : 2; currRead = (currWrite == 1) ? 1 : 2;
currWrite = (currWrite == 1) ? 2 : 1; currWrite = (currWrite == 1) ? 2 : 1;
@ -276,7 +300,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
if (err) if (err)
return err; {
return err;
}
currRead = 1; currRead = 1;
currWrite = 1; currWrite = 1;
@ -288,6 +314,7 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
return err; return err;
} }
cl_int cl_int
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir) unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
@ -304,12 +331,16 @@ clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
cl_device_id device_id; cl_device_id device_id;
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr); err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr);
if (err) if (err)
return err; {
return err;
}
size_t gSize; size_t gSize;
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr); err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr);
if (err) if (err)
return err; {
return err;
}
gSize = min(128, gSize); gSize = min(128, gSize);
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize}; size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};
@ -327,6 +358,7 @@ clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
return err; return err;
} }
cl_int cl_int
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir) unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
@ -343,12 +375,16 @@ clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real,
cl_device_id device_id; cl_device_id device_id;
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr); err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr);
if (err) if (err)
return err; {
return err;
}
size_t gSize; size_t gSize;
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr); err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr);
if (err) if (err)
return err; {
return err;
}
gSize = min(128, gSize); gSize = min(128, gSize);
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize}; size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};

View File

@ -30,7 +30,7 @@ num2str(int num)
{ {
char temp[200]; char temp[200];
snprintf(temp, sizeof(temp), "%d", num); snprintf(temp, sizeof(temp), "%d", num);
return string(temp); return {temp};
} }
// For any n, this function decomposes n into factors for loacal memory tranpose // For any n, this function decomposes n into factors for loacal memory tranpose
@ -155,15 +155,21 @@ getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices
} }
} }
static void static void
insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat) insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat)
{ {
if (dataFormat == clFFT_SplitComplexFormat) if (dataFormat == clFFT_SplitComplexFormat)
kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n"); {
kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
}
else else
kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n"); {
kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
}
} }
static void static void
insertVariables(string &kStream, int maxRadix) insertVariables(string &kStream, int maxRadix)
{ {
@ -177,11 +183,14 @@ insertVariables(string &kStream, int maxRadix)
kStream += string(" int groupId = get_group_id( 0 );\n"); kStream += string(" int groupId = get_group_id( 0 );\n");
} }
static void static void
formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
{ {
if (dataFormat == clFFT_InterleavedComplexFormat) if (dataFormat == clFFT_InterleavedComplexFormat)
kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n"); {
kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
}
else else
{ {
kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n"); kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n");
@ -189,11 +198,14 @@ formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dat
} }
} }
static void static void
formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
{ {
if (dataFormat == clFFT_InterleavedComplexFormat) if (dataFormat == clFFT_InterleavedComplexFormat)
kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n"); {
kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
}
else else
{ {
kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n"); kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n");
@ -201,6 +213,7 @@ formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat da
} }
} }
static int static int
insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat) insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat)
{ {
@ -211,7 +224,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
int lMemSize = 0; int lMemSize = 0;
if (numXFormsPerWG > 1) if (numXFormsPerWG > 1)
kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n"); {
kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
}
if (numWorkItemsPerXForm >= mem_coalesce_width) if (numWorkItemsPerXForm >= mem_coalesce_width)
{ {
@ -234,7 +249,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" out_imag += offset;\n"); kernelString += string(" out_imag += offset;\n");
} }
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat); {
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
}
kernelString += string(" }\n"); kernelString += string(" }\n");
} }
else else
@ -255,7 +272,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" out_imag += offset;\n"); kernelString += string(" out_imag += offset;\n");
} }
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat); {
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
}
} }
} }
else if (N >= mem_coalesce_width) else if (N >= mem_coalesce_width)
@ -286,17 +305,23 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
{ {
kernelString += string(" if( jj < s ) {\n"); kernelString += string(" if( jj < s ) {\n");
for (j = 0; j < numInnerIter; j++) for (j = 0; j < numInnerIter; j++)
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); {
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
}
kernelString += string(" }\n"); kernelString += string(" }\n");
if (i != numOuterIter - 1) if (i != numOuterIter - 1)
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); {
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
}
} }
kernelString += string("}\n "); kernelString += string("}\n ");
kernelString += string("else {\n"); kernelString += string("else {\n");
for (i = 0; i < numOuterIter; i++) for (i = 0; i < numOuterIter; i++)
{ {
for (j = 0; j < numInnerIter; j++) for (j = 0; j < numInnerIter; j++)
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); {
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
}
} }
kernelString += string("}\n"); kernelString += string("}\n");
@ -315,7 +340,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); {
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < numOuterIter; i++) for (i = 0; i < numOuterIter; i++)
@ -329,7 +356,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); {
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
@ -360,7 +389,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" if(jj < s )\n"); kernelString += string(" if(jj < s )\n");
formattedLoad(kernelString, i, i * groupSize, dataFormat); formattedLoad(kernelString, i, i * groupSize, dataFormat);
if (i != R0 - 1) if (i != R0 - 1)
kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n"); {
kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n");
}
} }
kernelString += string("}\n"); kernelString += string("}\n");
kernelString += string("else {\n"); kernelString += string("else {\n");
@ -385,19 +416,27 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].x;\n"); {
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].x;\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); {
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].y;\n"); {
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].y;\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < R0; i++) for (i = 0; i < R0; i++)
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); {
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
@ -406,6 +445,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
return lMemSize; return lMemSize;
} }
static int static int
insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat) insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat)
{ {
@ -433,7 +473,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
formattedStore(kernelString, ind, i * numWorkItemsPerXForm, dataFormat); formattedStore(kernelString, ind, i * numWorkItemsPerXForm, dataFormat);
} }
if (numXFormsPerWG > 1) if (numXFormsPerWG > 1)
kernelString += string(" }\n"); {
kernelString += string(" }\n");
}
} }
else if (N >= mem_coalesce_width) else if (N >= mem_coalesce_width)
{ {
@ -455,8 +497,12 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < numOuterIter; i++) for (i = 0; i < numOuterIter; i++)
for (j = 0; j < numInnerIter; j++) {
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].x = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n"); for (j = 0; j < numInnerIter; j++)
{
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].x = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
}
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++) for (i = 0; i < maxRadix; i++)
@ -469,8 +515,12 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < numOuterIter; i++) for (i = 0; i < numOuterIter; i++)
for (j = 0; j < numInnerIter; j++) {
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].y = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n"); for (j = 0; j < numInnerIter; j++)
{
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].y = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
}
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
@ -478,17 +528,23 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
{ {
kernelString += string(" if( jj < s ) {\n"); kernelString += string(" if( jj < s ) {\n");
for (j = 0; j < numInnerIter; j++) for (j = 0; j < numInnerIter; j++)
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); {
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
}
kernelString += string(" }\n"); kernelString += string(" }\n");
if (i != numOuterIter - 1) if (i != numOuterIter - 1)
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); {
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
}
} }
kernelString += string("}\n"); kernelString += string("}\n");
kernelString += string("else {\n"); kernelString += string("else {\n");
for (i = 0; i < numOuterIter; i++) for (i = 0; i < numOuterIter; i++)
{ {
for (j = 0; j < numInnerIter; j++) for (j = 0; j < numInnerIter; j++)
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); {
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
}
} }
kernelString += string("}\n"); kernelString += string("}\n");
@ -512,7 +568,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++) for (i = 0; i < maxRadix; i++)
kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n"); {
kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++) for (i = 0; i < maxRadix; i++)
@ -525,7 +583,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for (i = 0; i < maxRadix; i++) for (i = 0; i < maxRadix; i++)
kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n"); {
kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
@ -535,7 +595,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
formattedStore(kernelString, i, i * groupSize, dataFormat); formattedStore(kernelString, i, i * groupSize, dataFormat);
kernelString += string(" }\n"); kernelString += string(" }\n");
if (i != maxRadix - 1) if (i != maxRadix - 1)
kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n"); {
kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n");
}
} }
kernelString += string("}\n"); kernelString += string("}\n");
kernelString += string("else {\n"); kernelString += string("else {\n");
@ -551,6 +613,7 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
return lMemSize; return lMemSize;
} }
static void static void
insertfftKernel(string &kernelString, int Nr, int numIter) insertfftKernel(string &kernelString, int Nr, int numIter)
{ {
@ -561,6 +624,7 @@ insertfftKernel(string &kernelString, int Nr, int numIter)
} }
} }
static void static void
insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
{ {
@ -573,16 +637,24 @@ insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int le
if (z == 0) if (z == 0)
{ {
if (Nprev > 1) if (Nprev > 1)
kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n"); {
kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
}
else else
kernelString += string(" angf = (float) ii;\n"); {
kernelString += string(" angf = (float) ii;\n");
}
} }
else else
{ {
if (Nprev > 1) if (Nprev > 1)
kernelString += string(" angf = (float) ((") + num2str(z * numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); {
kernelString += string(" angf = (float) ((") + num2str(z * numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n");
}
else else
kernelString += string(" angf = (float) (") + num2str(z * numWorkItemsPerXForm) + string(" + ii);\n"); {
kernelString += string(" angf = (float) (") + num2str(z * numWorkItemsPerXForm) + string(" + ii);\n");
}
} }
for (k = 1; k < Nr; k++) for (k = 1; k < Nr; k++)
@ -596,30 +668,41 @@ insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int le
} }
} }
static int static int
getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad) getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad)
{ {
if ((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks)) if ((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks))
*offset = 0; {
*offset = 0;
}
else else
{ {
int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev; int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
int numColsReq = 1; int numColsReq = 1;
if (numRowsReq > Nr) if (numRowsReq > Nr)
numColsReq = numRowsReq / Nr; {
numColsReq = numRowsReq / Nr;
}
numColsReq = Nprev * numColsReq; numColsReq = Nprev * numColsReq;
*offset = numColsReq; *offset = numColsReq;
} }
if (numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) if (numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1)
*midPad = 0; {
*midPad = 0;
}
else else
{ {
int bankNum = ((numWorkItemsReq + *offset) * Nr) & (numBanks - 1); int bankNum = ((numWorkItemsReq + *offset) * Nr) & (numBanks - 1);
if (bankNum >= numWorkItemsPerXForm) if (bankNum >= numWorkItemsPerXForm)
*midPad = 0; {
*midPad = 0;
}
else else
*midPad = numWorkItemsPerXForm - bankNum; {
*midPad = numWorkItemsPerXForm - bankNum;
}
} }
int lMemSize = (numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1); int lMemSize = (numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1);
@ -644,6 +727,7 @@ insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPer
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
} }
static void static void
insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
{ {
@ -676,6 +760,7 @@ insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Nc
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
} }
static void static void
insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
{ {
@ -687,33 +772,52 @@ insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numW
if (Ncurr < numWorkItemsPerXForm) if (Ncurr < numWorkItemsPerXForm)
{ {
if (Nprev == 1) if (Nprev == 1)
kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n"); {
kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n");
}
else else
kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n"); {
kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
}
if (Nprev == 1) if (Nprev == 1)
kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n"); {
kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n");
}
else else
kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); {
kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n");
}
} }
else else
{ {
if (Nprev == 1) if (Nprev == 1)
kernelString += string(" j = ii;\n"); {
kernelString += string(" j = ii;\n");
}
else else
kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n"); {
kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n");
}
if (Nprev == 1) if (Nprev == 1)
kernelString += string(" i = 0;\n"); {
kernelString += string(" i = 0;\n");
}
else else
kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n"); {
kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n");
}
} }
if (numXFormsPerWG > 1) if (numXFormsPerWG > 1)
kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n"); {
kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n");
}
kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n");
} }
static void static void
insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad) insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad)
{ {
@ -742,7 +846,9 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
assert(numRadix > 0 && "no radix array supplied\n"); assert(numRadix > 0 && "no radix array supplied\n");
if (n / radixArray[0] > plan->max_work_item_per_workgroup) if (n / radixArray[0] > plan->max_work_item_per_workgroup)
getRadixArray(n, radixArray, &numRadix, plan->max_radix); {
getRadixArray(n, radixArray, &numRadix, plan->max_radix);
}
assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n"); assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n");
assert(n / radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n"); assert(n / radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n");
@ -839,11 +945,14 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
insertHeader(*kernelString, kernelName, dataFormat); insertHeader(*kernelString, kernelName, dataFormat);
*kernelString += string("{\n"); *kernelString += string("{\n");
if ((*kInfo)->lmem_size) if ((*kInfo)->lmem_size)
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); {
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
}
*kernelString += localString; *kernelString += localString;
*kernelString += string("}\n"); *kernelString += string("}\n");
} }
// For n larger than what can be computed using local memory fft, global transposes // For n larger than what can be computed using local memory fft, global transposes
// multiple kernel launces is needed. For these sizes, n can be decomposed using // multiple kernel launces is needed. For these sizes, n can be decomposed using
// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel // much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel
@ -864,7 +973,6 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
// in this example. Users can play with difference base radices and difference // in this example. Users can play with difference base radices and difference
// decompositions of base radices to generates different kernels and see which gives // decompositions of base radices to generates different kernels and see which gives
// best performance. Following function is just fixed to use 128 as base radix // best performance. Following function is just fixed to use 128 as base radix
void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
{ {
int baseRadix = min(n, 128); int baseRadix = min(n, 128);
@ -878,7 +986,9 @@ void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
} }
for (int i = 0; i < numR; i++) for (int i = 0; i < numR; i++)
radix[i] = baseRadix; {
radix[i] = baseRadix;
}
radix[numR] = N; radix[numR] = N;
numR++; numR++;
@ -906,6 +1016,7 @@ void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
} }
} }
static void static void
createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
{ {
@ -960,12 +1071,18 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
int strideI = Rinit; int strideI = Rinit;
for (i = 0; i < numPasses; i++) for (i = 0; i < numPasses; i++)
if (i != passNum) {
strideI *= radixArr[i]; if (i != passNum)
{
strideI *= radixArr[i];
}
}
int strideO = Rinit; int strideO = Rinit;
for (i = 0; i < passNum; i++) for (i = 0; i < passNum; i++)
strideO *= radixArr[i]; {
strideO *= radixArr[i];
}
int threadsPerXForm = R2; int threadsPerXForm = R2;
batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize; batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize;
@ -986,30 +1103,44 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
int numBlocksPerXForm = strideI / batchSize; int numBlocksPerXForm = strideI / batchSize;
int numBlocks = numBlocksPerXForm; int numBlocks = numBlocksPerXForm;
if (!vertical) if (!vertical)
numBlocks *= BS; {
numBlocks *= BS;
}
else else
numBlocks *= vertBS; {
numBlocks *= vertBS;
}
kernelName = string("fft") + num2str(kCount); kernelName = string("fft") + num2str(kCount);
*kInfo = (cl_fft_kernel_info *)malloc(sizeof(cl_fft_kernel_info)); *kInfo = (cl_fft_kernel_info *)malloc(sizeof(cl_fft_kernel_info));
(*kInfo)->kernel = nullptr; (*kInfo)->kernel = nullptr;
if (R2 == 1) if (R2 == 1)
(*kInfo)->lmem_size = 0; {
(*kInfo)->lmem_size = 0;
}
else else
{ {
if (strideO == 1) if (strideO == 1)
(*kInfo)->lmem_size = (radix + 1) * batchSize; {
(*kInfo)->lmem_size = (radix + 1) * batchSize;
}
else else
(*kInfo)->lmem_size = threadsPerBlock * R1; {
(*kInfo)->lmem_size = threadsPerBlock * R1;
}
} }
(*kInfo)->num_workgroups = numBlocks; (*kInfo)->num_workgroups = numBlocks;
(*kInfo)->num_xforms_per_workgroup = 1; (*kInfo)->num_xforms_per_workgroup = 1;
(*kInfo)->num_workitems_per_workgroup = threadsPerBlock; (*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
(*kInfo)->dir = dir; (*kInfo)->dir = dir;
if ((passNum == (numPasses - 1)) && (numPasses & 1)) if ((passNum == (numPasses - 1)) && (numPasses & 1))
(*kInfo)->in_place_possible = 1; {
(*kInfo)->in_place_possible = 1;
}
else else
(*kInfo)->in_place_possible = 0; {
(*kInfo)->in_place_possible = 0;
}
(*kInfo)->next = nullptr; (*kInfo)->next = nullptr;
(*kInfo)->kernel_name = (char *)malloc(sizeof(char) * (kernelName.size() + 1)); (*kInfo)->kernel_name = (char *)malloc(sizeof(char) * (kernelName.size() + 1));
snprintf((*kInfo)->kernel_name, sizeof((*kInfo)->kernel_name), kernelName.c_str()); snprintf((*kInfo)->kernel_name, sizeof((*kInfo)->kernel_name), kernelName.c_str());
@ -1026,7 +1157,9 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
int stride = radix * Rinit; int stride = radix * Rinit;
for (i = 0; i < passNum; i++) for (i = 0; i < passNum; i++)
stride *= radixArr[i]; {
stride *= radixArr[i];
}
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int)log2(n * BS)) + string("));\n"); localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int)log2(n * BS)) + string("));\n");
localString += string("bNum = groupId;\n"); localString += string("bNum = groupId;\n");
} }
@ -1041,7 +1174,9 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
int stride = radix * Rinit; int stride = radix * Rinit;
for (i = 0; i < passNum; i++) for (i = 0; i < passNum; i++)
stride *= radixArr[i]; {
stride *= radixArr[i];
}
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n"); localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");
localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n"); localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n"); localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");
@ -1059,15 +1194,21 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("in_real += indexIn;\n"); localString += string("in_real += indexIn;\n");
localString += string("in_imag += indexIn;\n"); localString += string("in_imag += indexIn;\n");
for (j = 0; j < R1; j++) for (j = 0; j < R1; j++)
localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j * gInInc * strideI) + string("];\n"); {
localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j * gInInc * strideI) + string("];\n");
}
for (j = 0; j < R1; j++) for (j = 0; j < R1; j++)
localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j * gInInc * strideI) + string("];\n"); {
localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j * gInInc * strideI) + string("];\n");
}
} }
else else
{ {
localString += string("in += indexIn;\n"); localString += string("in += indexIn;\n");
for (j = 0; j < R1; j++) for (j = 0; j < R1; j++)
localString += string("a[") + num2str(j) + string("] = in[") + num2str(j * gInInc * strideI) + string("];\n"); {
localString += string("a[") + num2str(j) + string("] = in[") + num2str(j * gInInc * strideI) + string("];\n");
}
} }
localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");
@ -1088,22 +1229,36 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("lMemStore = sMem + tid;\n"); localString += string("lMemStore = sMem + tid;\n");
localString += string("lMemLoad = sMem + indexIn;\n"); localString += string("lMemLoad = sMem + indexIn;\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); {
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (k = 0; k < numIter; k++) for (k = 0; k < numIter; k++)
for (t = 0; t < R2; t++) {
localString += string("a[") + num2str(k * R2 + t) + string("].x = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n"); for (t = 0; t < R2; t++)
{
localString += string("a[") + num2str(k * R2 + t) + string("].x = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); {
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (k = 0; k < numIter; k++) for (k = 0; k < numIter; k++)
for (t = 0; t < R2; t++) {
localString += string("a[") + num2str(k * R2 + t) + string("].y = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n"); for (t = 0; t < R2; t++)
{
localString += string("a[") + num2str(k * R2 + t) + string("].y = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (j = 0; j < numIter; j++) for (j = 0; j < numIter; j++)
localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j * R2) + string(", dir);\n"); {
localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j * R2) + string(", dir);\n");
}
} }
// twiddle // twiddle
@ -1127,40 +1282,60 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix + 1) + string(", tid & ") + num2str(radix - 1) + string(");\n"); localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix + 1) + string(", tid & ") + num2str(radix - 1) + string(");\n");
for (i = 0; i < R1 / R2; i++) for (i = 0; i < R1 / R2; i++)
for (j = 0; j < R2; j++) {
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].x;\n"); for (j = 0; j < R2; j++)
{
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].x;\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if (threadsPerBlock >= radix) if (threadsPerBlock >= radix)
{ {
for (i = 0; i < R1; i++) for (i = 0; i < R1; i++)
localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n"); {
localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
}
} }
else else
{ {
int innerIter = radix / threadsPerBlock; int innerIter = radix / threadsPerBlock;
int outerIter = R1 / innerIter; int outerIter = R1 / innerIter;
for (i = 0; i < outerIter; i++) for (i = 0; i < outerIter; i++)
for (j = 0; j < innerIter; j++) {
localString += string("a[") + num2str(i * innerIter + j) + string("].x = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n"); for (j = 0; j < innerIter; j++)
{
localString += string("a[") + num2str(i * innerIter + j) + string("].x = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
}
}
} }
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for (i = 0; i < R1 / R2; i++) for (i = 0; i < R1 / R2; i++)
for (j = 0; j < R2; j++) {
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].y;\n"); for (j = 0; j < R2; j++)
{
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].y;\n");
}
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if (threadsPerBlock >= radix) if (threadsPerBlock >= radix)
{ {
for (i = 0; i < R1; i++) for (i = 0; i < R1; i++)
localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n"); {
localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
}
} }
else else
{ {
int innerIter = radix / threadsPerBlock; int innerIter = radix / threadsPerBlock;
int outerIter = R1 / innerIter; int outerIter = R1 / innerIter;
for (i = 0; i < outerIter; i++) for (i = 0; i < outerIter; i++)
for (j = 0; j < innerIter; j++) {
localString += string("a[") + num2str(i * innerIter + j) + string("].y = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n"); for (j = 0; j < innerIter; j++)
{
localString += string("a[") + num2str(i * innerIter + j) + string("].y = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
}
}
} }
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
@ -1170,15 +1345,21 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("out_real += indexOut;\n"); localString += string("out_real += indexOut;\n");
localString += string("out_imag += indexOut;\n"); localString += string("out_imag += indexOut;\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
localString += string("out_real[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); {
localString += string("out_real[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
}
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
localString += string("out_imag[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); {
localString += string("out_imag[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
}
} }
else else
{ {
localString += string("out += indexOut;\n"); localString += string("out += indexOut;\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
localString += string("out[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n"); {
localString += string("out[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");
}
} }
} }
else else
@ -1189,22 +1370,30 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("out_real += indexOut;\n"); localString += string("out_real += indexOut;\n");
localString += string("out_imag += indexOut;\n"); localString += string("out_imag += indexOut;\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
localString += string("out_real[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].x;\n"); {
localString += string("out_real[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].x;\n");
}
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
localString += string("out_imag[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].y;\n"); {
localString += string("out_imag[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].y;\n");
}
} }
else else
{ {
localString += string("out += indexOut;\n"); localString += string("out += indexOut;\n");
for (k = 0; k < R1; k++) for (k = 0; k < R1; k++)
localString += string("out[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("];\n"); {
localString += string("out[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("];\n");
}
} }
} }
insertHeader(*kernelString, kernelName, dataFormat); insertHeader(*kernelString, kernelName, dataFormat);
*kernelString += string("{\n"); *kernelString += string("{\n");
if ((*kInfo)->lmem_size) if ((*kInfo)->lmem_size)
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); {
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
}
*kernelString += localString; *kernelString += localString;
*kernelString += string("}\n"); *kernelString += string("}\n");
@ -1214,6 +1403,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
} }
} }
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir) void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
{ {
unsigned int radixArray[10]; unsigned int radixArray[10];
@ -1237,21 +1427,29 @@ void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
{ {
getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix); getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix);
if (plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) if (plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
createLocalMemfftKernelString(plan); {
createLocalMemfftKernelString(plan);
}
else else
createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); {
createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
}
} }
} }
break; break;
case cl_fft_kernel_y: case cl_fft_kernel_y:
if (plan->n.y > 1) if (plan->n.y > 1)
createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1); {
createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
}
break; break;
case cl_fft_kernel_z: case cl_fft_kernel_z:
if (plan->n.z > 1) if (plan->n.z > 1)
createGlobalFFTKernelString(plan, plan->n.z, plan->n.x * plan->n.y, cl_fft_kernel_z, 1); {
createGlobalFFTKernelString(plan, plan->n.z, plan->n.x * plan->n.y, cl_fft_kernel_z, 1);
}
default: default:
return; return;
} }

View File

@ -31,9 +31,13 @@ getBlockConfigAndKernelString(cl_fft_plan *plan)
*plan->kernel_string += baseKernels; *plan->kernel_string += baseKernels;
if (plan->format == clFFT_SplitComplexFormat) if (plan->format == clFFT_SplitComplexFormat)
*plan->kernel_string += twistKernelPlannar; {
*plan->kernel_string += twistKernelPlannar;
}
else else
*plan->kernel_string += twistKernelInterleaved; {
*plan->kernel_string += twistKernelInterleaved;
}
switch (plan->dim) switch (plan->dim)
{ {
@ -72,13 +76,18 @@ deleteKernelInfo(cl_fft_kernel_info *kInfo)
if (kInfo) if (kInfo)
{ {
if (kInfo->kernel_name) if (kInfo->kernel_name)
free(kInfo->kernel_name); {
free(kInfo->kernel_name);
}
if (kInfo->kernel) if (kInfo->kernel)
clReleaseKernel(kInfo->kernel); {
clReleaseKernel(kInfo->kernel);
}
free(kInfo); free(kInfo);
} }
} }
static void static void
destroy_plan(cl_fft_plan *Plan) destroy_plan(cl_fft_plan *Plan)
{ {
@ -125,6 +134,7 @@ destroy_plan(cl_fft_plan *Plan)
} }
} }
static int static int
createKernelList(cl_fft_plan *plan) createKernelList(cl_fft_plan *plan)
{ {
@ -136,21 +146,30 @@ createKernelList(cl_fft_plan *plan)
{ {
kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err); kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
if (!kernel_info->kernel || err != CL_SUCCESS) if (!kernel_info->kernel || err != CL_SUCCESS)
return err; {
return err;
}
kernel_info = kernel_info->next; kernel_info = kernel_info->next;
} }
if (plan->format == clFFT_SplitComplexFormat) if (plan->format == clFFT_SplitComplexFormat)
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err); {
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
}
else else
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err); {
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
}
if (!plan->twist_kernel || err) if (!plan->twist_kernel || err)
return err; {
return err;
}
return CL_SUCCESS; return CL_SUCCESS;
} }
int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices) int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
{ {
int reg_needed = 0; int reg_needed = 0;
@ -166,13 +185,19 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
{ {
err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, nullptr); err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, nullptr);
if (err != CL_SUCCESS) if (err != CL_SUCCESS)
return -1; {
return -1;
}
if (wg_size < kInfo->num_workitems_per_workgroup) if (wg_size < kInfo->num_workitems_per_workgroup)
reg_needed |= 1; {
reg_needed |= 1;
}
if (*max_wg_size > wg_size) if (*max_wg_size > wg_size)
*max_wg_size = wg_size; {
*max_wg_size = wg_size;
}
kInfo = kInfo->next; kInfo = kInfo->next;
} }
@ -181,6 +206,7 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
return reg_needed; return reg_needed;
} }
#define ERR_MACRO(err) \ #define ERR_MACRO(err) \
{ \ { \
if ((err) != CL_SUCCESS) \ if ((err) != CL_SUCCESS) \
@ -192,6 +218,7 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
} \ } \
} }
clFFT_Plan clFFT_Plan
clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code) clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code)
{ {
@ -326,11 +353,14 @@ patch_kernel_source:
} }
if (error_code) if (error_code)
*error_code = CL_SUCCESS; {
*error_code = CL_SUCCESS;
}
return (clFFT_Plan)plan; return (clFFT_Plan)plan;
} }
void clFFT_DestroyPlan(clFFT_Plan plan) void clFFT_DestroyPlan(clFFT_Plan plan)
{ {
auto *Plan = (cl_fft_plan *)plan; auto *Plan = (cl_fft_plan *)plan;
@ -342,15 +372,20 @@ void clFFT_DestroyPlan(clFFT_Plan plan)
} }
} }
void clFFT_DumpPlan(clFFT_Plan Plan, FILE *file) void clFFT_DumpPlan(clFFT_Plan Plan, FILE *file)
{ {
size_t gDim; size_t gDim;
size_t lDim; size_t lDim;
FILE *out; FILE *out;
if (!file) if (!file)
out = stdout; {
out = stdout;
}
else else
out = file; {
out = file;
}
auto *plan = (cl_fft_plan *)Plan; auto *plan = (cl_fft_plan *)Plan;
cl_fft_kernel_info *kInfo = plan->kernel_info; cl_fft_kernel_info *kInfo = plan->kernel_info;