mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-07-23 20:33:05 +00:00
CI: make clang-tidy happy
This commit is contained in:
parent
12ed230cd7
commit
d9a516e0b8
@ -224,7 +224,7 @@ void GpsL1CaPcpsOpenClAcquisition::reset()
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
float GpsL1CaPcpsOpenClAcquisition::calculate_threshold(float pfa)
|
float GpsL1CaPcpsOpenClAcquisition::calculate_threshold(float pfa) const
|
||||||
{
|
{
|
||||||
// Calculate the threshold
|
// Calculate the threshold
|
||||||
unsigned int frequency_bins = 0;
|
unsigned int frequency_bins = 0;
|
||||||
|
@ -146,7 +146,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
float calculate_threshold(float pfa);
|
float calculate_threshold(float pfa) const;
|
||||||
const ConfigurationInterface* configuration_;
|
const ConfigurationInterface* configuration_;
|
||||||
pcps_opencl_acquisition_cc_sptr acquisition_cc_;
|
pcps_opencl_acquisition_cc_sptr acquisition_cc_;
|
||||||
gr::blocks::stream_to_vector::sptr stream_to_vector_;
|
gr::blocks::stream_to_vector::sptr stream_to_vector_;
|
||||||
|
@ -28,13 +28,16 @@ allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
|
|||||||
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
|
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
|
||||||
|
|
||||||
if (plan->tempmemobj)
|
if (plan->tempmemobj)
|
||||||
clReleaseMemObject(plan->tempmemobj);
|
{
|
||||||
|
clReleaseMemObject(plan->tempmemobj);
|
||||||
|
}
|
||||||
|
|
||||||
plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err);
|
plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err);
|
||||||
}
|
}
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static cl_int
|
static cl_int
|
||||||
allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
|
allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
|
||||||
{
|
{
|
||||||
@ -46,10 +49,14 @@ allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
|
|||||||
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
|
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
|
||||||
|
|
||||||
if (plan->tempmemobj_real)
|
if (plan->tempmemobj_real)
|
||||||
clReleaseMemObject(plan->tempmemobj_real);
|
{
|
||||||
|
clReleaseMemObject(plan->tempmemobj_real);
|
||||||
|
}
|
||||||
|
|
||||||
if (plan->tempmemobj_imag)
|
if (plan->tempmemobj_imag)
|
||||||
clReleaseMemObject(plan->tempmemobj_imag);
|
{
|
||||||
|
clReleaseMemObject(plan->tempmemobj_imag);
|
||||||
|
}
|
||||||
|
|
||||||
plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err);
|
plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &err);
|
||||||
plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &terr);
|
plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, nullptr, &terr);
|
||||||
@ -58,6 +65,7 @@ allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
|
void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
|
||||||
{
|
{
|
||||||
*lWorkItems = kernelInfo->num_workitems_per_workgroup;
|
*lWorkItems = kernelInfo->num_workitems_per_workgroup;
|
||||||
@ -83,6 +91,7 @@ void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo,
|
|||||||
*gWorkItems = numWorkGroups * *lWorkItems;
|
*gWorkItems = numWorkGroups * *lWorkItems;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
cl_int
|
cl_int
|
||||||
clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
|
clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
|
||||||
cl_mem data_in, cl_mem data_out,
|
cl_mem data_in, cl_mem data_out,
|
||||||
@ -91,7 +100,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
|
|||||||
int s;
|
int s;
|
||||||
auto *plan = (cl_fft_plan *)Plan;
|
auto *plan = (cl_fft_plan *)Plan;
|
||||||
if (plan->format != clFFT_InterleavedComplexFormat)
|
if (plan->format != clFFT_InterleavedComplexFormat)
|
||||||
return CL_INVALID_VALUE;
|
{
|
||||||
|
return CL_INVALID_VALUE;
|
||||||
|
}
|
||||||
|
|
||||||
cl_int err;
|
cl_int err;
|
||||||
size_t gWorkItems;
|
size_t gWorkItems;
|
||||||
@ -101,7 +112,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
|
|||||||
cl_int isInPlace = data_in == data_out ? 1 : 0;
|
cl_int isInPlace = data_in == data_out ? 1 : 0;
|
||||||
|
|
||||||
if ((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
|
if ((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
cl_mem memObj[3];
|
cl_mem memObj[3];
|
||||||
memObj[0] = data_in;
|
memObj[0] = data_in;
|
||||||
@ -146,7 +159,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
|
|||||||
|
|
||||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
|
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
currRead = (currWrite == 1) ? 1 : 2;
|
currRead = (currWrite == 1) ? 1 : 2;
|
||||||
currWrite = (currWrite == 1) ? 2 : 1;
|
currWrite = (currWrite == 1) ? 2 : 1;
|
||||||
@ -169,7 +184,9 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
|
|||||||
|
|
||||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
|
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
currRead = 1;
|
currRead = 1;
|
||||||
currWrite = 1;
|
currWrite = 1;
|
||||||
@ -181,6 +198,7 @@ clFFT_ExecuteInterleaved(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSi
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
cl_int
|
cl_int
|
||||||
clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
|
clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
|
||||||
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
|
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
|
||||||
@ -190,7 +208,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
|
|||||||
auto *plan = (cl_fft_plan *)Plan;
|
auto *plan = (cl_fft_plan *)Plan;
|
||||||
|
|
||||||
if (plan->format != clFFT_SplitComplexFormat)
|
if (plan->format != clFFT_SplitComplexFormat)
|
||||||
return CL_INVALID_VALUE;
|
{
|
||||||
|
return CL_INVALID_VALUE;
|
||||||
|
}
|
||||||
|
|
||||||
cl_int err;
|
cl_int err;
|
||||||
size_t gWorkItems;
|
size_t gWorkItems;
|
||||||
@ -200,7 +220,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
|
|||||||
cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
|
cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
|
||||||
|
|
||||||
if ((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
|
if ((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
cl_mem memObj_real[3];
|
cl_mem memObj_real[3];
|
||||||
cl_mem memObj_imag[3];
|
cl_mem memObj_imag[3];
|
||||||
@ -252,7 +274,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
|
|||||||
|
|
||||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
|
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
currRead = (currWrite == 1) ? 1 : 2;
|
currRead = (currWrite == 1) ? 1 : 2;
|
||||||
currWrite = (currWrite == 1) ? 2 : 1;
|
currWrite = (currWrite == 1) ? 2 : 1;
|
||||||
@ -276,7 +300,9 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
|
|||||||
|
|
||||||
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
|
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, nullptr, &gWorkItems, &lWorkItems, 0, nullptr, nullptr);
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
currRead = 1;
|
currRead = 1;
|
||||||
currWrite = 1;
|
currWrite = 1;
|
||||||
@ -288,6 +314,7 @@ clFFT_ExecutePlannar(cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
cl_int
|
cl_int
|
||||||
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
|
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
|
||||||
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
|
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
|
||||||
@ -304,12 +331,16 @@ clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
|
|||||||
cl_device_id device_id;
|
cl_device_id device_id;
|
||||||
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr);
|
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr);
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
size_t gSize;
|
size_t gSize;
|
||||||
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr);
|
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr);
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
gSize = min(128, gSize);
|
gSize = min(128, gSize);
|
||||||
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};
|
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};
|
||||||
@ -327,6 +358,7 @@ clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
cl_int
|
cl_int
|
||||||
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
|
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
|
||||||
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
|
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
|
||||||
@ -343,12 +375,16 @@ clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real,
|
|||||||
cl_device_id device_id;
|
cl_device_id device_id;
|
||||||
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr);
|
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr);
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
size_t gSize;
|
size_t gSize;
|
||||||
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr);
|
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, nullptr);
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
gSize = min(128, gSize);
|
gSize = min(128, gSize);
|
||||||
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};
|
size_t numGlobalThreads[1] = {max(numCols / gSize, 1) * gSize};
|
||||||
|
@ -30,7 +30,7 @@ num2str(int num)
|
|||||||
{
|
{
|
||||||
char temp[200];
|
char temp[200];
|
||||||
snprintf(temp, sizeof(temp), "%d", num);
|
snprintf(temp, sizeof(temp), "%d", num);
|
||||||
return string(temp);
|
return {temp};
|
||||||
}
|
}
|
||||||
|
|
||||||
// For any n, this function decomposes n into factors for loacal memory tranpose
|
// For any n, this function decomposes n into factors for loacal memory tranpose
|
||||||
@ -155,15 +155,21 @@ getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat)
|
insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat)
|
||||||
{
|
{
|
||||||
if (dataFormat == clFFT_SplitComplexFormat)
|
if (dataFormat == clFFT_SplitComplexFormat)
|
||||||
kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
|
{
|
||||||
|
kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
|
{
|
||||||
|
kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
insertVariables(string &kStream, int maxRadix)
|
insertVariables(string &kStream, int maxRadix)
|
||||||
{
|
{
|
||||||
@ -177,11 +183,14 @@ insertVariables(string &kStream, int maxRadix)
|
|||||||
kStream += string(" int groupId = get_group_id( 0 );\n");
|
kStream += string(" int groupId = get_group_id( 0 );\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
|
formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
|
||||||
{
|
{
|
||||||
if (dataFormat == clFFT_InterleavedComplexFormat)
|
if (dataFormat == clFFT_InterleavedComplexFormat)
|
||||||
kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
|
{
|
||||||
|
kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n");
|
kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n");
|
||||||
@ -189,11 +198,14 @@ formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dat
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
|
formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
|
||||||
{
|
{
|
||||||
if (dataFormat == clFFT_InterleavedComplexFormat)
|
if (dataFormat == clFFT_InterleavedComplexFormat)
|
||||||
kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
|
{
|
||||||
|
kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n");
|
kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n");
|
||||||
@ -201,6 +213,7 @@ formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat da
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat)
|
insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat)
|
||||||
{
|
{
|
||||||
@ -211,7 +224,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
|
|||||||
int lMemSize = 0;
|
int lMemSize = 0;
|
||||||
|
|
||||||
if (numXFormsPerWG > 1)
|
if (numXFormsPerWG > 1)
|
||||||
kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
|
{
|
||||||
|
kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
|
||||||
|
}
|
||||||
|
|
||||||
if (numWorkItemsPerXForm >= mem_coalesce_width)
|
if (numWorkItemsPerXForm >= mem_coalesce_width)
|
||||||
{
|
{
|
||||||
@ -234,7 +249,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
|
|||||||
kernelString += string(" out_imag += offset;\n");
|
kernelString += string(" out_imag += offset;\n");
|
||||||
}
|
}
|
||||||
for (i = 0; i < R0; i++)
|
for (i = 0; i < R0; i++)
|
||||||
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
|
{
|
||||||
|
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
|
||||||
|
}
|
||||||
kernelString += string(" }\n");
|
kernelString += string(" }\n");
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -255,7 +272,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
|
|||||||
kernelString += string(" out_imag += offset;\n");
|
kernelString += string(" out_imag += offset;\n");
|
||||||
}
|
}
|
||||||
for (i = 0; i < R0; i++)
|
for (i = 0; i < R0; i++)
|
||||||
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
|
{
|
||||||
|
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (N >= mem_coalesce_width)
|
else if (N >= mem_coalesce_width)
|
||||||
@ -286,17 +305,23 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
|
|||||||
{
|
{
|
||||||
kernelString += string(" if( jj < s ) {\n");
|
kernelString += string(" if( jj < s ) {\n");
|
||||||
for (j = 0; j < numInnerIter; j++)
|
for (j = 0; j < numInnerIter; j++)
|
||||||
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
|
{
|
||||||
|
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
|
||||||
|
}
|
||||||
kernelString += string(" }\n");
|
kernelString += string(" }\n");
|
||||||
if (i != numOuterIter - 1)
|
if (i != numOuterIter - 1)
|
||||||
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
|
{
|
||||||
|
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
kernelString += string("}\n ");
|
kernelString += string("}\n ");
|
||||||
kernelString += string("else {\n");
|
kernelString += string("else {\n");
|
||||||
for (i = 0; i < numOuterIter; i++)
|
for (i = 0; i < numOuterIter; i++)
|
||||||
{
|
{
|
||||||
for (j = 0; j < numInnerIter; j++)
|
for (j = 0; j < numInnerIter; j++)
|
||||||
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
|
{
|
||||||
|
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
kernelString += string("}\n");
|
kernelString += string("}\n");
|
||||||
|
|
||||||
@ -315,7 +340,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
|
|||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < R0; i++)
|
for (i = 0; i < R0; i++)
|
||||||
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
|
{
|
||||||
|
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
|
||||||
|
}
|
||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < numOuterIter; i++)
|
for (i = 0; i < numOuterIter; i++)
|
||||||
@ -329,7 +356,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
|
|||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < R0; i++)
|
for (i = 0; i < R0; i++)
|
||||||
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
|
{
|
||||||
|
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
|
||||||
|
}
|
||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
|
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
|
||||||
@ -360,7 +389,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
|
|||||||
kernelString += string(" if(jj < s )\n");
|
kernelString += string(" if(jj < s )\n");
|
||||||
formattedLoad(kernelString, i, i * groupSize, dataFormat);
|
formattedLoad(kernelString, i, i * groupSize, dataFormat);
|
||||||
if (i != R0 - 1)
|
if (i != R0 - 1)
|
||||||
kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n");
|
{
|
||||||
|
kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
kernelString += string("}\n");
|
kernelString += string("}\n");
|
||||||
kernelString += string("else {\n");
|
kernelString += string("else {\n");
|
||||||
@ -385,19 +416,27 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
|
|||||||
|
|
||||||
|
|
||||||
for (i = 0; i < R0; i++)
|
for (i = 0; i < R0; i++)
|
||||||
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].x;\n");
|
{
|
||||||
|
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].x;\n");
|
||||||
|
}
|
||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < R0; i++)
|
for (i = 0; i < R0; i++)
|
||||||
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
|
{
|
||||||
|
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
|
||||||
|
}
|
||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < R0; i++)
|
for (i = 0; i < R0; i++)
|
||||||
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].y;\n");
|
{
|
||||||
|
kernelString += string(" lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("] = a[") + num2str(i) + string("].y;\n");
|
||||||
|
}
|
||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < R0; i++)
|
for (i = 0; i < R0; i++)
|
||||||
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
|
{
|
||||||
|
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
|
||||||
|
}
|
||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
|
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
|
||||||
@ -406,6 +445,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
|
|||||||
return lMemSize;
|
return lMemSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat)
|
insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat)
|
||||||
{
|
{
|
||||||
@ -433,7 +473,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
|
|||||||
formattedStore(kernelString, ind, i * numWorkItemsPerXForm, dataFormat);
|
formattedStore(kernelString, ind, i * numWorkItemsPerXForm, dataFormat);
|
||||||
}
|
}
|
||||||
if (numXFormsPerWG > 1)
|
if (numXFormsPerWG > 1)
|
||||||
kernelString += string(" }\n");
|
{
|
||||||
|
kernelString += string(" }\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (N >= mem_coalesce_width)
|
else if (N >= mem_coalesce_width)
|
||||||
{
|
{
|
||||||
@ -455,8 +497,12 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
|
|||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < numOuterIter; i++)
|
for (i = 0; i < numOuterIter; i++)
|
||||||
for (j = 0; j < numInnerIter; j++)
|
{
|
||||||
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].x = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
|
for (j = 0; j < numInnerIter; j++)
|
||||||
|
{
|
||||||
|
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].x = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < maxRadix; i++)
|
for (i = 0; i < maxRadix; i++)
|
||||||
@ -469,8 +515,12 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
|
|||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < numOuterIter; i++)
|
for (i = 0; i < numOuterIter; i++)
|
||||||
for (j = 0; j < numInnerIter; j++)
|
{
|
||||||
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].y = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
|
for (j = 0; j < numInnerIter; j++)
|
||||||
|
{
|
||||||
|
kernelString += string(" a[") + num2str(i * numInnerIter + j) + string("].y = lMemStore[") + num2str(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)) + string("];\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
|
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
|
||||||
@ -478,17 +528,23 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
|
|||||||
{
|
{
|
||||||
kernelString += string(" if( jj < s ) {\n");
|
kernelString += string(" if( jj < s ) {\n");
|
||||||
for (j = 0; j < numInnerIter; j++)
|
for (j = 0; j < numInnerIter; j++)
|
||||||
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
|
{
|
||||||
|
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
|
||||||
|
}
|
||||||
kernelString += string(" }\n");
|
kernelString += string(" }\n");
|
||||||
if (i != numOuterIter - 1)
|
if (i != numOuterIter - 1)
|
||||||
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
|
{
|
||||||
|
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
kernelString += string("}\n");
|
kernelString += string("}\n");
|
||||||
kernelString += string("else {\n");
|
kernelString += string("else {\n");
|
||||||
for (i = 0; i < numOuterIter; i++)
|
for (i = 0; i < numOuterIter; i++)
|
||||||
{
|
{
|
||||||
for (j = 0; j < numInnerIter; j++)
|
for (j = 0; j < numInnerIter; j++)
|
||||||
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
|
{
|
||||||
|
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
kernelString += string("}\n");
|
kernelString += string("}\n");
|
||||||
|
|
||||||
@ -512,7 +568,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
|
|||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < maxRadix; i++)
|
for (i = 0; i < maxRadix; i++)
|
||||||
kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
|
{
|
||||||
|
kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
|
||||||
|
}
|
||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < maxRadix; i++)
|
for (i = 0; i < maxRadix; i++)
|
||||||
@ -525,7 +583,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
|
|||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
for (i = 0; i < maxRadix; i++)
|
for (i = 0; i < maxRadix; i++)
|
||||||
kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
|
{
|
||||||
|
kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i * (groupSize / N) * (N + numWorkItemsPerXForm)) + string("];\n");
|
||||||
|
}
|
||||||
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
|
||||||
|
|
||||||
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
|
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
|
||||||
@ -535,7 +595,9 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
|
|||||||
formattedStore(kernelString, i, i * groupSize, dataFormat);
|
formattedStore(kernelString, i, i * groupSize, dataFormat);
|
||||||
kernelString += string(" }\n");
|
kernelString += string(" }\n");
|
||||||
if (i != maxRadix - 1)
|
if (i != maxRadix - 1)
|
||||||
kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n");
|
{
|
||||||
|
kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
kernelString += string("}\n");
|
kernelString += string("}\n");
|
||||||
kernelString += string("else {\n");
|
kernelString += string("else {\n");
|
||||||
@ -551,6 +613,7 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
|
|||||||
return lMemSize;
|
return lMemSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
insertfftKernel(string &kernelString, int Nr, int numIter)
|
insertfftKernel(string &kernelString, int Nr, int numIter)
|
||||||
{
|
{
|
||||||
@ -561,6 +624,7 @@ insertfftKernel(string &kernelString, int Nr, int numIter)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
|
insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
|
||||||
{
|
{
|
||||||
@ -573,16 +637,24 @@ insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int le
|
|||||||
if (z == 0)
|
if (z == 0)
|
||||||
{
|
{
|
||||||
if (Nprev > 1)
|
if (Nprev > 1)
|
||||||
kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
|
{
|
||||||
|
kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
kernelString += string(" angf = (float) ii;\n");
|
{
|
||||||
|
kernelString += string(" angf = (float) ii;\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (Nprev > 1)
|
if (Nprev > 1)
|
||||||
kernelString += string(" angf = (float) ((") + num2str(z * numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n");
|
{
|
||||||
|
kernelString += string(" angf = (float) ((") + num2str(z * numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
kernelString += string(" angf = (float) (") + num2str(z * numWorkItemsPerXForm) + string(" + ii);\n");
|
{
|
||||||
|
kernelString += string(" angf = (float) (") + num2str(z * numWorkItemsPerXForm) + string(" + ii);\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (k = 1; k < Nr; k++)
|
for (k = 1; k < Nr; k++)
|
||||||
@ -596,30 +668,41 @@ insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int le
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad)
|
getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad)
|
||||||
{
|
{
|
||||||
if ((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks))
|
if ((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks))
|
||||||
*offset = 0;
|
{
|
||||||
|
*offset = 0;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
|
int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
|
||||||
int numColsReq = 1;
|
int numColsReq = 1;
|
||||||
if (numRowsReq > Nr)
|
if (numRowsReq > Nr)
|
||||||
numColsReq = numRowsReq / Nr;
|
{
|
||||||
|
numColsReq = numRowsReq / Nr;
|
||||||
|
}
|
||||||
numColsReq = Nprev * numColsReq;
|
numColsReq = Nprev * numColsReq;
|
||||||
*offset = numColsReq;
|
*offset = numColsReq;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1)
|
if (numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1)
|
||||||
*midPad = 0;
|
{
|
||||||
|
*midPad = 0;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int bankNum = ((numWorkItemsReq + *offset) * Nr) & (numBanks - 1);
|
int bankNum = ((numWorkItemsReq + *offset) * Nr) & (numBanks - 1);
|
||||||
if (bankNum >= numWorkItemsPerXForm)
|
if (bankNum >= numWorkItemsPerXForm)
|
||||||
*midPad = 0;
|
{
|
||||||
|
*midPad = 0;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
*midPad = numWorkItemsPerXForm - bankNum;
|
{
|
||||||
|
*midPad = numWorkItemsPerXForm - bankNum;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int lMemSize = (numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1);
|
int lMemSize = (numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1);
|
||||||
@ -644,6 +727,7 @@ insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPer
|
|||||||
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
|
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
|
insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
|
||||||
{
|
{
|
||||||
@ -676,6 +760,7 @@ insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Nc
|
|||||||
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
|
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
|
insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
|
||||||
{
|
{
|
||||||
@ -687,33 +772,52 @@ insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numW
|
|||||||
if (Ncurr < numWorkItemsPerXForm)
|
if (Ncurr < numWorkItemsPerXForm)
|
||||||
{
|
{
|
||||||
if (Nprev == 1)
|
if (Nprev == 1)
|
||||||
kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n");
|
{
|
||||||
|
kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
|
{
|
||||||
|
kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
|
||||||
|
}
|
||||||
|
|
||||||
if (Nprev == 1)
|
if (Nprev == 1)
|
||||||
kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n");
|
{
|
||||||
|
kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n");
|
{
|
||||||
|
kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (Nprev == 1)
|
if (Nprev == 1)
|
||||||
kernelString += string(" j = ii;\n");
|
{
|
||||||
|
kernelString += string(" j = ii;\n");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n");
|
{
|
||||||
|
kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n");
|
||||||
|
}
|
||||||
if (Nprev == 1)
|
if (Nprev == 1)
|
||||||
kernelString += string(" i = 0;\n");
|
{
|
||||||
|
kernelString += string(" i = 0;\n");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n");
|
{
|
||||||
|
kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (numXFormsPerWG > 1)
|
if (numXFormsPerWG > 1)
|
||||||
kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n");
|
{
|
||||||
|
kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n");
|
||||||
|
}
|
||||||
|
|
||||||
kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n");
|
kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad)
|
insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad)
|
||||||
{
|
{
|
||||||
@ -742,7 +846,9 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
|
|||||||
assert(numRadix > 0 && "no radix array supplied\n");
|
assert(numRadix > 0 && "no radix array supplied\n");
|
||||||
|
|
||||||
if (n / radixArray[0] > plan->max_work_item_per_workgroup)
|
if (n / radixArray[0] > plan->max_work_item_per_workgroup)
|
||||||
getRadixArray(n, radixArray, &numRadix, plan->max_radix);
|
{
|
||||||
|
getRadixArray(n, radixArray, &numRadix, plan->max_radix);
|
||||||
|
}
|
||||||
|
|
||||||
assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n");
|
assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n");
|
||||||
assert(n / radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n");
|
assert(n / radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n");
|
||||||
@ -839,11 +945,14 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
|
|||||||
insertHeader(*kernelString, kernelName, dataFormat);
|
insertHeader(*kernelString, kernelName, dataFormat);
|
||||||
*kernelString += string("{\n");
|
*kernelString += string("{\n");
|
||||||
if ((*kInfo)->lmem_size)
|
if ((*kInfo)->lmem_size)
|
||||||
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
|
{
|
||||||
|
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
|
||||||
|
}
|
||||||
*kernelString += localString;
|
*kernelString += localString;
|
||||||
*kernelString += string("}\n");
|
*kernelString += string("}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// For n larger than what can be computed using local memory fft, global transposes
|
// For n larger than what can be computed using local memory fft, global transposes
|
||||||
// multiple kernel launces is needed. For these sizes, n can be decomposed using
|
// multiple kernel launces is needed. For these sizes, n can be decomposed using
|
||||||
// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel
|
// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel
|
||||||
@ -864,7 +973,6 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
|
|||||||
// in this example. Users can play with difference base radices and difference
|
// in this example. Users can play with difference base radices and difference
|
||||||
// decompositions of base radices to generates different kernels and see which gives
|
// decompositions of base radices to generates different kernels and see which gives
|
||||||
// best performance. Following function is just fixed to use 128 as base radix
|
// best performance. Following function is just fixed to use 128 as base radix
|
||||||
|
|
||||||
void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
|
void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
|
||||||
{
|
{
|
||||||
int baseRadix = min(n, 128);
|
int baseRadix = min(n, 128);
|
||||||
@ -878,7 +986,9 @@ void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < numR; i++)
|
for (int i = 0; i < numR; i++)
|
||||||
radix[i] = baseRadix;
|
{
|
||||||
|
radix[i] = baseRadix;
|
||||||
|
}
|
||||||
|
|
||||||
radix[numR] = N;
|
radix[numR] = N;
|
||||||
numR++;
|
numR++;
|
||||||
@ -906,6 +1016,7 @@ void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
|
createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
|
||||||
{
|
{
|
||||||
@ -960,12 +1071,18 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
|
|||||||
|
|
||||||
int strideI = Rinit;
|
int strideI = Rinit;
|
||||||
for (i = 0; i < numPasses; i++)
|
for (i = 0; i < numPasses; i++)
|
||||||
if (i != passNum)
|
{
|
||||||
strideI *= radixArr[i];
|
if (i != passNum)
|
||||||
|
{
|
||||||
|
strideI *= radixArr[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int strideO = Rinit;
|
int strideO = Rinit;
|
||||||
for (i = 0; i < passNum; i++)
|
for (i = 0; i < passNum; i++)
|
||||||
strideO *= radixArr[i];
|
{
|
||||||
|
strideO *= radixArr[i];
|
||||||
|
}
|
||||||
|
|
||||||
int threadsPerXForm = R2;
|
int threadsPerXForm = R2;
|
||||||
batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize;
|
batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize;
|
||||||
@ -986,30 +1103,44 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
|
|||||||
int numBlocksPerXForm = strideI / batchSize;
|
int numBlocksPerXForm = strideI / batchSize;
|
||||||
int numBlocks = numBlocksPerXForm;
|
int numBlocks = numBlocksPerXForm;
|
||||||
if (!vertical)
|
if (!vertical)
|
||||||
numBlocks *= BS;
|
{
|
||||||
|
numBlocks *= BS;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
numBlocks *= vertBS;
|
{
|
||||||
|
numBlocks *= vertBS;
|
||||||
|
}
|
||||||
|
|
||||||
kernelName = string("fft") + num2str(kCount);
|
kernelName = string("fft") + num2str(kCount);
|
||||||
*kInfo = (cl_fft_kernel_info *)malloc(sizeof(cl_fft_kernel_info));
|
*kInfo = (cl_fft_kernel_info *)malloc(sizeof(cl_fft_kernel_info));
|
||||||
(*kInfo)->kernel = nullptr;
|
(*kInfo)->kernel = nullptr;
|
||||||
if (R2 == 1)
|
if (R2 == 1)
|
||||||
(*kInfo)->lmem_size = 0;
|
{
|
||||||
|
(*kInfo)->lmem_size = 0;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (strideO == 1)
|
if (strideO == 1)
|
||||||
(*kInfo)->lmem_size = (radix + 1) * batchSize;
|
{
|
||||||
|
(*kInfo)->lmem_size = (radix + 1) * batchSize;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
(*kInfo)->lmem_size = threadsPerBlock * R1;
|
{
|
||||||
|
(*kInfo)->lmem_size = threadsPerBlock * R1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
(*kInfo)->num_workgroups = numBlocks;
|
(*kInfo)->num_workgroups = numBlocks;
|
||||||
(*kInfo)->num_xforms_per_workgroup = 1;
|
(*kInfo)->num_xforms_per_workgroup = 1;
|
||||||
(*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
|
(*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
|
||||||
(*kInfo)->dir = dir;
|
(*kInfo)->dir = dir;
|
||||||
if ((passNum == (numPasses - 1)) && (numPasses & 1))
|
if ((passNum == (numPasses - 1)) && (numPasses & 1))
|
||||||
(*kInfo)->in_place_possible = 1;
|
{
|
||||||
|
(*kInfo)->in_place_possible = 1;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
(*kInfo)->in_place_possible = 0;
|
{
|
||||||
|
(*kInfo)->in_place_possible = 0;
|
||||||
|
}
|
||||||
(*kInfo)->next = nullptr;
|
(*kInfo)->next = nullptr;
|
||||||
(*kInfo)->kernel_name = (char *)malloc(sizeof(char) * (kernelName.size() + 1));
|
(*kInfo)->kernel_name = (char *)malloc(sizeof(char) * (kernelName.size() + 1));
|
||||||
snprintf((*kInfo)->kernel_name, sizeof((*kInfo)->kernel_name), kernelName.c_str());
|
snprintf((*kInfo)->kernel_name, sizeof((*kInfo)->kernel_name), kernelName.c_str());
|
||||||
@ -1026,7 +1157,9 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
|
|||||||
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
|
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
|
||||||
int stride = radix * Rinit;
|
int stride = radix * Rinit;
|
||||||
for (i = 0; i < passNum; i++)
|
for (i = 0; i < passNum; i++)
|
||||||
stride *= radixArr[i];
|
{
|
||||||
|
stride *= radixArr[i];
|
||||||
|
}
|
||||||
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int)log2(n * BS)) + string("));\n");
|
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int)log2(n * BS)) + string("));\n");
|
||||||
localString += string("bNum = groupId;\n");
|
localString += string("bNum = groupId;\n");
|
||||||
}
|
}
|
||||||
@ -1041,7 +1174,9 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
|
|||||||
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
|
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
|
||||||
int stride = radix * Rinit;
|
int stride = radix * Rinit;
|
||||||
for (i = 0; i < passNum; i++)
|
for (i = 0; i < passNum; i++)
|
||||||
stride *= radixArr[i];
|
{
|
||||||
|
stride *= radixArr[i];
|
||||||
|
}
|
||||||
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");
|
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");
|
||||||
localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
|
localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
|
||||||
localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");
|
localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");
|
||||||
@ -1059,15 +1194,21 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
|
|||||||
localString += string("in_real += indexIn;\n");
|
localString += string("in_real += indexIn;\n");
|
||||||
localString += string("in_imag += indexIn;\n");
|
localString += string("in_imag += indexIn;\n");
|
||||||
for (j = 0; j < R1; j++)
|
for (j = 0; j < R1; j++)
|
||||||
localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j * gInInc * strideI) + string("];\n");
|
{
|
||||||
|
localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j * gInInc * strideI) + string("];\n");
|
||||||
|
}
|
||||||
for (j = 0; j < R1; j++)
|
for (j = 0; j < R1; j++)
|
||||||
localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j * gInInc * strideI) + string("];\n");
|
{
|
||||||
|
localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j * gInInc * strideI) + string("];\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
localString += string("in += indexIn;\n");
|
localString += string("in += indexIn;\n");
|
||||||
for (j = 0; j < R1; j++)
|
for (j = 0; j < R1; j++)
|
||||||
localString += string("a[") + num2str(j) + string("] = in[") + num2str(j * gInInc * strideI) + string("];\n");
|
{
|
||||||
|
localString += string("a[") + num2str(j) + string("] = in[") + num2str(j * gInInc * strideI) + string("];\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");
|
localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");
|
||||||
@ -1088,22 +1229,36 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
|
|||||||
localString += string("lMemStore = sMem + tid;\n");
|
localString += string("lMemStore = sMem + tid;\n");
|
||||||
localString += string("lMemLoad = sMem + indexIn;\n");
|
localString += string("lMemLoad = sMem + indexIn;\n");
|
||||||
for (k = 0; k < R1; k++)
|
for (k = 0; k < R1; k++)
|
||||||
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
|
{
|
||||||
|
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
|
||||||
|
}
|
||||||
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
||||||
for (k = 0; k < numIter; k++)
|
for (k = 0; k < numIter; k++)
|
||||||
for (t = 0; t < R2; t++)
|
{
|
||||||
localString += string("a[") + num2str(k * R2 + t) + string("].x = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
|
for (t = 0; t < R2; t++)
|
||||||
|
{
|
||||||
|
localString += string("a[") + num2str(k * R2 + t) + string("].x = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
||||||
for (k = 0; k < R1; k++)
|
for (k = 0; k < R1; k++)
|
||||||
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
|
{
|
||||||
|
localString += string("lMemStore[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
|
||||||
|
}
|
||||||
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
||||||
for (k = 0; k < numIter; k++)
|
for (k = 0; k < numIter; k++)
|
||||||
for (t = 0; t < R2; t++)
|
{
|
||||||
localString += string("a[") + num2str(k * R2 + t) + string("].y = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
|
for (t = 0; t < R2; t++)
|
||||||
|
{
|
||||||
|
localString += string("a[") + num2str(k * R2 + t) + string("].y = lMemLoad[") + num2str(t * batchSize + k * threadsPerBlock) + string("];\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
||||||
|
|
||||||
for (j = 0; j < numIter; j++)
|
for (j = 0; j < numIter; j++)
|
||||||
localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j * R2) + string(", dir);\n");
|
{
|
||||||
|
localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j * R2) + string(", dir);\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// twiddle
|
// twiddle
|
||||||
@ -1127,40 +1282,60 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
|
|||||||
localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix + 1) + string(", tid & ") + num2str(radix - 1) + string(");\n");
|
localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix + 1) + string(", tid & ") + num2str(radix - 1) + string(");\n");
|
||||||
|
|
||||||
for (i = 0; i < R1 / R2; i++)
|
for (i = 0; i < R1 / R2; i++)
|
||||||
for (j = 0; j < R2; j++)
|
{
|
||||||
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].x;\n");
|
for (j = 0; j < R2; j++)
|
||||||
|
{
|
||||||
|
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].x;\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
||||||
if (threadsPerBlock >= radix)
|
if (threadsPerBlock >= radix)
|
||||||
{
|
{
|
||||||
for (i = 0; i < R1; i++)
|
for (i = 0; i < R1; i++)
|
||||||
localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
|
{
|
||||||
|
localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int innerIter = radix / threadsPerBlock;
|
int innerIter = radix / threadsPerBlock;
|
||||||
int outerIter = R1 / innerIter;
|
int outerIter = R1 / innerIter;
|
||||||
for (i = 0; i < outerIter; i++)
|
for (i = 0; i < outerIter; i++)
|
||||||
for (j = 0; j < innerIter; j++)
|
{
|
||||||
localString += string("a[") + num2str(i * innerIter + j) + string("].x = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
|
for (j = 0; j < innerIter; j++)
|
||||||
|
{
|
||||||
|
localString += string("a[") + num2str(i * innerIter + j) + string("].x = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
||||||
|
|
||||||
for (i = 0; i < R1 / R2; i++)
|
for (i = 0; i < R1 / R2; i++)
|
||||||
for (j = 0; j < R2; j++)
|
{
|
||||||
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].y;\n");
|
for (j = 0; j < R2; j++)
|
||||||
|
{
|
||||||
|
localString += string("lMemStore[ ") + num2str(i + j * R1) + string("] = a[") + num2str(i * R2 + j) + string("].y;\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
||||||
if (threadsPerBlock >= radix)
|
if (threadsPerBlock >= radix)
|
||||||
{
|
{
|
||||||
for (i = 0; i < R1; i++)
|
for (i = 0; i < R1; i++)
|
||||||
localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
|
{
|
||||||
|
localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * (radix + 1) * (threadsPerBlock / radix)) + string("];\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int innerIter = radix / threadsPerBlock;
|
int innerIter = radix / threadsPerBlock;
|
||||||
int outerIter = R1 / innerIter;
|
int outerIter = R1 / innerIter;
|
||||||
for (i = 0; i < outerIter; i++)
|
for (i = 0; i < outerIter; i++)
|
||||||
for (j = 0; j < innerIter; j++)
|
{
|
||||||
localString += string("a[") + num2str(i * innerIter + j) + string("].y = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
|
for (j = 0; j < innerIter; j++)
|
||||||
|
{
|
||||||
|
localString += string("a[") + num2str(i * innerIter + j) + string("].y = lMemLoad[") + num2str(j * threadsPerBlock + i * (radix + 1)) + string("];\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
|
||||||
|
|
||||||
@ -1170,15 +1345,21 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
|
|||||||
localString += string("out_real += indexOut;\n");
|
localString += string("out_real += indexOut;\n");
|
||||||
localString += string("out_imag += indexOut;\n");
|
localString += string("out_imag += indexOut;\n");
|
||||||
for (k = 0; k < R1; k++)
|
for (k = 0; k < R1; k++)
|
||||||
localString += string("out_real[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
|
{
|
||||||
|
localString += string("out_real[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
|
||||||
|
}
|
||||||
for (k = 0; k < R1; k++)
|
for (k = 0; k < R1; k++)
|
||||||
localString += string("out_imag[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
|
{
|
||||||
|
localString += string("out_imag[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
localString += string("out += indexOut;\n");
|
localString += string("out += indexOut;\n");
|
||||||
for (k = 0; k < R1; k++)
|
for (k = 0; k < R1; k++)
|
||||||
localString += string("out[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");
|
{
|
||||||
|
localString += string("out[") + num2str(k * threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -1189,22 +1370,30 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
|
|||||||
localString += string("out_real += indexOut;\n");
|
localString += string("out_real += indexOut;\n");
|
||||||
localString += string("out_imag += indexOut;\n");
|
localString += string("out_imag += indexOut;\n");
|
||||||
for (k = 0; k < R1; k++)
|
for (k = 0; k < R1; k++)
|
||||||
localString += string("out_real[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].x;\n");
|
{
|
||||||
|
localString += string("out_real[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].x;\n");
|
||||||
|
}
|
||||||
for (k = 0; k < R1; k++)
|
for (k = 0; k < R1; k++)
|
||||||
localString += string("out_imag[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].y;\n");
|
{
|
||||||
|
localString += string("out_imag[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("].y;\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
localString += string("out += indexOut;\n");
|
localString += string("out += indexOut;\n");
|
||||||
for (k = 0; k < R1; k++)
|
for (k = 0; k < R1; k++)
|
||||||
localString += string("out[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("];\n");
|
{
|
||||||
|
localString += string("out[") + num2str(((k % R2) * R1 + (k / R2)) * strideO) + string("] = a[") + num2str(k) + string("];\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
insertHeader(*kernelString, kernelName, dataFormat);
|
insertHeader(*kernelString, kernelName, dataFormat);
|
||||||
*kernelString += string("{\n");
|
*kernelString += string("{\n");
|
||||||
if ((*kInfo)->lmem_size)
|
if ((*kInfo)->lmem_size)
|
||||||
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
|
{
|
||||||
|
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
|
||||||
|
}
|
||||||
*kernelString += localString;
|
*kernelString += localString;
|
||||||
*kernelString += string("}\n");
|
*kernelString += string("}\n");
|
||||||
|
|
||||||
@ -1214,6 +1403,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
|
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
|
||||||
{
|
{
|
||||||
unsigned int radixArray[10];
|
unsigned int radixArray[10];
|
||||||
@ -1237,21 +1427,29 @@ void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
|
|||||||
{
|
{
|
||||||
getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix);
|
getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix);
|
||||||
if (plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
|
if (plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
|
||||||
createLocalMemfftKernelString(plan);
|
{
|
||||||
|
createLocalMemfftKernelString(plan);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
|
{
|
||||||
|
createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case cl_fft_kernel_y:
|
case cl_fft_kernel_y:
|
||||||
if (plan->n.y > 1)
|
if (plan->n.y > 1)
|
||||||
createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
|
{
|
||||||
|
createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case cl_fft_kernel_z:
|
case cl_fft_kernel_z:
|
||||||
if (plan->n.z > 1)
|
if (plan->n.z > 1)
|
||||||
createGlobalFFTKernelString(plan, plan->n.z, plan->n.x * plan->n.y, cl_fft_kernel_z, 1);
|
{
|
||||||
|
createGlobalFFTKernelString(plan, plan->n.z, plan->n.x * plan->n.y, cl_fft_kernel_z, 1);
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -31,9 +31,13 @@ getBlockConfigAndKernelString(cl_fft_plan *plan)
|
|||||||
*plan->kernel_string += baseKernels;
|
*plan->kernel_string += baseKernels;
|
||||||
|
|
||||||
if (plan->format == clFFT_SplitComplexFormat)
|
if (plan->format == clFFT_SplitComplexFormat)
|
||||||
*plan->kernel_string += twistKernelPlannar;
|
{
|
||||||
|
*plan->kernel_string += twistKernelPlannar;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
*plan->kernel_string += twistKernelInterleaved;
|
{
|
||||||
|
*plan->kernel_string += twistKernelInterleaved;
|
||||||
|
}
|
||||||
|
|
||||||
switch (plan->dim)
|
switch (plan->dim)
|
||||||
{
|
{
|
||||||
@ -72,13 +76,18 @@ deleteKernelInfo(cl_fft_kernel_info *kInfo)
|
|||||||
if (kInfo)
|
if (kInfo)
|
||||||
{
|
{
|
||||||
if (kInfo->kernel_name)
|
if (kInfo->kernel_name)
|
||||||
free(kInfo->kernel_name);
|
{
|
||||||
|
free(kInfo->kernel_name);
|
||||||
|
}
|
||||||
if (kInfo->kernel)
|
if (kInfo->kernel)
|
||||||
clReleaseKernel(kInfo->kernel);
|
{
|
||||||
|
clReleaseKernel(kInfo->kernel);
|
||||||
|
}
|
||||||
free(kInfo);
|
free(kInfo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
destroy_plan(cl_fft_plan *Plan)
|
destroy_plan(cl_fft_plan *Plan)
|
||||||
{
|
{
|
||||||
@ -125,6 +134,7 @@ destroy_plan(cl_fft_plan *Plan)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
createKernelList(cl_fft_plan *plan)
|
createKernelList(cl_fft_plan *plan)
|
||||||
{
|
{
|
||||||
@ -136,21 +146,30 @@ createKernelList(cl_fft_plan *plan)
|
|||||||
{
|
{
|
||||||
kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
|
kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
|
||||||
if (!kernel_info->kernel || err != CL_SUCCESS)
|
if (!kernel_info->kernel || err != CL_SUCCESS)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
kernel_info = kernel_info->next;
|
kernel_info = kernel_info->next;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (plan->format == clFFT_SplitComplexFormat)
|
if (plan->format == clFFT_SplitComplexFormat)
|
||||||
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
|
{
|
||||||
|
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
|
{
|
||||||
|
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
|
||||||
|
}
|
||||||
|
|
||||||
if (!plan->twist_kernel || err)
|
if (!plan->twist_kernel || err)
|
||||||
return err;
|
{
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
return CL_SUCCESS;
|
return CL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
|
int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
|
||||||
{
|
{
|
||||||
int reg_needed = 0;
|
int reg_needed = 0;
|
||||||
@ -166,13 +185,19 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
|
|||||||
{
|
{
|
||||||
err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, nullptr);
|
err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, nullptr);
|
||||||
if (err != CL_SUCCESS)
|
if (err != CL_SUCCESS)
|
||||||
return -1;
|
{
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
if (wg_size < kInfo->num_workitems_per_workgroup)
|
if (wg_size < kInfo->num_workitems_per_workgroup)
|
||||||
reg_needed |= 1;
|
{
|
||||||
|
reg_needed |= 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (*max_wg_size > wg_size)
|
if (*max_wg_size > wg_size)
|
||||||
*max_wg_size = wg_size;
|
{
|
||||||
|
*max_wg_size = wg_size;
|
||||||
|
}
|
||||||
|
|
||||||
kInfo = kInfo->next;
|
kInfo = kInfo->next;
|
||||||
}
|
}
|
||||||
@ -181,6 +206,7 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
|
|||||||
return reg_needed;
|
return reg_needed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define ERR_MACRO(err) \
|
#define ERR_MACRO(err) \
|
||||||
{ \
|
{ \
|
||||||
if ((err) != CL_SUCCESS) \
|
if ((err) != CL_SUCCESS) \
|
||||||
@ -192,6 +218,7 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
clFFT_Plan
|
clFFT_Plan
|
||||||
clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code)
|
clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code)
|
||||||
{
|
{
|
||||||
@ -326,11 +353,14 @@ patch_kernel_source:
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (error_code)
|
if (error_code)
|
||||||
*error_code = CL_SUCCESS;
|
{
|
||||||
|
*error_code = CL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
return (clFFT_Plan)plan;
|
return (clFFT_Plan)plan;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void clFFT_DestroyPlan(clFFT_Plan plan)
|
void clFFT_DestroyPlan(clFFT_Plan plan)
|
||||||
{
|
{
|
||||||
auto *Plan = (cl_fft_plan *)plan;
|
auto *Plan = (cl_fft_plan *)plan;
|
||||||
@ -342,15 +372,20 @@ void clFFT_DestroyPlan(clFFT_Plan plan)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void clFFT_DumpPlan(clFFT_Plan Plan, FILE *file)
|
void clFFT_DumpPlan(clFFT_Plan Plan, FILE *file)
|
||||||
{
|
{
|
||||||
size_t gDim;
|
size_t gDim;
|
||||||
size_t lDim;
|
size_t lDim;
|
||||||
FILE *out;
|
FILE *out;
|
||||||
if (!file)
|
if (!file)
|
||||||
out = stdout;
|
{
|
||||||
|
out = stdout;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
out = file;
|
{
|
||||||
|
out = file;
|
||||||
|
}
|
||||||
|
|
||||||
auto *plan = (cl_fft_plan *)Plan;
|
auto *plan = (cl_fft_plan *)Plan;
|
||||||
cl_fft_kernel_info *kInfo = plan->kernel_info;
|
cl_fft_kernel_info *kInfo = plan->kernel_info;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user