// // File: fft_internal.h // // Version: <1.0> // // Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") // in consideration of your agreement to the following terms, and your use, // installation, modification or redistribution of this Apple software // constitutes acceptance of these terms. If you do not agree with these // terms, please do not use, install, modify or redistribute this Apple // software. // // In consideration of your agreement to abide by the following terms, and // subject to these terms, Apple grants you a personal, non - exclusive // license, under Apple's copyrights in this original Apple software ( the // "Apple Software" ), to use, reproduce, modify and redistribute the Apple // Software, with or without modifications, in source and / or binary forms; // provided that if you redistribute the Apple Software in its entirety and // without modifications, you must retain this notice and the following text // and disclaimers in all such redistributions of the Apple Software. Neither // the name, trademarks, service marks or logos of Apple Inc. may be used to // endorse or promote products derived from the Apple Software without specific // prior written permission from Apple. Except as expressly stated in this // notice, no other rights or licenses, express or implied, are granted by // Apple herein, including but not limited to any patent rights that may be // infringed by your derivative works or by other works in which the Apple // Software may be incorporated. // // The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO // WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED // WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION // ALONE OR IN COMBINATION WITH YOUR PRODUCTS. // // IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR // CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION // AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER // UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR // OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright ( C ) 2008 Apple Inc. All Rights Reserved. // //////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef __CLFFT_INTERNAL_H #define __CLFFT_INTERNAL_H #include "clFFT.h" #include #include #include using namespace std; typedef enum kernel_dir_t { cl_fft_kernel_x, cl_fft_kernel_y, cl_fft_kernel_z }cl_fft_kernel_dir; typedef struct kernel_info_t { cl_kernel kernel; char *kernel_name; unsigned lmem_size; unsigned num_workgroups; unsigned num_xforms_per_workgroup; unsigned num_workitems_per_workgroup; cl_fft_kernel_dir dir; int in_place_possible; kernel_info_t *next; }cl_fft_kernel_info; typedef struct { // context in which fft resources are created and kernels are executed cl_context context; // size of signal clFFT_Dim3 n; // dimension of transform ... must be either 1D, 2D or 3D clFFT_Dimension dim; // data format ... must be either interleaved or plannar clFFT_DataFormat format; // string containing kernel source. Generated at runtime based on // n, dim, format and other parameters string *kernel_string; // CL program containing source and kernel this particular // n, dim, data format cl_program program; // linked list of kernels which needs to be executed for this fft cl_fft_kernel_info *kernel_info; // number of kernels int num_kernels; // twist kernel for virtualizing fft of very large sizes that do not // fit in GPU global memory cl_kernel twist_kernel; // flag indicating if temporary intermediate buffer is needed or not. // this depends on fft kernels being executed and if transform is // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... // one that does not require global transpose do not need temporary buffer) // 2D 1024x1024 out-of-place fft however do require intermediate buffer. // If temp buffer is needed, its allocation is lazy i.e. its not allocated // until its needed cl_int temp_buffer_needed; // Batch size is runtime parameter and size of temporary buffer (if needed) // depends on batch size. Allocation of temporary buffer is lazy i.e. its // only created when needed. Once its created at first call of clFFT_Executexxx // it is not allocated next time if next time clFFT_Executexxx is called with // batch size different than the first call. last_batch_size caches the last // batch size with which this plan is used so that we dont keep allocating/deallocating // temp buffer if same batch size is used again and again. unsigned last_batch_size; // temporary buffer for interleaved plan cl_mem tempmemobj; // temporary buffer for planner plan. Only one of tempmemobj or // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending // data format of plan (plannar or interleaved) cl_mem tempmemobj_real, tempmemobj_imag; // Maximum size of signal for which local memory transposed based // fft is sufficient i.e. no global mem transpose (communication) // is needed unsigned max_localmem_fft_size; // Maximum work items per work group allowed. This, along with max_radix below controls // maximum local memory being used by fft kernels of this plan. Set to 256 by default unsigned max_work_item_per_workgroup; // Maximum base radix for local memory fft ... this controls the maximum register // space used by work items. Currently defaults to 16 unsigned max_radix; // Device depended parameter that tells how many work-items need to be read consecutive // values to make sure global memory access by work-items of a work-group result in // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 unsigned min_mem_coalesce_width; // Number of local memory banks. This is used to geneate kernel with local memory // transposes with appropriate padding to avoid bank conflicts to local memory // e.g. on NVidia it is 16. unsigned num_local_mem_banks; }cl_fft_plan; void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); #endif