mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-01-15 11:45:47 +00:00
Merge branch 'next' of https://github.com/carlesfernandez/gnss-sdr into next
This commit is contained in:
commit
bf8ad2d37b
@ -2340,7 +2340,7 @@ int valpos(rtk_t *rtk, const double *v, const double *R, const int *vflg,
|
|||||||
type = (vflg[i] >> 4) & 0xF;
|
type = (vflg[i] >> 4) & 0xF;
|
||||||
freq = vflg[i] & 0xF;
|
freq = vflg[i] & 0xF;
|
||||||
stype = type == 0 ? 'L' : (type == 1 ? 'L' : 'C');
|
stype = type == 0 ? 'L' : (type == 1 ? 'L' : 'C');
|
||||||
errmsg(rtk, "large residual (sat=%2d-%2d %s%d v=%6.3f sig=%.3f)\n",
|
errmsg(rtk, "large residual (sat=%2d-%2d %c%d v=%6.3f sig=%.3f)\n",
|
||||||
sat1, sat2, stype, freq + 1, v[i], std::sqrt(R[i + i * nv]));
|
sat1, sat2, stype, freq + 1, v[i], std::sqrt(R[i + i * nv]));
|
||||||
}
|
}
|
||||||
#if 0 /* omitted v.2.4.0 */
|
#if 0 /* omitted v.2.4.0 */
|
||||||
|
@ -58,6 +58,7 @@
|
|||||||
#include <cerrno>
|
#include <cerrno>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
|
#include <inttypes.h>
|
||||||
#include <netdb.h>
|
#include <netdb.h>
|
||||||
#include <netinet/tcp.h>
|
#include <netinet/tcp.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -461,7 +462,7 @@ void closefile(file_t *file)
|
|||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
tracet(3, "closefile: fp=%d\n", file->fp);
|
tracet(3, "closefile: fpos=%" PRIdPTR "\n", file->fp);
|
||||||
closefile_(file);
|
closefile_(file);
|
||||||
free(file);
|
free(file);
|
||||||
}
|
}
|
||||||
@ -472,7 +473,7 @@ void swapfile(file_t *file, gtime_t time, char *msg)
|
|||||||
{
|
{
|
||||||
char openpath[MAXSTRPATH];
|
char openpath[MAXSTRPATH];
|
||||||
|
|
||||||
tracet(3, "swapfile: fp=%d time=%s\n", file->fp, time_str(time, 0));
|
tracet(3, "swapfile: fpos=%" PRIdPTR "\n time=%s\n", file->fp, time_str(time, 0));
|
||||||
|
|
||||||
/* return if old swap file open */
|
/* return if old swap file open */
|
||||||
if (file->fp_tmp || file->fp_tag_tmp)
|
if (file->fp_tmp || file->fp_tag_tmp)
|
||||||
@ -500,7 +501,7 @@ void swapfile(file_t *file, gtime_t time, char *msg)
|
|||||||
/* close old swap file -------------------------------------------------------*/
|
/* close old swap file -------------------------------------------------------*/
|
||||||
void swapclose(file_t *file)
|
void swapclose(file_t *file)
|
||||||
{
|
{
|
||||||
tracet(3, "swapclose: fp_tmp=%d\n", file->fp_tmp);
|
tracet(3, "swapclose: fp_tmp=%" PRIdPTR "\n", file->fp_tmp);
|
||||||
if (file->fp_tmp)
|
if (file->fp_tmp)
|
||||||
{
|
{
|
||||||
fclose(file->fp_tmp);
|
fclose(file->fp_tmp);
|
||||||
@ -534,7 +535,7 @@ int readfile(file_t *file, unsigned char *buff, int nmax, char *msg)
|
|||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
tracet(4, "readfile: fp=%d nmax=%d\n", file->fp, nmax);
|
tracet(4, "readfile: fp=%zd nmax=%d\n", file->fp, nmax);
|
||||||
|
|
||||||
if (file->fp == stdin)
|
if (file->fp == stdin)
|
||||||
{
|
{
|
||||||
@ -617,7 +618,7 @@ int readfile(file_t *file, unsigned char *buff, int nmax, char *msg)
|
|||||||
sprintf(msg, "end");
|
sprintf(msg, "end");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tracet(5, "readfile: fp=%d nr=%d fpos=%d\n", file->fp, nr, file->fpos);
|
tracet(5, "readfile: fpos=%" PRIdPTR "\n nr=%d fpos=%zd\n", file->fp, nr, file->fpos);
|
||||||
return nr;
|
return nr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -640,7 +641,7 @@ int writefile(file_t *file, unsigned char *buff, int n, char *msg)
|
|||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
tracet(3, "writefile: fp=%d n=%d\n", file->fp, n);
|
tracet(3, "writefile: fpos=%" PRIdPTR "\n n=%d\n", file->fp, n);
|
||||||
|
|
||||||
wtime = utc2gpst(timeget()); /* write time in gpst */
|
wtime = utc2gpst(timeget()); /* write time in gpst */
|
||||||
|
|
||||||
@ -693,7 +694,7 @@ int writefile(file_t *file, unsigned char *buff, int n, char *msg)
|
|||||||
fflush(file->fp_tag_tmp);
|
fflush(file->fp_tag_tmp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tracet(5, "writefile: fp=%d ns=%d tick=%5d fpos=%d\n", file->fp, ns, tick, fpos);
|
tracet(5, "writefile: fpos=%" PRIdPTR "\n ns=%d tick=%5d fpos=%zd\n", file->fp, ns, tick, fpos);
|
||||||
|
|
||||||
return static_cast<int>(ns);
|
return static_cast<int>(ns);
|
||||||
}
|
}
|
||||||
@ -1497,7 +1498,7 @@ int reqntrip_s(ntrip_t *ntrip, char *msg)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
tracet(2, "reqntrip_s: send request state=%d ns=%d\n", ntrip->state, p - buff);
|
tracet(2, "reqntrip_s: send request state=%d ns=%" PRIdPTR "\n", ntrip->state, p - buff);
|
||||||
tracet(5, "reqntrip_s: n=%d buff=\n%s\n", p - buff, buff);
|
tracet(5, "reqntrip_s: n=%d buff=\n%s\n", p - buff, buff);
|
||||||
ntrip->state = 1;
|
ntrip->state = 1;
|
||||||
return 1;
|
return 1;
|
||||||
@ -1535,7 +1536,7 @@ int reqntrip_c(ntrip_t *ntrip, char *msg)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
tracet(2, "reqntrip_c: send request state=%d ns=%d\n", ntrip->state, p - buff);
|
tracet(2, "reqntrip_c: send request state=%d ns=%" PRIdPTR "\n", ntrip->state, p - buff);
|
||||||
tracet(5, "reqntrip_c: n=%d buff=\n%s\n", p - buff, buff);
|
tracet(5, "reqntrip_c: n=%d buff=\n%s\n", p - buff, buff);
|
||||||
ntrip->state = 1;
|
ntrip->state = 1;
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -9,9 +9,9 @@
|
|||||||
#define NOMINMAX
|
#define NOMINMAX
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668
|
// http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668
|
||||||
#include < time.h >
|
#include < time.h >
|
||||||
#include <windows.h> //I've omitted this line.
|
#include <windows.h> // I've omitted this line.
|
||||||
#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
|
#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
|
||||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
|
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
|
||||||
#else
|
#else
|
||||||
@ -51,7 +51,7 @@ static inline int gettimeofday(struct timeval *tv, struct timezone *tz)
|
|||||||
tmpres <<= 32;
|
tmpres <<= 32;
|
||||||
tmpres |= ft.dwLowDateTime;
|
tmpres |= ft.dwLowDateTime;
|
||||||
|
|
||||||
/*converting file time to unix epoch*/
|
/* converting file time to unix epoch*/
|
||||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||||
|
@ -24,9 +24,9 @@
|
|||||||
#ifndef INCLUDED_LIBVOLK_GNSSSDR_COMMON_H
|
#ifndef INCLUDED_LIBVOLK_GNSSSDR_COMMON_H
|
||||||
#define INCLUDED_LIBVOLK_GNSSSDR_COMMON_H
|
#define INCLUDED_LIBVOLK_GNSSSDR_COMMON_H
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
// Cross-platform attribute macros not included in VOLK
|
// Cross-platform attribute macros not included in VOLK
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
#if defined __GNUC__
|
#if defined __GNUC__
|
||||||
#define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr)
|
#define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr)
|
||||||
#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
|
#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
|
||||||
@ -41,9 +41,9 @@
|
|||||||
#ifndef INCLUDED_LIBVOLK_COMMON_H
|
#ifndef INCLUDED_LIBVOLK_COMMON_H
|
||||||
#define INCLUDED_LIBVOLK_COMMON_H
|
#define INCLUDED_LIBVOLK_COMMON_H
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
// Cross-platform attribute macros
|
// Cross-platform attribute macros
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
#if defined __GNUC__
|
#if defined __GNUC__
|
||||||
#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
|
#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
|
||||||
#define __VOLK_ATTR_UNUSED __attribute__((unused))
|
#define __VOLK_ATTR_UNUSED __attribute__((unused))
|
||||||
@ -78,18 +78,18 @@
|
|||||||
#define __VOLK_VOLATILE __volatile__
|
#define __VOLK_VOLATILE __volatile__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
// Ignore annoying warnings in MSVC
|
// Ignore annoying warnings in MSVC
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
|
#pragma warning(disable : 4244) // 'conversion' conversion from 'type1' to 'type2', possible loss of data
|
||||||
#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2'
|
#pragma warning(disable : 4305) // 'identifier' : truncation from 'type1' to 'type2'
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
// C-linkage declaration macros
|
// C-linkage declaration macros
|
||||||
// FIXME: due to the usage of complex.h, require gcc for c-linkage
|
// FIXME: due to the usage of complex.h, require gcc for c-linkage
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
#if defined(__cplusplus) && (__GNUC__)
|
#if defined(__cplusplus) && (__GNUC__)
|
||||||
#define __VOLK_DECL_BEGIN \
|
#define __VOLK_DECL_BEGIN \
|
||||||
extern "C" \
|
extern "C" \
|
||||||
@ -100,19 +100,19 @@
|
|||||||
#define __VOLK_DECL_END
|
#define __VOLK_DECL_END
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
// Define VOLK_API for library symbols
|
// Define VOLK_API for library symbols
|
||||||
// http://gcc.gnu.org/wiki/Visibility
|
// http://gcc.gnu.org/wiki/Visibility
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
#ifdef volk_gnsssdr_EXPORTS
|
#ifdef volk_gnsssdr_EXPORTS
|
||||||
#define VOLK_API __VOLK_ATTR_EXPORT
|
#define VOLK_API __VOLK_ATTR_EXPORT
|
||||||
#else
|
#else
|
||||||
#define VOLK_API __VOLK_ATTR_IMPORT
|
#define VOLK_API __VOLK_ATTR_IMPORT
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
// The bit128 union used by some
|
// The bit128 union used by some
|
||||||
////////////////////////////////////////////////////////////////////////
|
//
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE
|
#ifdef LV_HAVE_SSE
|
||||||
|
@ -84,7 +84,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_generic(int16_t** result
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
|
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
|
||||||
local_code_chip_index = local_code_chip_index % code_length_chips;
|
local_code_chip_index = local_code_chip_index % code_length_chips;
|
||||||
result[current_correlator_tap][n] = local_code[local_code_chip_index];
|
result[current_correlator_tap][n] = local_code[local_code_chip_index];
|
||||||
@ -150,7 +150,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -217,7 +217,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -288,7 +288,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -359,7 +359,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -437,7 +437,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -515,7 +515,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -568,7 +568,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
|
|||||||
aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
||||||
aux = vaddq_f32(aux, aux2);
|
aux = vaddq_f32(aux, aux2);
|
||||||
|
|
||||||
//floor
|
// floor
|
||||||
i = vcvtq_s32_f32(aux);
|
i = vcvtq_s32_f32(aux);
|
||||||
fi = vcvtq_f32_s32(i);
|
fi = vcvtq_f32_s32(i);
|
||||||
igx = vcgtq_f32(fi, aux);
|
igx = vcgtq_f32(fi, aux);
|
||||||
@ -600,7 +600,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
|
|||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -131,36 +131,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3(lv_
|
|||||||
#endif // SSE3
|
#endif // SSE3
|
||||||
|
|
||||||
|
|
||||||
//#ifdef LV_HAVE_SSE3
|
|
||||||
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
|
||||||
//{
|
|
||||||
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
|
||||||
//float rem_carrier_phase_in_rad = 0.345;
|
|
||||||
//float phase_step_rad = 0.1;
|
|
||||||
//lv_32fc_t phase[1];
|
|
||||||
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
|
||||||
//lv_32fc_t phase_inc[1];
|
|
||||||
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
|
||||||
//int n;
|
|
||||||
//int num_a_vectors = 3;
|
|
||||||
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
|
||||||
//{
|
|
||||||
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
|
||||||
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
|
||||||
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
|
||||||
//{
|
|
||||||
//volk_gnsssdr_free(in_a[n]);
|
|
||||||
//}
|
|
||||||
//volk_gnsssdr_free(in_a);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//#endif // SSE3
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
@ -224,36 +194,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2(lv_
|
|||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
|
|
||||||
//#ifdef LV_HAVE_AVX2
|
|
||||||
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
|
||||||
//{
|
|
||||||
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
|
||||||
//float rem_carrier_phase_in_rad = 0.345;
|
|
||||||
//float phase_step_rad = 0.1;
|
|
||||||
//lv_32fc_t phase[1];
|
|
||||||
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
|
||||||
//lv_32fc_t phase_inc[1];
|
|
||||||
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
|
||||||
//int n;
|
|
||||||
//int num_a_vectors = 3;
|
|
||||||
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
|
||||||
//{
|
|
||||||
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
|
||||||
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
|
||||||
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
|
||||||
//{
|
|
||||||
//volk_gnsssdr_free(in_a[n]);
|
|
||||||
//}
|
|
||||||
//volk_gnsssdr_free(in_a);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//#endif // AVX2
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
@ -286,96 +226,4 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
|
|||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
|
|
||||||
//#ifdef LV_HAVE_AVX2
|
|
||||||
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
|
||||||
//{
|
|
||||||
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
|
||||||
//float rem_carrier_phase_in_rad = 0.345;
|
|
||||||
//float phase_step_rad = 0.1;
|
|
||||||
//lv_32fc_t phase[1];
|
|
||||||
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
|
||||||
//lv_32fc_t phase_inc[1];
|
|
||||||
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
|
||||||
//int n;
|
|
||||||
//int num_a_vectors = 3;
|
|
||||||
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
|
||||||
//{
|
|
||||||
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
|
||||||
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
|
||||||
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
|
||||||
//{
|
|
||||||
//volk_gnsssdr_free(in_a[n]);
|
|
||||||
//}
|
|
||||||
//volk_gnsssdr_free(in_a);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//#endif // AVX2
|
|
||||||
|
|
||||||
|
|
||||||
//#ifdef LV_HAVE_NEONV7
|
|
||||||
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
|
||||||
//{
|
|
||||||
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
|
||||||
//float rem_carrier_phase_in_rad = 0.345;
|
|
||||||
//float phase_step_rad = 0.1;
|
|
||||||
//lv_32fc_t phase[1];
|
|
||||||
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
|
||||||
//lv_32fc_t phase_inc[1];
|
|
||||||
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
|
||||||
//int n;
|
|
||||||
//int num_a_vectors = 3;
|
|
||||||
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
|
||||||
//{
|
|
||||||
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
|
||||||
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
|
||||||
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
|
||||||
//{
|
|
||||||
//volk_gnsssdr_free(in_a[n]);
|
|
||||||
//}
|
|
||||||
//volk_gnsssdr_free(in_a);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//#endif // NEON
|
|
||||||
|
|
||||||
|
|
||||||
//#ifdef LV_HAVE_NEONV7
|
|
||||||
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
|
||||||
//{
|
|
||||||
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
|
||||||
//float rem_carrier_phase_in_rad = 0.345;
|
|
||||||
//float phase_step_rad = 0.1;
|
|
||||||
//lv_32fc_t phase[1];
|
|
||||||
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
|
||||||
//lv_32fc_t phase_inc[1];
|
|
||||||
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
|
||||||
//int n;
|
|
||||||
//int num_a_vectors = 3;
|
|
||||||
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
|
||||||
//{
|
|
||||||
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
|
||||||
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
|
||||||
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
|
||||||
//{
|
|
||||||
//volk_gnsssdr_free(in_a[n]);
|
|
||||||
//}
|
|
||||||
//volk_gnsssdr_free(in_a);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//#endif // NEON
|
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H
|
#endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H
|
||||||
|
@ -200,34 +200,4 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx2(lv_16sc_t* cVector, c
|
|||||||
}
|
}
|
||||||
#endif /* LV_HAVE_AVX2 */
|
#endif /* LV_HAVE_AVX2 */
|
||||||
|
|
||||||
//
|
|
||||||
//
|
|
||||||
//#ifdef LV_HAVE_NEONV7
|
|
||||||
//#include <arm_neon.h>
|
|
||||||
//
|
|
||||||
//static inline void volk_gnsssdr_16ic_conjugate_16ic_neon(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points)
|
|
||||||
//{
|
|
||||||
// const unsigned int sse_iters = num_points / 4;
|
|
||||||
// unsigned int i;
|
|
||||||
// lv_16sc_t* c = cVector;
|
|
||||||
// const lv_16sc_t* a = aVector;
|
|
||||||
// int16x4x2_t a_val;
|
|
||||||
//
|
|
||||||
// for (i = 0; i < sse_iters; ++i)
|
|
||||||
// {
|
|
||||||
// a_val = vld2_s16((const int16_t*)a);
|
|
||||||
// __VOLK_GNSSSDR_PREFETCH(a + 4);
|
|
||||||
// a_val.val[1] = vneg_s16(a_val.val[1]);
|
|
||||||
// vst2_s16((int16_t*)c, a_val);
|
|
||||||
// a += 4;
|
|
||||||
// c += 4;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// for (i = sse_iters * 4; i < num_points; ++i)
|
|
||||||
// {
|
|
||||||
// *c++ = lv_conj(*a++);
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
//#endif /* LV_HAVE_NEONV7 */
|
|
||||||
|
|
||||||
#endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */
|
#endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */
|
||||||
|
@ -72,7 +72,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
|
|||||||
{
|
{
|
||||||
int local_code_chip_index;
|
int local_code_chip_index;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
//fesetround(FE_TONEAREST);
|
// fesetround(FE_TONEAREST);
|
||||||
for (n = 0; n < num_output_samples; n++)
|
for (n = 0; n < num_output_samples; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
@ -83,7 +83,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
@ -91,7 +91,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
|
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
|
||||||
{
|
{
|
||||||
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
const unsigned int quarterPoints = num_output_samples / 4;
|
const unsigned int quarterPoints = num_output_samples / 4;
|
||||||
|
|
||||||
@ -104,8 +104,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
|
|||||||
__m128 _code_phase_out, _code_phase_out_with_offset;
|
__m128 _code_phase_out, _code_phase_out_with_offset;
|
||||||
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
||||||
|
|
||||||
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
|
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); // load float to all four float values in m128 register
|
||||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register
|
||||||
__VOLK_ATTR_ALIGNED(16)
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
int four_times_code_length_chips_minus1[4];
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
@ -120,8 +120,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
|
|||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register
|
||||||
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register
|
||||||
|
|
||||||
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
|
|
||||||
@ -136,21 +136,21 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
|
|||||||
|
|
||||||
for (number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
|
||||||
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset
|
||||||
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer
|
||||||
|
|
||||||
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values
|
||||||
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
|
||||||
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
// todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
*_result++ = local_code[local_code_chip_index[0]];
|
*_result++ = local_code[local_code_chip_index[0]];
|
||||||
*_result++ = local_code[local_code_chip_index[1]];
|
*_result++ = local_code[local_code_chip_index[1]];
|
||||||
*_result++ = local_code[local_code_chip_index[2]];
|
*_result++ = local_code[local_code_chip_index[2]];
|
||||||
@ -176,7 +176,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
|
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
|
||||||
{
|
{
|
||||||
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
const unsigned int quarterPoints = num_output_samples / 4;
|
const unsigned int quarterPoints = num_output_samples / 4;
|
||||||
|
|
||||||
@ -189,8 +189,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
|
|||||||
__m128 _code_phase_out, _code_phase_out_with_offset;
|
__m128 _code_phase_out, _code_phase_out_with_offset;
|
||||||
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
||||||
|
|
||||||
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
|
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); // load float to all four float values in m128 register
|
||||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register
|
||||||
__VOLK_ATTR_ALIGNED(16)
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
int four_times_code_length_chips_minus1[4];
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
@ -205,8 +205,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
|
|||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register
|
||||||
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register
|
||||||
|
|
||||||
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
|
|
||||||
@ -221,21 +221,21 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
|
|||||||
|
|
||||||
for (number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
|
||||||
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset
|
||||||
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer
|
||||||
|
|
||||||
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values
|
||||||
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
|
||||||
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
// todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
*_result++ = local_code[local_code_chip_index[0]];
|
*_result++ = local_code[local_code_chip_index[0]];
|
||||||
*_result++ = local_code[local_code_chip_index[1]];
|
*_result++ = local_code[local_code_chip_index[1]];
|
||||||
*_result++ = local_code[local_code_chip_index[2]];
|
*_result++ = local_code[local_code_chip_index[2]];
|
||||||
@ -275,8 +275,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
|
|||||||
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
||||||
float32x4_t sign, PlusHalf, Round;
|
float32x4_t sign, PlusHalf, Round;
|
||||||
|
|
||||||
_rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register
|
_rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); // load float to all four float values in m128 register
|
||||||
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register
|
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); // load float to all four float values in m128 register
|
||||||
__VOLK_ATTR_ALIGNED(16)
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
int four_times_code_length_chips_minus1[4];
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
@ -291,8 +291,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
|
|||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); // load float to all four float values in m128 register
|
||||||
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register
|
||||||
|
|
||||||
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
uint32x4_t negative_indexes, overflow_indexes;
|
uint32x4_t negative_indexes, overflow_indexes;
|
||||||
@ -307,24 +307,24 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
|
|||||||
|
|
||||||
for (number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
|
||||||
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); // add the phase offset
|
||||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
|
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
|
||||||
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
|
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
|
||||||
Round = vsubq_f32(PlusHalf, sign);
|
Round = vsubq_f32(PlusHalf, sign);
|
||||||
_code_phase_out_int = vcvtq_s32_f32(Round);
|
_code_phase_out_int = vcvtq_s32_f32(Round);
|
||||||
|
|
||||||
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = vcltq_s32(_code_phase_out_int, zero); // test for negative values
|
||||||
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
|
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
|
||||||
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
// todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
*_result++ = local_code[local_code_chip_index[0]];
|
*_result++ = local_code[local_code_chip_index[0]];
|
||||||
*_result++ = local_code[local_code_chip_index[1]];
|
*_result++ = local_code[local_code_chip_index[1]];
|
||||||
*_result++ = local_code[local_code_chip_index[2]];
|
*_result++ = local_code[local_code_chip_index[2]];
|
||||||
@ -344,4 +344,4 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
|
|||||||
|
|
||||||
#endif /* LV_HAVE_NEONV7 */
|
#endif /* LV_HAVE_NEONV7 */
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_16ic_resampler_fast_16ic_H*/
|
#endif /* INCLUDED_volk_gnsssdr_16ic_resampler_fast_16ic_H */
|
||||||
|
@ -79,13 +79,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou
|
|||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
if (i % 512 == 0)
|
if (i % 512 == 0)
|
||||||
{
|
{
|
||||||
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
// printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
(*phase) /= std::abs((*phase));
|
(*phase) /= std::abs((*phase));
|
||||||
#else
|
#else
|
||||||
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
||||||
#endif
|
#endif
|
||||||
//printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
// printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -112,13 +112,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(lv_16s
|
|||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
}
|
}
|
||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
// printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
(*phase) /= std::abs((*phase));
|
(*phase) /= std::abs((*phase));
|
||||||
#else
|
#else
|
||||||
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
||||||
#endif
|
#endif
|
||||||
//printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
// printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
||||||
}
|
}
|
||||||
for (j = 0; j < num_points % ROTATOR_RELOAD; j++)
|
for (j = 0; j < num_points % ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
@ -161,8 +161,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
|
|||||||
|
|
||||||
for (number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -171,7 +171,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -179,11 +179,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -192,7 +192,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -214,7 +214,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
|
|||||||
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
|
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
|
||||||
}
|
}
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
@ -268,8 +268,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
{
|
{
|
||||||
for (j = 0; j < ROTATOR_RELOAD; j++)
|
for (j = 0; j < ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -278,7 +278,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -286,11 +286,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -299,7 +299,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -311,7 +311,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_mm_store_si128((__m128i*)_out, result);
|
_mm_store_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
@ -325,8 +325,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
|
|
||||||
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
|
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -335,7 +335,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -343,11 +343,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -356,7 +356,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -368,7 +368,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_mm_store_si128((__m128i*)_out, result);
|
_mm_store_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
@ -418,8 +418,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
|
|||||||
|
|
||||||
for (number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -428,7 +428,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -436,11 +436,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -449,7 +449,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -471,7 +471,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
|
|||||||
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
|
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
|
||||||
}
|
}
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
@ -525,8 +525,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
|
|||||||
{
|
{
|
||||||
for (j = 0; j < ROTATOR_RELOAD; j++)
|
for (j = 0; j < ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -535,7 +535,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -543,11 +543,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -556,7 +556,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -568,7 +568,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
|
|||||||
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_mm_storeu_si128((__m128i*)_out, result);
|
_mm_storeu_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
@ -582,8 +582,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
|
|||||||
|
|
||||||
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
|
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -592,7 +592,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -600,11 +600,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -613,7 +613,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -625,7 +625,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
|
|||||||
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_mm_storeu_si128((__m128i*)_out, result);
|
_mm_storeu_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
@ -746,9 +746,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
|
|||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
if ((i % 512) == 0)
|
if ((i % 512) == 0)
|
||||||
{
|
{
|
||||||
//printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
|
// printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
|
||||||
phase_est = arg_phase0 + (i + 1) * 4 * arg_phase_inc;
|
phase_est = arg_phase0 + (i + 1) * 4 * arg_phase_inc;
|
||||||
//printf("Estimated phase: %f\n\n", cos(phase_est));
|
// printf("Estimated phase: %f\n\n", cos(phase_est));
|
||||||
|
|
||||||
*phase = lv_cmake(cos(phase_est), sin(phase_est));
|
*phase = lv_cmake(cos(phase_est), sin(phase_est));
|
||||||
phase2 = (lv_32fc_t)(*phase) * phase_inc;
|
phase2 = (lv_32fc_t)(*phase) * phase_inc;
|
||||||
@ -887,9 +887,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
|
|||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
//printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
|
// printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
|
||||||
phase_est = arg_phase0 + (n + 1) * ROTATOR_RELOAD * 4 * arg_phase_inc;
|
phase_est = arg_phase0 + (n + 1) * ROTATOR_RELOAD * 4 * arg_phase_inc;
|
||||||
//printf("Estimated phase: %f\n\n", cos(phase_est));
|
// printf("Estimated phase: %f\n\n", cos(phase_est));
|
||||||
*phase = lv_cmake(cos(phase_est), sin(phase_est));
|
*phase = lv_cmake(cos(phase_est), sin(phase_est));
|
||||||
phase2 = (lv_32fc_t)(*phase) * phase_inc;
|
phase2 = (lv_32fc_t)(*phase) * phase_inc;
|
||||||
phase3 = phase2 * phase_inc;
|
phase3 = phase2 * phase_inc;
|
||||||
|
@ -28,7 +28,7 @@
|
|||||||
* GNU General Public License for more details.
|
* GNU General Public License for more details.
|
||||||
*
|
*
|
||||||
* You should have received a copy of the GNU General Public License
|
* You should have received a copy of the GNU General Public License
|
||||||
* along with GNSS-SDR. If not, see <https://www.gnu.org/licenses/>.
|
* along with GNSS-SDR. If not, see <https:// www.gnu.org/licenses/>.
|
||||||
*
|
*
|
||||||
* -------------------------------------------------------------------------
|
* -------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -77,7 +77,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
@ -108,7 +108,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
|
|||||||
for (number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_load_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_a + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_a + 8);
|
||||||
b = _mm_load_si128((__m128i*)_in_b);
|
b = _mm_load_si128((__m128i*)_in_b);
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_b + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_b + 8);
|
||||||
@ -123,7 +123,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
|
|||||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
|
||||||
|
|
||||||
realcacc = _mm_adds_epi16(realcacc, real);
|
realcacc = _mm_adds_epi16(realcacc, real);
|
||||||
imagcacc = _mm_adds_epi16(imagcacc, imag);
|
imagcacc = _mm_adds_epi16(imagcacc, imag);
|
||||||
@ -186,10 +186,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
|
|||||||
|
|
||||||
for (number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
// std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
||||||
//imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
// imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
||||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_loadu_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_a + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_a + 8);
|
||||||
b = _mm_loadu_si128((__m128i*)_in_b);
|
b = _mm_loadu_si128((__m128i*)_in_b);
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_b + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_b + 8);
|
||||||
@ -204,7 +204,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
|
|||||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
|
||||||
|
|
||||||
realcacc = _mm_adds_epi16(realcacc, real);
|
realcacc = _mm_adds_epi16(realcacc, real);
|
||||||
imagcacc = _mm_adds_epi16(imagcacc, imag);
|
imagcacc = _mm_adds_epi16(imagcacc, imag);
|
||||||
@ -281,7 +281,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
|
|||||||
imag1 = _mm256_mullo_epi16(a, b_sl);
|
imag1 = _mm256_mullo_epi16(a, b_sl);
|
||||||
imag2 = _mm256_mullo_epi16(b, a_sl);
|
imag2 = _mm256_mullo_epi16(b, a_sl);
|
||||||
|
|
||||||
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
|
||||||
|
|
||||||
realcacc = _mm256_adds_epi16(realcacc, real);
|
realcacc = _mm256_adds_epi16(realcacc, real);
|
||||||
imagcacc = _mm256_adds_epi16(imagcacc, imag);
|
imagcacc = _mm256_adds_epi16(imagcacc, imag);
|
||||||
@ -359,7 +359,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
|
|||||||
imag1 = _mm256_mullo_epi16(a, b_sl);
|
imag1 = _mm256_mullo_epi16(a, b_sl);
|
||||||
imag2 = _mm256_mullo_epi16(b, a_sl);
|
imag2 = _mm256_mullo_epi16(b, a_sl);
|
||||||
|
|
||||||
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
|
||||||
|
|
||||||
realcacc = _mm256_adds_epi16(realcacc, real);
|
realcacc = _mm256_adds_epi16(realcacc, real);
|
||||||
imagcacc = _mm256_adds_epi16(imagcacc, imag);
|
imagcacc = _mm256_adds_epi16(imagcacc, imag);
|
||||||
@ -571,4 +571,4 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out
|
|||||||
|
|
||||||
#endif /* LV_HAVE_NEONV7 */
|
#endif /* LV_HAVE_NEONV7 */
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H*/
|
#endif /* INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H */
|
||||||
|
@ -77,15 +77,15 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu
|
|||||||
result[n_vec] = lv_cmake(0, 0);
|
result[n_vec] = lv_cmake(0, 0);
|
||||||
for (n = 0; n < num_points; n++)
|
for (n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
//r*a.r - i*a.i, i*a.r + r*a.i
|
// r*a.r - i*a.i, i*a.r + r*a.i
|
||||||
//result[n_vec]+=in_common[n]*in_a[n_vec][n];
|
// result[n_vec]+=in_common[n]*in_a[n_vec][n];
|
||||||
lv_16sc_t tmp = in_common[n] * in_a[n_vec][n];
|
lv_16sc_t tmp = in_common[n] * in_a[n_vec][n];
|
||||||
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
|
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
@ -106,7 +106,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t*
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
@ -145,11 +145,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
|
|||||||
for (index = 0; index < sse_iters; index++)
|
for (index = 0; index < sse_iters; index++)
|
||||||
{
|
{
|
||||||
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b = _mm_load_si128((__m128i*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
|
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
@ -240,11 +240,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
|
|||||||
for (index = 0; index < sse_iters; index++)
|
for (index = 0; index < sse_iters; index++)
|
||||||
{
|
{
|
||||||
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b = _mm_loadu_si128((__m128i*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
|
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
@ -522,12 +522,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
|
|||||||
|
|
||||||
for (index = 0; index < neon_iters; index++)
|
for (index = 0; index < neon_iters; index++)
|
||||||
{
|
{
|
||||||
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b_val = vld2_s16((int16_t*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
//__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8);
|
// __VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8);
|
||||||
|
|
||||||
// multiply the real*real and imag*imag to get real result
|
// multiply the real*real and imag*imag to get real result
|
||||||
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
|
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
|
||||||
@ -609,7 +609,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
|
|||||||
|
|
||||||
for (index = 0; index < neon_iters; index++)
|
for (index = 0; index < neon_iters; index++)
|
||||||
{
|
{
|
||||||
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b_val = vld2_s16((int16_t*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
@ -690,7 +690,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
|
|||||||
|
|
||||||
for (index = 0; index < neon_iters; index++)
|
for (index = 0; index < neon_iters; index++)
|
||||||
{
|
{
|
||||||
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b_val = vld2_s16((int16_t*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
@ -738,4 +738,4 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
|
|||||||
}
|
}
|
||||||
#endif /* LV_HAVE_NEONV7 */
|
#endif /* LV_HAVE_NEONV7 */
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H*/
|
#endif /* INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H */
|
||||||
|
@ -68,12 +68,12 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
for (n = 0; n < num_points; n++)
|
for (n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
//r*a.r - i*a.i, i*a.r + r*a.i
|
// r*a.r - i*a.i, i*a.r + r*a.i
|
||||||
result[n] = in_a[n] * in_b[n];
|
result[n] = in_a[n] * in_b[n];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
@ -93,10 +93,10 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con
|
|||||||
lv_16sc_t* _out = out;
|
lv_16sc_t* _out = out;
|
||||||
for (number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
// std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
||||||
//imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
// imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
||||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_load_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
b = _mm_load_si128((__m128i*)_in_b);
|
b = _mm_load_si128((__m128i*)_in_b);
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
@ -147,10 +147,10 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con
|
|||||||
lv_16sc_t* _out = out;
|
lv_16sc_t* _out = out;
|
||||||
for (number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
// std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
||||||
//imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
// imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
||||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_loadu_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
b = _mm_loadu_si128((__m128i*)_in_b);
|
b = _mm_loadu_si128((__m128i*)_in_b);
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
@ -340,4 +340,4 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const
|
|||||||
}
|
}
|
||||||
#endif /* LV_HAVE_NEONV7*/
|
#endif /* LV_HAVE_NEONV7*/
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H*/
|
#endif /* INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H */
|
||||||
|
@ -89,33 +89,33 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc
|
|||||||
}
|
}
|
||||||
for (n = 0; n < num_points; n++)
|
for (n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||||
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
||||||
|
|
||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
if (n % 256 == 0)
|
if (n % 256 == 0)
|
||||||
{
|
{
|
||||||
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
// printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
(*phase) /= std::abs((*phase));
|
(*phase) /= std::abs((*phase));
|
||||||
#else
|
#else
|
||||||
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
||||||
#endif
|
#endif
|
||||||
//printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
// printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
||||||
}
|
}
|
||||||
|
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
|
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
|
||||||
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
|
// lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
|
||||||
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
|
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
@ -137,14 +137,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(
|
|||||||
{
|
{
|
||||||
for (j = 0; j < ROTATOR_RELOAD; j++)
|
for (j = 0; j < ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||||
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j];
|
lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j];
|
||||||
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
|
// lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
|
||||||
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
|
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -152,27 +152,27 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(
|
|||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
(*phase) /= std::abs((*phase));
|
(*phase) /= std::abs((*phase));
|
||||||
#else
|
#else
|
||||||
//(*phase) /= cabsf((*phase));
|
// (*phase) /= cabsf((*phase));
|
||||||
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
for (j = 0; j < num_points % ROTATOR_RELOAD; j++)
|
for (j = 0; j < num_points % ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||||
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j];
|
lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j];
|
||||||
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
|
// lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
|
||||||
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
|
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
@ -228,9 +228,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
|||||||
for (number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
// Phase rotation on operand in_common starts here:
|
// Phase rotation on operand in_common starts here:
|
||||||
//printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
|
// printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
|
||||||
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -239,7 +239,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
|||||||
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -247,11 +247,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -260,7 +260,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
|||||||
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -271,12 +271,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
|||||||
// store four rotated in_common samples in the register b
|
// store four rotated in_common samples in the register b
|
||||||
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
|
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
|
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
@ -331,12 +331,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
|||||||
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
|
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
|
||||||
|
|
||||||
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
||||||
//(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
|
// (*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
|
||||||
(*phase) = two_phase_acc[0];
|
(*phase) = two_phase_acc[0];
|
||||||
|
|
||||||
for (n = sse_iters * 4; n < num_points; n++)
|
for (n = sse_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp16 = in_common[n]; // printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||||
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
@ -344,7 +344,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
|||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
|
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
|
||||||
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
|
// lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
|
||||||
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
||||||
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
@ -411,9 +411,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
for (j = 0; j < ROTATOR_RELOAD; j++)
|
for (j = 0; j < ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
// Phase rotation on operand in_common starts here:
|
// Phase rotation on operand in_common starts here:
|
||||||
//printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
|
// printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
|
||||||
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -422,7 +422,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -430,11 +430,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -443,7 +443,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -454,12 +454,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
// store four rotated in_common samples in the register b
|
// store four rotated in_common samples in the register b
|
||||||
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
|
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
|
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
@ -488,8 +488,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
|
|
||||||
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
|
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -498,7 +498,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -506,11 +506,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -519,7 +519,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -530,12 +530,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
// store four rotated in_common samples in the register b
|
// store four rotated in_common samples in the register b
|
||||||
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
|
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
|
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
@ -582,12 +582,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
|
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
|
||||||
|
|
||||||
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
||||||
//(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
|
// (*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
|
||||||
(*phase) = two_phase_acc[0];
|
(*phase) = two_phase_acc[0];
|
||||||
|
|
||||||
for (n = sse_iters * 4; n < num_points; n++)
|
for (n = sse_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp16 = in_common[n]; // printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||||
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
@ -595,7 +595,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
|||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
|
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
|
||||||
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
|
// lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
|
||||||
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
||||||
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
@ -657,9 +657,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
|
|||||||
for (number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
// Phase rotation on operand in_common starts here:
|
// Phase rotation on operand in_common starts here:
|
||||||
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -668,7 +668,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
|
|||||||
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -676,11 +676,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -689,7 +689,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
|
|||||||
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -700,12 +700,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
|
|||||||
// store four rotated in_common samples in the register b
|
// store four rotated in_common samples in the register b
|
||||||
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
|
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
|
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
@ -824,8 +824,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
|
|||||||
|
|
||||||
for (number = 0; number < avx2_iters; number++)
|
for (number = 0; number < avx2_iters; number++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -834,7 +834,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -842,11 +842,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
|
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -855,7 +855,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -866,8 +866,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
|
|||||||
// store four output samples
|
// store four output samples
|
||||||
result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -876,7 +876,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -884,11 +884,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -897,7 +897,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1036,8 +1036,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
{
|
{
|
||||||
for (j = 0; j < ROTATOR_RELOAD; j++)
|
for (j = 0; j < ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1046,7 +1046,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1054,11 +1054,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
|
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1067,7 +1067,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1078,8 +1078,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
// store four output samples
|
// store four output samples
|
||||||
result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1088,7 +1088,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1096,11 +1096,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1109,7 +1109,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1152,8 +1152,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
|
|
||||||
for (j = 0; j < avx2_iters % ROTATOR_RELOAD; j++)
|
for (j = 0; j < avx2_iters % ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1162,7 +1162,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1170,11 +1170,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
|
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1183,7 +1183,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1194,8 +1194,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
// store four output samples
|
// store four output samples
|
||||||
result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1204,7 +1204,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1212,11 +1212,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
// complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1225,7 +1225,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
|||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
// complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
@ -1414,8 +1414,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
//__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8);
|
// __VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8);
|
||||||
|
|
||||||
// multiply the real*real and imag*imag to get real result
|
// multiply the real*real and imag*imag to get real result
|
||||||
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
|
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
|
||||||
@ -1474,7 +1474,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
|
|||||||
|
|
||||||
for (n = neon_iters * 4; n < num_points; n++)
|
for (n = neon_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp16_ = in_common[n]; // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
|
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
|
||||||
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
|
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
@ -1513,7 +1513,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
|
|||||||
float arg_phase0 = cargf(*phase);
|
float arg_phase0 = cargf(*phase);
|
||||||
float arg_phase_inc = cargf(phase_inc);
|
float arg_phase_inc = cargf(phase_inc);
|
||||||
float phase_est;
|
float phase_est;
|
||||||
//printf("arg phase0: %f", arg_phase0);
|
// printf("arg phase0: %f", arg_phase0);
|
||||||
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
|
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
|
||||||
__VOLK_ATTR_ALIGNED(16)
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
|
float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
|
||||||
@ -1605,9 +1605,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
|
|||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
if ((number % 256) == 0)
|
if ((number % 256) == 0)
|
||||||
{
|
{
|
||||||
//printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
|
// printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
|
||||||
phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc;
|
phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc;
|
||||||
//printf("Estimated phase: %f\n\n", cos(phase_est));
|
// printf("Estimated phase: %f\n\n", cos(phase_est));
|
||||||
|
|
||||||
*phase = lv_cmake(cos(phase_est), sin(phase_est));
|
*phase = lv_cmake(cos(phase_est), sin(phase_est));
|
||||||
phase2 = (lv_32fc_t)(*phase) * phase_inc;
|
phase2 = (lv_32fc_t)(*phase) * phase_inc;
|
||||||
@ -1624,14 +1624,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
|
|||||||
|
|
||||||
// Round = vmulq_f32(_phase_real, _phase_real);
|
// Round = vmulq_f32(_phase_real, _phase_real);
|
||||||
// Round = vmlaq_f32(Round, _phase_imag, _phase_imag);
|
// Round = vmlaq_f32(Round, _phase_imag, _phase_imag);
|
||||||
// Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]);
|
// Round = vsqrtq_f32(Round);// printf("sqrt: %f \n", Round[0]);
|
||||||
//Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]);
|
// Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]);
|
||||||
//Round = vrecpeq_f32((Round);
|
// Round = vrecpeq_f32((Round);
|
||||||
// _phase_real = vdivq_f32(_phase_real, Round);
|
// _phase_real = vdivq_f32(_phase_real, Round);
|
||||||
// _phase_imag = vdivq_f32(_phase_imag, Round);
|
// _phase_imag = vdivq_f32(_phase_imag, Round);
|
||||||
//_phase_real = vmulq_f32(_phase_real, Round);
|
// _phase_real = vmulq_f32(_phase_real, Round);
|
||||||
//_phase_imag = vmulq_f32(_phase_imag, Round);
|
// _phase_imag = vmulq_f32(_phase_imag, Round);
|
||||||
//printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0]));
|
// printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
@ -1671,7 +1671,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
|
|||||||
|
|
||||||
for (n = neon_iters * 4; n < num_points; n++)
|
for (n = neon_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp16_ = in_common[n]; // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
|
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
|
||||||
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
|
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
@ -1804,9 +1804,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
|
|||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
if ((number % 256) == 0)
|
if ((number % 256) == 0)
|
||||||
{
|
{
|
||||||
//printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
|
// printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
|
||||||
phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc;
|
phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc;
|
||||||
//printf("Estimated phase: %f\n\n", cos(phase_est));
|
// printf("Estimated phase: %f\n\n", cos(phase_est));
|
||||||
|
|
||||||
*phase = lv_cmake(cos(phase_est), sin(phase_est));
|
*phase = lv_cmake(cos(phase_est), sin(phase_est));
|
||||||
phase2 = (lv_32fc_t)(*phase) * phase_inc;
|
phase2 = (lv_32fc_t)(*phase) * phase_inc;
|
||||||
@ -1860,7 +1860,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
|
|||||||
|
|
||||||
for (n = neon_iters * 4; n < num_points; n++)
|
for (n = neon_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp16_ = in_common[n]; // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
|
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
|
||||||
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
|
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
@ -1874,4 +1874,4 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
|
|||||||
|
|
||||||
#endif /* LV_HAVE_NEONV7 */
|
#endif /* LV_HAVE_NEONV7 */
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H*/
|
#endif /* INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H */
|
||||||
|
@ -82,7 +82,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** re
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
|
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
|
||||||
local_code_chip_index = local_code_chip_index % code_length_chips;
|
local_code_chip_index = local_code_chip_index % code_length_chips;
|
||||||
result[current_correlator_tap][n] = local_code[local_code_chip_index];
|
result[current_correlator_tap][n] = local_code[local_code_chip_index];
|
||||||
@ -90,7 +90,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** re
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE4_1
|
#ifdef LV_HAVE_SSE4_1
|
||||||
@ -149,7 +149,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -216,7 +216,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -287,7 +287,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -358,7 +358,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -436,7 +436,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -514,7 +514,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -567,7 +567,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
|||||||
aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
||||||
aux = vaddq_f32(aux, aux2);
|
aux = vaddq_f32(aux, aux2);
|
||||||
|
|
||||||
//floor
|
// floor
|
||||||
i = vcvtq_s32_f32(aux);
|
i = vcvtq_s32_f32(aux);
|
||||||
fi = vcvtq_f32_s32(i);
|
fi = vcvtq_f32_s32(i);
|
||||||
igx = vcgtq_f32(fi, aux);
|
igx = vcgtq_f32(fi, aux);
|
||||||
@ -599,7 +599,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
|||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -611,4 +611,4 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/
|
#endif /* INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H */
|
||||||
|
@ -73,7 +73,7 @@
|
|||||||
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||||
{
|
{
|
||||||
int local_code_chip_index;
|
int local_code_chip_index;
|
||||||
//fesetround(FE_TONEAREST);
|
// fesetround(FE_TONEAREST);
|
||||||
int current_vector;
|
int current_vector;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
||||||
@ -89,7 +89,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
@ -97,7 +97,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||||
{
|
{
|
||||||
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
const unsigned int quarterPoints = num_output_samples / 4;
|
const unsigned int quarterPoints = num_output_samples / 4;
|
||||||
|
|
||||||
@ -109,7 +109,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
|
|||||||
__m128i _code_length_chips, _code_length_chips_minus1;
|
__m128i _code_length_chips, _code_length_chips_minus1;
|
||||||
__m128 _code_phase_out, _code_phase_out_with_offset;
|
__m128 _code_phase_out, _code_phase_out_with_offset;
|
||||||
|
|
||||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register
|
||||||
__VOLK_ATTR_ALIGNED(16)
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
int four_times_code_length_chips_minus1[4];
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
@ -124,8 +124,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
|
|||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register
|
||||||
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register
|
||||||
|
|
||||||
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
|
|
||||||
@ -142,29 +142,29 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
|
|||||||
int sample_idx = 0;
|
int sample_idx = 0;
|
||||||
for (number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
//common to all outputs
|
// common to all outputs
|
||||||
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
|
||||||
|
|
||||||
//output vector dependent (different code phase offset)
|
// output vector dependent (different code phase offset)
|
||||||
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
||||||
{
|
{
|
||||||
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
||||||
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
|
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); // load float to all four float values in m128 register
|
||||||
|
|
||||||
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset
|
||||||
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer
|
||||||
|
|
||||||
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values
|
||||||
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
|
||||||
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
// todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
||||||
_result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]];
|
_result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]];
|
||||||
_result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]];
|
_result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]];
|
||||||
@ -193,7 +193,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||||
{
|
{
|
||||||
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
const unsigned int quarterPoints = num_output_samples / 4;
|
const unsigned int quarterPoints = num_output_samples / 4;
|
||||||
|
|
||||||
@ -205,7 +205,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
|
|||||||
__m128i _code_length_chips, _code_length_chips_minus1;
|
__m128i _code_length_chips, _code_length_chips_minus1;
|
||||||
__m128 _code_phase_out, _code_phase_out_with_offset;
|
__m128 _code_phase_out, _code_phase_out_with_offset;
|
||||||
|
|
||||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register
|
||||||
__VOLK_ATTR_ALIGNED(16)
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
int four_times_code_length_chips_minus1[4];
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
@ -220,8 +220,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
|
|||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register
|
||||||
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register
|
||||||
|
|
||||||
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
|
|
||||||
@ -238,29 +238,29 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
|
|||||||
int sample_idx = 0;
|
int sample_idx = 0;
|
||||||
for (number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
//common to all outputs
|
// common to all outputs
|
||||||
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
|
||||||
|
|
||||||
//output vector dependent (different code phase offset)
|
// output vector dependent (different code phase offset)
|
||||||
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
||||||
{
|
{
|
||||||
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
||||||
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
|
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); // load float to all four float values in m128 register
|
||||||
|
|
||||||
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset
|
||||||
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer
|
||||||
|
|
||||||
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values
|
||||||
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
|
||||||
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
// todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
||||||
_result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]];
|
_result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]];
|
||||||
_result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]];
|
_result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]];
|
||||||
@ -303,7 +303,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
|
|||||||
float32x4_t _code_phase_out, _code_phase_out_with_offset;
|
float32x4_t _code_phase_out, _code_phase_out_with_offset;
|
||||||
float32x4_t sign, PlusHalf, Round;
|
float32x4_t sign, PlusHalf, Round;
|
||||||
|
|
||||||
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register
|
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); // load float to all four float values in float32x4_t register
|
||||||
__VOLK_ATTR_ALIGNED(16)
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
int four_times_code_length_chips_minus1[4];
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
@ -318,8 +318,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
|
|||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register
|
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); // load float to all four float values in float32x4_t register
|
||||||
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register
|
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); // load float to all four float values in float32x4_t register
|
||||||
|
|
||||||
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
uint32x4_t negative_indexes, overflow_indexes;
|
uint32x4_t negative_indexes, overflow_indexes;
|
||||||
@ -336,33 +336,33 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
|
|||||||
int sample_idx = 0;
|
int sample_idx = 0;
|
||||||
for (number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
//common to all outputs
|
// common to all outputs
|
||||||
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
|
||||||
|
|
||||||
//output vector dependent (different code phase offset)
|
// output vector dependent (different code phase offset)
|
||||||
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
||||||
{
|
{
|
||||||
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
||||||
_rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register
|
_rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); // load float to all four float values in float32x4_t register
|
||||||
|
|
||||||
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); // add the phase offset
|
||||||
//_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
// _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer
|
||||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
|
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
|
||||||
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
|
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
|
||||||
Round = vsubq_f32(PlusHalf, sign);
|
Round = vsubq_f32(PlusHalf, sign);
|
||||||
_code_phase_out_int = vcvtq_s32_f32(Round);
|
_code_phase_out_int = vcvtq_s32_f32(Round);
|
||||||
|
|
||||||
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = vcltq_s32(_code_phase_out_int, zero); // test for negative values
|
||||||
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
|
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
|
||||||
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
|
||||||
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
// todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
||||||
_result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]];
|
_result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]];
|
||||||
_result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]];
|
_result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]];
|
||||||
@ -386,4 +386,4 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
|
|||||||
|
|
||||||
#endif /* LV_HAVE_NEONV7 */
|
#endif /* LV_HAVE_NEONV7 */
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_H*/
|
#endif /* INCLUDED_volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_H */
|
||||||
|
@ -256,33 +256,6 @@ static inline void volk_gnsssdr_32f_high_dynamics_resamplerxnpuppet_32f_u_avx(fl
|
|||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
//
|
|
||||||
//#ifdef LV_HAVE_NEONV7
|
|
||||||
//static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points)
|
|
||||||
//{
|
|
||||||
// int code_length_chips = 2046;
|
|
||||||
// float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
|
||||||
// int num_out_vectors = 3;
|
|
||||||
// float rem_code_phase_chips = -0.234;
|
|
||||||
// int n;
|
|
||||||
// float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
|
||||||
//
|
|
||||||
// float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
|
||||||
// for (n = 0; n < num_out_vectors; n++)
|
|
||||||
// {
|
|
||||||
// result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
|
||||||
//
|
|
||||||
// memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
|
||||||
//
|
|
||||||
// for (n = 0; n < num_out_vectors; n++)
|
|
||||||
// {
|
|
||||||
// volk_gnsssdr_free(result_aux[n]);
|
|
||||||
// }
|
|
||||||
// volk_gnsssdr_free(result_aux);
|
|
||||||
//}
|
|
||||||
//#endif
|
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_32f_high_dynamics_resamplerpuppet_32f_H
|
#endif // INCLUDED_volk_gnsssdr_32f_high_dynamics_resamplerpuppet_32f_H
|
||||||
|
@ -52,6 +52,8 @@
|
|||||||
* \b Outputs
|
* \b Outputs
|
||||||
* \li out: Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n]))
|
* \li out: Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n]))
|
||||||
*
|
*
|
||||||
|
* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier
|
||||||
|
* Based on algorithms from the cephes library http://www.netlib.org/cephes/
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef INCLUDED_volk_gnsssdr_32f_sincos_32fc_H
|
#ifndef INCLUDED_volk_gnsssdr_32f_sincos_32fc_H
|
||||||
@ -251,8 +253,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
|
|||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
|
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
|
||||||
static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t* bPtr = out;
|
lv_32fc_t* bPtr = out;
|
||||||
@ -421,8 +422,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
|
|||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
|
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
|
||||||
static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t* bPtr = out;
|
lv_32fc_t* bPtr = out;
|
||||||
@ -644,8 +644,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
|
|||||||
|
|
||||||
#ifdef LV_HAVE_NEONV7
|
#ifdef LV_HAVE_NEONV7
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
/* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */
|
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
|
||||||
static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t* bPtr = out;
|
lv_32fc_t* bPtr = out;
|
||||||
|
@ -83,7 +83,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_generic(fl
|
|||||||
int local_code_chip_index;
|
int local_code_chip_index;
|
||||||
int current_correlator_tap;
|
int current_correlator_tap;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
//first correlator
|
// first correlator
|
||||||
for (n = 0; n < num_points; n++)
|
for (n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for first tap
|
// resample code for first tap
|
||||||
@ -94,7 +94,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_generic(fl
|
|||||||
result[0][n] = local_code[local_code_chip_index];
|
result[0][n] = local_code[local_code_chip_index];
|
||||||
}
|
}
|
||||||
|
|
||||||
//adjacent correlators
|
// adjacent correlators
|
||||||
unsigned int shift_samples = 0;
|
unsigned int shift_samples = 0;
|
||||||
for (current_correlator_tap = 1; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 1; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
{
|
{
|
||||||
@ -618,11 +618,11 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa
|
|||||||
#endif
|
#endif
|
||||||
//
|
//
|
||||||
//
|
//
|
||||||
//#ifdef LV_HAVE_NEONV7
|
// #ifdef LV_HAVE_NEONV7
|
||||||
//#include <arm_neon.h>
|
// #include <arm_neon.h>
|
||||||
//
|
//
|
||||||
//static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
// static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||||
//{
|
// {
|
||||||
// float** _result = result;
|
// float** _result = result;
|
||||||
// const unsigned int neon_iters = num_points / 4;
|
// const unsigned int neon_iters = num_points / 4;
|
||||||
// int current_correlator_tap;
|
// int current_correlator_tap;
|
||||||
@ -662,7 +662,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa
|
|||||||
// aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
// aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
||||||
// aux = vaddq_f32(aux, aux2);
|
// aux = vaddq_f32(aux, aux2);
|
||||||
//
|
//
|
||||||
// //floor
|
// // floor
|
||||||
// i = vcvtq_s32_f32(aux);
|
// i = vcvtq_s32_f32(aux);
|
||||||
// fi = vcvtq_f32_s32(i);
|
// fi = vcvtq_f32_s32(i);
|
||||||
// igx = vcgtq_f32(fi, aux);
|
// igx = vcgtq_f32(fi, aux);
|
||||||
@ -700,8 +700,8 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa
|
|||||||
// _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
// _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
//}
|
// }
|
||||||
//
|
//
|
||||||
//#endif
|
// #endif
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_H*/
|
#endif /* INCLUDED_volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_H */
|
||||||
|
@ -85,7 +85,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_generic(float** result,
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
|
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
|
||||||
local_code_chip_index = local_code_chip_index % code_length_chips;
|
local_code_chip_index = local_code_chip_index % code_length_chips;
|
||||||
result[current_correlator_tap][n] = local_code[local_code_chip_index];
|
result[current_correlator_tap][n] = local_code[local_code_chip_index];
|
||||||
@ -93,7 +93,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_generic(float** result,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
@ -156,7 +156,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -227,7 +227,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -293,7 +293,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -360,7 +360,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -438,7 +438,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -516,7 +516,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -571,7 +571,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
|
|||||||
aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
||||||
aux = vaddq_f32(aux, aux2);
|
aux = vaddq_f32(aux, aux2);
|
||||||
|
|
||||||
//floor
|
// floor
|
||||||
i = vcvtq_s32_f32(aux);
|
i = vcvtq_s32_f32(aux);
|
||||||
fi = vcvtq_f32_s32(i);
|
fi = vcvtq_f32_s32(i);
|
||||||
igx = vcgtq_f32(fi, aux);
|
igx = vcgtq_f32(fi, aux);
|
||||||
@ -603,7 +603,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
|
|||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
|
@ -74,7 +74,7 @@
|
|||||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||||
#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
|
#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
//#include <stdio.h>
|
// #include <stdio.h>
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
|
|
||||||
@ -89,18 +89,18 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f
|
|||||||
}
|
}
|
||||||
for (n = 0; n < num_points; n++)
|
for (n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp32_1 = *in_common++ * (*phase); // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
|
|
||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
if (n % 256 == 0)
|
if (n % 256 == 0)
|
||||||
{
|
{
|
||||||
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
// printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
(*phase) /= std::abs((*phase));
|
(*phase) /= std::abs((*phase));
|
||||||
#else
|
#else
|
||||||
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
||||||
#endif
|
#endif
|
||||||
//printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
// printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
||||||
}
|
}
|
||||||
|
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
@ -112,7 +112,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
|
|||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
(*phase) /= std::abs((*phase));
|
(*phase) /= std::abs((*phase));
|
||||||
#else
|
#else
|
||||||
//(*phase) /= cabsf((*phase));
|
// (*phase) /= cabsf((*phase));
|
||||||
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -162,7 +162,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX
|
#ifdef LV_HAVE_AVX
|
||||||
#include <volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h>
|
#include <volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h>
|
||||||
|
@ -179,7 +179,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
|
|||||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||||
float aux;
|
float aux;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
|
const float min_val = (float)SHRT_MIN; /// todo Something off here, compiler does not perform right cast
|
||||||
const float max_val = (float)SHRT_MAX;
|
const float max_val = (float)SHRT_MAX;
|
||||||
|
|
||||||
__m256 inputVal1, inputVal2;
|
__m256 inputVal1, inputVal2;
|
||||||
@ -342,7 +342,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
|
|||||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||||
float aux;
|
float aux;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
|
const float min_val = (float)SHRT_MIN; /// todo Something off here, compiler does not perform right cast
|
||||||
const float max_val = (float)SHRT_MAX;
|
const float max_val = (float)SHRT_MAX;
|
||||||
|
|
||||||
__m256 inputVal1, inputVal2;
|
__m256 inputVal1, inputVal2;
|
||||||
|
@ -89,18 +89,18 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc
|
|||||||
}
|
}
|
||||||
for (n = 0; n < num_points; n++)
|
for (n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp32_1 = *in_common++ * (*phase); // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
|
|
||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
if (n % 256 == 0)
|
if (n % 256 == 0)
|
||||||
{
|
{
|
||||||
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
// printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
(*phase) /= std::abs((*phase));
|
(*phase) /= std::abs((*phase));
|
||||||
#else
|
#else
|
||||||
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
||||||
#endif
|
#endif
|
||||||
//printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
// printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
||||||
}
|
}
|
||||||
|
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
|
|||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
(*phase) /= std::abs((*phase));
|
(*phase) /= std::abs((*phase));
|
||||||
#else
|
#else
|
||||||
//(*phase) /= cabsf((*phase));
|
// (*phase) /= cabsf((*phase));
|
||||||
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -225,7 +225,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
|
|||||||
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(z1);
|
yh = _mm_movehdup_ps(z1);
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
@ -337,7 +337,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
|
|||||||
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(z1);
|
yh = _mm_movehdup_ps(z1);
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 2;
|
_in_common += 2;
|
||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
@ -457,7 +457,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
|
|||||||
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
|
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm256_movehdup_ps(z);
|
yh = _mm256_movehdup_ps(z);
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 4;
|
_in_common += 4;
|
||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
@ -587,7 +587,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
|
|||||||
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
|
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm256_movehdup_ps(z);
|
yh = _mm256_movehdup_ps(z);
|
||||||
|
|
||||||
//next two samples
|
// next two samples
|
||||||
_in_common += 4;
|
_in_common += 4;
|
||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
|
@ -82,7 +82,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** re
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
|
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
|
||||||
local_code_chip_index = local_code_chip_index % code_length_chips;
|
local_code_chip_index = local_code_chip_index % code_length_chips;
|
||||||
result[current_correlator_tap][n] = local_code[local_code_chip_index];
|
result[current_correlator_tap][n] = local_code[local_code_chip_index];
|
||||||
@ -90,7 +90,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** re
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
@ -153,7 +153,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -224,7 +224,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -290,7 +290,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -357,7 +357,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -435,7 +435,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -513,7 +513,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -558,13 +558,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
|
|||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
|
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
|
||||||
//aux = _mm256_add_ps(aux, aux2);
|
// aux = _mm256_add_ps(aux, aux2);
|
||||||
// floor
|
// floor
|
||||||
aux = _mm256_floor_ps(aux);
|
aux = _mm256_floor_ps(aux);
|
||||||
|
|
||||||
// fmod
|
// fmod
|
||||||
c = _mm256_div_ps(aux, code_length_chips_reg_f);
|
c = _mm256_div_ps(aux, code_length_chips_reg_f);
|
||||||
//_mm_fmsub_ps(c, code_length_chips_reg_f, aux)
|
// _mm_fmsub_ps(c, code_length_chips_reg_f, aux)
|
||||||
i = _mm256_cvttps_epi32(c);
|
i = _mm256_cvttps_epi32(c);
|
||||||
cTrunc = _mm256_cvtepi32_ps(i);
|
cTrunc = _mm256_cvtepi32_ps(i);
|
||||||
base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
|
base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
|
||||||
@ -592,7 +592,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -637,8 +637,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
|
|||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
|
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
|
||||||
//aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
|
// aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
//aux = _mm256_add_ps(aux, aux2);
|
// aux = _mm256_add_ps(aux, aux2);
|
||||||
// floor
|
// floor
|
||||||
aux = _mm256_floor_ps(aux);
|
aux = _mm256_floor_ps(aux);
|
||||||
|
|
||||||
@ -671,7 +671,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
|
|||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -726,7 +726,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
|
|||||||
aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
||||||
aux = vaddq_f32(aux, aux2);
|
aux = vaddq_f32(aux, aux2);
|
||||||
|
|
||||||
//floor
|
// floor
|
||||||
i = vcvtq_s32_f32(aux);
|
i = vcvtq_s32_f32(aux);
|
||||||
fi = vcvtq_f32_s32(i);
|
fi = vcvtq_f32_s32(i);
|
||||||
igx = vcgtq_f32(fi, aux);
|
igx = vcgtq_f32(fi, aux);
|
||||||
@ -758,7 +758,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
|
|||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
// Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
@ -769,4 +769,4 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H*/
|
#endif /* INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H */
|
||||||
|
@ -121,7 +121,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_AVX2*/
|
#endif /* LV_HAVE_AVX2 */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX
|
#ifdef LV_HAVE_AVX
|
||||||
@ -156,7 +156,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
|
|||||||
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
|
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
|
||||||
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
|
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
|
||||||
|
|
||||||
//compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
|
// compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
|
||||||
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1);
|
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1);
|
||||||
|
|
||||||
if (!_mm256_testc_si256(compareResults, ones))
|
if (!_mm256_testc_si256(compareResults, ones))
|
||||||
@ -437,7 +437,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
|
|||||||
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
|
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
|
||||||
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
|
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
|
||||||
|
|
||||||
//compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
|
// compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
|
||||||
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1);
|
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1);
|
||||||
|
|
||||||
if (!_mm256_testc_si256(compareResults, ones))
|
if (!_mm256_testc_si256(compareResults, ones))
|
||||||
|
@ -313,7 +313,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0
|
|||||||
{
|
{
|
||||||
currentValues = _mm256_load_si256((__m256i*)inputPtr);
|
currentValues = _mm256_load_si256((__m256i*)inputPtr);
|
||||||
compareResults = _mm256_max_epi8(maxValues, currentValues);
|
compareResults = _mm256_max_epi8(maxValues, currentValues);
|
||||||
maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults);
|
maxValues = compareResults; // _mm256_blendv_epi8(currentValues, maxValues, compareResults);
|
||||||
inputPtr += 32;
|
inputPtr += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -113,7 +113,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const
|
|||||||
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
||||||
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
|
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
|
||||||
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
||||||
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
// tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
||||||
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
|
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
|
||||||
_mm256_storeu_ps((float*)c, tmp);
|
_mm256_storeu_ps((float*)c, tmp);
|
||||||
|
|
||||||
@ -230,7 +230,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
|
|||||||
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
||||||
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
|
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
|
||||||
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
||||||
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
// tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
||||||
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
|
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
|
||||||
_mm256_store_ps((float*)c, tmp);
|
_mm256_store_ps((float*)c, tmp);
|
||||||
|
|
||||||
|
@ -114,46 +114,6 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV
|
|||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSSE3 */
|
#endif /* LV_HAVE_SSSE3 */
|
||||||
|
|
||||||
//#ifdef LV_HAVE_SSE
|
|
||||||
//#include <xmmintrin.h>
|
|
||||||
//
|
|
||||||
//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
|
|
||||||
// unsigned int number = 0;
|
|
||||||
// const unsigned int quarterPoints = num_points / 4;
|
|
||||||
//
|
|
||||||
// const float* complexVectorPtr = (float*)complexVector;
|
|
||||||
// float* magnitudeVectorPtr = magnitudeVector;
|
|
||||||
//
|
|
||||||
// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
|
|
||||||
// for(;number < quarterPoints; number++){
|
|
||||||
// cplxValue1 = _mm_loadu_ps(complexVectorPtr);
|
|
||||||
// complexVectorPtr += 4;
|
|
||||||
//
|
|
||||||
// cplxValue2 = _mm_loadu_ps(complexVectorPtr);
|
|
||||||
// complexVectorPtr += 4;
|
|
||||||
//
|
|
||||||
// // Arrange in i1i2i3i4 format
|
|
||||||
// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
|
|
||||||
// // Arrange in q1q2q3q4 format
|
|
||||||
// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
|
|
||||||
//
|
|
||||||
// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
|
|
||||||
// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
|
|
||||||
//
|
|
||||||
// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
|
|
||||||
//
|
|
||||||
// _mm_storeu_ps(magnitudeVectorPtr, result);
|
|
||||||
// magnitudeVectorPtr += 4;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// number = quarterPoints * 4;
|
|
||||||
// for(; number < num_points; number++){
|
|
||||||
// float val1Real = *complexVectorPtr++;
|
|
||||||
// float val1Imag = *complexVectorPtr++;
|
|
||||||
// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
//#endif /* LV_HAVE_SSE */
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
|
|
||||||
@ -228,46 +188,6 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV
|
|||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSSE3 */
|
#endif /* LV_HAVE_SSSE3 */
|
||||||
|
|
||||||
//#ifdef LV_HAVE_SSE
|
|
||||||
//#include <xmmintrin.h>
|
|
||||||
//
|
|
||||||
//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
|
|
||||||
// unsigned int number = 0;
|
|
||||||
// const unsigned int quarterPoints = num_points / 4;
|
|
||||||
//
|
|
||||||
// const float* complexVectorPtr = (float*)complexVector;
|
|
||||||
// float* magnitudeVectorPtr = magnitudeVector;
|
|
||||||
//
|
|
||||||
// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
|
|
||||||
// for(;number < quarterPoints; number++){
|
|
||||||
// cplxValue1 = _mm_load_ps(complexVectorPtr);
|
|
||||||
// complexVectorPtr += 4;
|
|
||||||
//
|
|
||||||
// cplxValue2 = _mm_load_ps(complexVectorPtr);
|
|
||||||
// complexVectorPtr += 4;
|
|
||||||
//
|
|
||||||
// // Arrange in i1i2i3i4 format
|
|
||||||
// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
|
|
||||||
// // Arrange in q1q2q3q4 format
|
|
||||||
// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
|
|
||||||
//
|
|
||||||
// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
|
|
||||||
// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
|
|
||||||
//
|
|
||||||
// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
|
|
||||||
//
|
|
||||||
// _mm_store_ps(magnitudeVectorPtr, result);
|
|
||||||
// magnitudeVectorPtr += 4;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// number = quarterPoints * 4;
|
|
||||||
// for(; number < num_points; number++){
|
|
||||||
// float val1Real = *complexVectorPtr++;
|
|
||||||
// float val1Imag = *complexVectorPtr++;
|
|
||||||
// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
//#endif /* LV_HAVE_SSE */
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_ORC
|
#ifdef LV_HAVE_ORC
|
||||||
|
@ -56,6 +56,8 @@
|
|||||||
* \li out: Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n]))
|
* \li out: Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n]))
|
||||||
* \li phase: Pointer to a float containing the final phase, in radians.
|
* \li phase: Pointer to a float containing the final phase, in radians.
|
||||||
*
|
*
|
||||||
|
* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier
|
||||||
|
* Based on algorithms from the cephes library http://www.netlib.org/cephes/
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
@ -69,8 +71,7 @@
|
|||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
|
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t *bPtr = out;
|
lv_32fc_t *bPtr = out;
|
||||||
@ -225,8 +226,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const fl
|
|||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
|
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t *bPtr = out;
|
lv_32fc_t *bPtr = out;
|
||||||
@ -459,8 +459,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t *out, co
|
|||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/
|
|
||||||
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
|
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t *bPtr = out;
|
lv_32fc_t *bPtr = out;
|
||||||
@ -647,8 +646,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const fl
|
|||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/
|
|
||||||
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
|
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t *bPtr = out;
|
lv_32fc_t *bPtr = out;
|
||||||
@ -835,8 +833,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const fl
|
|||||||
|
|
||||||
#ifdef LV_HAVE_NEONV7
|
#ifdef LV_HAVE_NEONV7
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
/* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */
|
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t *bPtr = out;
|
lv_32fc_t *bPtr = out;
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
#include <INTEGER.h>
|
#include <INTEGER.h>
|
||||||
#include <asn_codecs_prim.h> /* Encoder and decoder of a primitive type */
|
#include <asn_codecs_prim.h> /* Encoder and decoder of a primitive type */
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* INTEGER basic type description.
|
* INTEGER basic type description.
|
||||||
@ -146,7 +147,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by
|
|||||||
scr = (char *)alloca(scrsize);
|
scr = (char *)alloca(scrsize);
|
||||||
if(plainOrXER == 0)
|
if(plainOrXER == 0)
|
||||||
ret = snprintf(scr, scrsize,
|
ret = snprintf(scr, scrsize,
|
||||||
"%lld (%s)", accum, el->enum_name);
|
"%+"PRId64"(%s)", accum, el->enum_name);
|
||||||
else
|
else
|
||||||
ret = snprintf(scr, scrsize,
|
ret = snprintf(scr, scrsize,
|
||||||
"<%s/>", el->enum_name);
|
"<%s/>", el->enum_name);
|
||||||
@ -160,7 +161,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by
|
|||||||
scr = scratch;
|
scr = scratch;
|
||||||
ret = snprintf(scr, scrsize,
|
ret = snprintf(scr, scrsize,
|
||||||
(specs && specs->field_unsigned)
|
(specs && specs->field_unsigned)
|
||||||
?"%llu":"%lld", accum);
|
?"%"PRIu64:"%+"PRId64, accum);
|
||||||
}
|
}
|
||||||
assert(ret > 0 && (size_t)ret < scrsize);
|
assert(ret > 0 && (size_t)ret < scrsize);
|
||||||
return (cb(scr, ret, app_key) < 0) ? -1 : ret;
|
return (cb(scr, ret, app_key) < 0) ? -1 : ret;
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
#include <INTEGER.h>
|
#include <INTEGER.h>
|
||||||
#include <asn_codecs_prim.h> /* Encoder and decoder of a primitive type */
|
#include <asn_codecs_prim.h> /* Encoder and decoder of a primitive type */
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* INTEGER basic type description.
|
* INTEGER basic type description.
|
||||||
@ -146,7 +147,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by
|
|||||||
scr = (char *)alloca(scrsize);
|
scr = (char *)alloca(scrsize);
|
||||||
if(plainOrXER == 0)
|
if(plainOrXER == 0)
|
||||||
ret = snprintf(scr, scrsize,
|
ret = snprintf(scr, scrsize,
|
||||||
"%lld (%s)", accum, el->enum_name);
|
"%+"PRId64"(%s)", accum, el->enum_name);
|
||||||
else
|
else
|
||||||
ret = snprintf(scr, scrsize,
|
ret = snprintf(scr, scrsize,
|
||||||
"<%s/>", el->enum_name);
|
"<%s/>", el->enum_name);
|
||||||
@ -160,7 +161,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by
|
|||||||
scr = scratch;
|
scr = scratch;
|
||||||
ret = snprintf(scr, scrsize,
|
ret = snprintf(scr, scrsize,
|
||||||
(specs && specs->field_unsigned)
|
(specs && specs->field_unsigned)
|
||||||
?"%llu":"%lld", accum);
|
?"%"PRIu64:"%+"PRId64, accum);
|
||||||
}
|
}
|
||||||
assert(ret > 0 && (size_t)ret < scrsize);
|
assert(ret > 0 && (size_t)ret < scrsize);
|
||||||
return (cb(scr, ret, app_key) < 0) ? -1 : ret;
|
return (cb(scr, ret, app_key) < 0) ? -1 : ret;
|
||||||
|
Loading…
Reference in New Issue
Block a user