1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-01-15 11:45:47 +00:00
This commit is contained in:
Carles Fernandez 2019-08-19 13:14:26 +02:00
commit bf8ad2d37b
31 changed files with 469 additions and 1563 deletions

View File

@ -2340,7 +2340,7 @@ int valpos(rtk_t *rtk, const double *v, const double *R, const int *vflg,
type = (vflg[i] >> 4) & 0xF; type = (vflg[i] >> 4) & 0xF;
freq = vflg[i] & 0xF; freq = vflg[i] & 0xF;
stype = type == 0 ? 'L' : (type == 1 ? 'L' : 'C'); stype = type == 0 ? 'L' : (type == 1 ? 'L' : 'C');
errmsg(rtk, "large residual (sat=%2d-%2d %s%d v=%6.3f sig=%.3f)\n", errmsg(rtk, "large residual (sat=%2d-%2d %c%d v=%6.3f sig=%.3f)\n",
sat1, sat2, stype, freq + 1, v[i], std::sqrt(R[i + i * nv])); sat1, sat2, stype, freq + 1, v[i], std::sqrt(R[i + i * nv]));
} }
#if 0 /* omitted v.2.4.0 */ #if 0 /* omitted v.2.4.0 */

View File

@ -58,6 +58,7 @@
#include <cerrno> #include <cerrno>
#include <cstring> #include <cstring>
#include <fcntl.h> #include <fcntl.h>
#include <inttypes.h>
#include <netdb.h> #include <netdb.h>
#include <netinet/tcp.h> #include <netinet/tcp.h>
#include <string> #include <string>
@ -461,7 +462,7 @@ void closefile(file_t *file)
{ {
return; return;
} }
tracet(3, "closefile: fp=%d\n", file->fp); tracet(3, "closefile: fpos=%" PRIdPTR "\n", file->fp);
closefile_(file); closefile_(file);
free(file); free(file);
} }
@ -472,7 +473,7 @@ void swapfile(file_t *file, gtime_t time, char *msg)
{ {
char openpath[MAXSTRPATH]; char openpath[MAXSTRPATH];
tracet(3, "swapfile: fp=%d time=%s\n", file->fp, time_str(time, 0)); tracet(3, "swapfile: fpos=%" PRIdPTR "\n time=%s\n", file->fp, time_str(time, 0));
/* return if old swap file open */ /* return if old swap file open */
if (file->fp_tmp || file->fp_tag_tmp) if (file->fp_tmp || file->fp_tag_tmp)
@ -500,7 +501,7 @@ void swapfile(file_t *file, gtime_t time, char *msg)
/* close old swap file -------------------------------------------------------*/ /* close old swap file -------------------------------------------------------*/
void swapclose(file_t *file) void swapclose(file_t *file)
{ {
tracet(3, "swapclose: fp_tmp=%d\n", file->fp_tmp); tracet(3, "swapclose: fp_tmp=%" PRIdPTR "\n", file->fp_tmp);
if (file->fp_tmp) if (file->fp_tmp)
{ {
fclose(file->fp_tmp); fclose(file->fp_tmp);
@ -534,7 +535,7 @@ int readfile(file_t *file, unsigned char *buff, int nmax, char *msg)
{ {
return 0; return 0;
} }
tracet(4, "readfile: fp=%d nmax=%d\n", file->fp, nmax); tracet(4, "readfile: fp=%zd nmax=%d\n", file->fp, nmax);
if (file->fp == stdin) if (file->fp == stdin)
{ {
@ -617,7 +618,7 @@ int readfile(file_t *file, unsigned char *buff, int nmax, char *msg)
sprintf(msg, "end"); sprintf(msg, "end");
} }
} }
tracet(5, "readfile: fp=%d nr=%d fpos=%d\n", file->fp, nr, file->fpos); tracet(5, "readfile: fpos=%" PRIdPTR "\n nr=%d fpos=%zd\n", file->fp, nr, file->fpos);
return nr; return nr;
} }
@ -640,7 +641,7 @@ int writefile(file_t *file, unsigned char *buff, int n, char *msg)
{ {
return 0; return 0;
} }
tracet(3, "writefile: fp=%d n=%d\n", file->fp, n); tracet(3, "writefile: fpos=%" PRIdPTR "\n n=%d\n", file->fp, n);
wtime = utc2gpst(timeget()); /* write time in gpst */ wtime = utc2gpst(timeget()); /* write time in gpst */
@ -693,7 +694,7 @@ int writefile(file_t *file, unsigned char *buff, int n, char *msg)
fflush(file->fp_tag_tmp); fflush(file->fp_tag_tmp);
} }
} }
tracet(5, "writefile: fp=%d ns=%d tick=%5d fpos=%d\n", file->fp, ns, tick, fpos); tracet(5, "writefile: fpos=%" PRIdPTR "\n ns=%d tick=%5d fpos=%zd\n", file->fp, ns, tick, fpos);
return static_cast<int>(ns); return static_cast<int>(ns);
} }
@ -1497,7 +1498,7 @@ int reqntrip_s(ntrip_t *ntrip, char *msg)
return 0; return 0;
} }
tracet(2, "reqntrip_s: send request state=%d ns=%d\n", ntrip->state, p - buff); tracet(2, "reqntrip_s: send request state=%d ns=%" PRIdPTR "\n", ntrip->state, p - buff);
tracet(5, "reqntrip_s: n=%d buff=\n%s\n", p - buff, buff); tracet(5, "reqntrip_s: n=%d buff=\n%s\n", p - buff, buff);
ntrip->state = 1; ntrip->state = 1;
return 1; return 1;
@ -1535,7 +1536,7 @@ int reqntrip_c(ntrip_t *ntrip, char *msg)
return 0; return 0;
} }
tracet(2, "reqntrip_c: send request state=%d ns=%d\n", ntrip->state, p - buff); tracet(2, "reqntrip_c: send request state=%d ns=%" PRIdPTR "\n", ntrip->state, p - buff);
tracet(5, "reqntrip_c: n=%d buff=\n%s\n", p - buff, buff); tracet(5, "reqntrip_c: n=%d buff=\n%s\n", p - buff, buff);
ntrip->state = 1; ntrip->state = 1;
return 1; return 1;

View File

@ -9,9 +9,9 @@
#define NOMINMAX #define NOMINMAX
#endif #endif
//http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 // http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668
#include < time.h > #include < time.h >
#include <windows.h> //I've omitted this line. #include <windows.h> // I've omitted this line.
#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
#else #else
@ -51,7 +51,7 @@ static inline int gettimeofday(struct timeval *tv, struct timezone *tz)
tmpres <<= 32; tmpres <<= 32;
tmpres |= ft.dwLowDateTime; tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/ /* converting file time to unix epoch*/
tmpres -= DELTA_EPOCH_IN_MICROSECS; tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL);

View File

@ -24,9 +24,9 @@
#ifndef INCLUDED_LIBVOLK_GNSSSDR_COMMON_H #ifndef INCLUDED_LIBVOLK_GNSSSDR_COMMON_H
#define INCLUDED_LIBVOLK_GNSSSDR_COMMON_H #define INCLUDED_LIBVOLK_GNSSSDR_COMMON_H
//////////////////////////////////////////////////////////////////////// //
// Cross-platform attribute macros not included in VOLK // Cross-platform attribute macros not included in VOLK
//////////////////////////////////////////////////////////////////////// //
#if defined __GNUC__ #if defined __GNUC__
#define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr) #define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr)
#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality) #define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
@ -41,9 +41,9 @@
#ifndef INCLUDED_LIBVOLK_COMMON_H #ifndef INCLUDED_LIBVOLK_COMMON_H
#define INCLUDED_LIBVOLK_COMMON_H #define INCLUDED_LIBVOLK_COMMON_H
//////////////////////////////////////////////////////////////////////// //
// Cross-platform attribute macros // Cross-platform attribute macros
//////////////////////////////////////////////////////////////////////// //
#if defined __GNUC__ #if defined __GNUC__
#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) #define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
#define __VOLK_ATTR_UNUSED __attribute__((unused)) #define __VOLK_ATTR_UNUSED __attribute__((unused))
@ -78,18 +78,18 @@
#define __VOLK_VOLATILE __volatile__ #define __VOLK_VOLATILE __volatile__
#endif #endif
//////////////////////////////////////////////////////////////////////// //
// Ignore annoying warnings in MSVC // Ignore annoying warnings in MSVC
//////////////////////////////////////////////////////////////////////// //
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data #pragma warning(disable : 4244) // 'conversion' conversion from 'type1' to 'type2', possible loss of data
#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2' #pragma warning(disable : 4305) // 'identifier' : truncation from 'type1' to 'type2'
#endif #endif
//////////////////////////////////////////////////////////////////////// //
// C-linkage declaration macros // C-linkage declaration macros
// FIXME: due to the usage of complex.h, require gcc for c-linkage // FIXME: due to the usage of complex.h, require gcc for c-linkage
//////////////////////////////////////////////////////////////////////// //
#if defined(__cplusplus) && (__GNUC__) #if defined(__cplusplus) && (__GNUC__)
#define __VOLK_DECL_BEGIN \ #define __VOLK_DECL_BEGIN \
extern "C" \ extern "C" \
@ -100,19 +100,19 @@
#define __VOLK_DECL_END #define __VOLK_DECL_END
#endif #endif
//////////////////////////////////////////////////////////////////////// //
// Define VOLK_API for library symbols // Define VOLK_API for library symbols
// http://gcc.gnu.org/wiki/Visibility // http://gcc.gnu.org/wiki/Visibility
//////////////////////////////////////////////////////////////////////// //
#ifdef volk_gnsssdr_EXPORTS #ifdef volk_gnsssdr_EXPORTS
#define VOLK_API __VOLK_ATTR_EXPORT #define VOLK_API __VOLK_ATTR_EXPORT
#else #else
#define VOLK_API __VOLK_ATTR_IMPORT #define VOLK_API __VOLK_ATTR_IMPORT
#endif #endif
//////////////////////////////////////////////////////////////////////// //
// The bit128 union used by some // The bit128 union used by some
//////////////////////////////////////////////////////////////////////// //
#include <inttypes.h> #include <inttypes.h>
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE

View File

@ -84,7 +84,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_generic(int16_t** result
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
local_code_chip_index = local_code_chip_index % code_length_chips; local_code_chip_index = local_code_chip_index % code_length_chips;
result[current_correlator_tap][n] = local_code[local_code_chip_index]; result[current_correlator_tap][n] = local_code[local_code_chip_index];
@ -150,7 +150,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -217,7 +217,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -288,7 +288,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -359,7 +359,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -437,7 +437,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -515,7 +515,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -568,7 +568,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
aux = vmulq_f32(code_phase_step_chips_reg, indexn); aux = vmulq_f32(code_phase_step_chips_reg, indexn);
aux = vaddq_f32(aux, aux2); aux = vaddq_f32(aux, aux2);
//floor // floor
i = vcvtq_s32_f32(aux); i = vcvtq_s32_f32(aux);
fi = vcvtq_f32_s32(i); fi = vcvtq_f32_s32(i);
igx = vcgtq_f32(fi, aux); igx = vcgtq_f32(fi, aux);
@ -600,7 +600,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];

View File

@ -131,36 +131,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3(lv_
#endif // SSE3 #endif // SSE3
//#ifdef LV_HAVE_SSE3
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//}
//#endif // SSE3
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
@ -224,36 +194,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2(lv_
#endif // AVX2 #endif // AVX2
//#ifdef LV_HAVE_AVX2
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//}
//#endif // AVX2
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
@ -286,96 +226,4 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
#endif // AVX2 #endif // AVX2
//#ifdef LV_HAVE_AVX2
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//}
//#endif // AVX2
//#ifdef LV_HAVE_NEONV7
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//}
//#endif // NEON
//#ifdef LV_HAVE_NEONV7
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//}
//#endif // NEON
#endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H #endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H

View File

@ -200,34 +200,4 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx2(lv_16sc_t* cVector, c
} }
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
//
//
//#ifdef LV_HAVE_NEONV7
//#include <arm_neon.h>
//
//static inline void volk_gnsssdr_16ic_conjugate_16ic_neon(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points)
//{
// const unsigned int sse_iters = num_points / 4;
// unsigned int i;
// lv_16sc_t* c = cVector;
// const lv_16sc_t* a = aVector;
// int16x4x2_t a_val;
//
// for (i = 0; i < sse_iters; ++i)
// {
// a_val = vld2_s16((const int16_t*)a);
// __VOLK_GNSSSDR_PREFETCH(a + 4);
// a_val.val[1] = vneg_s16(a_val.val[1]);
// vst2_s16((int16_t*)c, a_val);
// a += 4;
// c += 4;
// }
//
// for (i = sse_iters * 4; i < num_points; ++i)
// {
// *c++ = lv_conj(*a++);
// }
//}
//#endif /* LV_HAVE_NEONV7 */
#endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */ #endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */

View File

@ -72,7 +72,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
{ {
int local_code_chip_index; int local_code_chip_index;
unsigned int n; unsigned int n;
//fesetround(FE_TONEAREST); // fesetround(FE_TONEAREST);
for (n = 0; n < num_output_samples; n++) for (n = 0; n < num_output_samples; n++)
{ {
// resample code for current tap // resample code for current tap
@ -83,7 +83,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
@ -91,7 +91,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float) static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
{ {
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number; unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
@ -104,8 +104,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
__m128 _code_phase_out, _code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
rem_code_phase_chips = rem_code_phase_chips - 0.5f; rem_code_phase_chips = rem_code_phase_chips - 0.5f;
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); // load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4]; int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
@ -120,8 +120,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
@ -136,21 +136,21 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
for (number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible // todo: optimize the local code lookup table with intrinsics, if possible
*_result++ = local_code[local_code_chip_index[0]]; *_result++ = local_code[local_code_chip_index[0]];
*_result++ = local_code[local_code_chip_index[1]]; *_result++ = local_code[local_code_chip_index[1]];
*_result++ = local_code[local_code_chip_index[2]]; *_result++ = local_code[local_code_chip_index[2]];
@ -176,7 +176,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float) static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
{ {
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number; unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
@ -189,8 +189,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
__m128 _code_phase_out, _code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
rem_code_phase_chips = rem_code_phase_chips - 0.5f; rem_code_phase_chips = rem_code_phase_chips - 0.5f;
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); // load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4]; int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
@ -205,8 +205,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
@ -221,21 +221,21 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
for (number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible // todo: optimize the local code lookup table with intrinsics, if possible
*_result++ = local_code[local_code_chip_index[0]]; *_result++ = local_code[local_code_chip_index[0]];
*_result++ = local_code[local_code_chip_index[1]]; *_result++ = local_code[local_code_chip_index[1]];
*_result++ = local_code[local_code_chip_index[2]]; *_result++ = local_code[local_code_chip_index[2]];
@ -275,8 +275,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
rem_code_phase_chips = rem_code_phase_chips - 0.5f; rem_code_phase_chips = rem_code_phase_chips - 0.5f;
float32x4_t sign, PlusHalf, Round; float32x4_t sign, PlusHalf, Round;
_rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); // load float to all four float values in m128 register
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); // load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4]; int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
@ -291,8 +291,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); // load float to all four float values in m128 register
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
uint32x4_t negative_indexes, overflow_indexes; uint32x4_t negative_indexes, overflow_indexes;
@ -307,24 +307,24 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
for (number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); // add the phase offset
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
Round = vsubq_f32(PlusHalf, sign); Round = vsubq_f32(PlusHalf, sign);
_code_phase_out_int = vcvtq_s32_f32(Round); _code_phase_out_int = vcvtq_s32_f32(Round);
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values negative_indexes = vcltq_s32(_code_phase_out_int, zero); // test for negative values
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); // the negative values branch
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible // todo: optimize the local code lookup table with intrinsics, if possible
*_result++ = local_code[local_code_chip_index[0]]; *_result++ = local_code[local_code_chip_index[0]];
*_result++ = local_code[local_code_chip_index[1]]; *_result++ = local_code[local_code_chip_index[1]];
*_result++ = local_code[local_code_chip_index[2]]; *_result++ = local_code[local_code_chip_index[2]];
@ -344,4 +344,4 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
#endif /* LV_HAVE_NEONV7 */ #endif /* LV_HAVE_NEONV7 */
#endif /*INCLUDED_volk_gnsssdr_16ic_resampler_fast_16ic_H*/ #endif /* INCLUDED_volk_gnsssdr_16ic_resampler_fast_16ic_H */

View File

@ -79,13 +79,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou
// Regenerate phase // Regenerate phase
if (i % 512 == 0) if (i % 512 == 0)
{ {
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); // printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
#ifdef __cplusplus #ifdef __cplusplus
(*phase) /= std::abs((*phase)); (*phase) /= std::abs((*phase));
#else #else
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
#endif #endif
//printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); // printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
} }
} }
} }
@ -112,13 +112,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(lv_16s
(*phase) *= phase_inc; (*phase) *= phase_inc;
} }
// Regenerate phase // Regenerate phase
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); // printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
#ifdef __cplusplus #ifdef __cplusplus
(*phase) /= std::abs((*phase)); (*phase) /= std::abs((*phase));
#else #else
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
#endif #endif
//printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); // printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
} }
for (j = 0; j < num_points % ROTATOR_RELOAD; j++) for (j = 0; j < num_points % ROTATOR_RELOAD; j++)
{ {
@ -161,8 +161,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
for (number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -171,7 +171,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -179,11 +179,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -192,7 +192,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -214,7 +214,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
} }
//next two samples // next two samples
_in += 2; _in += 2;
_out += 4; _out += 4;
} }
@ -268,8 +268,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
{ {
for (j = 0; j < ROTATOR_RELOAD; j++) for (j = 0; j < ROTATOR_RELOAD; j++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -278,7 +278,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -286,11 +286,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -299,7 +299,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -311,7 +311,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_store_si128((__m128i*)_out, result); _mm_store_si128((__m128i*)_out, result);
//next two samples // next two samples
_in += 2; _in += 2;
_out += 4; _out += 4;
} }
@ -325,8 +325,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -335,7 +335,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -343,11 +343,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -356,7 +356,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -368,7 +368,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_store_si128((__m128i*)_out, result); _mm_store_si128((__m128i*)_out, result);
//next two samples // next two samples
_in += 2; _in += 2;
_out += 4; _out += 4;
} }
@ -418,8 +418,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
for (number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -428,7 +428,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -436,11 +436,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -449,7 +449,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -471,7 +471,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
} }
//next two samples // next two samples
_in += 2; _in += 2;
_out += 4; _out += 4;
} }
@ -525,8 +525,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
{ {
for (j = 0; j < ROTATOR_RELOAD; j++) for (j = 0; j < ROTATOR_RELOAD; j++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -535,7 +535,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -543,11 +543,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -556,7 +556,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -568,7 +568,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_storeu_si128((__m128i*)_out, result); _mm_storeu_si128((__m128i*)_out, result);
//next two samples // next two samples
_in += 2; _in += 2;
_out += 4; _out += 4;
} }
@ -582,8 +582,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -592,7 +592,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -600,11 +600,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -613,7 +613,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -625,7 +625,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_storeu_si128((__m128i*)_out, result); _mm_storeu_si128((__m128i*)_out, result);
//next two samples // next two samples
_in += 2; _in += 2;
_out += 4; _out += 4;
} }
@ -746,9 +746,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
// Regenerate phase // Regenerate phase
if ((i % 512) == 0) if ((i % 512) == 0)
{ {
//printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); // printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
phase_est = arg_phase0 + (i + 1) * 4 * arg_phase_inc; phase_est = arg_phase0 + (i + 1) * 4 * arg_phase_inc;
//printf("Estimated phase: %f\n\n", cos(phase_est)); // printf("Estimated phase: %f\n\n", cos(phase_est));
*phase = lv_cmake(cos(phase_est), sin(phase_est)); *phase = lv_cmake(cos(phase_est), sin(phase_est));
phase2 = (lv_32fc_t)(*phase) * phase_inc; phase2 = (lv_32fc_t)(*phase) * phase_inc;
@ -887,9 +887,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
_out += 4; _out += 4;
} }
// Regenerate phase // Regenerate phase
//printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); // printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
phase_est = arg_phase0 + (n + 1) * ROTATOR_RELOAD * 4 * arg_phase_inc; phase_est = arg_phase0 + (n + 1) * ROTATOR_RELOAD * 4 * arg_phase_inc;
//printf("Estimated phase: %f\n\n", cos(phase_est)); // printf("Estimated phase: %f\n\n", cos(phase_est));
*phase = lv_cmake(cos(phase_est), sin(phase_est)); *phase = lv_cmake(cos(phase_est), sin(phase_est));
phase2 = (lv_32fc_t)(*phase) * phase_inc; phase2 = (lv_32fc_t)(*phase) * phase_inc;
phase3 = phase2 * phase_inc; phase3 = phase2 * phase_inc;

View File

@ -28,7 +28,7 @@
* GNU General Public License for more details. * GNU General Public License for more details.
* *
* You should have received a copy of the GNU General Public License * You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <https://www.gnu.org/licenses/>. * along with GNSS-SDR. If not, see <https:// www.gnu.org/licenses/>.
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
*/ */
@ -77,7 +77,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
@ -108,7 +108,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
for (number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_load_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_a + 8); __VOLK_GNSSSDR_PREFETCH(_in_a + 8);
b = _mm_load_si128((__m128i*)_in_b); b = _mm_load_si128((__m128i*)_in_b);
__VOLK_GNSSSDR_PREFETCH(_in_b + 8); __VOLK_GNSSSDR_PREFETCH(_in_b + 8);
@ -123,7 +123,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
realcacc = _mm_adds_epi16(realcacc, real); realcacc = _mm_adds_epi16(realcacc, real);
imagcacc = _mm_adds_epi16(imagcacc, imag); imagcacc = _mm_adds_epi16(imagcacc, imag);
@ -186,10 +186,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
for (number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] // std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1] // imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_loadu_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_a + 8); __VOLK_GNSSSDR_PREFETCH(_in_a + 8);
b = _mm_loadu_si128((__m128i*)_in_b); b = _mm_loadu_si128((__m128i*)_in_b);
__VOLK_GNSSSDR_PREFETCH(_in_b + 8); __VOLK_GNSSSDR_PREFETCH(_in_b + 8);
@ -204,7 +204,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
realcacc = _mm_adds_epi16(realcacc, real); realcacc = _mm_adds_epi16(realcacc, real);
imagcacc = _mm_adds_epi16(imagcacc, imag); imagcacc = _mm_adds_epi16(imagcacc, imag);
@ -281,7 +281,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
imag1 = _mm256_mullo_epi16(a, b_sl); imag1 = _mm256_mullo_epi16(a, b_sl);
imag2 = _mm256_mullo_epi16(b, a_sl); imag2 = _mm256_mullo_epi16(b, a_sl);
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
realcacc = _mm256_adds_epi16(realcacc, real); realcacc = _mm256_adds_epi16(realcacc, real);
imagcacc = _mm256_adds_epi16(imagcacc, imag); imagcacc = _mm256_adds_epi16(imagcacc, imag);
@ -359,7 +359,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
imag1 = _mm256_mullo_epi16(a, b_sl); imag1 = _mm256_mullo_epi16(a, b_sl);
imag2 = _mm256_mullo_epi16(b, a_sl); imag2 = _mm256_mullo_epi16(b, a_sl);
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
realcacc = _mm256_adds_epi16(realcacc, real); realcacc = _mm256_adds_epi16(realcacc, real);
imagcacc = _mm256_adds_epi16(imagcacc, imag); imagcacc = _mm256_adds_epi16(imagcacc, imag);
@ -571,4 +571,4 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out
#endif /* LV_HAVE_NEONV7 */ #endif /* LV_HAVE_NEONV7 */
#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H*/ #endif /* INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H */

View File

@ -77,15 +77,15 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu
result[n_vec] = lv_cmake(0, 0); result[n_vec] = lv_cmake(0, 0);
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
//r*a.r - i*a.i, i*a.r + r*a.i // r*a.r - i*a.i, i*a.r + r*a.i
//result[n_vec]+=in_common[n]*in_a[n_vec][n]; // result[n_vec]+=in_common[n]*in_a[n_vec][n];
lv_16sc_t tmp = in_common[n] * in_a[n_vec][n]; lv_16sc_t tmp = in_common[n] * in_a[n_vec][n];
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
} }
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
@ -106,7 +106,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t*
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
@ -145,11 +145,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
for (index = 0; index < sse_iters; index++) for (index = 0; index < sse_iters; index++)
{ {
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b = _mm_load_si128((__m128i*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
@ -240,11 +240,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
for (index = 0; index < sse_iters; index++) for (index = 0; index < sse_iters; index++)
{ {
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b = _mm_loadu_si128((__m128i*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
@ -522,12 +522,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
for (index = 0; index < neon_iters; index++) for (index = 0; index < neon_iters; index++)
{ {
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b_val = vld2_s16((int16_t*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
//__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8); // __VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8);
// multiply the real*real and imag*imag to get real result // multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@ -609,7 +609,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
for (index = 0; index < neon_iters; index++) for (index = 0; index < neon_iters; index++)
{ {
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b_val = vld2_s16((int16_t*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
@ -690,7 +690,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
for (index = 0; index < neon_iters; index++) for (index = 0; index < neon_iters; index++)
{ {
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b_val = vld2_s16((int16_t*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
@ -738,4 +738,4 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
} }
#endif /* LV_HAVE_NEONV7 */ #endif /* LV_HAVE_NEONV7 */
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H*/ #endif /* INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H */

View File

@ -68,12 +68,12 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
unsigned int n; unsigned int n;
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
//r*a.r - i*a.i, i*a.r + r*a.i // r*a.r - i*a.i, i*a.r + r*a.i
result[n] = in_a[n] * in_b[n]; result[n] = in_a[n] * in_b[n];
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
@ -93,10 +93,10 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con
lv_16sc_t* _out = out; lv_16sc_t* _out = out;
for (number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] // std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1] // imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_load_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b = _mm_load_si128((__m128i*)_in_b); b = _mm_load_si128((__m128i*)_in_b);
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
@ -147,10 +147,10 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con
lv_16sc_t* _out = out; lv_16sc_t* _out = out;
for (number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] // std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1] // imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_loadu_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b = _mm_loadu_si128((__m128i*)_in_b); b = _mm_loadu_si128((__m128i*)_in_b);
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
@ -340,4 +340,4 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const
} }
#endif /* LV_HAVE_NEONV7*/ #endif /* LV_HAVE_NEONV7*/
#endif /*INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H*/ #endif /* INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H */

View File

@ -89,33 +89,33 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc
} }
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
// Regenerate phase // Regenerate phase
if (n % 256 == 0) if (n % 256 == 0)
{ {
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); // printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
#ifdef __cplusplus #ifdef __cplusplus
(*phase) /= std::abs((*phase)); (*phase) /= std::abs((*phase));
#else #else
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
#endif #endif
//printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); // printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
} }
(*phase) *= phase_inc; (*phase) *= phase_inc;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
} }
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
@ -137,14 +137,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(
{ {
for (j = 0; j < ROTATOR_RELOAD; j++) for (j = 0; j < ROTATOR_RELOAD; j++)
{ {
tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc; (*phase) *= phase_inc;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j]; lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j];
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
} }
} }
@ -152,27 +152,27 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(
#ifdef __cplusplus #ifdef __cplusplus
(*phase) /= std::abs((*phase)); (*phase) /= std::abs((*phase));
#else #else
//(*phase) /= cabsf((*phase)); // (*phase) /= cabsf((*phase));
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
#endif #endif
} }
for (j = 0; j < num_points % ROTATOR_RELOAD; j++) for (j = 0; j < num_points % ROTATOR_RELOAD; j++)
{ {
tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc; (*phase) *= phase_inc;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j]; lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j];
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
} }
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
@ -228,9 +228,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
for (number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
// Phase rotation on operand in_common starts here: // Phase rotation on operand in_common starts here:
//printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); // printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -239,7 +239,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -247,11 +247,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in_common += 2; _in_common += 2;
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -260,7 +260,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -271,12 +271,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
// store four rotated in_common samples in the register b // store four rotated in_common samples in the register b
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
//next two samples // next two samples
_in_common += 2; _in_common += 2;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
@ -331,12 +331,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
//(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); // (*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
(*phase) = two_phase_acc[0]; (*phase) = two_phase_acc[0];
for (n = sse_iters * 4; n < num_points; n++) for (n = sse_iters * 4; n < num_points; n++)
{ {
tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp16 = in_common[n]; // printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc; (*phase) *= phase_inc;
@ -344,7 +344,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
@ -411,9 +411,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
for (j = 0; j < ROTATOR_RELOAD; j++) for (j = 0; j < ROTATOR_RELOAD; j++)
{ {
// Phase rotation on operand in_common starts here: // Phase rotation on operand in_common starts here:
//printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); // printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -422,7 +422,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -430,11 +430,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in_common += 2; _in_common += 2;
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -443,7 +443,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -454,12 +454,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
// store four rotated in_common samples in the register b // store four rotated in_common samples in the register b
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
//next two samples // next two samples
_in_common += 2; _in_common += 2;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
@ -488,8 +488,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
{ {
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -498,7 +498,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -506,11 +506,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in_common += 2; _in_common += 2;
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -519,7 +519,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -530,12 +530,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
// store four rotated in_common samples in the register b // store four rotated in_common samples in the register b
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
//next two samples // next two samples
_in_common += 2; _in_common += 2;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
@ -582,12 +582,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
//(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); // (*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
(*phase) = two_phase_acc[0]; (*phase) = two_phase_acc[0];
for (n = sse_iters * 4; n < num_points; n++) for (n = sse_iters * 4; n < num_points; n++)
{ {
tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp16 = in_common[n]; // printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc; (*phase) *= phase_inc;
@ -595,7 +595,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
@ -657,9 +657,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
for (number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
// Phase rotation on operand in_common starts here: // Phase rotation on operand in_common starts here:
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -668,7 +668,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -676,11 +676,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in_common += 2; _in_common += 2;
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -689,7 +689,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -700,12 +700,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
// store four rotated in_common samples in the register b // store four rotated in_common samples in the register b
b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic
//next two samples // next two samples
_in_common += 2; _in_common += 2;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
@ -824,8 +824,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
for (number = 0; number < avx2_iters; number++) for (number = 0; number < avx2_iters; number++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -834,7 +834,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -842,11 +842,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in_common += 2; _in_common += 2;
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -855,7 +855,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -866,8 +866,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
// store four output samples // store four output samples
result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_in_common += 2; _in_common += 2;
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -876,7 +876,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -884,11 +884,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in_common += 2; _in_common += 2;
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 16); __VOLK_GNSSSDR_PREFETCH(_in_common + 16);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -897,7 +897,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1036,8 +1036,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
{ {
for (j = 0; j < ROTATOR_RELOAD; j++) for (j = 0; j < ROTATOR_RELOAD; j++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1046,7 +1046,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1054,11 +1054,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in_common += 2; _in_common += 2;
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1067,7 +1067,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1078,8 +1078,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
// store four output samples // store four output samples
result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_in_common += 2; _in_common += 2;
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1088,7 +1088,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1096,11 +1096,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in_common += 2; _in_common += 2;
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 16); __VOLK_GNSSSDR_PREFETCH(_in_common + 16);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1109,7 +1109,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1152,8 +1152,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
for (j = 0; j < avx2_iters % ROTATOR_RELOAD; j++) for (j = 0; j < avx2_iters % ROTATOR_RELOAD; j++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1162,7 +1162,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1170,11 +1170,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in_common += 2; _in_common += 2;
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1183,7 +1183,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1194,8 +1194,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
// store four output samples // store four output samples
result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_in_common += 2; _in_common += 2;
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1204,7 +1204,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1212,11 +1212,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples // next two samples
_in_common += 2; _in_common += 2;
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 16); __VOLK_GNSSSDR_PREFETCH(_in_common + 16);
//complex 32fc multiplication b=a*two_phase_acc_reg // complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1225,7 +1225,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@ -1414,8 +1414,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
//__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); // __VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8);
// multiply the real*real and imag*imag to get real result // multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@ -1474,7 +1474,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
for (n = neon_iters * 4; n < num_points; n++) for (n = neon_iters * 4; n < num_points; n++)
{ {
tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp16_ = in_common[n]; // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
(*phase) *= phase_inc; (*phase) *= phase_inc;
@ -1513,7 +1513,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
float arg_phase0 = cargf(*phase); float arg_phase0 = cargf(*phase);
float arg_phase_inc = cargf(phase_inc); float arg_phase_inc = cargf(phase_inc);
float phase_est; float phase_est;
//printf("arg phase0: %f", arg_phase0); // printf("arg phase0: %f", arg_phase0);
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) __VOLK_ATTR_ALIGNED(16)
float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
@ -1605,9 +1605,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
// Regenerate phase // Regenerate phase
if ((number % 256) == 0) if ((number % 256) == 0)
{ {
//printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); // printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc;
//printf("Estimated phase: %f\n\n", cos(phase_est)); // printf("Estimated phase: %f\n\n", cos(phase_est));
*phase = lv_cmake(cos(phase_est), sin(phase_est)); *phase = lv_cmake(cos(phase_est), sin(phase_est));
phase2 = (lv_32fc_t)(*phase) * phase_inc; phase2 = (lv_32fc_t)(*phase) * phase_inc;
@ -1624,14 +1624,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
// Round = vmulq_f32(_phase_real, _phase_real); // Round = vmulq_f32(_phase_real, _phase_real);
// Round = vmlaq_f32(Round, _phase_imag, _phase_imag); // Round = vmlaq_f32(Round, _phase_imag, _phase_imag);
// Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]); // Round = vsqrtq_f32(Round);// printf("sqrt: %f \n", Round[0]);
//Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); // Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]);
//Round = vrecpeq_f32((Round); // Round = vrecpeq_f32((Round);
// _phase_real = vdivq_f32(_phase_real, Round); // _phase_real = vdivq_f32(_phase_real, Round);
// _phase_imag = vdivq_f32(_phase_imag, Round); // _phase_imag = vdivq_f32(_phase_imag, Round);
//_phase_real = vmulq_f32(_phase_real, Round); // _phase_real = vmulq_f32(_phase_real, Round);
//_phase_imag = vmulq_f32(_phase_imag, Round); // _phase_imag = vmulq_f32(_phase_imag, Round);
//printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); // printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0]));
} }
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
@ -1671,7 +1671,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
for (n = neon_iters * 4; n < num_points; n++) for (n = neon_iters * 4; n < num_points; n++)
{ {
tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp16_ = in_common[n]; // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
(*phase) *= phase_inc; (*phase) *= phase_inc;
@ -1804,9 +1804,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
// Regenerate phase // Regenerate phase
if ((number % 256) == 0) if ((number % 256) == 0)
{ {
//printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); // printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc;
//printf("Estimated phase: %f\n\n", cos(phase_est)); // printf("Estimated phase: %f\n\n", cos(phase_est));
*phase = lv_cmake(cos(phase_est), sin(phase_est)); *phase = lv_cmake(cos(phase_est), sin(phase_est));
phase2 = (lv_32fc_t)(*phase) * phase_inc; phase2 = (lv_32fc_t)(*phase) * phase_inc;
@ -1860,7 +1860,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
for (n = neon_iters * 4; n < num_points; n++) for (n = neon_iters * 4; n < num_points; n++)
{ {
tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp16_ = in_common[n]; // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
(*phase) *= phase_inc; (*phase) *= phase_inc;
@ -1874,4 +1874,4 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
#endif /* LV_HAVE_NEONV7 */ #endif /* LV_HAVE_NEONV7 */
#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H*/ #endif /* INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H */

View File

@ -82,7 +82,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** re
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
local_code_chip_index = local_code_chip_index % code_length_chips; local_code_chip_index = local_code_chip_index % code_length_chips;
result[current_correlator_tap][n] = local_code[local_code_chip_index]; result[current_correlator_tap][n] = local_code[local_code_chip_index];
@ -90,7 +90,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** re
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE4_1 #ifdef LV_HAVE_SSE4_1
@ -149,7 +149,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -216,7 +216,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -287,7 +287,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -358,7 +358,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -436,7 +436,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -514,7 +514,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -567,7 +567,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
aux = vmulq_f32(code_phase_step_chips_reg, indexn); aux = vmulq_f32(code_phase_step_chips_reg, indexn);
aux = vaddq_f32(aux, aux2); aux = vaddq_f32(aux, aux2);
//floor // floor
i = vcvtq_s32_f32(aux); i = vcvtq_s32_f32(aux);
fi = vcvtq_f32_s32(i); fi = vcvtq_f32_s32(i);
igx = vcgtq_f32(fi, aux); igx = vcgtq_f32(fi, aux);
@ -599,7 +599,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -611,4 +611,4 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
#endif #endif
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/ #endif /* INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H */

View File

@ -73,7 +73,7 @@
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
{ {
int local_code_chip_index; int local_code_chip_index;
//fesetround(FE_TONEAREST); // fesetround(FE_TONEAREST);
int current_vector; int current_vector;
unsigned int n; unsigned int n;
for (current_vector = 0; current_vector < num_out_vectors; current_vector++) for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
@ -89,7 +89,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
@ -97,7 +97,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
{ {
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number; unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
@ -109,7 +109,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
__m128i _code_length_chips, _code_length_chips_minus1; __m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4]; int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
@ -124,8 +124,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
@ -142,29 +142,29 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
int sample_idx = 0; int sample_idx = 0;
for (number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
//common to all outputs // common to all outputs
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
//output vector dependent (different code phase offset) // output vector dependent (different code phase offset)
for (current_vector = 0; current_vector < num_out_vectors; current_vector++) for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{ {
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); // load float to all four float values in m128 register
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible // todo: optimize the local code lookup table with intrinsics, if possible
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
_result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]];
_result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]];
@ -193,7 +193,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
{ {
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number; unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
@ -205,7 +205,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
__m128i _code_length_chips, _code_length_chips_minus1; __m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4]; int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
@ -220,8 +220,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
@ -238,29 +238,29 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
int sample_idx = 0; int sample_idx = 0;
for (number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
//common to all outputs // common to all outputs
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
//output vector dependent (different code phase offset) // output vector dependent (different code phase offset)
for (current_vector = 0; current_vector < num_out_vectors; current_vector++) for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{ {
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); // load float to all four float values in m128 register
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible // todo: optimize the local code lookup table with intrinsics, if possible
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
_result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]];
_result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]];
@ -303,7 +303,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
float32x4_t _code_phase_out, _code_phase_out_with_offset; float32x4_t _code_phase_out, _code_phase_out_with_offset;
float32x4_t sign, PlusHalf, Round; float32x4_t sign, PlusHalf, Round;
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); // load float to all four float values in float32x4_t register
__VOLK_ATTR_ALIGNED(16) __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4]; int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
@ -318,8 +318,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); // load float to all four float values in float32x4_t register
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); // load float to all four float values in float32x4_t register
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
uint32x4_t negative_indexes, overflow_indexes; uint32x4_t negative_indexes, overflow_indexes;
@ -336,33 +336,33 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
int sample_idx = 0; int sample_idx = 0;
for (number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
//common to all outputs // common to all outputs
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step
//output vector dependent (different code phase offset) // output vector dependent (different code phase offset)
for (current_vector = 0; current_vector < num_out_vectors; current_vector++) for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{ {
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register _rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); // load float to all four float values in float32x4_t register
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); // add the phase offset
//_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer // _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
Round = vsubq_f32(PlusHalf, sign); Round = vsubq_f32(PlusHalf, sign);
_code_phase_out_int = vcvtq_s32_f32(Round); _code_phase_out_int = vcvtq_s32_f32(Round);
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values negative_indexes = vcltq_s32(_code_phase_out_int, zero); // test for negative values
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); // the negative values branch
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible // todo: optimize the local code lookup table with intrinsics, if possible
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
_result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]];
_result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]];
@ -386,4 +386,4 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
#endif /* LV_HAVE_NEONV7 */ #endif /* LV_HAVE_NEONV7 */
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_H*/ #endif /* INCLUDED_volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_H */

View File

@ -256,33 +256,6 @@ static inline void volk_gnsssdr_32f_high_dynamics_resamplerxnpuppet_32f_u_avx(fl
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
//
//#ifdef LV_HAVE_NEONV7
//static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points)
//{
// int code_length_chips = 2046;
// float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
// int num_out_vectors = 3;
// float rem_code_phase_chips = -0.234;
// int n;
// float shifts_chips[3] = {-0.1, 0.0, 0.1};
//
// float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
// for (n = 0; n < num_out_vectors; n++)
// {
// result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
// }
//
// volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
//
// memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
//
// for (n = 0; n < num_out_vectors; n++)
// {
// volk_gnsssdr_free(result_aux[n]);
// }
// volk_gnsssdr_free(result_aux);
//}
//#endif
#endif // INCLUDED_volk_gnsssdr_32f_high_dynamics_resamplerpuppet_32f_H #endif // INCLUDED_volk_gnsssdr_32f_high_dynamics_resamplerpuppet_32f_H

View File

@ -52,6 +52,8 @@
* \b Outputs * \b Outputs
* \li out: Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n])) * \li out: Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n]))
* *
* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier
* Based on algorithms from the cephes library http://www.netlib.org/cephes/
*/ */
#ifndef INCLUDED_volk_gnsssdr_32f_sincos_32fc_H #ifndef INCLUDED_volk_gnsssdr_32f_sincos_32fc_H
@ -251,8 +253,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include <emmintrin.h> #include <emmintrin.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float* in, unsigned int num_points) static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float* in, unsigned int num_points)
{ {
lv_32fc_t* bPtr = out; lv_32fc_t* bPtr = out;
@ -421,8 +422,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include <emmintrin.h> #include <emmintrin.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float* in, unsigned int num_points) static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float* in, unsigned int num_points)
{ {
lv_32fc_t* bPtr = out; lv_32fc_t* bPtr = out;
@ -644,8 +644,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
#ifdef LV_HAVE_NEONV7 #ifdef LV_HAVE_NEONV7
#include <arm_neon.h> #include <arm_neon.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float* in, unsigned int num_points) static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float* in, unsigned int num_points)
{ {
lv_32fc_t* bPtr = out; lv_32fc_t* bPtr = out;

View File

@ -83,7 +83,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_generic(fl
int local_code_chip_index; int local_code_chip_index;
int current_correlator_tap; int current_correlator_tap;
unsigned int n; unsigned int n;
//first correlator // first correlator
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
// resample code for first tap // resample code for first tap
@ -94,7 +94,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_generic(fl
result[0][n] = local_code[local_code_chip_index]; result[0][n] = local_code[local_code_chip_index];
} }
//adjacent correlators // adjacent correlators
unsigned int shift_samples = 0; unsigned int shift_samples = 0;
for (current_correlator_tap = 1; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 1; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{ {
@ -618,11 +618,11 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa
#endif #endif
// //
// //
//#ifdef LV_HAVE_NEONV7 // #ifdef LV_HAVE_NEONV7
//#include <arm_neon.h> // #include <arm_neon.h>
// //
//static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) // static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
//{ // {
// float** _result = result; // float** _result = result;
// const unsigned int neon_iters = num_points / 4; // const unsigned int neon_iters = num_points / 4;
// int current_correlator_tap; // int current_correlator_tap;
@ -662,7 +662,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa
// aux = vmulq_f32(code_phase_step_chips_reg, indexn); // aux = vmulq_f32(code_phase_step_chips_reg, indexn);
// aux = vaddq_f32(aux, aux2); // aux = vaddq_f32(aux, aux2);
// //
// //floor // // floor
// i = vcvtq_s32_f32(aux); // i = vcvtq_s32_f32(aux);
// fi = vcvtq_f32_s32(i); // fi = vcvtq_f32_s32(i);
// igx = vcgtq_f32(fi, aux); // igx = vcgtq_f32(fi, aux);
@ -700,8 +700,8 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa
// _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; // _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
// } // }
// } // }
//} // }
// //
//#endif // #endif
#endif /*INCLUDED_volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_H*/ #endif /* INCLUDED_volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_H */

View File

@ -85,7 +85,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_generic(float** result,
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
local_code_chip_index = local_code_chip_index % code_length_chips; local_code_chip_index = local_code_chip_index % code_length_chips;
result[current_correlator_tap][n] = local_code[local_code_chip_index]; result[current_correlator_tap][n] = local_code[local_code_chip_index];
@ -93,7 +93,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_generic(float** result,
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
@ -156,7 +156,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -227,7 +227,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -293,7 +293,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -360,7 +360,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -438,7 +438,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -516,7 +516,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -571,7 +571,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
aux = vmulq_f32(code_phase_step_chips_reg, indexn); aux = vmulq_f32(code_phase_step_chips_reg, indexn);
aux = vaddq_f32(aux, aux2); aux = vaddq_f32(aux, aux2);
//floor // floor
i = vcvtq_s32_f32(aux); i = vcvtq_s32_f32(aux);
fi = vcvtq_f32_s32(i); fi = vcvtq_f32_s32(i);
igx = vcgtq_f32(fi, aux); igx = vcgtq_f32(fi, aux);
@ -603,7 +603,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];

View File

@ -74,7 +74,7 @@
#include <volk_gnsssdr/volk_gnsssdr_complex.h> #include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <volk_gnsssdr/volk_gnsssdr_malloc.h> #include <volk_gnsssdr/volk_gnsssdr_malloc.h>
#include <math.h> #include <math.h>
//#include <stdio.h> // #include <stdio.h>
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
@ -89,18 +89,18 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f
} }
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32_1 = *in_common++ * (*phase); // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
// Regenerate phase // Regenerate phase
if (n % 256 == 0) if (n % 256 == 0)
{ {
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); // printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
#ifdef __cplusplus #ifdef __cplusplus
(*phase) /= std::abs((*phase)); (*phase) /= std::abs((*phase));
#else #else
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
#endif #endif
//printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); // printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
} }
(*phase) *= phase_inc; (*phase) *= phase_inc;
@ -112,7 +112,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
#ifdef __cplusplus #ifdef __cplusplus
(*phase) /= std::abs((*phase)); (*phase) /= std::abs((*phase));
#else #else
//(*phase) /= cabsf((*phase)); // (*phase) /= cabsf((*phase));
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
#endif #endif
} }
@ -162,7 +162,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX
#include <volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h> #include <volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h>

View File

@ -179,7 +179,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
int16_t* outputVectorPtr = (int16_t*)outputVector; int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux; float aux;
unsigned int i; unsigned int i;
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast const float min_val = (float)SHRT_MIN; /// todo Something off here, compiler does not perform right cast
const float max_val = (float)SHRT_MAX; const float max_val = (float)SHRT_MAX;
__m256 inputVal1, inputVal2; __m256 inputVal1, inputVal2;
@ -342,7 +342,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
int16_t* outputVectorPtr = (int16_t*)outputVector; int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux; float aux;
unsigned int i; unsigned int i;
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast const float min_val = (float)SHRT_MIN; /// todo Something off here, compiler does not perform right cast
const float max_val = (float)SHRT_MAX; const float max_val = (float)SHRT_MAX;
__m256 inputVal1, inputVal2; __m256 inputVal1, inputVal2;

View File

@ -89,18 +89,18 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc
} }
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32_1 = *in_common++ * (*phase); // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
// Regenerate phase // Regenerate phase
if (n % 256 == 0) if (n % 256 == 0)
{ {
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); // printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
#ifdef __cplusplus #ifdef __cplusplus
(*phase) /= std::abs((*phase)); (*phase) /= std::abs((*phase));
#else #else
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
#endif #endif
//printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); // printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
} }
(*phase) *= phase_inc; (*phase) *= phase_inc;
@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
#ifdef __cplusplus #ifdef __cplusplus
(*phase) /= std::abs((*phase)); (*phase) /= std::abs((*phase));
#else #else
//(*phase) /= cabsf((*phase)); // (*phase) /= cabsf((*phase));
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
#endif #endif
} }
@ -225,7 +225,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(z1); yh = _mm_movehdup_ps(z1);
//next two samples // next two samples
_in_common += 2; _in_common += 2;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
@ -337,7 +337,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(z1); yh = _mm_movehdup_ps(z1);
//next two samples // next two samples
_in_common += 2; _in_common += 2;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
@ -457,7 +457,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(z); yh = _mm256_movehdup_ps(z);
//next two samples // next two samples
_in_common += 4; _in_common += 4;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
@ -587,7 +587,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(z); yh = _mm256_movehdup_ps(z);
//next two samples // next two samples
_in_common += 4; _in_common += 4;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)

View File

@ -82,7 +82,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** re
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
local_code_chip_index = local_code_chip_index % code_length_chips; local_code_chip_index = local_code_chip_index % code_length_chips;
result[current_correlator_tap][n] = local_code[local_code_chip_index]; result[current_correlator_tap][n] = local_code[local_code_chip_index];
@ -90,7 +90,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** re
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
@ -153,7 +153,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -224,7 +224,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -290,7 +290,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -357,7 +357,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -435,7 +435,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -513,7 +513,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -558,13 +558,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2); aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
//aux = _mm256_add_ps(aux, aux2); // aux = _mm256_add_ps(aux, aux2);
// floor // floor
aux = _mm256_floor_ps(aux); aux = _mm256_floor_ps(aux);
// fmod // fmod
c = _mm256_div_ps(aux, code_length_chips_reg_f); c = _mm256_div_ps(aux, code_length_chips_reg_f);
//_mm_fmsub_ps(c, code_length_chips_reg_f, aux) // _mm_fmsub_ps(c, code_length_chips_reg_f, aux)
i = _mm256_cvttps_epi32(c); i = _mm256_cvttps_epi32(c);
cTrunc = _mm256_cvtepi32_ps(i); cTrunc = _mm256_cvtepi32_ps(i);
base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux); base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
@ -592,7 +592,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -637,8 +637,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2); aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
//aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn); // aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
//aux = _mm256_add_ps(aux, aux2); // aux = _mm256_add_ps(aux, aux2);
// floor // floor
aux = _mm256_floor_ps(aux); aux = _mm256_floor_ps(aux);
@ -671,7 +671,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -726,7 +726,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
aux = vmulq_f32(code_phase_step_chips_reg, indexn); aux = vmulq_f32(code_phase_step_chips_reg, indexn);
aux = vaddq_f32(aux, aux2); aux = vaddq_f32(aux, aux2);
//floor // floor
i = vcvtq_s32_f32(aux); i = vcvtq_s32_f32(aux);
fi = vcvtq_f32_s32(i); fi = vcvtq_f32_s32(i);
igx = vcgtq_f32(fi, aux); igx = vcgtq_f32(fi, aux);
@ -758,7 +758,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! // Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
@ -769,4 +769,4 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
#endif #endif
#endif /*INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H*/ #endif /* INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H */

View File

@ -121,7 +121,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
} }
} }
#endif /*LV_HAVE_AVX2*/ #endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX
@ -156,7 +156,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
compareResultslo = _mm_cmpgt_epi8(maxValues, lo); compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
compareResultshi = _mm_cmpgt_epi8(maxValues, hi); compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
//compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h // compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1); compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1);
if (!_mm256_testc_si256(compareResults, ones)) if (!_mm256_testc_si256(compareResults, ones))
@ -437,7 +437,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
compareResultslo = _mm_cmpgt_epi8(maxValues, lo); compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
compareResultshi = _mm_cmpgt_epi8(maxValues, hi); compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
//compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h // compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1); compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1);
if (!_mm256_testc_si256(compareResults, ones)) if (!_mm256_testc_si256(compareResults, ones))

View File

@ -313,7 +313,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0
{ {
currentValues = _mm256_load_si256((__m256i*)inputPtr); currentValues = _mm256_load_si256((__m256i*)inputPtr);
compareResults = _mm256_max_epi8(maxValues, currentValues); compareResults = _mm256_max_epi8(maxValues, currentValues);
maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults); maxValues = compareResults; // _mm256_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 32; inputPtr += 32;
} }

View File

@ -113,7 +113,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h // tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
_mm256_storeu_ps((float*)c, tmp); _mm256_storeu_ps((float*)c, tmp);
@ -230,7 +230,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h // tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
_mm256_store_ps((float*)c, tmp); _mm256_store_ps((float*)c, tmp);

View File

@ -114,46 +114,6 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV
} }
#endif /* LV_HAVE_SSSE3 */ #endif /* LV_HAVE_SSSE3 */
//#ifdef LV_HAVE_SSE
//#include <xmmintrin.h>
//
//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
// unsigned int number = 0;
// const unsigned int quarterPoints = num_points / 4;
//
// const float* complexVectorPtr = (float*)complexVector;
// float* magnitudeVectorPtr = magnitudeVector;
//
// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
// for(;number < quarterPoints; number++){
// cplxValue1 = _mm_loadu_ps(complexVectorPtr);
// complexVectorPtr += 4;
//
// cplxValue2 = _mm_loadu_ps(complexVectorPtr);
// complexVectorPtr += 4;
//
// // Arrange in i1i2i3i4 format
// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
// // Arrange in q1q2q3q4 format
// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
//
// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
//
// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
//
// _mm_storeu_ps(magnitudeVectorPtr, result);
// magnitudeVectorPtr += 4;
// }
//
// number = quarterPoints * 4;
// for(; number < num_points; number++){
// float val1Real = *complexVectorPtr++;
// float val1Imag = *complexVectorPtr++;
// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
// }
//}
//#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
@ -228,46 +188,6 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV
} }
#endif /* LV_HAVE_SSSE3 */ #endif /* LV_HAVE_SSSE3 */
//#ifdef LV_HAVE_SSE
//#include <xmmintrin.h>
//
//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
// unsigned int number = 0;
// const unsigned int quarterPoints = num_points / 4;
//
// const float* complexVectorPtr = (float*)complexVector;
// float* magnitudeVectorPtr = magnitudeVector;
//
// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
// for(;number < quarterPoints; number++){
// cplxValue1 = _mm_load_ps(complexVectorPtr);
// complexVectorPtr += 4;
//
// cplxValue2 = _mm_load_ps(complexVectorPtr);
// complexVectorPtr += 4;
//
// // Arrange in i1i2i3i4 format
// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
// // Arrange in q1q2q3q4 format
// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
//
// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
//
// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
//
// _mm_store_ps(magnitudeVectorPtr, result);
// magnitudeVectorPtr += 4;
// }
//
// number = quarterPoints * 4;
// for(; number < num_points; number++){
// float val1Real = *complexVectorPtr++;
// float val1Imag = *complexVectorPtr++;
// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
// }
//}
//#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_ORC #ifdef LV_HAVE_ORC

View File

@ -56,6 +56,8 @@
* \li out: Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n])) * \li out: Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n]))
* \li phase: Pointer to a float containing the final phase, in radians. * \li phase: Pointer to a float containing the final phase, in radians.
* *
* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier
* Based on algorithms from the cephes library http://www.netlib.org/cephes/
*/ */
@ -69,8 +71,7 @@
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include <emmintrin.h> #include <emmintrin.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
lv_32fc_t *bPtr = out; lv_32fc_t *bPtr = out;
@ -225,8 +226,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const fl
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include <emmintrin.h> #include <emmintrin.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
lv_32fc_t *bPtr = out; lv_32fc_t *bPtr = out;
@ -459,8 +459,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t *out, co
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
#include <immintrin.h> #include <immintrin.h>
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
lv_32fc_t *bPtr = out; lv_32fc_t *bPtr = out;
@ -647,8 +646,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const fl
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
#include <immintrin.h> #include <immintrin.h>
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
lv_32fc_t *bPtr = out; lv_32fc_t *bPtr = out;
@ -835,8 +833,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const fl
#ifdef LV_HAVE_NEONV7 #ifdef LV_HAVE_NEONV7
#include <arm_neon.h> #include <arm_neon.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
lv_32fc_t *bPtr = out; lv_32fc_t *bPtr = out;

View File

@ -7,6 +7,7 @@
#include <INTEGER.h> #include <INTEGER.h>
#include <asn_codecs_prim.h> /* Encoder and decoder of a primitive type */ #include <asn_codecs_prim.h> /* Encoder and decoder of a primitive type */
#include <errno.h> #include <errno.h>
#include <inttypes.h>
/* /*
* INTEGER basic type description. * INTEGER basic type description.
@ -146,7 +147,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by
scr = (char *)alloca(scrsize); scr = (char *)alloca(scrsize);
if(plainOrXER == 0) if(plainOrXER == 0)
ret = snprintf(scr, scrsize, ret = snprintf(scr, scrsize,
"%lld (%s)", accum, el->enum_name); "%+"PRId64"(%s)", accum, el->enum_name);
else else
ret = snprintf(scr, scrsize, ret = snprintf(scr, scrsize,
"<%s/>", el->enum_name); "<%s/>", el->enum_name);
@ -160,7 +161,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by
scr = scratch; scr = scratch;
ret = snprintf(scr, scrsize, ret = snprintf(scr, scrsize,
(specs && specs->field_unsigned) (specs && specs->field_unsigned)
?"%llu":"%lld", accum); ?"%"PRIu64:"%+"PRId64, accum);
} }
assert(ret > 0 && (size_t)ret < scrsize); assert(ret > 0 && (size_t)ret < scrsize);
return (cb(scr, ret, app_key) < 0) ? -1 : ret; return (cb(scr, ret, app_key) < 0) ? -1 : ret;

View File

@ -7,6 +7,7 @@
#include <INTEGER.h> #include <INTEGER.h>
#include <asn_codecs_prim.h> /* Encoder and decoder of a primitive type */ #include <asn_codecs_prim.h> /* Encoder and decoder of a primitive type */
#include <errno.h> #include <errno.h>
#include <inttypes.h>
/* /*
* INTEGER basic type description. * INTEGER basic type description.
@ -146,7 +147,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by
scr = (char *)alloca(scrsize); scr = (char *)alloca(scrsize);
if(plainOrXER == 0) if(plainOrXER == 0)
ret = snprintf(scr, scrsize, ret = snprintf(scr, scrsize,
"%lld (%s)", accum, el->enum_name); "%+"PRId64"(%s)", accum, el->enum_name);
else else
ret = snprintf(scr, scrsize, ret = snprintf(scr, scrsize,
"<%s/>", el->enum_name); "<%s/>", el->enum_name);
@ -160,7 +161,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by
scr = scratch; scr = scratch;
ret = snprintf(scr, scrsize, ret = snprintf(scr, scrsize,
(specs && specs->field_unsigned) (specs && specs->field_unsigned)
?"%llu":"%lld", accum); ?"%"PRIu64:"%+"PRId64, accum);
} }
assert(ret > 0 && (size_t)ret < scrsize); assert(ret > 0 && (size_t)ret < scrsize);
return (cb(scr, ret, app_key) < 0) ? -1 : ret; return (cb(scr, ret, app_key) < 0) ? -1 : ret;