mirror of
				https://github.com/gnss-sdr/gnss-sdr
				synced 2025-10-26 13:07:39 +00:00 
			
		
		
		
	Merge branch 'next' of https://github.com/carlesfernandez/gnss-sdr into next
This commit is contained in:
		| @@ -2340,7 +2340,7 @@ int valpos(rtk_t *rtk, const double *v, const double *R, const int *vflg, | |||||||
|             type = (vflg[i] >> 4) & 0xF; |             type = (vflg[i] >> 4) & 0xF; | ||||||
|             freq = vflg[i] & 0xF; |             freq = vflg[i] & 0xF; | ||||||
|             stype = type == 0 ? 'L' : (type == 1 ? 'L' : 'C'); |             stype = type == 0 ? 'L' : (type == 1 ? 'L' : 'C'); | ||||||
|             errmsg(rtk, "large residual (sat=%2d-%2d %s%d v=%6.3f sig=%.3f)\n", |             errmsg(rtk, "large residual (sat=%2d-%2d %c%d v=%6.3f sig=%.3f)\n", | ||||||
|                 sat1, sat2, stype, freq + 1, v[i], std::sqrt(R[i + i * nv])); |                 sat1, sat2, stype, freq + 1, v[i], std::sqrt(R[i + i * nv])); | ||||||
|         } |         } | ||||||
| #if 0 /* omitted v.2.4.0 */ | #if 0 /* omitted v.2.4.0 */ | ||||||
|   | |||||||
| @@ -58,6 +58,7 @@ | |||||||
| #include <cerrno> | #include <cerrno> | ||||||
| #include <cstring> | #include <cstring> | ||||||
| #include <fcntl.h> | #include <fcntl.h> | ||||||
|  | #include <inttypes.h> | ||||||
| #include <netdb.h> | #include <netdb.h> | ||||||
| #include <netinet/tcp.h> | #include <netinet/tcp.h> | ||||||
| #include <string> | #include <string> | ||||||
| @@ -461,7 +462,7 @@ void closefile(file_t *file) | |||||||
|         { |         { | ||||||
|             return; |             return; | ||||||
|         } |         } | ||||||
|     tracet(3, "closefile: fp=%d\n", file->fp); |     tracet(3, "closefile: fpos=%" PRIdPTR "\n", file->fp); | ||||||
|     closefile_(file); |     closefile_(file); | ||||||
|     free(file); |     free(file); | ||||||
| } | } | ||||||
| @@ -472,7 +473,7 @@ void swapfile(file_t *file, gtime_t time, char *msg) | |||||||
| { | { | ||||||
|     char openpath[MAXSTRPATH]; |     char openpath[MAXSTRPATH]; | ||||||
|  |  | ||||||
|     tracet(3, "swapfile: fp=%d time=%s\n", file->fp, time_str(time, 0)); |     tracet(3, "swapfile: fpos=%" PRIdPTR "\n time=%s\n", file->fp, time_str(time, 0)); | ||||||
|  |  | ||||||
|     /* return if old swap file open */ |     /* return if old swap file open */ | ||||||
|     if (file->fp_tmp || file->fp_tag_tmp) |     if (file->fp_tmp || file->fp_tag_tmp) | ||||||
| @@ -500,7 +501,7 @@ void swapfile(file_t *file, gtime_t time, char *msg) | |||||||
| /* close old swap file -------------------------------------------------------*/ | /* close old swap file -------------------------------------------------------*/ | ||||||
| void swapclose(file_t *file) | void swapclose(file_t *file) | ||||||
| { | { | ||||||
|     tracet(3, "swapclose: fp_tmp=%d\n", file->fp_tmp); |     tracet(3, "swapclose: fp_tmp=%" PRIdPTR "\n", file->fp_tmp); | ||||||
|     if (file->fp_tmp) |     if (file->fp_tmp) | ||||||
|         { |         { | ||||||
|             fclose(file->fp_tmp); |             fclose(file->fp_tmp); | ||||||
| @@ -534,7 +535,7 @@ int readfile(file_t *file, unsigned char *buff, int nmax, char *msg) | |||||||
|         { |         { | ||||||
|             return 0; |             return 0; | ||||||
|         } |         } | ||||||
|     tracet(4, "readfile: fp=%d nmax=%d\n", file->fp, nmax); |     tracet(4, "readfile: fp=%zd nmax=%d\n", file->fp, nmax); | ||||||
|  |  | ||||||
|     if (file->fp == stdin) |     if (file->fp == stdin) | ||||||
|         { |         { | ||||||
| @@ -617,7 +618,7 @@ int readfile(file_t *file, unsigned char *buff, int nmax, char *msg) | |||||||
|                     sprintf(msg, "end"); |                     sprintf(msg, "end"); | ||||||
|                 } |                 } | ||||||
|         } |         } | ||||||
|     tracet(5, "readfile: fp=%d nr=%d fpos=%d\n", file->fp, nr, file->fpos); |     tracet(5, "readfile: fpos=%" PRIdPTR "\n nr=%d fpos=%zd\n", file->fp, nr, file->fpos); | ||||||
|     return nr; |     return nr; | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -640,7 +641,7 @@ int writefile(file_t *file, unsigned char *buff, int n, char *msg) | |||||||
|         { |         { | ||||||
|             return 0; |             return 0; | ||||||
|         } |         } | ||||||
|     tracet(3, "writefile: fp=%d n=%d\n", file->fp, n); |     tracet(3, "writefile: fpos=%" PRIdPTR "\n n=%d\n", file->fp, n); | ||||||
|  |  | ||||||
|     wtime = utc2gpst(timeget()); /* write time in gpst */ |     wtime = utc2gpst(timeget()); /* write time in gpst */ | ||||||
|  |  | ||||||
| @@ -693,7 +694,7 @@ int writefile(file_t *file, unsigned char *buff, int n, char *msg) | |||||||
|                     fflush(file->fp_tag_tmp); |                     fflush(file->fp_tag_tmp); | ||||||
|                 } |                 } | ||||||
|         } |         } | ||||||
|     tracet(5, "writefile: fp=%d ns=%d tick=%5d fpos=%d\n", file->fp, ns, tick, fpos); |     tracet(5, "writefile: fpos=%" PRIdPTR "\n ns=%d tick=%5d fpos=%zd\n", file->fp, ns, tick, fpos); | ||||||
|  |  | ||||||
|     return static_cast<int>(ns); |     return static_cast<int>(ns); | ||||||
| } | } | ||||||
| @@ -1497,7 +1498,7 @@ int reqntrip_s(ntrip_t *ntrip, char *msg) | |||||||
|             return 0; |             return 0; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     tracet(2, "reqntrip_s: send request state=%d ns=%d\n", ntrip->state, p - buff); |     tracet(2, "reqntrip_s: send request state=%d ns=%" PRIdPTR "\n", ntrip->state, p - buff); | ||||||
|     tracet(5, "reqntrip_s: n=%d buff=\n%s\n", p - buff, buff); |     tracet(5, "reqntrip_s: n=%d buff=\n%s\n", p - buff, buff); | ||||||
|     ntrip->state = 1; |     ntrip->state = 1; | ||||||
|     return 1; |     return 1; | ||||||
| @@ -1535,7 +1536,7 @@ int reqntrip_c(ntrip_t *ntrip, char *msg) | |||||||
|             return 0; |             return 0; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     tracet(2, "reqntrip_c: send request state=%d ns=%d\n", ntrip->state, p - buff); |     tracet(2, "reqntrip_c: send request state=%d ns=%" PRIdPTR "\n", ntrip->state, p - buff); | ||||||
|     tracet(5, "reqntrip_c: n=%d buff=\n%s\n", p - buff, buff); |     tracet(5, "reqntrip_c: n=%d buff=\n%s\n", p - buff, buff); | ||||||
|     ntrip->state = 1; |     ntrip->state = 1; | ||||||
|     return 1; |     return 1; | ||||||
|   | |||||||
| @@ -9,9 +9,9 @@ | |||||||
| #define NOMINMAX | #define NOMINMAX | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| //http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 | // http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 | ||||||
| #include < time.h > | #include < time.h > | ||||||
| #include <windows.h>  //I've omitted this line. | #include <windows.h>  // I've omitted this line. | ||||||
| #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) | #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) | ||||||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 | #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 | ||||||
| #else | #else | ||||||
| @@ -51,7 +51,7 @@ static inline int gettimeofday(struct timeval *tv, struct timezone *tz) | |||||||
|             tmpres <<= 32; |             tmpres <<= 32; | ||||||
|             tmpres |= ft.dwLowDateTime; |             tmpres |= ft.dwLowDateTime; | ||||||
|  |  | ||||||
|             /*converting file time to unix epoch*/ |             /* converting file time to unix epoch*/ | ||||||
|             tmpres -= DELTA_EPOCH_IN_MICROSECS; |             tmpres -= DELTA_EPOCH_IN_MICROSECS; | ||||||
|             tv->tv_sec = (long)(tmpres / 1000000UL); |             tv->tv_sec = (long)(tmpres / 1000000UL); | ||||||
|             tv->tv_usec = (long)(tmpres % 1000000UL); |             tv->tv_usec = (long)(tmpres % 1000000UL); | ||||||
|   | |||||||
| @@ -24,9 +24,9 @@ | |||||||
| #ifndef INCLUDED_LIBVOLK_GNSSSDR_COMMON_H | #ifndef INCLUDED_LIBVOLK_GNSSSDR_COMMON_H | ||||||
| #define INCLUDED_LIBVOLK_GNSSSDR_COMMON_H | #define INCLUDED_LIBVOLK_GNSSSDR_COMMON_H | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| // Cross-platform attribute macros not included in VOLK | // Cross-platform attribute macros not included in VOLK | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| #if defined __GNUC__ | #if defined __GNUC__ | ||||||
| #define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr) | #define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr) | ||||||
| #define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality) | #define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality) | ||||||
| @@ -41,9 +41,9 @@ | |||||||
| #ifndef INCLUDED_LIBVOLK_COMMON_H | #ifndef INCLUDED_LIBVOLK_COMMON_H | ||||||
| #define INCLUDED_LIBVOLK_COMMON_H | #define INCLUDED_LIBVOLK_COMMON_H | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| // Cross-platform attribute macros | // Cross-platform attribute macros | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| #if defined __GNUC__ | #if defined __GNUC__ | ||||||
| #define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) | #define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) | ||||||
| #define __VOLK_ATTR_UNUSED __attribute__((unused)) | #define __VOLK_ATTR_UNUSED __attribute__((unused)) | ||||||
| @@ -78,18 +78,18 @@ | |||||||
| #define __VOLK_VOLATILE __volatile__ | #define __VOLK_VOLATILE __volatile__ | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| // Ignore annoying warnings in MSVC | // Ignore annoying warnings in MSVC | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| #if defined(_MSC_VER) | #if defined(_MSC_VER) | ||||||
| #pragma warning(disable : 4244)  //'conversion' conversion from 'type1' to 'type2', possible loss of data | #pragma warning(disable : 4244)  // 'conversion' conversion from 'type1' to 'type2', possible loss of data | ||||||
| #pragma warning(disable : 4305)  //'identifier' : truncation from 'type1' to 'type2' | #pragma warning(disable : 4305)  // 'identifier' : truncation from 'type1' to 'type2' | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| // C-linkage declaration macros | // C-linkage declaration macros | ||||||
| // FIXME: due to the usage of complex.h, require gcc for c-linkage | // FIXME: due to the usage of complex.h, require gcc for c-linkage | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| #if defined(__cplusplus) && (__GNUC__) | #if defined(__cplusplus) && (__GNUC__) | ||||||
| #define __VOLK_DECL_BEGIN \ | #define __VOLK_DECL_BEGIN \ | ||||||
|     extern "C"            \ |     extern "C"            \ | ||||||
| @@ -100,19 +100,19 @@ | |||||||
| #define __VOLK_DECL_END | #define __VOLK_DECL_END | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| // Define VOLK_API for library symbols | // Define VOLK_API for library symbols | ||||||
| // http://gcc.gnu.org/wiki/Visibility | // http://gcc.gnu.org/wiki/Visibility | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| #ifdef volk_gnsssdr_EXPORTS | #ifdef volk_gnsssdr_EXPORTS | ||||||
| #define VOLK_API __VOLK_ATTR_EXPORT | #define VOLK_API __VOLK_ATTR_EXPORT | ||||||
| #else | #else | ||||||
| #define VOLK_API __VOLK_ATTR_IMPORT | #define VOLK_API __VOLK_ATTR_IMPORT | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| // The bit128 union used by some | // The bit128 union used by some | ||||||
| //////////////////////////////////////////////////////////////////////// | // | ||||||
| #include <inttypes.h> | #include <inttypes.h> | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE | #ifdef LV_HAVE_SSE | ||||||
|   | |||||||
| @@ -84,7 +84,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_generic(int16_t** result | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); |                     if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); | ||||||
|                     local_code_chip_index = local_code_chip_index % code_length_chips; |                     local_code_chip_index = local_code_chip_index % code_length_chips; | ||||||
|                     result[current_correlator_tap][n] = local_code[local_code_chip_index]; |                     result[current_correlator_tap][n] = local_code[local_code_chip_index]; | ||||||
| @@ -150,7 +150,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -217,7 +217,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -288,7 +288,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result, | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -359,7 +359,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result, | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -437,7 +437,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -515,7 +515,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -568,7 +568,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c | |||||||
|                     aux = vmulq_f32(code_phase_step_chips_reg, indexn); |                     aux = vmulq_f32(code_phase_step_chips_reg, indexn); | ||||||
|                     aux = vaddq_f32(aux, aux2); |                     aux = vaddq_f32(aux, aux2); | ||||||
|  |  | ||||||
|                     //floor |                     // floor | ||||||
|                     i = vcvtq_s32_f32(aux); |                     i = vcvtq_s32_f32(aux); | ||||||
|                     fi = vcvtq_f32_s32(i); |                     fi = vcvtq_f32_s32(i); | ||||||
|                     igx = vcgtq_f32(fi, aux); |                     igx = vcgtq_f32(fi, aux); | ||||||
| @@ -600,7 +600,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c | |||||||
|                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); |                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -131,36 +131,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3(lv_ | |||||||
| #endif  // SSE3 | #endif  // SSE3 | ||||||
|  |  | ||||||
|  |  | ||||||
| //#ifdef LV_HAVE_SSE3 |  | ||||||
| //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) |  | ||||||
| //{ |  | ||||||
| //// phases must be normalized. Phase rotator expects a complex exponential input! |  | ||||||
| //float rem_carrier_phase_in_rad = 0.345; |  | ||||||
| //float phase_step_rad = 0.1; |  | ||||||
| //lv_32fc_t phase[1]; |  | ||||||
| //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); |  | ||||||
| //lv_32fc_t phase_inc[1]; |  | ||||||
| //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); |  | ||||||
| //int n; |  | ||||||
| //int num_a_vectors = 3; |  | ||||||
| //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); |  | ||||||
| //for(n = 0; n < num_a_vectors; n++) |  | ||||||
| //{ |  | ||||||
| //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); |  | ||||||
| //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); |  | ||||||
| //} |  | ||||||
|  |  | ||||||
| //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); |  | ||||||
|  |  | ||||||
| //for(n = 0; n < num_a_vectors; n++) |  | ||||||
| //{ |  | ||||||
| //volk_gnsssdr_free(in_a[n]); |  | ||||||
| //} |  | ||||||
| //volk_gnsssdr_free(in_a); |  | ||||||
| //} |  | ||||||
|  |  | ||||||
| //#endif // SSE3 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 | #ifdef LV_HAVE_SSE3 | ||||||
| static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) | static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) | ||||||
| @@ -224,36 +194,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2(lv_ | |||||||
| #endif  // AVX2 | #endif  // AVX2 | ||||||
|  |  | ||||||
|  |  | ||||||
| //#ifdef LV_HAVE_AVX2 |  | ||||||
| //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) |  | ||||||
| //{ |  | ||||||
| //// phases must be normalized. Phase rotator expects a complex exponential input! |  | ||||||
| //float rem_carrier_phase_in_rad = 0.345; |  | ||||||
| //float phase_step_rad = 0.1; |  | ||||||
| //lv_32fc_t phase[1]; |  | ||||||
| //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); |  | ||||||
| //lv_32fc_t phase_inc[1]; |  | ||||||
| //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); |  | ||||||
| //int n; |  | ||||||
| //int num_a_vectors = 3; |  | ||||||
| //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); |  | ||||||
| //for(n = 0; n < num_a_vectors; n++) |  | ||||||
| //{ |  | ||||||
| //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); |  | ||||||
| //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); |  | ||||||
| //} |  | ||||||
|  |  | ||||||
| //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); |  | ||||||
|  |  | ||||||
| //for(n = 0; n < num_a_vectors; n++) |  | ||||||
| //{ |  | ||||||
| //volk_gnsssdr_free(in_a[n]); |  | ||||||
| //} |  | ||||||
| //volk_gnsssdr_free(in_a); |  | ||||||
| //} |  | ||||||
|  |  | ||||||
| //#endif // AVX2 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_AVX2 | #ifdef LV_HAVE_AVX2 | ||||||
| static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) | static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) | ||||||
| @@ -286,96 +226,4 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_ | |||||||
| #endif  // AVX2 | #endif  // AVX2 | ||||||
|  |  | ||||||
|  |  | ||||||
| //#ifdef LV_HAVE_AVX2 |  | ||||||
| //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) |  | ||||||
| //{ |  | ||||||
| //// phases must be normalized. Phase rotator expects a complex exponential input! |  | ||||||
| //float rem_carrier_phase_in_rad = 0.345; |  | ||||||
| //float phase_step_rad = 0.1; |  | ||||||
| //lv_32fc_t phase[1]; |  | ||||||
| //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); |  | ||||||
| //lv_32fc_t phase_inc[1]; |  | ||||||
| //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); |  | ||||||
| //int n; |  | ||||||
| //int num_a_vectors = 3; |  | ||||||
| //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); |  | ||||||
| //for(n = 0; n < num_a_vectors; n++) |  | ||||||
| //{ |  | ||||||
| //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); |  | ||||||
| //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); |  | ||||||
| //} |  | ||||||
|  |  | ||||||
| //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); |  | ||||||
|  |  | ||||||
| //for(n = 0; n < num_a_vectors; n++) |  | ||||||
| //{ |  | ||||||
| //volk_gnsssdr_free(in_a[n]); |  | ||||||
| //} |  | ||||||
| //volk_gnsssdr_free(in_a); |  | ||||||
| //} |  | ||||||
|  |  | ||||||
| //#endif // AVX2 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| //#ifdef LV_HAVE_NEONV7 |  | ||||||
| //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) |  | ||||||
| //{ |  | ||||||
| //// phases must be normalized. Phase rotator expects a complex exponential input! |  | ||||||
| //float rem_carrier_phase_in_rad = 0.345; |  | ||||||
| //float phase_step_rad = 0.1; |  | ||||||
| //lv_32fc_t phase[1]; |  | ||||||
| //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); |  | ||||||
| //lv_32fc_t phase_inc[1]; |  | ||||||
| //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); |  | ||||||
| //int n; |  | ||||||
| //int num_a_vectors = 3; |  | ||||||
| //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); |  | ||||||
| //for(n = 0; n < num_a_vectors; n++) |  | ||||||
| //{ |  | ||||||
| //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); |  | ||||||
| //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); |  | ||||||
| //} |  | ||||||
|  |  | ||||||
| //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); |  | ||||||
|  |  | ||||||
| //for(n = 0; n < num_a_vectors; n++) |  | ||||||
| //{ |  | ||||||
| //volk_gnsssdr_free(in_a[n]); |  | ||||||
| //} |  | ||||||
| //volk_gnsssdr_free(in_a); |  | ||||||
| //} |  | ||||||
|  |  | ||||||
| //#endif // NEON |  | ||||||
|  |  | ||||||
|  |  | ||||||
| //#ifdef LV_HAVE_NEONV7 |  | ||||||
| //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) |  | ||||||
| //{ |  | ||||||
| //// phases must be normalized. Phase rotator expects a complex exponential input! |  | ||||||
| //float rem_carrier_phase_in_rad = 0.345; |  | ||||||
| //float phase_step_rad = 0.1; |  | ||||||
| //lv_32fc_t phase[1]; |  | ||||||
| //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); |  | ||||||
| //lv_32fc_t phase_inc[1]; |  | ||||||
| //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); |  | ||||||
| //int n; |  | ||||||
| //int num_a_vectors = 3; |  | ||||||
| //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); |  | ||||||
| //for(n = 0; n < num_a_vectors; n++) |  | ||||||
| //{ |  | ||||||
| //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); |  | ||||||
| //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); |  | ||||||
| //} |  | ||||||
|  |  | ||||||
| //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); |  | ||||||
|  |  | ||||||
| //for(n = 0; n < num_a_vectors; n++) |  | ||||||
| //{ |  | ||||||
| //volk_gnsssdr_free(in_a[n]); |  | ||||||
| //} |  | ||||||
| //volk_gnsssdr_free(in_a); |  | ||||||
| //} |  | ||||||
|  |  | ||||||
| //#endif // NEON |  | ||||||
|  |  | ||||||
| #endif  // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H | #endif  // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H | ||||||
|   | |||||||
| @@ -200,34 +200,4 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx2(lv_16sc_t* cVector, c | |||||||
| } | } | ||||||
| #endif /* LV_HAVE_AVX2 */ | #endif /* LV_HAVE_AVX2 */ | ||||||
|  |  | ||||||
| // |  | ||||||
| // |  | ||||||
| //#ifdef LV_HAVE_NEONV7 |  | ||||||
| //#include <arm_neon.h> |  | ||||||
| // |  | ||||||
| //static inline void volk_gnsssdr_16ic_conjugate_16ic_neon(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) |  | ||||||
| //{ |  | ||||||
| //    const unsigned int sse_iters = num_points / 4; |  | ||||||
| //    unsigned int i; |  | ||||||
| //    lv_16sc_t* c = cVector; |  | ||||||
| //    const lv_16sc_t* a = aVector; |  | ||||||
| //    int16x4x2_t a_val; |  | ||||||
| // |  | ||||||
| //    for (i = 0; i < sse_iters; ++i) |  | ||||||
| //        { |  | ||||||
| //            a_val = vld2_s16((const int16_t*)a); |  | ||||||
| //            __VOLK_GNSSSDR_PREFETCH(a + 4); |  | ||||||
| //            a_val.val[1] = vneg_s16(a_val.val[1]); |  | ||||||
| //            vst2_s16((int16_t*)c, a_val); |  | ||||||
| //            a += 4; |  | ||||||
| //            c += 4; |  | ||||||
| //        } |  | ||||||
| // |  | ||||||
| //    for (i = sse_iters * 4; i < num_points; ++i) |  | ||||||
| //        { |  | ||||||
| //            *c++ = lv_conj(*a++); |  | ||||||
| //        } |  | ||||||
| //} |  | ||||||
| //#endif /* LV_HAVE_NEONV7 */ |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */ | #endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */ | ||||||
|   | |||||||
| @@ -72,7 +72,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu | |||||||
| { | { | ||||||
|     int local_code_chip_index; |     int local_code_chip_index; | ||||||
|     unsigned int n; |     unsigned int n; | ||||||
|     //fesetround(FE_TONEAREST); |     // fesetround(FE_TONEAREST); | ||||||
|     for (n = 0; n < num_output_samples; n++) |     for (n = 0; n < num_output_samples; n++) | ||||||
|         { |         { | ||||||
|             // resample code for current tap |             // resample code for current tap | ||||||
| @@ -83,7 +83,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu | |||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE2 | #ifdef LV_HAVE_SSE2 | ||||||
| @@ -91,7 +91,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu | |||||||
|  |  | ||||||
| static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)  //, int* scratch_buffer, float* scratch_buffer_float) | static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)  //, int* scratch_buffer, float* scratch_buffer_float) | ||||||
| { | { | ||||||
|     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);  //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO |     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);  // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO | ||||||
|     unsigned int number; |     unsigned int number; | ||||||
|     const unsigned int quarterPoints = num_output_samples / 4; |     const unsigned int quarterPoints = num_output_samples / 4; | ||||||
|  |  | ||||||
| @@ -104,8 +104,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul | |||||||
|     __m128 _code_phase_out, _code_phase_out_with_offset; |     __m128 _code_phase_out, _code_phase_out_with_offset; | ||||||
|     rem_code_phase_chips = rem_code_phase_chips - 0.5f; |     rem_code_phase_chips = rem_code_phase_chips - 0.5f; | ||||||
|  |  | ||||||
|     _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips);          //load float to all four float values in m128 register |     _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips);          // load float to all four float values in m128 register | ||||||
|     _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips);  //load float to all four float values in m128 register |     _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips);  // load float to all four float values in m128 register | ||||||
|     __VOLK_ATTR_ALIGNED(16) |     __VOLK_ATTR_ALIGNED(16) | ||||||
|     int four_times_code_length_chips_minus1[4]; |     int four_times_code_length_chips_minus1[4]; | ||||||
|     four_times_code_length_chips_minus1[0] = code_length_chips - 1; |     four_times_code_length_chips_minus1[0] = code_length_chips - 1; | ||||||
| @@ -120,8 +120,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul | |||||||
|     four_times_code_length_chips[2] = code_length_chips; |     four_times_code_length_chips[2] = code_length_chips; | ||||||
|     four_times_code_length_chips[3] = code_length_chips; |     four_times_code_length_chips[3] = code_length_chips; | ||||||
|  |  | ||||||
|     _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips);                //load float to all four float values in m128 register |     _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips);                // load float to all four float values in m128 register | ||||||
|     _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1);  //load float to all four float values in m128 register |     _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1);  // load float to all four float values in m128 register | ||||||
|  |  | ||||||
|     __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; |     __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; | ||||||
|  |  | ||||||
| @@ -136,21 +136,21 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul | |||||||
|  |  | ||||||
|     for (number = 0; number < quarterPoints; number++) |     for (number = 0; number < quarterPoints; number++) | ||||||
|         { |         { | ||||||
|             _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index);        //compute the code phase point with the phase step |             _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index);        // compute the code phase point with the phase step | ||||||
|             _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase);  //add the phase offset |             _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase);  // add the phase offset | ||||||
|             _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset);          //convert to integer |             _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset);          // convert to integer | ||||||
|  |  | ||||||
|             negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero);                     //test for negative values |             negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero);                     // test for negative values | ||||||
|             _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips);  //the negative values branch |             _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips);  // the negative values branch | ||||||
|             _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); |             _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); | ||||||
|  |  | ||||||
|             overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1);  //test for overflow values |             overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1);  // test for overflow values | ||||||
|             _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips);   //the negative values branch |             _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips);   // the negative values branch | ||||||
|             _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); |             _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); | ||||||
|  |  | ||||||
|             _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back |             _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back | ||||||
|  |  | ||||||
|             //todo: optimize the local code lookup table with intrinsics, if possible |             // todo: optimize the local code lookup table with intrinsics, if possible | ||||||
|             *_result++ = local_code[local_code_chip_index[0]]; |             *_result++ = local_code[local_code_chip_index[0]]; | ||||||
|             *_result++ = local_code[local_code_chip_index[1]]; |             *_result++ = local_code[local_code_chip_index[1]]; | ||||||
|             *_result++ = local_code[local_code_chip_index[2]]; |             *_result++ = local_code[local_code_chip_index[2]]; | ||||||
| @@ -176,7 +176,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul | |||||||
|  |  | ||||||
| static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)  //, int* scratch_buffer, float* scratch_buffer_float) | static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)  //, int* scratch_buffer, float* scratch_buffer_float) | ||||||
| { | { | ||||||
|     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);  //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO |     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);  // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO | ||||||
|     unsigned int number; |     unsigned int number; | ||||||
|     const unsigned int quarterPoints = num_output_samples / 4; |     const unsigned int quarterPoints = num_output_samples / 4; | ||||||
|  |  | ||||||
| @@ -189,8 +189,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul | |||||||
|     __m128 _code_phase_out, _code_phase_out_with_offset; |     __m128 _code_phase_out, _code_phase_out_with_offset; | ||||||
|     rem_code_phase_chips = rem_code_phase_chips - 0.5f; |     rem_code_phase_chips = rem_code_phase_chips - 0.5f; | ||||||
|  |  | ||||||
|     _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips);          //load float to all four float values in m128 register |     _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips);          // load float to all four float values in m128 register | ||||||
|     _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips);  //load float to all four float values in m128 register |     _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips);  // load float to all four float values in m128 register | ||||||
|     __VOLK_ATTR_ALIGNED(16) |     __VOLK_ATTR_ALIGNED(16) | ||||||
|     int four_times_code_length_chips_minus1[4]; |     int four_times_code_length_chips_minus1[4]; | ||||||
|     four_times_code_length_chips_minus1[0] = code_length_chips - 1; |     four_times_code_length_chips_minus1[0] = code_length_chips - 1; | ||||||
| @@ -205,8 +205,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul | |||||||
|     four_times_code_length_chips[2] = code_length_chips; |     four_times_code_length_chips[2] = code_length_chips; | ||||||
|     four_times_code_length_chips[3] = code_length_chips; |     four_times_code_length_chips[3] = code_length_chips; | ||||||
|  |  | ||||||
|     _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips);                //load float to all four float values in m128 register |     _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips);                // load float to all four float values in m128 register | ||||||
|     _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1);  //load float to all four float values in m128 register |     _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1);  // load float to all four float values in m128 register | ||||||
|  |  | ||||||
|     __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; |     __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; | ||||||
|  |  | ||||||
| @@ -221,21 +221,21 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul | |||||||
|  |  | ||||||
|     for (number = 0; number < quarterPoints; number++) |     for (number = 0; number < quarterPoints; number++) | ||||||
|         { |         { | ||||||
|             _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index);        //compute the code phase point with the phase step |             _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index);        // compute the code phase point with the phase step | ||||||
|             _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase);  //add the phase offset |             _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase);  // add the phase offset | ||||||
|             _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset);          //convert to integer |             _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset);          // convert to integer | ||||||
|  |  | ||||||
|             negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero);                     //test for negative values |             negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero);                     // test for negative values | ||||||
|             _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips);  //the negative values branch |             _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips);  // the negative values branch | ||||||
|             _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); |             _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); | ||||||
|  |  | ||||||
|             overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1);  //test for overflow values |             overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1);  // test for overflow values | ||||||
|             _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips);   //the negative values branch |             _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips);   // the negative values branch | ||||||
|             _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); |             _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); | ||||||
|  |  | ||||||
|             _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back |             _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back | ||||||
|  |  | ||||||
|             //todo: optimize the local code lookup table with intrinsics, if possible |             // todo: optimize the local code lookup table with intrinsics, if possible | ||||||
|             *_result++ = local_code[local_code_chip_index[0]]; |             *_result++ = local_code[local_code_chip_index[0]]; | ||||||
|             *_result++ = local_code[local_code_chip_index[1]]; |             *_result++ = local_code[local_code_chip_index[1]]; | ||||||
|             *_result++ = local_code[local_code_chip_index[2]]; |             *_result++ = local_code[local_code_chip_index[2]]; | ||||||
| @@ -275,8 +275,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, | |||||||
|     rem_code_phase_chips = rem_code_phase_chips - 0.5f; |     rem_code_phase_chips = rem_code_phase_chips - 0.5f; | ||||||
|     float32x4_t sign, PlusHalf, Round; |     float32x4_t sign, PlusHalf, Round; | ||||||
|  |  | ||||||
|     _rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips);          //load float to all four float values in m128 register |     _rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips);          // load float to all four float values in m128 register | ||||||
|     _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips);  //load float to all four float values in m128 register |     _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips);  // load float to all four float values in m128 register | ||||||
|     __VOLK_ATTR_ALIGNED(16) |     __VOLK_ATTR_ALIGNED(16) | ||||||
|     int four_times_code_length_chips_minus1[4]; |     int four_times_code_length_chips_minus1[4]; | ||||||
|     four_times_code_length_chips_minus1[0] = code_length_chips - 1; |     four_times_code_length_chips_minus1[0] = code_length_chips - 1; | ||||||
| @@ -291,8 +291,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, | |||||||
|     four_times_code_length_chips[2] = code_length_chips; |     four_times_code_length_chips[2] = code_length_chips; | ||||||
|     four_times_code_length_chips[3] = code_length_chips; |     four_times_code_length_chips[3] = code_length_chips; | ||||||
|  |  | ||||||
|     _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips);                //load float to all four float values in m128 register |     _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips);                // load float to all four float values in m128 register | ||||||
|     _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1);  //load float to all four float values in m128 register |     _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1);  // load float to all four float values in m128 register | ||||||
|  |  | ||||||
|     int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; |     int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; | ||||||
|     uint32x4_t negative_indexes, overflow_indexes; |     uint32x4_t negative_indexes, overflow_indexes; | ||||||
| @@ -307,24 +307,24 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, | |||||||
|  |  | ||||||
|     for (number = 0; number < quarterPoints; number++) |     for (number = 0; number < quarterPoints; number++) | ||||||
|         { |         { | ||||||
|             _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index);        //compute the code phase point with the phase step |             _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index);        // compute the code phase point with the phase step | ||||||
|             _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase);  //add the phase offset |             _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase);  // add the phase offset | ||||||
|             sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); |             sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); | ||||||
|             PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); |             PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); | ||||||
|             Round = vsubq_f32(PlusHalf, sign); |             Round = vsubq_f32(PlusHalf, sign); | ||||||
|             _code_phase_out_int = vcvtq_s32_f32(Round); |             _code_phase_out_int = vcvtq_s32_f32(Round); | ||||||
|  |  | ||||||
|             negative_indexes = vcltq_s32(_code_phase_out_int, zero);                       //test for negative values |             negative_indexes = vcltq_s32(_code_phase_out_int, zero);                       // test for negative values | ||||||
|             _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips);  //the negative values branch |             _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips);  // the negative values branch | ||||||
|             _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); |             _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); | ||||||
|  |  | ||||||
|             overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1);   //test for overflow values |             overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1);   // test for overflow values | ||||||
|             _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips);  //the negative values branch |             _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips);  // the negative values branch | ||||||
|             _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); |             _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); | ||||||
|  |  | ||||||
|             vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back |             vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back | ||||||
|  |  | ||||||
|             //todo: optimize the local code lookup table with intrinsics, if possible |             // todo: optimize the local code lookup table with intrinsics, if possible | ||||||
|             *_result++ = local_code[local_code_chip_index[0]]; |             *_result++ = local_code[local_code_chip_index[0]]; | ||||||
|             *_result++ = local_code[local_code_chip_index[1]]; |             *_result++ = local_code[local_code_chip_index[1]]; | ||||||
|             *_result++ = local_code[local_code_chip_index[2]]; |             *_result++ = local_code[local_code_chip_index[2]]; | ||||||
| @@ -344,4 +344,4 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, | |||||||
|  |  | ||||||
| #endif /* LV_HAVE_NEONV7 */ | #endif /* LV_HAVE_NEONV7 */ | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_resampler_fast_16ic_H*/ | #endif /* INCLUDED_volk_gnsssdr_16ic_resampler_fast_16ic_H */ | ||||||
|   | |||||||
| @@ -79,13 +79,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou | |||||||
|             // Regenerate phase |             // Regenerate phase | ||||||
|             if (i % 512 == 0) |             if (i % 512 == 0) | ||||||
|                 { |                 { | ||||||
|                     //printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); |                     // printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
|                     (*phase) /= std::abs((*phase)); |                     (*phase) /= std::abs((*phase)); | ||||||
| #else | #else | ||||||
|                     (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); |                     (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); | ||||||
| #endif | #endif | ||||||
|                     //printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); |                     // printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); | ||||||
|                 } |                 } | ||||||
|         } |         } | ||||||
| } | } | ||||||
| @@ -112,13 +112,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(lv_16s | |||||||
|                     (*phase) *= phase_inc; |                     (*phase) *= phase_inc; | ||||||
|                 } |                 } | ||||||
|                 // Regenerate phase |                 // Regenerate phase | ||||||
|                 //printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); |                 // printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
|             (*phase) /= std::abs((*phase)); |             (*phase) /= std::abs((*phase)); | ||||||
| #else | #else | ||||||
|             (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); |             (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); | ||||||
| #endif | #endif | ||||||
|             //printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); |             // printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); | ||||||
|         } |         } | ||||||
|     for (j = 0; j < num_points % ROTATOR_RELOAD; j++) |     for (j = 0; j < num_points % ROTATOR_RELOAD; j++) | ||||||
|         { |         { | ||||||
| @@ -161,8 +161,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out | |||||||
|  |  | ||||||
|     for (number = 0; number < sse_iters; number++) |     for (number = 0; number < sse_iters; number++) | ||||||
|         { |         { | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -171,7 +171,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -179,11 +179,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out | |||||||
|             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in += 2; |             _in += 2; | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             __VOLK_GNSSSDR_PREFETCH(_in + 8); |             __VOLK_GNSSSDR_PREFETCH(_in + 8); | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -192,7 +192,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -214,7 +214,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out | |||||||
|                     two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); |                     two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in += 2; |             _in += 2; | ||||||
|             _out += 4; |             _out += 4; | ||||||
|         } |         } | ||||||
| @@ -268,8 +268,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc | |||||||
|         { |         { | ||||||
|             for (j = 0; j < ROTATOR_RELOAD; j++) |             for (j = 0; j < ROTATOR_RELOAD; j++) | ||||||
|                 { |                 { | ||||||
|                     a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |                     a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|                     //complex 32fc multiplication b=a*two_phase_acc_reg |                     // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -278,7 +278,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc | |||||||
|                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|                     c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |                     c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |                     // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -286,11 +286,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc | |||||||
|                     tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |                     tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|                     //next two samples |                     // next two samples | ||||||
|                     _in += 2; |                     _in += 2; | ||||||
|                     a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |                     a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in + 8); | ||||||
|                     //complex 32fc multiplication b=a*two_phase_acc_reg |                     // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -299,7 +299,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc | |||||||
|                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|                     c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |                     c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |                     // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -311,7 +311,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc | |||||||
|                     result = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic |                     result = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic | ||||||
|                     _mm_store_si128((__m128i*)_out, result); |                     _mm_store_si128((__m128i*)_out, result); | ||||||
|  |  | ||||||
|                     //next two samples |                     // next two samples | ||||||
|                     _in += 2; |                     _in += 2; | ||||||
|                     _out += 4; |                     _out += 4; | ||||||
|                 } |                 } | ||||||
| @@ -325,8 +325,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc | |||||||
|  |  | ||||||
|     for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) |     for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) | ||||||
|         { |         { | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -335,7 +335,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -343,11 +343,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc | |||||||
|             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in += 2; |             _in += 2; | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             __VOLK_GNSSSDR_PREFETCH(_in + 8); |             __VOLK_GNSSSDR_PREFETCH(_in + 8); | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -356,7 +356,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -368,7 +368,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc | |||||||
|             result = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic |             result = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic | ||||||
|             _mm_store_si128((__m128i*)_out, result); |             _mm_store_si128((__m128i*)_out, result); | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in += 2; |             _in += 2; | ||||||
|             _out += 4; |             _out += 4; | ||||||
|         } |         } | ||||||
| @@ -418,8 +418,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out | |||||||
|  |  | ||||||
|     for (number = 0; number < sse_iters; number++) |     for (number = 0; number < sse_iters; number++) | ||||||
|         { |         { | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -428,7 +428,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -436,11 +436,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out | |||||||
|             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in += 2; |             _in += 2; | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             __VOLK_GNSSSDR_PREFETCH(_in + 8); |             __VOLK_GNSSSDR_PREFETCH(_in + 8); | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -449,7 +449,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -471,7 +471,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out | |||||||
|                     two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); |                     two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in += 2; |             _in += 2; | ||||||
|             _out += 4; |             _out += 4; | ||||||
|         } |         } | ||||||
| @@ -525,8 +525,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc | |||||||
|         { |         { | ||||||
|             for (j = 0; j < ROTATOR_RELOAD; j++) |             for (j = 0; j < ROTATOR_RELOAD; j++) | ||||||
|                 { |                 { | ||||||
|                     a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |                     a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|                     //complex 32fc multiplication b=a*two_phase_acc_reg |                     // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -535,7 +535,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc | |||||||
|                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|                     c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |                     c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |                     // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -543,11 +543,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc | |||||||
|                     tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |                     tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|                     //next two samples |                     // next two samples | ||||||
|                     _in += 2; |                     _in += 2; | ||||||
|                     a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |                     a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in + 8); | ||||||
|                     //complex 32fc multiplication b=a*two_phase_acc_reg |                     // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -556,7 +556,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc | |||||||
|                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|                     c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |                     c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |                     // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -568,7 +568,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc | |||||||
|                     result = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic |                     result = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic | ||||||
|                     _mm_storeu_si128((__m128i*)_out, result); |                     _mm_storeu_si128((__m128i*)_out, result); | ||||||
|  |  | ||||||
|                     //next two samples |                     // next two samples | ||||||
|                     _in += 2; |                     _in += 2; | ||||||
|                     _out += 4; |                     _out += 4; | ||||||
|                 } |                 } | ||||||
| @@ -582,8 +582,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc | |||||||
|  |  | ||||||
|     for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) |     for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) | ||||||
|         { |         { | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -592,7 +592,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -600,11 +600,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc | |||||||
|             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in += 2; |             _in += 2; | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             __VOLK_GNSSSDR_PREFETCH(_in + 8); |             __VOLK_GNSSSDR_PREFETCH(_in + 8); | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -613,7 +613,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -625,7 +625,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc | |||||||
|             result = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic |             result = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic | ||||||
|             _mm_storeu_si128((__m128i*)_out, result); |             _mm_storeu_si128((__m128i*)_out, result); | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in += 2; |             _in += 2; | ||||||
|             _out += 4; |             _out += 4; | ||||||
|         } |         } | ||||||
| @@ -746,9 +746,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe | |||||||
|                     // Regenerate phase |                     // Regenerate phase | ||||||
|                     if ((i % 512) == 0) |                     if ((i % 512) == 0) | ||||||
|                         { |                         { | ||||||
|                             //printf("Computed phase:  %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); |                             // printf("Computed phase:  %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); | ||||||
|                             phase_est = arg_phase0 + (i + 1) * 4 * arg_phase_inc; |                             phase_est = arg_phase0 + (i + 1) * 4 * arg_phase_inc; | ||||||
|                             //printf("Estimated phase: %f\n\n", cos(phase_est)); |                             // printf("Estimated phase: %f\n\n", cos(phase_est)); | ||||||
|  |  | ||||||
|                             *phase = lv_cmake(cos(phase_est), sin(phase_est)); |                             *phase = lv_cmake(cos(phase_est), sin(phase_est)); | ||||||
|                             phase2 = (lv_32fc_t)(*phase) * phase_inc; |                             phase2 = (lv_32fc_t)(*phase) * phase_inc; | ||||||
| @@ -887,9 +887,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t | |||||||
|                             _out += 4; |                             _out += 4; | ||||||
|                         } |                         } | ||||||
|                     // Regenerate phase |                     // Regenerate phase | ||||||
|                     //printf("Computed phase:  %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); |                     // printf("Computed phase:  %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); | ||||||
|                     phase_est = arg_phase0 + (n + 1) * ROTATOR_RELOAD * 4 * arg_phase_inc; |                     phase_est = arg_phase0 + (n + 1) * ROTATOR_RELOAD * 4 * arg_phase_inc; | ||||||
|                     //printf("Estimated phase: %f\n\n", cos(phase_est)); |                     // printf("Estimated phase: %f\n\n", cos(phase_est)); | ||||||
|                     *phase = lv_cmake(cos(phase_est), sin(phase_est)); |                     *phase = lv_cmake(cos(phase_est), sin(phase_est)); | ||||||
|                     phase2 = (lv_32fc_t)(*phase) * phase_inc; |                     phase2 = (lv_32fc_t)(*phase) * phase_inc; | ||||||
|                     phase3 = phase2 * phase_inc; |                     phase3 = phase2 * phase_inc; | ||||||
|   | |||||||
| @@ -28,7 +28,7 @@ | |||||||
|  * GNU General Public License for more details. |  * GNU General Public License for more details. | ||||||
|  * |  * | ||||||
|  * You should have received a copy of the GNU General Public License |  * You should have received a copy of the GNU General Public License | ||||||
|  * along with GNSS-SDR. If not, see <https://www.gnu.org/licenses/>. |  * along with GNSS-SDR. If not, see <https:// www.gnu.org/licenses/>. | ||||||
|  * |  * | ||||||
|  * ------------------------------------------------------------------------- |  * ------------------------------------------------------------------------- | ||||||
|  */ |  */ | ||||||
| @@ -77,7 +77,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, | |||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE2 | #ifdef LV_HAVE_SSE2 | ||||||
| @@ -108,7 +108,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con | |||||||
|             for (number = 0; number < sse_iters; number++) |             for (number = 0; number < sse_iters; number++) | ||||||
|                 { |                 { | ||||||
|                     // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] |                     // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||||||
|                     a = _mm_load_si128((__m128i*)_in_a);  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                     a = _mm_load_si128((__m128i*)_in_a);  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in_a + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in_a + 8); | ||||||
|                     b = _mm_load_si128((__m128i*)_in_b); |                     b = _mm_load_si128((__m128i*)_in_b); | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in_b + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in_b + 8); | ||||||
| @@ -123,7 +123,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con | |||||||
|                     imag1 = _mm_mullo_epi16(a, b_sl);  // a3.i*b3.r, .... |                     imag1 = _mm_mullo_epi16(a, b_sl);  // a3.i*b3.r, .... | ||||||
|                     imag2 = _mm_mullo_epi16(b, a_sl);  // b3.i*a3.r, .... |                     imag2 = _mm_mullo_epi16(b, a_sl);  // b3.i*a3.r, .... | ||||||
|  |  | ||||||
|                     imag = _mm_adds_epi16(imag1, imag2);  //with saturation arithmetic! |                     imag = _mm_adds_epi16(imag1, imag2);  // with saturation arithmetic! | ||||||
|  |  | ||||||
|                     realcacc = _mm_adds_epi16(realcacc, real); |                     realcacc = _mm_adds_epi16(realcacc, real); | ||||||
|                     imagcacc = _mm_adds_epi16(imagcacc, imag); |                     imagcacc = _mm_adds_epi16(imagcacc, imag); | ||||||
| @@ -186,10 +186,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con | |||||||
|  |  | ||||||
|             for (number = 0; number < sse_iters; number++) |             for (number = 0; number < sse_iters; number++) | ||||||
|                 { |                 { | ||||||
|                     //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] |                     // std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] | ||||||
|                     //imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1] |                     // imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1] | ||||||
|                     // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] |                     // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||||||
|                     a = _mm_loadu_si128((__m128i*)_in_a);  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                     a = _mm_loadu_si128((__m128i*)_in_a);  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in_a + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in_a + 8); | ||||||
|                     b = _mm_loadu_si128((__m128i*)_in_b); |                     b = _mm_loadu_si128((__m128i*)_in_b); | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in_b + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in_b + 8); | ||||||
| @@ -204,7 +204,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con | |||||||
|                     imag1 = _mm_mullo_epi16(a, b_sl);  // a3.i*b3.r, .... |                     imag1 = _mm_mullo_epi16(a, b_sl);  // a3.i*b3.r, .... | ||||||
|                     imag2 = _mm_mullo_epi16(b, a_sl);  // b3.i*a3.r, .... |                     imag2 = _mm_mullo_epi16(b, a_sl);  // b3.i*a3.r, .... | ||||||
|  |  | ||||||
|                     imag = _mm_adds_epi16(imag1, imag2);  //with saturation arithmetic! |                     imag = _mm_adds_epi16(imag1, imag2);  // with saturation arithmetic! | ||||||
|  |  | ||||||
|                     realcacc = _mm_adds_epi16(realcacc, real); |                     realcacc = _mm_adds_epi16(realcacc, real); | ||||||
|                     imagcacc = _mm_adds_epi16(imagcacc, imag); |                     imagcacc = _mm_adds_epi16(imagcacc, imag); | ||||||
| @@ -281,7 +281,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con | |||||||
|                     imag1 = _mm256_mullo_epi16(a, b_sl); |                     imag1 = _mm256_mullo_epi16(a, b_sl); | ||||||
|                     imag2 = _mm256_mullo_epi16(b, a_sl); |                     imag2 = _mm256_mullo_epi16(b, a_sl); | ||||||
|  |  | ||||||
|                     imag = _mm256_adds_epi16(imag1, imag2);  //with saturation arithmetic! |                     imag = _mm256_adds_epi16(imag1, imag2);  // with saturation arithmetic! | ||||||
|  |  | ||||||
|                     realcacc = _mm256_adds_epi16(realcacc, real); |                     realcacc = _mm256_adds_epi16(realcacc, real); | ||||||
|                     imagcacc = _mm256_adds_epi16(imagcacc, imag); |                     imagcacc = _mm256_adds_epi16(imagcacc, imag); | ||||||
| @@ -359,7 +359,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con | |||||||
|                     imag1 = _mm256_mullo_epi16(a, b_sl); |                     imag1 = _mm256_mullo_epi16(a, b_sl); | ||||||
|                     imag2 = _mm256_mullo_epi16(b, a_sl); |                     imag2 = _mm256_mullo_epi16(b, a_sl); | ||||||
|  |  | ||||||
|                     imag = _mm256_adds_epi16(imag1, imag2);  //with saturation arithmetic! |                     imag = _mm256_adds_epi16(imag1, imag2);  // with saturation arithmetic! | ||||||
|  |  | ||||||
|                     realcacc = _mm256_adds_epi16(realcacc, real); |                     realcacc = _mm256_adds_epi16(realcacc, real); | ||||||
|                     imagcacc = _mm256_adds_epi16(imagcacc, imag); |                     imagcacc = _mm256_adds_epi16(imagcacc, imag); | ||||||
| @@ -571,4 +571,4 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out | |||||||
|  |  | ||||||
| #endif /* LV_HAVE_NEONV7 */ | #endif /* LV_HAVE_NEONV7 */ | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H*/ | #endif /* INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H */ | ||||||
|   | |||||||
| @@ -77,15 +77,15 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu | |||||||
|             result[n_vec] = lv_cmake(0, 0); |             result[n_vec] = lv_cmake(0, 0); | ||||||
|             for (n = 0; n < num_points; n++) |             for (n = 0; n < num_points; n++) | ||||||
|                 { |                 { | ||||||
|                     //r*a.r - i*a.i, i*a.r + r*a.i |                     // r*a.r - i*a.i, i*a.r + r*a.i | ||||||
|                     //result[n_vec]+=in_common[n]*in_a[n_vec][n]; |                     // result[n_vec]+=in_common[n]*in_a[n_vec][n]; | ||||||
|                     lv_16sc_t tmp = in_common[n] * in_a[n_vec][n]; |                     lv_16sc_t tmp = in_common[n] * in_a[n_vec][n]; | ||||||
|                     result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); |                     result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); | ||||||
|                 } |                 } | ||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC | #ifdef LV_HAVE_GENERIC | ||||||
| @@ -106,7 +106,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t* | |||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE2 | #ifdef LV_HAVE_SSE2 | ||||||
| @@ -145,11 +145,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul | |||||||
|             for (index = 0; index < sse_iters; index++) |             for (index = 0; index < sse_iters; index++) | ||||||
|                 { |                 { | ||||||
|                     // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] |                     // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||||||
|                     b = _mm_load_si128((__m128i*)_in_common);  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                     b = _mm_load_si128((__m128i*)_in_common);  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); | ||||||
|                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                         { |                         { | ||||||
|                             a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4]));  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                             a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4]));  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|  |  | ||||||
|                             c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... |                             c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... | ||||||
|  |  | ||||||
| @@ -240,11 +240,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul | |||||||
|             for (index = 0; index < sse_iters; index++) |             for (index = 0; index < sse_iters; index++) | ||||||
|                 { |                 { | ||||||
|                     // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] |                     // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||||||
|                     b = _mm_loadu_si128((__m128i*)_in_common);  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                     b = _mm_loadu_si128((__m128i*)_in_common);  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); | ||||||
|                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                         { |                         { | ||||||
|                             a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4]));  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                             a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4]));  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|  |  | ||||||
|                             c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... |                             c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... | ||||||
|  |  | ||||||
| @@ -522,12 +522,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, | |||||||
|  |  | ||||||
|             for (index = 0; index < neon_iters; index++) |             for (index = 0; index < neon_iters; index++) | ||||||
|                 { |                 { | ||||||
|                     b_val = vld2_s16((int16_t*)_in_common);  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                     b_val = vld2_s16((int16_t*)_in_common);  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); | ||||||
|                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                         { |                         { | ||||||
|                             a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4]));  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                             a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4]));  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|                             //__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8); |                             // __VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8); | ||||||
|  |  | ||||||
|                             // multiply the real*real and imag*imag to get real result |                             // multiply the real*real and imag*imag to get real result | ||||||
|                             // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r |                             // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r | ||||||
| @@ -609,7 +609,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res | |||||||
|  |  | ||||||
|             for (index = 0; index < neon_iters; index++) |             for (index = 0; index < neon_iters; index++) | ||||||
|                 { |                 { | ||||||
|                     b_val = vld2_s16((int16_t*)_in_common);  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                     b_val = vld2_s16((int16_t*)_in_common);  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); | ||||||
|                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                         { |                         { | ||||||
| @@ -690,7 +690,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* | |||||||
|  |  | ||||||
|             for (index = 0; index < neon_iters; index++) |             for (index = 0; index < neon_iters; index++) | ||||||
|                 { |                 { | ||||||
|                     b_val = vld2_s16((int16_t*)_in_common);  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                     b_val = vld2_s16((int16_t*)_in_common);  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); | ||||||
|                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                         { |                         { | ||||||
| @@ -738,4 +738,4 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* | |||||||
| } | } | ||||||
| #endif /* LV_HAVE_NEONV7 */ | #endif /* LV_HAVE_NEONV7 */ | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H*/ | #endif /* INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H */ | ||||||
|   | |||||||
| @@ -68,12 +68,12 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, | |||||||
|     unsigned int n; |     unsigned int n; | ||||||
|     for (n = 0; n < num_points; n++) |     for (n = 0; n < num_points; n++) | ||||||
|         { |         { | ||||||
|             //r*a.r - i*a.i, i*a.r + r*a.i |             // r*a.r - i*a.i, i*a.r + r*a.i | ||||||
|             result[n] = in_a[n] * in_b[n]; |             result[n] = in_a[n] * in_b[n]; | ||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE2 | #ifdef LV_HAVE_SSE2 | ||||||
| @@ -93,10 +93,10 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con | |||||||
|     lv_16sc_t* _out = out; |     lv_16sc_t* _out = out; | ||||||
|     for (number = 0; number < sse_iters; number++) |     for (number = 0; number < sse_iters; number++) | ||||||
|         { |         { | ||||||
|             //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] |             // std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] | ||||||
|             //imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1] |             // imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1] | ||||||
|             // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] |             // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||||||
|             a = _mm_load_si128((__m128i*)_in_a);  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |             a = _mm_load_si128((__m128i*)_in_a);  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|             b = _mm_load_si128((__m128i*)_in_b); |             b = _mm_load_si128((__m128i*)_in_b); | ||||||
|             c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... |             c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... | ||||||
|  |  | ||||||
| @@ -147,10 +147,10 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con | |||||||
|     lv_16sc_t* _out = out; |     lv_16sc_t* _out = out; | ||||||
|     for (number = 0; number < sse_iters; number++) |     for (number = 0; number < sse_iters; number++) | ||||||
|         { |         { | ||||||
|             //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] |             // std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] | ||||||
|             //imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1] |             // imaginary part -> reinterpret_cast<cv T*>(a)[2*i + 1] | ||||||
|             // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] |             // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||||||
|             a = _mm_loadu_si128((__m128i*)_in_a);  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |             a = _mm_loadu_si128((__m128i*)_in_a);  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|             b = _mm_loadu_si128((__m128i*)_in_b); |             b = _mm_loadu_si128((__m128i*)_in_b); | ||||||
|             c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... |             c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... | ||||||
|  |  | ||||||
| @@ -340,4 +340,4 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const | |||||||
| } | } | ||||||
| #endif /* LV_HAVE_NEONV7*/ | #endif /* LV_HAVE_NEONV7*/ | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H*/ | #endif /* INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H */ | ||||||
|   | |||||||
| @@ -89,33 +89,33 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc | |||||||
|         } |         } | ||||||
|     for (n = 0; n < num_points; n++) |     for (n = 0; n < num_points; n++) | ||||||
|         { |         { | ||||||
|             tmp16 = *in_common++;  //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); |             tmp16 = *in_common++;  // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|             tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); |             tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); | ||||||
|             tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); |             tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); | ||||||
|  |  | ||||||
|             // Regenerate phase |             // Regenerate phase | ||||||
|             if (n % 256 == 0) |             if (n % 256 == 0) | ||||||
|                 { |                 { | ||||||
|                     //printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); |                     // printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
|                     (*phase) /= std::abs((*phase)); |                     (*phase) /= std::abs((*phase)); | ||||||
| #else | #else | ||||||
|                     (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); |                     (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); | ||||||
| #endif | #endif | ||||||
|                     //printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); |                     // printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|             (*phase) *= phase_inc; |             (*phase) *= phase_inc; | ||||||
|             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                 { |                 { | ||||||
|                     lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; |                     lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; | ||||||
|                     //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); |                     // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); | ||||||
|                     result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); |                     result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); | ||||||
|                 } |                 } | ||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC | #ifdef LV_HAVE_GENERIC | ||||||
| @@ -137,14 +137,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( | |||||||
|         { |         { | ||||||
|             for (j = 0; j < ROTATOR_RELOAD; j++) |             for (j = 0; j < ROTATOR_RELOAD; j++) | ||||||
|                 { |                 { | ||||||
|                     tmp16 = *in_common++;  //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); |                     tmp16 = *in_common++;  // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|                     tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); |                     tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); | ||||||
|                     tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); |                     tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); | ||||||
|                     (*phase) *= phase_inc; |                     (*phase) *= phase_inc; | ||||||
|                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                         { |                         { | ||||||
|                             lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j]; |                             lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j]; | ||||||
|                             //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); |                             // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); | ||||||
|                             result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); |                             result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); | ||||||
|                         } |                         } | ||||||
|                 } |                 } | ||||||
| @@ -152,27 +152,27 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( | |||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
|             (*phase) /= std::abs((*phase)); |             (*phase) /= std::abs((*phase)); | ||||||
| #else | #else | ||||||
|             //(*phase) /= cabsf((*phase)); |             // (*phase) /= cabsf((*phase)); | ||||||
|             (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); |             (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); | ||||||
| #endif | #endif | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for (j = 0; j < num_points % ROTATOR_RELOAD; j++) |     for (j = 0; j < num_points % ROTATOR_RELOAD; j++) | ||||||
|         { |         { | ||||||
|             tmp16 = *in_common++;  //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); |             tmp16 = *in_common++;  // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|             tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); |             tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); | ||||||
|             tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); |             tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); | ||||||
|             (*phase) *= phase_inc; |             (*phase) *= phase_inc; | ||||||
|             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                 { |                 { | ||||||
|                     lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j]; |                     lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j]; | ||||||
|                     //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); |                     // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); | ||||||
|                     result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); |                     result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); | ||||||
|                 } |                 } | ||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 | #ifdef LV_HAVE_SSE3 | ||||||
| @@ -228,9 +228,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ | |||||||
|     for (number = 0; number < sse_iters; number++) |     for (number = 0; number < sse_iters; number++) | ||||||
|         { |         { | ||||||
|             // Phase rotation on operand in_common starts here: |             // Phase rotation on operand in_common starts here: | ||||||
|             //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); |             // printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -239,7 +239,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ | |||||||
|             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             pc1 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic |             pc1 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -247,11 +247,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ | |||||||
|             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             __VOLK_GNSSSDR_PREFETCH(_in_common + 8); |             __VOLK_GNSSSDR_PREFETCH(_in_common + 8); | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -260,7 +260,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ | |||||||
|             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             pc2 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic |             pc2 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -271,12 +271,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ | |||||||
|             // store four rotated in_common samples in the register b |             // store four rotated in_common samples in the register b | ||||||
|             b = _mm_packs_epi32(pc1, pc2);  // convert from 32ic to 16ic |             b = _mm_packs_epi32(pc1, pc2);  // convert from 32ic to 16ic | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|  |  | ||||||
|             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                 { |                 { | ||||||
|                     a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 4]));  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                     a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 4]));  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|  |  | ||||||
|                     c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... |                     c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... | ||||||
|  |  | ||||||
| @@ -331,12 +331,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ | |||||||
|     two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); |     two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); | ||||||
|  |  | ||||||
|     _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); |     _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); | ||||||
|     //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); |     // (*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); | ||||||
|     (*phase) = two_phase_acc[0]; |     (*phase) = two_phase_acc[0]; | ||||||
|  |  | ||||||
|     for (n = sse_iters * 4; n < num_points; n++) |     for (n = sse_iters * 4; n < num_points; n++) | ||||||
|         { |         { | ||||||
|             tmp16 = in_common[n];  //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); |             tmp16 = in_common[n];  // printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|             tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); |             tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); | ||||||
|             tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); |             tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); | ||||||
|             (*phase) *= phase_inc; |             (*phase) *= phase_inc; | ||||||
| @@ -344,7 +344,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ | |||||||
|             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                 { |                 { | ||||||
|                     lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; |                     lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; | ||||||
|                     //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); |                     // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); | ||||||
|                     _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), |                     _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), | ||||||
|                         sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); |                         sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); | ||||||
|                 } |                 } | ||||||
| @@ -411,9 +411,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|             for (j = 0; j < ROTATOR_RELOAD; j++) |             for (j = 0; j < ROTATOR_RELOAD; j++) | ||||||
|                 { |                 { | ||||||
|                     // Phase rotation on operand in_common starts here: |                     // Phase rotation on operand in_common starts here: | ||||||
|                     //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); |                     // printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|                     pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |                     pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|                     //complex 32fc multiplication b=a*two_phase_acc_reg |                     // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -422,7 +422,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|                     pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|                     pc1 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic |                     pc1 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |                     // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -430,11 +430,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|                     tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |                     tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|                     //next two samples |                     // next two samples | ||||||
|                     _in_common += 2; |                     _in_common += 2; | ||||||
|                     pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |                     pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); |                     __VOLK_GNSSSDR_PREFETCH(_in_common + 8); | ||||||
|                     //complex 32fc multiplication b=a*two_phase_acc_reg |                     // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -443,7 +443,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|                     pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|                     pc2 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic |                     pc2 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |                     // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -454,12 +454,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|                     // store four rotated in_common samples in the register b |                     // store four rotated in_common samples in the register b | ||||||
|                     b = _mm_packs_epi32(pc1, pc2);  // convert from 32ic to 16ic |                     b = _mm_packs_epi32(pc1, pc2);  // convert from 32ic to 16ic | ||||||
|  |  | ||||||
|                     //next two samples |                     // next two samples | ||||||
|                     _in_common += 2; |                     _in_common += 2; | ||||||
|  |  | ||||||
|                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                         { |                         { | ||||||
|                             a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4]));  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                             a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4]));  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|  |  | ||||||
|                             c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... |                             c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... | ||||||
|  |  | ||||||
| @@ -488,8 +488,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|  |  | ||||||
|     for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) |     for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) | ||||||
|         { |         { | ||||||
|             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -498,7 +498,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             pc1 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic |             pc1 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -506,11 +506,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             __VOLK_GNSSSDR_PREFETCH(_in_common + 8); |             __VOLK_GNSSSDR_PREFETCH(_in_common + 8); | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -519,7 +519,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             pc2 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic |             pc2 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -530,12 +530,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|             // store four rotated in_common samples in the register b |             // store four rotated in_common samples in the register b | ||||||
|             b = _mm_packs_epi32(pc1, pc2);  // convert from 32ic to 16ic |             b = _mm_packs_epi32(pc1, pc2);  // convert from 32ic to 16ic | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|  |  | ||||||
|             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                 { |                 { | ||||||
|                     a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4]));  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                     a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4]));  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|  |  | ||||||
|                     c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... |                     c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... | ||||||
|  |  | ||||||
| @@ -582,12 +582,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|     two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); |     two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); | ||||||
|  |  | ||||||
|     _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); |     _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); | ||||||
|     //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); |     // (*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); | ||||||
|     (*phase) = two_phase_acc[0]; |     (*phase) = two_phase_acc[0]; | ||||||
|  |  | ||||||
|     for (n = sse_iters * 4; n < num_points; n++) |     for (n = sse_iters * 4; n < num_points; n++) | ||||||
|         { |         { | ||||||
|             tmp16 = in_common[n];  //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); |             tmp16 = in_common[n];  // printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|             tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); |             tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); | ||||||
|             tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); |             tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); | ||||||
|             (*phase) *= phase_inc; |             (*phase) *= phase_inc; | ||||||
| @@ -595,7 +595,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l | |||||||
|             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                 { |                 { | ||||||
|                     lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; |                     lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; | ||||||
|                     //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); |                     // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); | ||||||
|                     _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), |                     _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), | ||||||
|                         sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); |                         sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); | ||||||
|                 } |                 } | ||||||
| @@ -657,9 +657,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ | |||||||
|     for (number = 0; number < sse_iters; number++) |     for (number = 0; number < sse_iters; number++) | ||||||
|         { |         { | ||||||
|             // Phase rotation on operand in_common starts here: |             // Phase rotation on operand in_common starts here: | ||||||
|             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             __VOLK_GNSSSDR_PREFETCH(_in_common + 8); |             __VOLK_GNSSSDR_PREFETCH(_in_common + 8); | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -668,7 +668,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ | |||||||
|             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             pc1 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic |             pc1 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -676,11 +676,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ | |||||||
|             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             __VOLK_GNSSSDR_PREFETCH(_in_common + 8); |             __VOLK_GNSSSDR_PREFETCH(_in_common + 8); | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(pa, yl);                // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -689,7 +689,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ | |||||||
|             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             pb = _mm_addsub_ps(tmp1, tmp2);           // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             pc2 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic |             pc2 = _mm_cvtps_epi32(pb);                // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -700,12 +700,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ | |||||||
|             // store four rotated in_common samples in the register b |             // store four rotated in_common samples in the register b | ||||||
|             b = _mm_packs_epi32(pc1, pc2);  // convert from 32ic to 16ic |             b = _mm_packs_epi32(pc1, pc2);  // convert from 32ic to 16ic | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|  |  | ||||||
|             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                 { |                 { | ||||||
|                     a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 4]));  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                     a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 4]));  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|  |  | ||||||
|                     c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... |                     c = _mm_mullo_epi16(a, b);  // a3.i*b3.i, a3.r*b3.r, .... | ||||||
|  |  | ||||||
| @@ -824,8 +824,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ | |||||||
|  |  | ||||||
|     for (number = 0; number < avx2_iters; number++) |     for (number = 0; number < avx2_iters; number++) | ||||||
|         { |         { | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -834,7 +834,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -842,11 +842,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ | |||||||
|             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|  |  | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -855,7 +855,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -866,8 +866,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ | |||||||
|             // store four output samples |             // store four output samples | ||||||
|             result1 = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic |             result1 = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -876,7 +876,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -884,11 +884,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ | |||||||
|             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             __VOLK_GNSSSDR_PREFETCH(_in_common + 16); |             __VOLK_GNSSSDR_PREFETCH(_in_common + 16); | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -897,7 +897,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1036,8 +1036,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|         { |         { | ||||||
|             for (j = 0; j < ROTATOR_RELOAD; j++) |             for (j = 0; j < ROTATOR_RELOAD; j++) | ||||||
|                 { |                 { | ||||||
|                     a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |                     a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|                     //complex 32fc multiplication b=a*two_phase_acc_reg |                     // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1046,7 +1046,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|                     c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |                     c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |                     // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1054,11 +1054,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|                     tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |                     tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|                     //next two samples |                     // next two samples | ||||||
|                     _in_common += 2; |                     _in_common += 2; | ||||||
|                     a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |                     a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|  |  | ||||||
|                     //complex 32fc multiplication b=a*two_phase_acc_reg |                     // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1067,7 +1067,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|                     c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |                     c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |                     // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1078,8 +1078,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|                     // store four output samples |                     // store four output samples | ||||||
|                     result1 = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic |                     result1 = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic | ||||||
|                     _in_common += 2; |                     _in_common += 2; | ||||||
|                     a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |                     a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|                     //complex 32fc multiplication b=a*two_phase_acc_reg |                     // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1088,7 +1088,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|                     c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |                     c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |                     // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1096,11 +1096,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|                     tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |                     tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|                     //next two samples |                     // next two samples | ||||||
|                     _in_common += 2; |                     _in_common += 2; | ||||||
|                     a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |                     a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|                     __VOLK_GNSSSDR_PREFETCH(_in_common + 16); |                     __VOLK_GNSSSDR_PREFETCH(_in_common + 16); | ||||||
|                     //complex 32fc multiplication b=a*two_phase_acc_reg |                     // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1109,7 +1109,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |                     b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|                     c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |                     c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |                     // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |                     yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |                     yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1152,8 +1152,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|  |  | ||||||
|     for (j = 0; j < avx2_iters % ROTATOR_RELOAD; j++) |     for (j = 0; j < avx2_iters % ROTATOR_RELOAD; j++) | ||||||
|         { |         { | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1162,7 +1162,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1170,11 +1170,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|  |  | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1183,7 +1183,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1194,8 +1194,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|             // store four output samples |             // store four output samples | ||||||
|             result1 = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic |             result1 = _mm_packs_epi32(c1, c2);  // convert from 32ic to 16ic | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1204,7 +1204,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c1 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1212,11 +1212,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di |             tmp2 = _mm_mul_ps(tmp3, yh);                                        // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2);                      // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg |             a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0])));  // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg | ||||||
|             __VOLK_GNSSSDR_PREFETCH(_in_common + 16); |             __VOLK_GNSSSDR_PREFETCH(_in_common + 16); | ||||||
|             //complex 32fc multiplication b=a*two_phase_acc_reg |             // complex 32fc multiplication b=a*two_phase_acc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);  // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(a, yl);                 // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1225,7 +1225,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l | |||||||
|             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |             b = _mm_addsub_ps(tmp1, tmp2);            // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic |             c2 = _mm_cvtps_epi32(b);                  // convert from 32fc to 32ic | ||||||
|  |  | ||||||
|             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg |             // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg | ||||||
|             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(two_phase_acc_reg);                            // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di |             yh = _mm_movehdup_ps(two_phase_acc_reg);                            // Load yh with ci,ci,di,di | ||||||
|             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl);                           // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
| @@ -1414,8 +1414,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* | |||||||
|  |  | ||||||
|                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|                         { |                         { | ||||||
|                             a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4]));  //load (2 byte imag, 2 byte real) x 4 into 128 bits reg |                             a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4]));  // load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|                             //__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); |                             // __VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); | ||||||
|  |  | ||||||
|                             // multiply the real*real and imag*imag to get real result |                             // multiply the real*real and imag*imag to get real result | ||||||
|                             // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r |                             // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r | ||||||
| @@ -1474,7 +1474,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* | |||||||
|  |  | ||||||
|     for (n = neon_iters * 4; n < num_points; n++) |     for (n = neon_iters * 4; n < num_points; n++) | ||||||
|         { |         { | ||||||
|             tmp16_ = in_common[n];  //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); |             tmp16_ = in_common[n];  // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|             tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); |             tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); | ||||||
|             tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); |             tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); | ||||||
|             (*phase) *= phase_inc; |             (*phase) *= phase_inc; | ||||||
| @@ -1513,7 +1513,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s | |||||||
|             float arg_phase0 = cargf(*phase); |             float arg_phase0 = cargf(*phase); | ||||||
|             float arg_phase_inc = cargf(phase_inc); |             float arg_phase_inc = cargf(phase_inc); | ||||||
|             float phase_est; |             float phase_est; | ||||||
|             //printf("arg phase0: %f", arg_phase0); |             // printf("arg phase0: %f", arg_phase0); | ||||||
|             lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; |             lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; | ||||||
|             __VOLK_ATTR_ALIGNED(16) |             __VOLK_ATTR_ALIGNED(16) | ||||||
|             float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; |             float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; | ||||||
| @@ -1605,9 +1605,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s | |||||||
|                     // Regenerate phase |                     // Regenerate phase | ||||||
|                     if ((number % 256) == 0) |                     if ((number % 256) == 0) | ||||||
|                         { |                         { | ||||||
|                             //printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); |                             // printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); | ||||||
|                             phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; |                             phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; | ||||||
|                             //printf("Estimated phase: %f\n\n", cos(phase_est)); |                             // printf("Estimated phase: %f\n\n", cos(phase_est)); | ||||||
|  |  | ||||||
|                             *phase = lv_cmake(cos(phase_est), sin(phase_est)); |                             *phase = lv_cmake(cos(phase_est), sin(phase_est)); | ||||||
|                             phase2 = (lv_32fc_t)(*phase) * phase_inc; |                             phase2 = (lv_32fc_t)(*phase) * phase_inc; | ||||||
| @@ -1624,14 +1624,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s | |||||||
|  |  | ||||||
|                             // Round = vmulq_f32(_phase_real, _phase_real); |                             // Round = vmulq_f32(_phase_real, _phase_real); | ||||||
|                             // Round = vmlaq_f32(Round, _phase_imag, _phase_imag); |                             // Round = vmlaq_f32(Round, _phase_imag, _phase_imag); | ||||||
|                             //               Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]); |                             //               Round = vsqrtq_f32(Round);// printf("sqrt: %f \n", Round[0]); | ||||||
|                             //Round = vrsqrteq_f32(Round);printf("1/sqtr: %f  \n",Round[0]); |                             // Round = vrsqrteq_f32(Round);printf("1/sqtr: %f  \n",Round[0]); | ||||||
|                             //Round = vrecpeq_f32((Round); |                             // Round = vrecpeq_f32((Round); | ||||||
|                             //              _phase_real = vdivq_f32(_phase_real, Round); |                             //              _phase_real = vdivq_f32(_phase_real, Round); | ||||||
|                             //              _phase_imag = vdivq_f32(_phase_imag, Round); |                             //              _phase_imag = vdivq_f32(_phase_imag, Round); | ||||||
|                             //_phase_real = vmulq_f32(_phase_real, Round); |                             // _phase_real = vmulq_f32(_phase_real, Round); | ||||||
|                             //_phase_imag = vmulq_f32(_phase_imag, Round); |                             // _phase_imag = vmulq_f32(_phase_imag, Round); | ||||||
|                             //printf("After  %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); |                             // printf("After  %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); | ||||||
|                         } |                         } | ||||||
|  |  | ||||||
|                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |                     for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
| @@ -1671,7 +1671,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s | |||||||
|  |  | ||||||
|     for (n = neon_iters * 4; n < num_points; n++) |     for (n = neon_iters * 4; n < num_points; n++) | ||||||
|         { |         { | ||||||
|             tmp16_ = in_common[n];  //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); |             tmp16_ = in_common[n];  // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|             tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); |             tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); | ||||||
|             tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); |             tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); | ||||||
|             (*phase) *= phase_inc; |             (*phase) *= phase_inc; | ||||||
| @@ -1804,9 +1804,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ | |||||||
|                     // Regenerate phase |                     // Regenerate phase | ||||||
|                     if ((number % 256) == 0) |                     if ((number % 256) == 0) | ||||||
|                         { |                         { | ||||||
|                             //printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); |                             // printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); | ||||||
|                             phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; |                             phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; | ||||||
|                             //printf("Estimated phase: %f\n\n", cos(phase_est)); |                             // printf("Estimated phase: %f\n\n", cos(phase_est)); | ||||||
|  |  | ||||||
|                             *phase = lv_cmake(cos(phase_est), sin(phase_est)); |                             *phase = lv_cmake(cos(phase_est), sin(phase_est)); | ||||||
|                             phase2 = (lv_32fc_t)(*phase) * phase_inc; |                             phase2 = (lv_32fc_t)(*phase) * phase_inc; | ||||||
| @@ -1860,7 +1860,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ | |||||||
|  |  | ||||||
|     for (n = neon_iters * 4; n < num_points; n++) |     for (n = neon_iters * 4; n < num_points; n++) | ||||||
|         { |         { | ||||||
|             tmp16_ = in_common[n];  //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); |             tmp16_ = in_common[n];  // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|             tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); |             tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); | ||||||
|             tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); |             tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); | ||||||
|             (*phase) *= phase_inc; |             (*phase) *= phase_inc; | ||||||
| @@ -1874,4 +1874,4 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ | |||||||
|  |  | ||||||
| #endif /* LV_HAVE_NEONV7 */ | #endif /* LV_HAVE_NEONV7 */ | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H*/ | #endif /* INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H */ | ||||||
|   | |||||||
| @@ -82,7 +82,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** re | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); |                     if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); | ||||||
|                     local_code_chip_index = local_code_chip_index % code_length_chips; |                     local_code_chip_index = local_code_chip_index % code_length_chips; | ||||||
|                     result[current_correlator_tap][n] = local_code[local_code_chip_index]; |                     result[current_correlator_tap][n] = local_code[local_code_chip_index]; | ||||||
| @@ -90,7 +90,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** re | |||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE4_1 | #ifdef LV_HAVE_SSE4_1 | ||||||
| @@ -149,7 +149,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -216,7 +216,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -287,7 +287,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -358,7 +358,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -436,7 +436,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -514,7 +514,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -567,7 +567,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul | |||||||
|                     aux = vmulq_f32(code_phase_step_chips_reg, indexn); |                     aux = vmulq_f32(code_phase_step_chips_reg, indexn); | ||||||
|                     aux = vaddq_f32(aux, aux2); |                     aux = vaddq_f32(aux, aux2); | ||||||
|  |  | ||||||
|                     //floor |                     // floor | ||||||
|                     i = vcvtq_s32_f32(aux); |                     i = vcvtq_s32_f32(aux); | ||||||
|                     fi = vcvtq_f32_s32(i); |                     fi = vcvtq_f32_s32(i); | ||||||
|                     igx = vcgtq_f32(fi, aux); |                     igx = vcgtq_f32(fi, aux); | ||||||
| @@ -599,7 +599,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul | |||||||
|                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); |                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -611,4 +611,4 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/ | #endif /* INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H */ | ||||||
|   | |||||||
| @@ -73,7 +73,7 @@ | |||||||
| static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) | static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) | ||||||
| { | { | ||||||
|     int local_code_chip_index; |     int local_code_chip_index; | ||||||
|     //fesetround(FE_TONEAREST); |     // fesetround(FE_TONEAREST); | ||||||
|     int current_vector; |     int current_vector; | ||||||
|     unsigned int n; |     unsigned int n; | ||||||
|     for (current_vector = 0; current_vector < num_out_vectors; current_vector++) |     for (current_vector = 0; current_vector < num_out_vectors; current_vector++) | ||||||
| @@ -89,7 +89,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t | |||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE2 | #ifdef LV_HAVE_SSE2 | ||||||
| @@ -97,7 +97,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t | |||||||
|  |  | ||||||
| static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) | static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) | ||||||
| { | { | ||||||
|     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);  //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO |     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);  // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO | ||||||
|     unsigned int number; |     unsigned int number; | ||||||
|     const unsigned int quarterPoints = num_output_samples / 4; |     const unsigned int quarterPoints = num_output_samples / 4; | ||||||
|  |  | ||||||
| @@ -109,7 +109,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* | |||||||
|     __m128i _code_length_chips, _code_length_chips_minus1; |     __m128i _code_length_chips, _code_length_chips_minus1; | ||||||
|     __m128 _code_phase_out, _code_phase_out_with_offset; |     __m128 _code_phase_out, _code_phase_out_with_offset; | ||||||
|  |  | ||||||
|     _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips);  //load float to all four float values in m128 register |     _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips);  // load float to all four float values in m128 register | ||||||
|     __VOLK_ATTR_ALIGNED(16) |     __VOLK_ATTR_ALIGNED(16) | ||||||
|     int four_times_code_length_chips_minus1[4]; |     int four_times_code_length_chips_minus1[4]; | ||||||
|     four_times_code_length_chips_minus1[0] = code_length_chips - 1; |     four_times_code_length_chips_minus1[0] = code_length_chips - 1; | ||||||
| @@ -124,8 +124,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* | |||||||
|     four_times_code_length_chips[2] = code_length_chips; |     four_times_code_length_chips[2] = code_length_chips; | ||||||
|     four_times_code_length_chips[3] = code_length_chips; |     four_times_code_length_chips[3] = code_length_chips; | ||||||
|  |  | ||||||
|     _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips);                //load float to all four float values in m128 register |     _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips);                // load float to all four float values in m128 register | ||||||
|     _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1);  //load float to all four float values in m128 register |     _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1);  // load float to all four float values in m128 register | ||||||
|  |  | ||||||
|     __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; |     __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; | ||||||
|  |  | ||||||
| @@ -142,29 +142,29 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* | |||||||
|     int sample_idx = 0; |     int sample_idx = 0; | ||||||
|     for (number = 0; number < quarterPoints; number++) |     for (number = 0; number < quarterPoints; number++) | ||||||
|         { |         { | ||||||
|             //common to all outputs |             // common to all outputs | ||||||
|             _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index);  //compute the code phase point with the phase step |             _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index);  // compute the code phase point with the phase step | ||||||
|  |  | ||||||
|             //output vector dependent (different code phase offset) |             // output vector dependent (different code phase offset) | ||||||
|             for (current_vector = 0; current_vector < num_out_vectors; current_vector++) |             for (current_vector = 0; current_vector < num_out_vectors; current_vector++) | ||||||
|                 { |                 { | ||||||
|                     tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f;  // adjust offset to perform correct rounding (chip transition at 0) |                     tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f;  // adjust offset to perform correct rounding (chip transition at 0) | ||||||
|                     _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips);               //load float to all four float values in m128 register |                     _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips);               // load float to all four float values in m128 register | ||||||
|  |  | ||||||
|                     _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase);  //add the phase offset |                     _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase);  // add the phase offset | ||||||
|                     _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset);          //convert to integer |                     _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset);          // convert to integer | ||||||
|  |  | ||||||
|                     negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero);                     //test for negative values |                     negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero);                     // test for negative values | ||||||
|                     _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips);  //the negative values branch |                     _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips);  // the negative values branch | ||||||
|                     _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); |                     _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); | ||||||
|  |  | ||||||
|                     overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1);  //test for overflow values |                     overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1);  // test for overflow values | ||||||
|                     _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips);   //the negative values branch |                     _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips);   // the negative values branch | ||||||
|                     _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); |                     _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); | ||||||
|  |  | ||||||
|                     _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back |                     _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back | ||||||
|  |  | ||||||
|                     //todo: optimize the local code lookup table with intrinsics, if possible |                     // todo: optimize the local code lookup table with intrinsics, if possible | ||||||
|                     _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; |                     _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; | ||||||
|                     _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; |                     _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; | ||||||
|                     _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; |                     _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; | ||||||
| @@ -193,7 +193,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* | |||||||
|  |  | ||||||
| static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) | static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) | ||||||
| { | { | ||||||
|     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);  //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO |     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);  // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO | ||||||
|     unsigned int number; |     unsigned int number; | ||||||
|     const unsigned int quarterPoints = num_output_samples / 4; |     const unsigned int quarterPoints = num_output_samples / 4; | ||||||
|  |  | ||||||
| @@ -205,7 +205,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t* | |||||||
|     __m128i _code_length_chips, _code_length_chips_minus1; |     __m128i _code_length_chips, _code_length_chips_minus1; | ||||||
|     __m128 _code_phase_out, _code_phase_out_with_offset; |     __m128 _code_phase_out, _code_phase_out_with_offset; | ||||||
|  |  | ||||||
|     _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips);  //load float to all four float values in m128 register |     _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips);  // load float to all four float values in m128 register | ||||||
|     __VOLK_ATTR_ALIGNED(16) |     __VOLK_ATTR_ALIGNED(16) | ||||||
|     int four_times_code_length_chips_minus1[4]; |     int four_times_code_length_chips_minus1[4]; | ||||||
|     four_times_code_length_chips_minus1[0] = code_length_chips - 1; |     four_times_code_length_chips_minus1[0] = code_length_chips - 1; | ||||||
| @@ -220,8 +220,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t* | |||||||
|     four_times_code_length_chips[2] = code_length_chips; |     four_times_code_length_chips[2] = code_length_chips; | ||||||
|     four_times_code_length_chips[3] = code_length_chips; |     four_times_code_length_chips[3] = code_length_chips; | ||||||
|  |  | ||||||
|     _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips);                //load float to all four float values in m128 register |     _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips);                // load float to all four float values in m128 register | ||||||
|     _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1);  //load float to all four float values in m128 register |     _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1);  // load float to all four float values in m128 register | ||||||
|  |  | ||||||
|     __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; |     __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; | ||||||
|  |  | ||||||
| @@ -238,29 +238,29 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t* | |||||||
|     int sample_idx = 0; |     int sample_idx = 0; | ||||||
|     for (number = 0; number < quarterPoints; number++) |     for (number = 0; number < quarterPoints; number++) | ||||||
|         { |         { | ||||||
|             //common to all outputs |             // common to all outputs | ||||||
|             _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index);  //compute the code phase point with the phase step |             _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index);  // compute the code phase point with the phase step | ||||||
|  |  | ||||||
|             //output vector dependent (different code phase offset) |             // output vector dependent (different code phase offset) | ||||||
|             for (current_vector = 0; current_vector < num_out_vectors; current_vector++) |             for (current_vector = 0; current_vector < num_out_vectors; current_vector++) | ||||||
|                 { |                 { | ||||||
|                     tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f;  // adjust offset to perform correct rounding (chip transition at 0) |                     tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f;  // adjust offset to perform correct rounding (chip transition at 0) | ||||||
|                     _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips);               //load float to all four float values in m128 register |                     _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips);               // load float to all four float values in m128 register | ||||||
|  |  | ||||||
|                     _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase);  //add the phase offset |                     _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase);  // add the phase offset | ||||||
|                     _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset);          //convert to integer |                     _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset);          // convert to integer | ||||||
|  |  | ||||||
|                     negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero);                     //test for negative values |                     negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero);                     // test for negative values | ||||||
|                     _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips);  //the negative values branch |                     _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips);  // the negative values branch | ||||||
|                     _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); |                     _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); | ||||||
|  |  | ||||||
|                     overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1);  //test for overflow values |                     overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1);  // test for overflow values | ||||||
|                     _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips);   //the negative values branch |                     _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips);   // the negative values branch | ||||||
|                     _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); |                     _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); | ||||||
|  |  | ||||||
|                     _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back |                     _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back | ||||||
|  |  | ||||||
|                     //todo: optimize the local code lookup table with intrinsics, if possible |                     // todo: optimize the local code lookup table with intrinsics, if possible | ||||||
|                     _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; |                     _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; | ||||||
|                     _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; |                     _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; | ||||||
|                     _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; |                     _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; | ||||||
| @@ -303,7 +303,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** | |||||||
|     float32x4_t _code_phase_out, _code_phase_out_with_offset; |     float32x4_t _code_phase_out, _code_phase_out_with_offset; | ||||||
|     float32x4_t sign, PlusHalf, Round; |     float32x4_t sign, PlusHalf, Round; | ||||||
|  |  | ||||||
|     _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips);  //load float to all four float values in float32x4_t register |     _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips);  // load float to all four float values in float32x4_t register | ||||||
|     __VOLK_ATTR_ALIGNED(16) |     __VOLK_ATTR_ALIGNED(16) | ||||||
|     int four_times_code_length_chips_minus1[4]; |     int four_times_code_length_chips_minus1[4]; | ||||||
|     four_times_code_length_chips_minus1[0] = code_length_chips - 1; |     four_times_code_length_chips_minus1[0] = code_length_chips - 1; | ||||||
| @@ -318,8 +318,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** | |||||||
|     four_times_code_length_chips[2] = code_length_chips; |     four_times_code_length_chips[2] = code_length_chips; | ||||||
|     four_times_code_length_chips[3] = code_length_chips; |     four_times_code_length_chips[3] = code_length_chips; | ||||||
|  |  | ||||||
|     _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips);                //load float to all four float values in float32x4_t register |     _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips);                // load float to all four float values in float32x4_t register | ||||||
|     _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1);  //load float to all four float values in float32x4_t register |     _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1);  // load float to all four float values in float32x4_t register | ||||||
|  |  | ||||||
|     int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; |     int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; | ||||||
|     uint32x4_t negative_indexes, overflow_indexes; |     uint32x4_t negative_indexes, overflow_indexes; | ||||||
| @@ -336,33 +336,33 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** | |||||||
|     int sample_idx = 0; |     int sample_idx = 0; | ||||||
|     for (number = 0; number < quarterPoints; number++) |     for (number = 0; number < quarterPoints; number++) | ||||||
|         { |         { | ||||||
|             //common to all outputs |             // common to all outputs | ||||||
|             _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index);  //compute the code phase point with the phase step |             _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index);  // compute the code phase point with the phase step | ||||||
|  |  | ||||||
|             //output vector dependent (different code phase offset) |             // output vector dependent (different code phase offset) | ||||||
|             for (current_vector = 0; current_vector < num_out_vectors; current_vector++) |             for (current_vector = 0; current_vector < num_out_vectors; current_vector++) | ||||||
|                 { |                 { | ||||||
|                     tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f;  // adjust offset to perform correct rounding (chip transition at 0) |                     tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f;  // adjust offset to perform correct rounding (chip transition at 0) | ||||||
|                     _rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips);              //load float to all four float values in float32x4_t register |                     _rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips);              // load float to all four float values in float32x4_t register | ||||||
|  |  | ||||||
|                     _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase);  //add the phase offset |                     _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase);  // add the phase offset | ||||||
|                     //_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer |                     // _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer | ||||||
|                     sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); |                     sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); | ||||||
|                     PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); |                     PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); | ||||||
|                     Round = vsubq_f32(PlusHalf, sign); |                     Round = vsubq_f32(PlusHalf, sign); | ||||||
|                     _code_phase_out_int = vcvtq_s32_f32(Round); |                     _code_phase_out_int = vcvtq_s32_f32(Round); | ||||||
|  |  | ||||||
|                     negative_indexes = vcltq_s32(_code_phase_out_int, zero);                       //test for negative values |                     negative_indexes = vcltq_s32(_code_phase_out_int, zero);                       // test for negative values | ||||||
|                     _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips);  //the negative values branch |                     _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips);  // the negative values branch | ||||||
|                     _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); |                     _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); | ||||||
|  |  | ||||||
|                     overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1);   //test for overflow values |                     overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1);   // test for overflow values | ||||||
|                     _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips);  //the negative values branch |                     _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips);  // the negative values branch | ||||||
|                     _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); |                     _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); | ||||||
|  |  | ||||||
|                     vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back |                     vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over);  // Store the results back | ||||||
|  |  | ||||||
|                     //todo: optimize the local code lookup table with intrinsics, if possible |                     // todo: optimize the local code lookup table with intrinsics, if possible | ||||||
|                     _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; |                     _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; | ||||||
|                     _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; |                     _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; | ||||||
|                     _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; |                     _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; | ||||||
| @@ -386,4 +386,4 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** | |||||||
|  |  | ||||||
| #endif /* LV_HAVE_NEONV7 */ | #endif /* LV_HAVE_NEONV7 */ | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_H*/ | #endif /* INCLUDED_volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_H */ | ||||||
|   | |||||||
| @@ -256,33 +256,6 @@ static inline void volk_gnsssdr_32f_high_dynamics_resamplerxnpuppet_32f_u_avx(fl | |||||||
|     volk_gnsssdr_free(result_aux); |     volk_gnsssdr_free(result_aux); | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| // |  | ||||||
| //#ifdef LV_HAVE_NEONV7 |  | ||||||
| //static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points) |  | ||||||
| //{ |  | ||||||
| //    int code_length_chips = 2046; |  | ||||||
| //    float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); |  | ||||||
| //    int num_out_vectors = 3; |  | ||||||
| //    float rem_code_phase_chips = -0.234; |  | ||||||
| //    int n; |  | ||||||
| //    float shifts_chips[3] = {-0.1, 0.0, 0.1}; |  | ||||||
| // |  | ||||||
| //    float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); |  | ||||||
| //    for (n = 0; n < num_out_vectors; n++) |  | ||||||
| //        { |  | ||||||
| //            result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); |  | ||||||
| //        } |  | ||||||
| // |  | ||||||
| //    volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); |  | ||||||
| // |  | ||||||
| //    memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); |  | ||||||
| // |  | ||||||
| //    for (n = 0; n < num_out_vectors; n++) |  | ||||||
| //        { |  | ||||||
| //            volk_gnsssdr_free(result_aux[n]); |  | ||||||
| //        } |  | ||||||
| //    volk_gnsssdr_free(result_aux); |  | ||||||
| //} |  | ||||||
| //#endif |  | ||||||
|  |  | ||||||
| #endif  // INCLUDED_volk_gnsssdr_32f_high_dynamics_resamplerpuppet_32f_H | #endif  // INCLUDED_volk_gnsssdr_32f_high_dynamics_resamplerpuppet_32f_H | ||||||
|   | |||||||
| @@ -52,6 +52,8 @@ | |||||||
|  * \b Outputs |  * \b Outputs | ||||||
|  * \li out:            Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n])) |  * \li out:            Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n])) | ||||||
|  * |  * | ||||||
|  |  * Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier | ||||||
|  |  * Based on algorithms from the cephes library http://www.netlib.org/cephes/ | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
| #ifndef INCLUDED_volk_gnsssdr_32f_sincos_32fc_H | #ifndef INCLUDED_volk_gnsssdr_32f_sincos_32fc_H | ||||||
| @@ -251,8 +253,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f | |||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE2 | #ifdef LV_HAVE_SSE2 | ||||||
| #include <emmintrin.h> | #include <emmintrin.h> | ||||||
| /* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier  */ |  | ||||||
| /* Based on algorithms from the cephes library http://www.netlib.org/cephes/   */ |  | ||||||
| static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float* in, unsigned int num_points) | static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float* in, unsigned int num_points) | ||||||
| { | { | ||||||
|     lv_32fc_t* bPtr = out; |     lv_32fc_t* bPtr = out; | ||||||
| @@ -421,8 +422,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo | |||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE2 | #ifdef LV_HAVE_SSE2 | ||||||
| #include <emmintrin.h> | #include <emmintrin.h> | ||||||
| /* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier  */ |  | ||||||
| /* Based on algorithms from the cephes library http://www.netlib.org/cephes/   */ |  | ||||||
| static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float* in, unsigned int num_points) | static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float* in, unsigned int num_points) | ||||||
| { | { | ||||||
|     lv_32fc_t* bPtr = out; |     lv_32fc_t* bPtr = out; | ||||||
| @@ -644,8 +644,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con | |||||||
|  |  | ||||||
| #ifdef LV_HAVE_NEONV7 | #ifdef LV_HAVE_NEONV7 | ||||||
| #include <arm_neon.h> | #include <arm_neon.h> | ||||||
| /* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier  */ |  | ||||||
| /* Based on algorithms from the cephes library http://www.netlib.org/cephes/   */ |  | ||||||
| static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float* in, unsigned int num_points) | static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float* in, unsigned int num_points) | ||||||
| { | { | ||||||
|     lv_32fc_t* bPtr = out; |     lv_32fc_t* bPtr = out; | ||||||
|   | |||||||
| @@ -83,7 +83,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_generic(fl | |||||||
|     int local_code_chip_index; |     int local_code_chip_index; | ||||||
|     int current_correlator_tap; |     int current_correlator_tap; | ||||||
|     unsigned int n; |     unsigned int n; | ||||||
|     //first correlator |     // first correlator | ||||||
|     for (n = 0; n < num_points; n++) |     for (n = 0; n < num_points; n++) | ||||||
|         { |         { | ||||||
|             // resample code for first tap |             // resample code for first tap | ||||||
| @@ -94,7 +94,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_generic(fl | |||||||
|             result[0][n] = local_code[local_code_chip_index]; |             result[0][n] = local_code[local_code_chip_index]; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     //adjacent correlators |     // adjacent correlators | ||||||
|     unsigned int shift_samples = 0; |     unsigned int shift_samples = 0; | ||||||
|     for (current_correlator_tap = 1; current_correlator_tap < num_out_vectors; current_correlator_tap++) |     for (current_correlator_tap = 1; current_correlator_tap < num_out_vectors; current_correlator_tap++) | ||||||
|         { |         { | ||||||
| @@ -618,11 +618,11 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa | |||||||
| #endif | #endif | ||||||
| // | // | ||||||
| // | // | ||||||
| //#ifdef LV_HAVE_NEONV7 | // #ifdef LV_HAVE_NEONV7 | ||||||
| //#include <arm_neon.h> | // #include <arm_neon.h> | ||||||
| // | // | ||||||
| //static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) | // static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) | ||||||
| //{ | // { | ||||||
| //    float** _result = result; | //    float** _result = result; | ||||||
| //    const unsigned int neon_iters = num_points / 4; | //    const unsigned int neon_iters = num_points / 4; | ||||||
| //    int current_correlator_tap; | //    int current_correlator_tap; | ||||||
| @@ -662,7 +662,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa | |||||||
| //                    aux = vmulq_f32(code_phase_step_chips_reg, indexn); | //                    aux = vmulq_f32(code_phase_step_chips_reg, indexn); | ||||||
| //                    aux = vaddq_f32(aux, aux2); | //                    aux = vaddq_f32(aux, aux2); | ||||||
| // | // | ||||||
| //                    //floor | //                    // floor | ||||||
| //                    i = vcvtq_s32_f32(aux); | //                    i = vcvtq_s32_f32(aux); | ||||||
| //                    fi = vcvtq_f32_s32(i); | //                    fi = vcvtq_f32_s32(i); | ||||||
| //                    igx = vcgtq_f32(fi, aux); | //                    igx = vcgtq_f32(fi, aux); | ||||||
| @@ -700,8 +700,8 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa | |||||||
| //                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | //                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| //                } | //                } | ||||||
| //        } | //        } | ||||||
| //} | // } | ||||||
| // | // | ||||||
| //#endif | // #endif | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_H*/ | #endif /* INCLUDED_volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_H */ | ||||||
|   | |||||||
| @@ -85,7 +85,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_generic(float** result, | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); |                     if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); | ||||||
|                     local_code_chip_index = local_code_chip_index % code_length_chips; |                     local_code_chip_index = local_code_chip_index % code_length_chips; | ||||||
|                     result[current_correlator_tap][n] = local_code[local_code_chip_index]; |                     result[current_correlator_tap][n] = local_code[local_code_chip_index]; | ||||||
| @@ -93,7 +93,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_generic(float** result, | |||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 | #ifdef LV_HAVE_SSE3 | ||||||
| @@ -156,7 +156,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -227,7 +227,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -293,7 +293,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result, | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -360,7 +360,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result, | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -438,7 +438,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -516,7 +516,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -571,7 +571,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con | |||||||
|                     aux = vmulq_f32(code_phase_step_chips_reg, indexn); |                     aux = vmulq_f32(code_phase_step_chips_reg, indexn); | ||||||
|                     aux = vaddq_f32(aux, aux2); |                     aux = vaddq_f32(aux, aux2); | ||||||
|  |  | ||||||
|                     //floor |                     // floor | ||||||
|                     i = vcvtq_s32_f32(aux); |                     i = vcvtq_s32_f32(aux); | ||||||
|                     fi = vcvtq_f32_s32(i); |                     fi = vcvtq_f32_s32(i); | ||||||
|                     igx = vcgtq_f32(fi, aux); |                     igx = vcgtq_f32(fi, aux); | ||||||
| @@ -603,7 +603,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con | |||||||
|                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); |                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
|   | |||||||
| @@ -74,7 +74,7 @@ | |||||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> | #include <volk_gnsssdr/volk_gnsssdr_complex.h> | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_malloc.h> | #include <volk_gnsssdr/volk_gnsssdr_malloc.h> | ||||||
| #include <math.h> | #include <math.h> | ||||||
| //#include <stdio.h> | // #include <stdio.h> | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC | #ifdef LV_HAVE_GENERIC | ||||||
|  |  | ||||||
| @@ -89,18 +89,18 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f | |||||||
|         } |         } | ||||||
|     for (n = 0; n < num_points; n++) |     for (n = 0; n < num_points; n++) | ||||||
|         { |         { | ||||||
|             tmp32_1 = *in_common++ * (*phase);  //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); |             tmp32_1 = *in_common++ * (*phase);  // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|  |  | ||||||
|             // Regenerate phase |             // Regenerate phase | ||||||
|             if (n % 256 == 0) |             if (n % 256 == 0) | ||||||
|                 { |                 { | ||||||
|                     //printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); |                     // printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
|                     (*phase) /= std::abs((*phase)); |                     (*phase) /= std::abs((*phase)); | ||||||
| #else | #else | ||||||
|                     (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); |                     (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); | ||||||
| #endif | #endif | ||||||
|                     //printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); |                     // printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|             (*phase) *= phase_inc; |             (*phase) *= phase_inc; | ||||||
| @@ -112,7 +112,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f | |||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC | #ifdef LV_HAVE_GENERIC | ||||||
| @@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload | |||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
|             (*phase) /= std::abs((*phase)); |             (*phase) /= std::abs((*phase)); | ||||||
| #else | #else | ||||||
|             //(*phase) /= cabsf((*phase)); |             // (*phase) /= cabsf((*phase)); | ||||||
|             (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); |             (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); | ||||||
| #endif | #endif | ||||||
|         } |         } | ||||||
| @@ -162,7 +162,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload | |||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_AVX | #ifdef LV_HAVE_AVX | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h> | #include <volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h> | ||||||
|   | |||||||
| @@ -179,7 +179,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector | |||||||
|     int16_t* outputVectorPtr = (int16_t*)outputVector; |     int16_t* outputVectorPtr = (int16_t*)outputVector; | ||||||
|     float aux; |     float aux; | ||||||
|     unsigned int i; |     unsigned int i; | ||||||
|     const float min_val = (float)SHRT_MIN;  ///todo Something off here, compiler does not perform right cast |     const float min_val = (float)SHRT_MIN;  /// todo Something off here, compiler does not perform right cast | ||||||
|     const float max_val = (float)SHRT_MAX; |     const float max_val = (float)SHRT_MAX; | ||||||
|  |  | ||||||
|     __m256 inputVal1, inputVal2; |     __m256 inputVal1, inputVal2; | ||||||
| @@ -342,7 +342,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector | |||||||
|     int16_t* outputVectorPtr = (int16_t*)outputVector; |     int16_t* outputVectorPtr = (int16_t*)outputVector; | ||||||
|     float aux; |     float aux; | ||||||
|     unsigned int i; |     unsigned int i; | ||||||
|     const float min_val = (float)SHRT_MIN;  ///todo Something off here, compiler does not perform right cast |     const float min_val = (float)SHRT_MIN;  /// todo Something off here, compiler does not perform right cast | ||||||
|     const float max_val = (float)SHRT_MAX; |     const float max_val = (float)SHRT_MAX; | ||||||
|  |  | ||||||
|     __m256 inputVal1, inputVal2; |     __m256 inputVal1, inputVal2; | ||||||
|   | |||||||
| @@ -89,18 +89,18 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc | |||||||
|         } |         } | ||||||
|     for (n = 0; n < num_points; n++) |     for (n = 0; n < num_points; n++) | ||||||
|         { |         { | ||||||
|             tmp32_1 = *in_common++ * (*phase);  //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); |             tmp32_1 = *in_common++ * (*phase);  // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); | ||||||
|  |  | ||||||
|             // Regenerate phase |             // Regenerate phase | ||||||
|             if (n % 256 == 0) |             if (n % 256 == 0) | ||||||
|                 { |                 { | ||||||
|                     //printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); |                     // printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
|                     (*phase) /= std::abs((*phase)); |                     (*phase) /= std::abs((*phase)); | ||||||
| #else | #else | ||||||
|                     (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); |                     (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); | ||||||
| #endif | #endif | ||||||
|                     //printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); |                     // printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|             (*phase) *= phase_inc; |             (*phase) *= phase_inc; | ||||||
| @@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload( | |||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
|             (*phase) /= std::abs((*phase)); |             (*phase) /= std::abs((*phase)); | ||||||
| #else | #else | ||||||
|             //(*phase) /= cabsf((*phase)); |             // (*phase) /= cabsf((*phase)); | ||||||
|             (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); |             (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); | ||||||
| #endif | #endif | ||||||
|         } |         } | ||||||
| @@ -225,7 +225,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ | |||||||
|             yl = _mm_moveldup_ps(z1);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(z1);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(z1); |             yh = _mm_movehdup_ps(z1); | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|  |  | ||||||
|             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
| @@ -337,7 +337,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ | |||||||
|             yl = _mm_moveldup_ps(z1);  // Load yl with cr,cr,dr,dr |             yl = _mm_moveldup_ps(z1);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm_movehdup_ps(z1); |             yh = _mm_movehdup_ps(z1); | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 2; |             _in_common += 2; | ||||||
|  |  | ||||||
|             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
| @@ -457,7 +457,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t | |||||||
|             yl = _mm256_moveldup_ps(z);  // Load yl with cr,cr,dr,dr |             yl = _mm256_moveldup_ps(z);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm256_movehdup_ps(z); |             yh = _mm256_movehdup_ps(z); | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 4; |             _in_common += 4; | ||||||
|  |  | ||||||
|             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
| @@ -587,7 +587,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t | |||||||
|             yl = _mm256_moveldup_ps(z);  // Load yl with cr,cr,dr,dr |             yl = _mm256_moveldup_ps(z);  // Load yl with cr,cr,dr,dr | ||||||
|             yh = _mm256_movehdup_ps(z); |             yh = _mm256_movehdup_ps(z); | ||||||
|  |  | ||||||
|             //next two samples |             // next two samples | ||||||
|             _in_common += 4; |             _in_common += 4; | ||||||
|  |  | ||||||
|             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) |             for (n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|   | |||||||
| @@ -82,7 +82,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** re | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); |                     if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); | ||||||
|                     local_code_chip_index = local_code_chip_index % code_length_chips; |                     local_code_chip_index = local_code_chip_index % code_length_chips; | ||||||
|                     result[current_correlator_tap][n] = local_code[local_code_chip_index]; |                     result[current_correlator_tap][n] = local_code[local_code_chip_index]; | ||||||
| @@ -90,7 +90,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** re | |||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ | #endif /* LV_HAVE_GENERIC */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 | #ifdef LV_HAVE_SSE3 | ||||||
| @@ -153,7 +153,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -224,7 +224,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -290,7 +290,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -357,7 +357,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -435,7 +435,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -513,7 +513,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -558,13 +558,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res | |||||||
|                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); |                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); | ||||||
|                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); |                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); | ||||||
|                     aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2); |                     aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2); | ||||||
|                     //aux = _mm256_add_ps(aux, aux2); |                     // aux = _mm256_add_ps(aux, aux2); | ||||||
|                     // floor |                     // floor | ||||||
|                     aux = _mm256_floor_ps(aux); |                     aux = _mm256_floor_ps(aux); | ||||||
|  |  | ||||||
|                     // fmod |                     // fmod | ||||||
|                     c = _mm256_div_ps(aux, code_length_chips_reg_f); |                     c = _mm256_div_ps(aux, code_length_chips_reg_f); | ||||||
|                     //_mm_fmsub_ps(c, code_length_chips_reg_f, aux) |                     // _mm_fmsub_ps(c, code_length_chips_reg_f, aux) | ||||||
|                     i = _mm256_cvttps_epi32(c); |                     i = _mm256_cvttps_epi32(c); | ||||||
|                     cTrunc = _mm256_cvtepi32_ps(i); |                     cTrunc = _mm256_cvtepi32_ps(i); | ||||||
|                     base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux); |                     base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux); | ||||||
| @@ -592,7 +592,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -637,8 +637,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res | |||||||
|                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); |                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); | ||||||
|                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); |                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); | ||||||
|                     aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2); |                     aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2); | ||||||
|                     //aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn); |                     // aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn); | ||||||
|                     //aux = _mm256_add_ps(aux, aux2); |                     // aux = _mm256_add_ps(aux, aux2); | ||||||
|                     // floor |                     // floor | ||||||
|                     aux = _mm256_floor_ps(aux); |                     aux = _mm256_floor_ps(aux); | ||||||
|  |  | ||||||
| @@ -671,7 +671,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res | |||||||
|                 { |                 { | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -726,7 +726,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul | |||||||
|                     aux = vmulq_f32(code_phase_step_chips_reg, indexn); |                     aux = vmulq_f32(code_phase_step_chips_reg, indexn); | ||||||
|                     aux = vaddq_f32(aux, aux2); |                     aux = vaddq_f32(aux, aux2); | ||||||
|  |  | ||||||
|                     //floor |                     // floor | ||||||
|                     i = vcvtq_s32_f32(aux); |                     i = vcvtq_s32_f32(aux); | ||||||
|                     fi = vcvtq_f32_s32(i); |                     fi = vcvtq_f32_s32(i); | ||||||
|                     igx = vcgtq_f32(fi, aux); |                     igx = vcgtq_f32(fi, aux); | ||||||
| @@ -758,7 +758,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul | |||||||
|                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); |                     __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); | ||||||
|                     // resample code for current tap |                     // resample code for current tap | ||||||
|                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); |                     local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); | ||||||
|                     //Take into account that in multitap correlators, the shifts can be negative! |                     // Take into account that in multitap correlators, the shifts can be negative! | ||||||
|                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); |                     if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); | ||||||
|                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; |                     local_code_chip_index_ = local_code_chip_index_ % code_length_chips; | ||||||
|                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; |                     _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; | ||||||
| @@ -769,4 +769,4 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H*/ | #endif /* INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H */ | ||||||
|   | |||||||
| @@ -121,7 +121,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co | |||||||
|         } |         } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_AVX2*/ | #endif /* LV_HAVE_AVX2 */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_AVX | #ifdef LV_HAVE_AVX | ||||||
| @@ -156,7 +156,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con | |||||||
|                     compareResultslo = _mm_cmpgt_epi8(maxValues, lo); |                     compareResultslo = _mm_cmpgt_epi8(maxValues, lo); | ||||||
|                     compareResultshi = _mm_cmpgt_epi8(maxValues, hi); |                     compareResultshi = _mm_cmpgt_epi8(maxValues, hi); | ||||||
|  |  | ||||||
|                     //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h |                     // compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h | ||||||
|                     compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1); |                     compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1); | ||||||
|  |  | ||||||
|                     if (!_mm256_testc_si256(compareResults, ones)) |                     if (!_mm256_testc_si256(compareResults, ones)) | ||||||
| @@ -437,7 +437,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con | |||||||
|                     compareResultslo = _mm_cmpgt_epi8(maxValues, lo); |                     compareResultslo = _mm_cmpgt_epi8(maxValues, lo); | ||||||
|                     compareResultshi = _mm_cmpgt_epi8(maxValues, hi); |                     compareResultshi = _mm_cmpgt_epi8(maxValues, hi); | ||||||
|  |  | ||||||
|                     //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h |                     // compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h | ||||||
|                     compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1); |                     compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1); | ||||||
|  |  | ||||||
|                     if (!_mm256_testc_si256(compareResults, ones)) |                     if (!_mm256_testc_si256(compareResults, ones)) | ||||||
|   | |||||||
| @@ -313,7 +313,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0 | |||||||
|                 { |                 { | ||||||
|                     currentValues = _mm256_load_si256((__m256i*)inputPtr); |                     currentValues = _mm256_load_si256((__m256i*)inputPtr); | ||||||
|                     compareResults = _mm256_max_epi8(maxValues, currentValues); |                     compareResults = _mm256_max_epi8(maxValues, currentValues); | ||||||
|                     maxValues = compareResults;  //_mm256_blendv_epi8(currentValues, maxValues, compareResults); |                     maxValues = compareResults;  // _mm256_blendv_epi8(currentValues, maxValues, compareResults); | ||||||
|                     inputPtr += 32; |                     inputPtr += 32; | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -113,7 +113,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const | |||||||
|             tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); |             tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); | ||||||
|             tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); |             tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); | ||||||
|             tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); |             tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); | ||||||
|             //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h |             // tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h | ||||||
|             tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); |             tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); | ||||||
|             _mm256_storeu_ps((float*)c, tmp); |             _mm256_storeu_ps((float*)c, tmp); | ||||||
|  |  | ||||||
| @@ -230,7 +230,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const | |||||||
|             tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); |             tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); | ||||||
|             tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); |             tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); | ||||||
|             tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); |             tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); | ||||||
|             //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h |             // tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h | ||||||
|             tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); |             tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); | ||||||
|             _mm256_store_ps((float*)c, tmp); |             _mm256_store_ps((float*)c, tmp); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -114,46 +114,6 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV | |||||||
| } | } | ||||||
| #endif /* LV_HAVE_SSSE3 */ | #endif /* LV_HAVE_SSSE3 */ | ||||||
|  |  | ||||||
| //#ifdef LV_HAVE_SSE |  | ||||||
| //#include <xmmintrin.h> |  | ||||||
| // |  | ||||||
| //static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ |  | ||||||
| //    unsigned int number = 0; |  | ||||||
| //    const unsigned int quarterPoints = num_points / 4; |  | ||||||
| // |  | ||||||
| //    const float* complexVectorPtr = (float*)complexVector; |  | ||||||
| //    float* magnitudeVectorPtr = magnitudeVector; |  | ||||||
| // |  | ||||||
| //    __m128 cplxValue1, cplxValue2, iValue, qValue, result; |  | ||||||
| //    for(;number < quarterPoints; number++){ |  | ||||||
| //      cplxValue1 = _mm_loadu_ps(complexVectorPtr); |  | ||||||
| //      complexVectorPtr += 4; |  | ||||||
| // |  | ||||||
| //      cplxValue2 = _mm_loadu_ps(complexVectorPtr); |  | ||||||
| //      complexVectorPtr += 4; |  | ||||||
| // |  | ||||||
| //      // Arrange in i1i2i3i4 format |  | ||||||
| //      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); |  | ||||||
| //      // Arrange in q1q2q3q4 format |  | ||||||
| //      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); |  | ||||||
| // |  | ||||||
| //      iValue = _mm_mul_ps(iValue, iValue); // Square the I values |  | ||||||
| //      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values |  | ||||||
| // |  | ||||||
| //      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values |  | ||||||
| // |  | ||||||
| //      _mm_storeu_ps(magnitudeVectorPtr, result); |  | ||||||
| //      magnitudeVectorPtr += 4; |  | ||||||
| //    } |  | ||||||
| // |  | ||||||
| //    number = quarterPoints * 4; |  | ||||||
| //    for(; number < num_points; number++){ |  | ||||||
| //       float val1Real = *complexVectorPtr++; |  | ||||||
| //       float val1Imag = *complexVectorPtr++; |  | ||||||
| //      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); |  | ||||||
| //    } |  | ||||||
| //} |  | ||||||
| //#endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC | #ifdef LV_HAVE_GENERIC | ||||||
|  |  | ||||||
| @@ -228,46 +188,6 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV | |||||||
| } | } | ||||||
| #endif /* LV_HAVE_SSSE3 */ | #endif /* LV_HAVE_SSSE3 */ | ||||||
|  |  | ||||||
| //#ifdef LV_HAVE_SSE |  | ||||||
| //#include <xmmintrin.h> |  | ||||||
| // |  | ||||||
| //static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ |  | ||||||
| //    unsigned int number = 0; |  | ||||||
| //    const unsigned int quarterPoints = num_points / 4; |  | ||||||
| // |  | ||||||
| //    const float* complexVectorPtr = (float*)complexVector; |  | ||||||
| //    float* magnitudeVectorPtr = magnitudeVector; |  | ||||||
| // |  | ||||||
| //    __m128 cplxValue1, cplxValue2, iValue, qValue, result; |  | ||||||
| //    for(;number < quarterPoints; number++){ |  | ||||||
| //      cplxValue1 = _mm_load_ps(complexVectorPtr); |  | ||||||
| //      complexVectorPtr += 4; |  | ||||||
| // |  | ||||||
| //      cplxValue2 = _mm_load_ps(complexVectorPtr); |  | ||||||
| //      complexVectorPtr += 4; |  | ||||||
| // |  | ||||||
| //      // Arrange in i1i2i3i4 format |  | ||||||
| //      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); |  | ||||||
| //      // Arrange in q1q2q3q4 format |  | ||||||
| //      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); |  | ||||||
| // |  | ||||||
| //      iValue = _mm_mul_ps(iValue, iValue); // Square the I values |  | ||||||
| //      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values |  | ||||||
| // |  | ||||||
| //      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values |  | ||||||
| // |  | ||||||
| //      _mm_store_ps(magnitudeVectorPtr, result); |  | ||||||
| //      magnitudeVectorPtr += 4; |  | ||||||
| //    } |  | ||||||
| // |  | ||||||
| //    number = quarterPoints * 4; |  | ||||||
| //    for(; number < num_points; number++){ |  | ||||||
| //       float val1Real = *complexVectorPtr++; |  | ||||||
| //       float val1Imag = *complexVectorPtr++; |  | ||||||
| //      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); |  | ||||||
| //    } |  | ||||||
| //} |  | ||||||
| //#endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_ORC | #ifdef LV_HAVE_ORC | ||||||
|   | |||||||
| @@ -56,6 +56,8 @@ | |||||||
|  * \li out:            Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n])) |  * \li out:            Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n])) | ||||||
|  * \li phase:          Pointer to a float containing the final phase, in radians. |  * \li phase:          Pointer to a float containing the final phase, in radians. | ||||||
|  * |  * | ||||||
|  |  * Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier | ||||||
|  |  * Based on algorithms from the cephes library http://www.netlib.org/cephes/ | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -69,8 +71,7 @@ | |||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE2 | #ifdef LV_HAVE_SSE2 | ||||||
| #include <emmintrin.h> | #include <emmintrin.h> | ||||||
| /* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier  */ |  | ||||||
| /* Based on algorithms from the cephes library http://www.netlib.org/cephes/   */ |  | ||||||
| static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) | static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) | ||||||
| { | { | ||||||
|     lv_32fc_t *bPtr = out; |     lv_32fc_t *bPtr = out; | ||||||
| @@ -225,8 +226,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const fl | |||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE2 | #ifdef LV_HAVE_SSE2 | ||||||
| #include <emmintrin.h> | #include <emmintrin.h> | ||||||
| /* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier  */ |  | ||||||
| /* Based on algorithms from the cephes library http://www.netlib.org/cephes/   */ |  | ||||||
| static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) | static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) | ||||||
| { | { | ||||||
|     lv_32fc_t *bPtr = out; |     lv_32fc_t *bPtr = out; | ||||||
| @@ -459,8 +459,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t *out, co | |||||||
|  |  | ||||||
| #ifdef LV_HAVE_AVX2 | #ifdef LV_HAVE_AVX2 | ||||||
| #include <immintrin.h> | #include <immintrin.h> | ||||||
| /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ |  | ||||||
|  * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ |  | ||||||
| static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) | static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) | ||||||
| { | { | ||||||
|     lv_32fc_t *bPtr = out; |     lv_32fc_t *bPtr = out; | ||||||
| @@ -647,8 +646,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const fl | |||||||
|  |  | ||||||
| #ifdef LV_HAVE_AVX2 | #ifdef LV_HAVE_AVX2 | ||||||
| #include <immintrin.h> | #include <immintrin.h> | ||||||
| /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ |  | ||||||
|  * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ |  | ||||||
| static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) | static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) | ||||||
| { | { | ||||||
|     lv_32fc_t *bPtr = out; |     lv_32fc_t *bPtr = out; | ||||||
| @@ -835,8 +833,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const fl | |||||||
|  |  | ||||||
| #ifdef LV_HAVE_NEONV7 | #ifdef LV_HAVE_NEONV7 | ||||||
| #include <arm_neon.h> | #include <arm_neon.h> | ||||||
| /* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier  */ |  | ||||||
| /* Based on algorithms from the cephes library http://www.netlib.org/cephes/   */ |  | ||||||
| static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) | static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) | ||||||
| { | { | ||||||
|     lv_32fc_t *bPtr = out; |     lv_32fc_t *bPtr = out; | ||||||
|   | |||||||
| @@ -7,6 +7,7 @@ | |||||||
| #include <INTEGER.h> | #include <INTEGER.h> | ||||||
| #include <asn_codecs_prim.h>	/* Encoder and decoder of a primitive type */ | #include <asn_codecs_prim.h>	/* Encoder and decoder of a primitive type */ | ||||||
| #include <errno.h> | #include <errno.h> | ||||||
|  | #include <inttypes.h> | ||||||
|  |  | ||||||
| /* | /* | ||||||
|  * INTEGER basic type description. |  * INTEGER basic type description. | ||||||
| @@ -146,7 +147,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by | |||||||
| 			scr = (char *)alloca(scrsize); | 			scr = (char *)alloca(scrsize); | ||||||
| 			if(plainOrXER == 0) | 			if(plainOrXER == 0) | ||||||
| 				ret = snprintf(scr, scrsize, | 				ret = snprintf(scr, scrsize, | ||||||
|                                "%lld (%s)", accum, el->enum_name); |               "%+"PRId64"(%s)", accum, el->enum_name); | ||||||
| 			else | 			else | ||||||
| 				ret = snprintf(scr, scrsize, | 				ret = snprintf(scr, scrsize, | ||||||
| 					"<%s/>", el->enum_name); | 					"<%s/>", el->enum_name); | ||||||
| @@ -160,7 +161,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by | |||||||
| 			scr = scratch; | 			scr = scratch; | ||||||
| 			ret = snprintf(scr, scrsize, | 			ret = snprintf(scr, scrsize, | ||||||
| 				(specs && specs->field_unsigned) | 				(specs && specs->field_unsigned) | ||||||
|                            ?"%llu":"%lld", accum); |         ?"%"PRIu64:"%+"PRId64, accum); | ||||||
| 		} | 		} | ||||||
| 		assert(ret > 0 && (size_t)ret < scrsize); | 		assert(ret > 0 && (size_t)ret < scrsize); | ||||||
| 		return (cb(scr, ret, app_key) < 0) ? -1 : ret; | 		return (cb(scr, ret, app_key) < 0) ? -1 : ret; | ||||||
|   | |||||||
| @@ -7,6 +7,7 @@ | |||||||
| #include <INTEGER.h> | #include <INTEGER.h> | ||||||
| #include <asn_codecs_prim.h>	/* Encoder and decoder of a primitive type */ | #include <asn_codecs_prim.h>	/* Encoder and decoder of a primitive type */ | ||||||
| #include <errno.h> | #include <errno.h> | ||||||
|  | #include <inttypes.h> | ||||||
|  |  | ||||||
| /* | /* | ||||||
|  * INTEGER basic type description. |  * INTEGER basic type description. | ||||||
| @@ -146,7 +147,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by | |||||||
| 			scr = (char *)alloca(scrsize); | 			scr = (char *)alloca(scrsize); | ||||||
| 			if(plainOrXER == 0) | 			if(plainOrXER == 0) | ||||||
| 				ret = snprintf(scr, scrsize, | 				ret = snprintf(scr, scrsize, | ||||||
| 					"%lld (%s)", accum, el->enum_name); | 					"%+"PRId64"(%s)", accum, el->enum_name); | ||||||
| 			else | 			else | ||||||
| 				ret = snprintf(scr, scrsize, | 				ret = snprintf(scr, scrsize, | ||||||
| 					"<%s/>", el->enum_name); | 					"<%s/>", el->enum_name); | ||||||
| @@ -160,7 +161,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by | |||||||
| 			scr = scratch; | 			scr = scratch; | ||||||
| 			ret = snprintf(scr, scrsize, | 			ret = snprintf(scr, scrsize, | ||||||
| 				(specs && specs->field_unsigned) | 				(specs && specs->field_unsigned) | ||||||
| 				?"%llu":"%lld", accum); | 				?"%"PRIu64:"%+"PRId64, accum); | ||||||
| 		} | 		} | ||||||
| 		assert(ret > 0 && (size_t)ret < scrsize); | 		assert(ret > 0 && (size_t)ret < scrsize); | ||||||
| 		return (cb(scr, ret, app_key) < 0) ? -1 : ret; | 		return (cb(scr, ret, app_key) < 0) ? -1 : ret; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Carles Fernandez
					Carles Fernandez