mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-02-24 06:50:08 +00:00
Merge branch 'next' of https://github.com/gnss-sdr/gnss-sdr into next
This commit is contained in:
commit
e5b16f47ae
@ -735,8 +735,8 @@ if(NOT ARMADILLO_FOUND)
|
||||
message(STATUS " Armadillo will be downloaded and built automatically ")
|
||||
message(STATUS " when doing 'make'. ")
|
||||
|
||||
set(armadillo_RELEASE 6.400.3)
|
||||
set(armadillo_MD5 "b4c18561f080c710dc5bcd89533e68f8")
|
||||
set(armadillo_RELEASE 6.500.4)
|
||||
set(armadillo_MD5 "a39f27197d24b3d25437fab6bb1d118f")
|
||||
|
||||
ExternalProject_Add(
|
||||
armadillo-${armadillo_RELEASE}
|
||||
|
56
README.md
56
README.md
@ -37,52 +37,30 @@ $ sudo apt-get install build-essential cmake git libboost-dev libboost-date-time
|
||||
libarmadillo-dev libgflags-dev libgoogle-glog-dev libgnutls-openssl-dev libgtest-dev
|
||||
~~~~~~
|
||||
|
||||
Once you have installed these packages, you can jump directly to [how to download the source code and build GNSS-SDR](#download-and-build-linux). Alternatively, if you need to manually install those libraries, please keep reading.
|
||||
Once you have installed these packages, you can jump directly to [how to download the source code and build GNSS-SDR](#download-and-build-linux). Alternatively, if you need to manually build and install those libraries, please keep reading.
|
||||
|
||||
Note for Ubuntu 14.04 LTS "trusty" users: you will need to build from source and install GNU Radio manually, as explained below, since GNSS-SDR requires gnuradio-dev >= 3.7.3, and Ubuntu 14.04 came with 3.7.2. Install all the packages above BUT EXCEPT ```libuhd-dev```, ```gnuradio-dev``` and ```gr-osmosdr``` (and remove them if they are already installed in your machine), and install those dependencies using PyBOMBS.
|
||||
Note for Ubuntu 14.04 LTS "trusty" users: you will need to build from source and install GNU Radio manually, as explained below, since GNSS-SDR requires gnuradio-dev >= 3.7.3, and Ubuntu 14.04 came with 3.7.2. Install all the packages above BUT EXCEPT ```libuhd-dev```, ```gnuradio-dev``` and ```gr-osmosdr``` (and remove them if they are already installed in your machine), and install those dependencies using PyBOMBS.
|
||||
|
||||
### Manual installation of GNU Radio
|
||||
|
||||
Downloading, building and installing [GNU Radio](http://gnuradio.org/redmine/projects/gnuradio/wiki "GNU Radio's Homepage") and all its dependencies is not a simple task. We recommend to use [PyBOMBS](http://gnuradio.org/redmine/projects/pybombs/wiki "Python Build Overlay Managed Bundle System wiki") (Python Build Overlay Managed Bundle System), the GNU Radio install management system that automatically does all the work for you. In a terminal, type:
|
||||
Downloading, building and installing [GNU Radio](http://gnuradio.org/redmine/projects/gnuradio/wiki "GNU Radio's Homepage") and all its dependencies is not a simple task. We recommend to use [PyBOMBS](http://gnuradio.org/pybombs "Python Build Overlay Managed Bundle System wiki") (Python Build Overlay Managed Bundle System), the GNU Radio install management system that automatically does all the work for you. In a terminal, type:
|
||||
|
||||
|
||||
~~~~~~
|
||||
$ git clone --recursive https://github.com/pybombs/pybombs
|
||||
~~~~~~
|
||||
$ git clone https://github.com/gnuradio/pybombs.git
|
||||
$ cd pybombs
|
||||
$ sudo python setup.py install
|
||||
$ pybombs recipes add gr-recipes https://github.com/gnuradio/gr-recipes.git
|
||||
$ pybombs recipes add gr-etcetera https://github.com/gnuradio/gr-etcetera.git
|
||||
$ sudo pybombs prefix init /usr/local -a myprefix
|
||||
$ pybombs config default_prefix myprefix
|
||||
$ sudo pybombs install gnuradio gr-osmosdr armadillo glog
|
||||
~~~~~~
|
||||
|
||||
Configure PyBOMBS:
|
||||
Other installation and configuration options are available from https://github.com/gnuradio/pybombs
|
||||
|
||||
~~~~~~
|
||||
$ ./pybombs config
|
||||
~~~~~~
|
||||
The last step can take some time (up to two hours to complete, depending on your system), and it downloads, builds and installs the latest versions of GNU Radio, related drivers and dependencies in your system.
|
||||
|
||||
You can safely accept the default options but for ```prefix```. We recommend to put ```/usr/local``` there. After the configuration, you should get something similar to:
|
||||
|
||||
~~~~~~
|
||||
gituser = username
|
||||
prefix = /usr/local
|
||||
satisfy_order = deb,src # For Debian/Ubuntu/LinuxMint
|
||||
satisfy_order = rpm,src # For Fedora/CentOS/RHEL/openSUSE
|
||||
forcepkgs =
|
||||
forcebuild = gnuradio,uhd,gr-osmosdr,rtl-sdr
|
||||
timeout = 30
|
||||
cmakebuildtype = RelWithDebInfo
|
||||
builddocs = OFF
|
||||
cc = gcc
|
||||
cxx = g++
|
||||
makewidth = 4
|
||||
~~~~~~
|
||||
|
||||
|
||||
Then, you are ready to download and install [UHD](http://files.ettus.com/uhd_docs/manual/html/) (the Universal Hardware Driver), GNU Radio and all their required dependencies by doing:
|
||||
|
||||
~~~~~~
|
||||
$ sudo ./pybombs install uhd gnuradio
|
||||
~~~~~~
|
||||
|
||||
This can take some time (up to two hours to complete, depending on your system), and downloads, builds and installs the latest versions of the Universal Hardware Driver (UHD) and GNU Radio in your system, including all their dependencies.
|
||||
In case you do not want to use PyBOMBS and prefer to build and install GNU Radio step by step, follow instructions at the [GNU Radio Build Guide](http://gnuradio.org/redmine/projects/gnuradio/wiki/BuildGuide).
|
||||
In case you do not want to use PyBOMBS and prefer to build and install GNU Radio step by step, follow instructions at the [GNU Radio Build Guide](http://gnuradio.org/redmine/projects/gnuradio/wiki/BuildGuide). Other GNSS-SDR dependencies can be built and installed manually as explained below.
|
||||
|
||||
|
||||
|
||||
@ -94,9 +72,9 @@ In case you do not want to use PyBOMBS and prefer to build and install GNU Radio
|
||||
$ sudo apt-get install libopenblas-dev liblapack-dev # For Debian/Ubuntu/LinuxMint
|
||||
$ sudo yum install lapack-devel blas-devel # For Fedora/CentOS/RHEL
|
||||
$ sudo zypper install lapack-devel blas-devel # For OpenSUSE
|
||||
$ wget http://sourceforge.net/projects/arma/files/armadillo-6.400.3.tar.gz
|
||||
$ tar xvfz armadillo-6.400.3.tar.gz
|
||||
$ cd armadillo-6.400.3
|
||||
$ wget http://sourceforge.net/projects/arma/files/armadillo-6.500.4.tar.gz
|
||||
$ tar xvfz armadillo-6.500.4.tar.gz
|
||||
$ cd armadillo-6.500.4
|
||||
$ cmake .
|
||||
$ make
|
||||
$ sudo make install
|
||||
|
@ -14,6 +14,12 @@ This library is automatically built and installed along with GNSS-SDR if it is n
|
||||
|
||||
However, you can install and use VOLK_GNSSSDR kernels as you use VOLK's, independently from GNSS-SDR.
|
||||
|
||||
First, make sure that the required dependencies are installed in you machine:
|
||||
|
||||
~~~~~~
|
||||
$ sudo apt-get install git subversion cmake python-cheetah libboost-dev libbbost-filesystem
|
||||
~~~~~~
|
||||
|
||||
In order to build and install the library, go to the base folder of the source code and do:
|
||||
|
||||
~~~~~~
|
||||
|
@ -11,13 +11,13 @@
|
||||
|
||||
static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_generic(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
|
||||
{
|
||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||
float rem_carrier_phase_in_rad=0.345;
|
||||
float phase_step_rad = 0.123;
|
||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||
float rem_carrier_phase_in_rad = 0.345;
|
||||
float phase_step_rad = 0.123;
|
||||
lv_32fc_t phase[1];
|
||||
phase[0]=lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
|
||||
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
|
||||
lv_32fc_t phase_inc[1];
|
||||
phase_inc[0]=lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
|
||||
phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
|
||||
volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(outVector, inVector, phase_inc[0], phase, num_points);
|
||||
}
|
||||
|
||||
@ -28,13 +28,13 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_generic(lv_16sc_t* outVe
|
||||
|
||||
static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_a_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
|
||||
{
|
||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||
float rem_carrier_phase_in_rad=0.345;
|
||||
float phase_step_rad = 0.123;
|
||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||
float rem_carrier_phase_in_rad = 0.345;
|
||||
float phase_step_rad = 0.123;
|
||||
lv_32fc_t phase[1];
|
||||
phase[0]=lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
|
||||
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
|
||||
lv_32fc_t phase_inc[1];
|
||||
phase_inc[0]=lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
|
||||
phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
|
||||
volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse2(outVector, inVector, phase_inc[0], phase, num_points);
|
||||
}
|
||||
|
||||
@ -44,13 +44,13 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_a_sse2(lv_16sc_t* outVec
|
||||
|
||||
static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
|
||||
{
|
||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||
float rem_carrier_phase_in_rad=0.345;
|
||||
float phase_step_rad = 0.123;
|
||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||
float rem_carrier_phase_in_rad = 0.345;
|
||||
float phase_step_rad = 0.123;
|
||||
lv_32fc_t phase[1];
|
||||
phase[0]=lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
|
||||
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
|
||||
lv_32fc_t phase_inc[1];
|
||||
phase_inc[0]=lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
|
||||
phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
|
||||
volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse2(outVector, inVector, phase_inc[0], phase, num_points);
|
||||
}
|
||||
|
||||
@ -60,13 +60,13 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse2(lv_16sc_t* outVec
|
||||
|
||||
static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_neon(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
|
||||
{
|
||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||
float rem_carrier_phase_in_rad=0.345;
|
||||
float phase_step_rad = 0.123;
|
||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||
float rem_carrier_phase_in_rad = 0.345;
|
||||
float phase_step_rad = 0.123;
|
||||
lv_32fc_t phase[1];
|
||||
phase[0]=lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
|
||||
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
|
||||
lv_32fc_t phase_inc[1];
|
||||
phase_inc[0]=lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
|
||||
phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
|
||||
volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(outVector, inVector, phase_inc[0], phase, num_points);
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,7 @@
|
||||
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define ROTATOR_RELOAD 512
|
||||
|
||||
|
||||
@ -58,14 +58,12 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou
|
||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||
*outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
||||
(*phase) *= phase_inc;
|
||||
tmp32=(*phase);
|
||||
//printf("[%i][%i] phase fc: %f,%f \n",i,j,lv_creal(tmp32),lv_cimag(tmp32));
|
||||
}
|
||||
}
|
||||
for(i = 0; i < num_points % ROTATOR_RELOAD; ++i)
|
||||
{
|
||||
tmp16 = *inVector++;
|
||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||
tmp16 = *inVector++;
|
||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||
*outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
||||
(*phase) *= phase_inc;
|
||||
}
|
||||
@ -79,95 +77,94 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou
|
||||
|
||||
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 4;
|
||||
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
|
||||
const unsigned int sse_iters = num_points / 4;
|
||||
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
|
||||
|
||||
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
||||
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
||||
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
||||
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
||||
|
||||
const lv_16sc_t* _in_a = inVector;
|
||||
__attribute__((aligned(32))) lv_32fc_t four_phase_rotations_32fc[4];
|
||||
// debug
|
||||
//__attribute__((aligned(16))) lv_16sc_t four_phase_rotations_16sc[4];
|
||||
const lv_16sc_t* _in_a = inVector;
|
||||
__attribute__((aligned(32))) lv_32fc_t four_phase_rotations_32fc[4];
|
||||
// debug
|
||||
//__attribute__((aligned(16))) lv_16sc_t four_phase_rotations_16sc[4];
|
||||
|
||||
// specify how many bits are used in the rotation (2^(N-1)) (it WILL increase the output signal range!)
|
||||
__attribute__((aligned(32))) float rotator_amplitude_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__m128 _rotator_amplitude_reg = _mm_load_ps(rotator_amplitude_float);
|
||||
// specify how many bits are used in the rotation (2^(N-1)) (it WILL increase the output signal range!)
|
||||
__attribute__((aligned(32))) float rotator_amplitude_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__m128 _rotator_amplitude_reg = _mm_load_ps(rotator_amplitude_float);
|
||||
|
||||
//const lv_16sc_t* _in_b = in_b;
|
||||
lv_16sc_t* _out = outVector;
|
||||
//const lv_16sc_t* _in_b = in_b;
|
||||
lv_16sc_t* _out = outVector;
|
||||
|
||||
__m128 fc_reg1, fc_reg2;
|
||||
__m128i sc_reg1, sc_reg2; // is __m128i defined in xmmintrin.h?
|
||||
__m128 fc_reg1, fc_reg2;
|
||||
__m128i sc_reg1, sc_reg2; // is __m128i defined in xmmintrin.h?
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
||||
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||
//b = _mm_loadu_si128((__m128i*)_in_b);
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
||||
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||
//b = _mm_loadu_si128((__m128i*)_in_b);
|
||||
|
||||
// compute next four 16ic complex exponential values for phase rotation
|
||||
// compute next four 16ic complex exponential values for phase rotation
|
||||
|
||||
// compute next four float complex rotations
|
||||
four_phase_rotations_32fc[0]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[1]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[2]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[3]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
//convert the rotations to integers
|
||||
fc_reg1 = _mm_load_ps((float*)&four_phase_rotations_32fc[0]);
|
||||
// compute next four float complex rotations
|
||||
four_phase_rotations_32fc[0]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[1]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[2]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[3]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
//convert the rotations to integers
|
||||
fc_reg1 = _mm_load_ps((float*)&four_phase_rotations_32fc[0]);
|
||||
|
||||
// disable next line for 1 bit rotation (equivalent to a square wave NCO)
|
||||
fc_reg1 = _mm_mul_ps (fc_reg1, _rotator_amplitude_reg);
|
||||
// disable next line for 1 bit rotation (equivalent to a square wave NCO)
|
||||
fc_reg1 = _mm_mul_ps (fc_reg1, _rotator_amplitude_reg);
|
||||
|
||||
fc_reg2 = _mm_load_ps((float*)&four_phase_rotations_32fc[2]);
|
||||
sc_reg1 = _mm_cvtps_epi32(fc_reg1);
|
||||
sc_reg2 = _mm_cvtps_epi32(fc_reg2);
|
||||
b = _mm_packs_epi32(sc_reg1, sc_reg2);
|
||||
fc_reg2 = _mm_load_ps((float*)&four_phase_rotations_32fc[2]);
|
||||
sc_reg1 = _mm_cvtps_epi32(fc_reg1);
|
||||
sc_reg2 = _mm_cvtps_epi32(fc_reg2);
|
||||
b = _mm_packs_epi32(sc_reg1, sc_reg2);
|
||||
|
||||
// debug
|
||||
//_mm_store_si128((__m128i*)four_phase_rotations_16sc, b);
|
||||
//printf("phase fc: %f,%f phase sc: %i,%i \n",lv_creal(four_phase_rotations_32fc[0]),lv_cimag(four_phase_rotations_32fc[0]),lv_creal(four_phase_rotations_16sc[0]),lv_cimag(four_phase_rotations_16sc[0]));
|
||||
// debug
|
||||
//_mm_store_si128((__m128i*)four_phase_rotations_16sc, b);
|
||||
//printf("phase fc: %f,%f phase sc: %i,%i \n",lv_creal(four_phase_rotations_32fc[0]),lv_cimag(four_phase_rotations_32fc[0]),lv_creal(four_phase_rotations_16sc[0]),lv_cimag(four_phase_rotations_16sc[0]));
|
||||
|
||||
// multiply the input vector times the rotations
|
||||
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||
// multiply the input vector times the rotations
|
||||
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||
|
||||
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||
real = _mm_subs_epi16 (c, c_sr);
|
||||
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
||||
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||
real = _mm_subs_epi16 (c, c_sr);
|
||||
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
||||
|
||||
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||
|
||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||
|
||||
imag = _mm_adds_epi16(imag1, imag2);
|
||||
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
||||
imag = _mm_adds_epi16(imag1, imag2);
|
||||
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
||||
|
||||
result = _mm_or_si128 (real, imag);
|
||||
result = _mm_or_si128 (real, imag);
|
||||
|
||||
// normalize the rotations
|
||||
// TODO
|
||||
// normalize the rotations
|
||||
// TODO
|
||||
|
||||
// store results
|
||||
_mm_store_si128((__m128i*)_out, result);
|
||||
// store results
|
||||
_mm_store_si128((__m128i*)_out, result);
|
||||
|
||||
_in_a += 4;
|
||||
_out += 4;
|
||||
}
|
||||
|
||||
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
|
||||
{
|
||||
*_out++ = *_in_a++ * (*phase);
|
||||
(*phase) *= phase_inc;
|
||||
}
|
||||
_in_a += 4;
|
||||
_out += 4;
|
||||
}
|
||||
|
||||
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
|
||||
{
|
||||
*_out++ = *_in_a++ * (*phase);
|
||||
(*phase) *= phase_inc;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -177,128 +174,200 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse2(lv_16sc_t* out
|
||||
|
||||
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 4;
|
||||
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
|
||||
const unsigned int sse_iters = num_points / 4;
|
||||
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
|
||||
|
||||
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
||||
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
||||
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
||||
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
||||
|
||||
const lv_16sc_t* _in_a = inVector;
|
||||
__attribute__((aligned(32))) lv_32fc_t four_phase_rotations_32fc[4];
|
||||
// debug
|
||||
//__attribute__((aligned(16))) lv_16sc_t four_phase_rotations_16sc[4];
|
||||
const lv_16sc_t* _in_a = inVector;
|
||||
__attribute__((aligned(32))) lv_32fc_t four_phase_rotations_32fc[4];
|
||||
// debug
|
||||
//__attribute__((aligned(16))) lv_16sc_t four_phase_rotations_16sc[4];
|
||||
|
||||
// specify how many bits are used in the rotation (2^(N-1)) (it WILL increase the output signal range!)
|
||||
__attribute__((aligned(32))) float rotator_amplitude_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__m128 _rotator_amplitude_reg = _mm_load_ps(rotator_amplitude_float);
|
||||
// specify how many bits are used in the rotation (2^(N-1)) (it WILL increase the output signal range!)
|
||||
__attribute__((aligned(32))) float rotator_amplitude_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__m128 _rotator_amplitude_reg = _mm_load_ps(rotator_amplitude_float);
|
||||
|
||||
//const lv_16sc_t* _in_b = in_b;
|
||||
lv_16sc_t* _out = outVector;
|
||||
//const lv_16sc_t* _in_b = in_b;
|
||||
lv_16sc_t* _out = outVector;
|
||||
|
||||
__m128 fc_reg1, fc_reg2;
|
||||
__m128i sc_reg1, sc_reg2; // is __m128i defined in xmmintrin.h?
|
||||
__m128 fc_reg1, fc_reg2;
|
||||
__m128i sc_reg1, sc_reg2; // is __m128i defined in xmmintrin.h?
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
||||
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||
//b = _mm_loadu_si128((__m128i*)_in_b);
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
||||
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||
//b = _mm_loadu_si128((__m128i*)_in_b);
|
||||
|
||||
// compute next four 16ic complex exponential values for phase rotation
|
||||
// compute next four 16ic complex exponential values for phase rotation
|
||||
|
||||
// compute next four float complex rotations
|
||||
four_phase_rotations_32fc[0]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[1]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[2]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[3]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
//convert the rotations to integers
|
||||
fc_reg1 = _mm_load_ps((float*)&four_phase_rotations_32fc[0]);
|
||||
// compute next four float complex rotations
|
||||
four_phase_rotations_32fc[0]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[1]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[2]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
four_phase_rotations_32fc[3]=*phase;
|
||||
(*phase) *= phase_inc;
|
||||
//convert the rotations to integers
|
||||
fc_reg1 = _mm_load_ps((float*)&four_phase_rotations_32fc[0]);
|
||||
|
||||
// disable next line for 1 bit rotation (equivalent to a square wave NCO)
|
||||
fc_reg1 = _mm_mul_ps (fc_reg1, _rotator_amplitude_reg);
|
||||
// disable next line for 1 bit rotation (equivalent to a square wave NCO)
|
||||
fc_reg1 = _mm_mul_ps (fc_reg1, _rotator_amplitude_reg);
|
||||
|
||||
fc_reg2 = _mm_load_ps((float*)&four_phase_rotations_32fc[2]);
|
||||
sc_reg1 = _mm_cvtps_epi32(fc_reg1);
|
||||
sc_reg2 = _mm_cvtps_epi32(fc_reg2);
|
||||
b = _mm_packs_epi32(sc_reg1, sc_reg2);
|
||||
fc_reg2 = _mm_load_ps((float*)&four_phase_rotations_32fc[2]);
|
||||
sc_reg1 = _mm_cvtps_epi32(fc_reg1);
|
||||
sc_reg2 = _mm_cvtps_epi32(fc_reg2);
|
||||
b = _mm_packs_epi32(sc_reg1, sc_reg2);
|
||||
|
||||
// debug
|
||||
//_mm_store_si128((__m128i*)four_phase_rotations_16sc, b);
|
||||
//printf("phase fc: %f,%f phase sc: %i,%i \n",lv_creal(four_phase_rotations_32fc[0]),lv_cimag(four_phase_rotations_32fc[0]),lv_creal(four_phase_rotations_16sc[0]),lv_cimag(four_phase_rotations_16sc[0]));
|
||||
// debug
|
||||
//_mm_store_si128((__m128i*)four_phase_rotations_16sc, b);
|
||||
//printf("phase fc: %f,%f phase sc: %i,%i \n",lv_creal(four_phase_rotations_32fc[0]),lv_cimag(four_phase_rotations_32fc[0]),lv_creal(four_phase_rotations_16sc[0]),lv_cimag(four_phase_rotations_16sc[0]));
|
||||
|
||||
// multiply the input vector times the rotations
|
||||
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||
// multiply the input vector times the rotations
|
||||
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||
|
||||
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||
real = _mm_subs_epi16 (c, c_sr);
|
||||
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
||||
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||
real = _mm_subs_epi16 (c, c_sr);
|
||||
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
||||
|
||||
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||
|
||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||
|
||||
imag = _mm_adds_epi16(imag1, imag2);
|
||||
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
||||
imag = _mm_adds_epi16(imag1, imag2);
|
||||
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
||||
|
||||
result = _mm_or_si128 (real, imag);
|
||||
result = _mm_or_si128 (real, imag);
|
||||
|
||||
// normalize the rotations
|
||||
// TODO
|
||||
// normalize the rotations
|
||||
// TODO
|
||||
|
||||
// store results
|
||||
_mm_storeu_si128((__m128i*)_out, result);
|
||||
// store results
|
||||
_mm_storeu_si128((__m128i*)_out, result);
|
||||
|
||||
_in_a += 4;
|
||||
_out += 4;
|
||||
}
|
||||
|
||||
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
|
||||
{
|
||||
*_out++ = *_in_a++ * (*phase);
|
||||
(*phase) *= phase_inc;
|
||||
}
|
||||
_in_a += 4;
|
||||
_out += 4;
|
||||
}
|
||||
|
||||
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
|
||||
{
|
||||
*_out++ = *_in_a++ * (*phase);
|
||||
(*phase) *= phase_inc;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
#ifdef LV_HAVE_NEON
|
||||
#include <arm.neon.h>
|
||||
#include <arm_neon.h>
|
||||
|
||||
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
|
||||
{
|
||||
unsigned int i = 0;
|
||||
int j = 0;
|
||||
lv_16sc_t tmp16;
|
||||
lv_32fc_t tmp32;
|
||||
for(i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i)
|
||||
const unsigned int neon_iters = num_points / 4;
|
||||
lv_16sc_t tmp16_;
|
||||
lv_32fc_t tmp32_;
|
||||
|
||||
const lv_16sc_t* _in = inVector;
|
||||
lv_16sc_t* _out = outVector;
|
||||
|
||||
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
|
||||
float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
|
||||
float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
|
||||
|
||||
float32x4_t _phase4_real = vld1q_f32(__phase4_real);
|
||||
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
|
||||
|
||||
lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc;
|
||||
lv_32fc_t phase3 = phase2 * phase_inc;
|
||||
lv_32fc_t phase4 = phase3 * phase_inc;
|
||||
|
||||
float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
|
||||
float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
|
||||
|
||||
float32x4_t _phase_real = vld1q_f32(__phase_real);
|
||||
float32x4_t _phase_imag = vld1q_f32(__phase_imag);
|
||||
|
||||
float32x4_t half = vdupq_n_f32(0.5f);
|
||||
int16x4x2_t tmp16;
|
||||
int32x4x2_t tmp32i;
|
||||
float32x4x2_t tmp32f, tmp_real, tmp_imag;
|
||||
float32x4_t sign, PlusHalf, Round;
|
||||
|
||||
if (neon_iters > 0)
|
||||
{
|
||||
for(j = 0; j < ROTATOR_RELOAD; ++j)
|
||||
for(; i < neon_iters; ++i)
|
||||
{
|
||||
tmp16 = *inVector++;
|
||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||
*outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
||||
(*phase) *= phase_inc;
|
||||
tmp32=(*phase);
|
||||
printf("[%i][%i] phase fc: %f,%f \n",i,j,lv_creal(tmp32),lv_cimag(tmp32));
|
||||
/* load 4 complex numbers (int 16 bits each component) */
|
||||
tmp16 = vld2_s16((int16_t*)_in); _in += 4;
|
||||
|
||||
/* promote them to int 32 bits */
|
||||
tmp32i.val[0] = vmovl_s16(tmp16.val[0]);
|
||||
tmp32i.val[1] = vmovl_s16(tmp16.val[1]);
|
||||
|
||||
/* promote them to float 32 bits */
|
||||
tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]);
|
||||
tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]);
|
||||
|
||||
/* complex multiplication of four complex samples (float 32 bits each component) */
|
||||
tmp_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real);
|
||||
tmp_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag);
|
||||
tmp_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag);
|
||||
tmp_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real);
|
||||
|
||||
tmp32f.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
|
||||
tmp32f.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
|
||||
|
||||
/* downcast results to int32 */
|
||||
/* in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); */
|
||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31)));
|
||||
PlusHalf = vaddq_f32(tmp32f.val[0], half);
|
||||
Round = vsubq_f32(PlusHalf, sign);
|
||||
tmp32i.val[0] = vcvtq_s32_f32(Round);
|
||||
|
||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31)));
|
||||
PlusHalf = vaddq_f32(tmp32f.val[1], half);
|
||||
Round = vsubq_f32(PlusHalf, sign);
|
||||
tmp32i.val[1] = vcvtq_s32_f32(Round);
|
||||
|
||||
/* downcast results to int16 */
|
||||
tmp16.val[0] = vqmovn_s32(tmp32i.val[0]);
|
||||
tmp16.val[1] = vqmovn_s32(tmp32i.val[1]);
|
||||
|
||||
/* compute next four phases */
|
||||
tmp_real.val[0] = vmulq_f32(_phase_real, _phase4_real);
|
||||
tmp_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag);
|
||||
tmp_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag);
|
||||
tmp_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real);
|
||||
|
||||
_phase_real = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
|
||||
_phase_imag = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
|
||||
|
||||
/* store the four complex results */
|
||||
vst2_s16((int16_t*)_out, tmp16);
|
||||
_out += 4;
|
||||
}
|
||||
vst1q_f32((float32_t*)__phase_real, _phase_real);
|
||||
vst1q_f32((float32_t*)__phase_imag, _phase_imag);
|
||||
|
||||
(*phase) = lv_cmake(__phase_real[3], __phase_imag[3]);
|
||||
}
|
||||
for(i = 0; i < num_points % ROTATOR_RELOAD; ++i)
|
||||
for(i = 0; i < neon_iters % 4; ++i)
|
||||
{
|
||||
tmp16 = *inVector++;
|
||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||
*outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
||||
tmp16_ = *_in++;
|
||||
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
|
||||
*_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
|
||||
(*phase) *= phase_inc;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* LV_HAVE_NEON */
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_H */
|
||||
|
@ -190,4 +190,90 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_NEON
|
||||
#include <arm_neon.h>
|
||||
/*!
|
||||
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int neon_iters = num_points / 8;
|
||||
|
||||
float32_t* inputVectorPtr = (float32_t*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
|
||||
const float32x4_t min_val = vmovq_n_f32((float32_t)SCHAR_MIN);
|
||||
const float32x4_t max_val = vmovq_n_f32((float32_t)SCHAR_MAX);
|
||||
|
||||
const float32x4_t half = vdupq_n_f32(0.5f);
|
||||
|
||||
float32x4_t sign, PlusHalf, Round, ret1, a;
|
||||
int32x4_t toint_a;
|
||||
int16x4_t intInputVal1, intInputVal2;
|
||||
int16x8_t pack16_8_1;
|
||||
int8x8_t res8_1, res8_2;
|
||||
int8x16_t outputVal;
|
||||
|
||||
for(unsigned int i = 0; i < neon_iters; i++)
|
||||
{
|
||||
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
|
||||
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
||||
PlusHalf = vaddq_f32(ret1, half);
|
||||
Round = vsubq_f32(PlusHalf, sign);
|
||||
toint_a = vcvtq_s32_f32(Round);
|
||||
intInputVal1 = vqmovn_s32(toint_a);
|
||||
|
||||
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
|
||||
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
||||
PlusHalf = vaddq_f32(ret1, half);
|
||||
Round = vsubq_f32(PlusHalf, sign);
|
||||
toint_a = vcvtq_s32_f32(Round);
|
||||
intInputVal2 = vqmovn_s32(toint_a);
|
||||
|
||||
pack16_8_1 = vcombine_s16(intInputVal1, intInputVal2);
|
||||
res8_1 = vqmovn_s16(pack16_8_1);
|
||||
|
||||
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
|
||||
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
||||
PlusHalf = vaddq_f32(ret1, half);
|
||||
Round = vsubq_f32(PlusHalf, sign);
|
||||
toint_a = vcvtq_s32_f32(Round);
|
||||
intInputVal1 = vqmovn_s32(toint_a);
|
||||
|
||||
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
|
||||
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
||||
PlusHalf = vaddq_f32(ret1, half);
|
||||
Round = vsubq_f32(PlusHalf, sign);
|
||||
toint_a = vcvtq_s32_f32(Round);
|
||||
intInputVal2 = vqmovn_s32(toint_a);
|
||||
|
||||
pack16_8_1 = vcombine_s16(intInputVal1, intInputVal2);
|
||||
res8_2 = vqmovn_s16(pack16_8_1);
|
||||
|
||||
outputVal = vcombine_s8(res8_1, res8_2);
|
||||
|
||||
vst1q_s8((int8_t*)outputVectorPtr, outputVal);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int i = neon_iters * 16; i < num_points * 2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > (float32_t)SCHAR_MAX)
|
||||
inputVectorPtr[i] = (float32_t)SCHAR_MAX;
|
||||
else if(inputVectorPtr[i] < (float32_t)SCHAR_MIN)
|
||||
inputVectorPtr[i] = (float32_t)SCHAR_MIN;
|
||||
*outputVectorPtr++ = (int8_t)rintf(*inputVectorPtr++);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* LV_HAVE_NEON */
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_H */
|
||||
|
Loading…
x
Reference in New Issue
Block a user