1 /* Copyright (c) 2007-2008 CSIRO
2    Copyright (c) 2007-2009 Xiph.Org Foundation
3    Copyright (c) 2007-2016 Jean-Marc Valin */
4 /*
5    Redistribution and use in source and binary forms, with or without
6    modification, are permitted provided that the following conditions
7    are met:
8 
9    - Redistributions of source code must retain the above copyright
10    notice, this list of conditions and the following disclaimer.
11 
12    - Redistributions in binary form must reproduce the above copyright
13    notice, this list of conditions and the following disclaimer in the
14    documentation and/or other materials provided with the distribution.
15 
16    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28 
29 #ifdef HAVE_CONFIG_H
30 #include "config.h"
31 #endif
32 
33 #include <xmmintrin.h>
34 #include <emmintrin.h>
35 #include "celt_lpc.h"
36 #include "stack_alloc.h"
37 #include "mathops.h"
38 #include "vq.h"
39 #include "x86cpu.h"
40 
41 
42 #ifndef FIXED_POINT
43 
op_pvq_search_sse2(celt_norm * _X,int * iy,int K,int N,int arch)44 opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
45 {
46    int i, j;
47    int pulsesLeft;
48    float xy, yy;
49    VARDECL(celt_norm, y);
50    VARDECL(celt_norm, X);
51    VARDECL(float, signy);
52    __m128 signmask;
53    __m128 sums;
54    __m128i fours;
55    SAVE_STACK;
56 
57    (void)arch;
58    /* All bits set to zero, except for the sign bit. */
59    signmask = _mm_set_ps1(-0.f);
60    fours = _mm_set_epi32(4, 4, 4, 4);
61    ALLOC(y, N+3, celt_norm);
62    ALLOC(X, N+3, celt_norm);
63    ALLOC(signy, N+3, float);
64 
65    OPUS_COPY(X, _X, N);
66    X[N] = X[N+1] = X[N+2] = 0;
67    sums = _mm_setzero_ps();
68    for (j=0;j<N;j+=4)
69    {
70       __m128 x4, s4;
71       x4 = _mm_loadu_ps(&X[j]);
72       s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
73       /* Get rid of the sign */
74       x4 = _mm_andnot_ps(signmask, x4);
75       sums = _mm_add_ps(sums, x4);
76       /* Clear y and iy in case we don't do the projection. */
77       _mm_storeu_ps(&y[j], _mm_setzero_ps());
78       _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
79       _mm_storeu_ps(&X[j], x4);
80       _mm_storeu_ps(&signy[j], s4);
81    }
82    sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)));
83    sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1)));
84 
85    xy = yy = 0;
86 
87    pulsesLeft = K;
88 
89    /* Do a pre-search by projecting on the pyramid */
90    if (K > (N>>1))
91    {
92       __m128i pulses_sum;
93       __m128 yy4, xy4;
94       __m128 rcp4;
95       opus_val32 sum = _mm_cvtss_f32(sums);
96       /* If X is too small, just replace it with a pulse at 0 */
97       /* Prevents infinities and NaNs from causing too many pulses
98          to be allocated. 64 is an approximation of infinity here. */
99       if (!(sum > EPSILON && sum < 64))
100       {
101          X[0] = QCONST16(1.f,14);
102          j=1; do
103             X[j]=0;
104          while (++j<N);
105          sums = _mm_set_ps1(1.f);
106       }
107       /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
108       rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums));
109       xy4 = yy4 = _mm_setzero_ps();
110       pulses_sum = _mm_setzero_si128();
111       for (j=0;j<N;j+=4)
112       {
113          __m128 rx4, x4, y4;
114          __m128i iy4;
115          x4 = _mm_loadu_ps(&X[j]);
116          rx4 = _mm_mul_ps(x4, rcp4);
117          iy4 = _mm_cvttps_epi32(rx4);
118          pulses_sum = _mm_add_epi32(pulses_sum, iy4);
119          _mm_storeu_si128((__m128i*)&iy[j], iy4);
120          y4 = _mm_cvtepi32_ps(iy4);
121          xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
122          yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
123          /* double the y[] vector so we don't have to do it in the search loop. */
124          _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
125       }
126       pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2)));
127       pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1)));
128       pulsesLeft -= _mm_cvtsi128_si32(pulses_sum);
129       xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2)));
130       xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1)));
131       xy = _mm_cvtss_f32(xy4);
132       yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2)));
133       yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1)));
134       yy = _mm_cvtss_f32(yy4);
135    }
136    X[N] = X[N+1] = X[N+2] = -100;
137    y[N] = y[N+1] = y[N+2] = 100;
138    celt_sig_assert(pulsesLeft>=0);
139 
140    /* This should never happen, but just in case it does (e.g. on silence)
141       we fill the first bin with pulses. */
142    if (pulsesLeft > N+3)
143    {
144       opus_val16 tmp = (opus_val16)pulsesLeft;
145       yy = MAC16_16(yy, tmp, tmp);
146       yy = MAC16_16(yy, tmp, y[0]);
147       iy[0] += pulsesLeft;
148       pulsesLeft=0;
149    }
150 
151    for (i=0;i<pulsesLeft;i++)
152    {
153       int best_id;
154       __m128 xy4, yy4;
155       __m128 max, max2;
156       __m128i count;
157       __m128i pos;
158       /* The squared magnitude term gets added anyway, so we might as well
159          add it outside the loop */
160       yy = ADD16(yy, 1);
161       xy4 = _mm_load1_ps(&xy);
162       yy4 = _mm_load1_ps(&yy);
163       max = _mm_setzero_ps();
164       pos = _mm_setzero_si128();
165       count = _mm_set_epi32(3, 2, 1, 0);
166       for (j=0;j<N;j+=4)
167       {
168          __m128 x4, y4, r4;
169          x4 = _mm_loadu_ps(&X[j]);
170          y4 = _mm_loadu_ps(&y[j]);
171          x4 = _mm_add_ps(x4, xy4);
172          y4 = _mm_add_ps(y4, yy4);
173          y4 = _mm_rsqrt_ps(y4);
174          r4 = _mm_mul_ps(x4, y4);
175          /* Update the index of the max. */
176          pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
177          /* Update the max. */
178          max = _mm_max_ps(max, r4);
179          /* Update the indices (+4) */
180          count = _mm_add_epi32(count, fours);
181       }
182       /* Horizontal max */
183       max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
184       max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
185       /* Now that max2 contains the max at all positions, look at which value(s) of the
186          partial max is equal to the global max. */
187       pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
188       pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
189       pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
190       best_id = _mm_cvtsi128_si32(pos);
191 
192       /* Updating the sums of the new pulse(s) */
193       xy = ADD32(xy, EXTEND32(X[best_id]));
194       /* We're multiplying y[j] by two so we don't have to do it here */
195       yy = ADD16(yy, y[best_id]);
196 
197       /* Only now that we've made the final choice, update y/iy */
198       /* Multiplying y[j] by 2 so we don't have to do it everywhere else */
199       y[best_id] += 2;
200       iy[best_id]++;
201    }
202 
203    /* Put the original sign back */
204    for (j=0;j<N;j+=4)
205    {
206       __m128i y4;
207       __m128i s4;
208       y4 = _mm_loadu_si128((__m128i*)&iy[j]);
209       s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
210       y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
211       _mm_storeu_si128((__m128i*)&iy[j], y4);
212    }
213    RESTORE_STACK;
214    return yy;
215 }
216 
217 #endif
218