1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2016  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36 
37 #include "private/cpu.h"
38 
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE2_SUPPORTED
44 
45 #include "FLAC/assert.h"
46 #include "FLAC/format.h"
47 
48 #include <emmintrin.h> /* SSE2 */
49 
50 #define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
51 #define     DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
52 
53 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
54 #define     DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
55 
56 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 * data,unsigned data_len,const FLAC__int32 qlp_coeff[],unsigned order,int lp_quantization,FLAC__int32 residual[])57 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
58 {
59 	int i;
60 	FLAC__int32 sum;
61 	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
62 
63 	FLAC__ASSERT(order > 0);
64 	FLAC__ASSERT(order <= 32);
65 
66 	if(order <= 12) {
67 		if(order > 8) {
68 			if(order > 10) {
69 				if(order == 12) {
70 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
71 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
72 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
73 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
74 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
75 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
76 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
77 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
78 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
79 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
80 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
81 					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
82 					q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
83 
84 					for(i = 0; i < (int)data_len-3; i+=4) {
85 						__m128i summ, mull;
86 						summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
87 						mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
88 						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
89 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
90 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
91 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
92 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
93 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
94 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
95 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
96 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
97 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
98 						summ = _mm_sra_epi32(summ, cnt);
99 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
100 					}
101 				}
102 				else { /* order == 11 */
103 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
104 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
105 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
106 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
107 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
108 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
109 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
110 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
111 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
112 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
113 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
114 					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
115 
116 					for(i = 0; i < (int)data_len-3; i+=4) {
117 						__m128i summ, mull;
118 						summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
119 						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
120 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
121 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
122 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
123 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
124 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
125 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
126 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
127 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
128 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
129 						summ = _mm_sra_epi32(summ, cnt);
130 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
131 					}
132 				}
133 			}
134 			else {
135 				if(order == 10) {
136 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
137 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
138 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
139 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
140 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
141 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
142 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
143 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
144 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
145 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
146 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
147 
148 					for(i = 0; i < (int)data_len-3; i+=4) {
149 						__m128i summ, mull;
150 						summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
151 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
152 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
153 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
154 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
155 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
156 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
157 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
158 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
159 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
160 						summ = _mm_sra_epi32(summ, cnt);
161 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
162 					}
163 				}
164 				else { /* order == 9 */
165 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
166 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
167 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
168 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
169 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
170 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
171 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
172 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
173 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
174 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
175 
176 					for(i = 0; i < (int)data_len-3; i+=4) {
177 						__m128i summ, mull;
178 						summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
179 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
180 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
181 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
182 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
183 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
184 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
185 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
186 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
187 						summ = _mm_sra_epi32(summ, cnt);
188 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
189 					}
190 				}
191 			}
192 		}
193 		else if(order > 4) {
194 			if(order > 6) {
195 				if(order == 8) {
196 					__m128i q0, q1, q2, q3, q4, q5, q6, q7;
197 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
198 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
199 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
200 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
201 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
202 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
203 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
204 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
205 
206 					for(i = 0; i < (int)data_len-3; i+=4) {
207 						__m128i summ, mull;
208 						summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
209 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
210 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
211 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
212 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
213 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
214 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
215 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
216 						summ = _mm_sra_epi32(summ, cnt);
217 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
218 					}
219 				}
220 				else { /* order == 7 */
221 					__m128i q0, q1, q2, q3, q4, q5, q6;
222 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
223 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
224 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
225 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
226 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
227 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
228 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
229 
230 					for(i = 0; i < (int)data_len-3; i+=4) {
231 						__m128i summ, mull;
232 						summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
233 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
234 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
235 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
236 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
237 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
238 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
239 						summ = _mm_sra_epi32(summ, cnt);
240 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
241 					}
242 				}
243 			}
244 			else {
245 				if(order == 6) {
246 					__m128i q0, q1, q2, q3, q4, q5;
247 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
248 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
249 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
250 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
251 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
252 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
253 
254 					for(i = 0; i < (int)data_len-3; i+=4) {
255 						__m128i summ, mull;
256 						summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
257 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
258 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
259 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
260 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
261 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
262 						summ = _mm_sra_epi32(summ, cnt);
263 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
264 					}
265 				}
266 				else { /* order == 5 */
267 					__m128i q0, q1, q2, q3, q4;
268 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
269 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
270 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
271 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
272 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
273 
274 					for(i = 0; i < (int)data_len-3; i+=4) {
275 						__m128i summ, mull;
276 						summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
277 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
278 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
279 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
280 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
281 						summ = _mm_sra_epi32(summ, cnt);
282 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
283 					}
284 				}
285 			}
286 		}
287 		else {
288 			if(order > 2) {
289 				if(order == 4) {
290 					__m128i q0, q1, q2, q3;
291 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
292 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
293 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
294 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
295 
296 					for(i = 0; i < (int)data_len-3; i+=4) {
297 						__m128i summ, mull;
298 						summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
299 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
300 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
301 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
302 						summ = _mm_sra_epi32(summ, cnt);
303 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
304 					}
305 				}
306 				else { /* order == 3 */
307 					__m128i q0, q1, q2;
308 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
309 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
310 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
311 
312 					for(i = 0; i < (int)data_len-3; i+=4) {
313 						__m128i summ, mull;
314 						summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
315 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
316 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
317 						summ = _mm_sra_epi32(summ, cnt);
318 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
319 					}
320 				}
321 			}
322 			else {
323 				if(order == 2) {
324 					__m128i q0, q1;
325 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
326 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
327 
328 					for(i = 0; i < (int)data_len-3; i+=4) {
329 						__m128i summ, mull;
330 						summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
331 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
332 						summ = _mm_sra_epi32(summ, cnt);
333 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
334 					}
335 				}
336 				else { /* order == 1 */
337 					__m128i q0;
338 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
339 
340 					for(i = 0; i < (int)data_len-3; i+=4) {
341 						__m128i summ;
342 						summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
343 						summ = _mm_sra_epi32(summ, cnt);
344 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
345 					}
346 				}
347 			}
348 		}
349 		for(; i < (int)data_len; i++) {
350 			sum = 0;
351 			switch(order) {
352 				case 12: sum += qlp_coeff[11] * data[i-12];
353 				case 11: sum += qlp_coeff[10] * data[i-11];
354 				case 10: sum += qlp_coeff[ 9] * data[i-10];
355 				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
356 				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
357 				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
358 				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
359 				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
360 				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
361 				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
362 				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
363 				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
364 			}
365 			residual[i] = data[i] - (sum >> lp_quantization);
366 		}
367 	}
368 	else { /* order > 12 */
369 		for(i = 0; i < (int)data_len; i++) {
370 			sum = 0;
371 			switch(order) {
372 				case 32: sum += qlp_coeff[31] * data[i-32];
373 				case 31: sum += qlp_coeff[30] * data[i-31];
374 				case 30: sum += qlp_coeff[29] * data[i-30];
375 				case 29: sum += qlp_coeff[28] * data[i-29];
376 				case 28: sum += qlp_coeff[27] * data[i-28];
377 				case 27: sum += qlp_coeff[26] * data[i-27];
378 				case 26: sum += qlp_coeff[25] * data[i-26];
379 				case 25: sum += qlp_coeff[24] * data[i-25];
380 				case 24: sum += qlp_coeff[23] * data[i-24];
381 				case 23: sum += qlp_coeff[22] * data[i-23];
382 				case 22: sum += qlp_coeff[21] * data[i-22];
383 				case 21: sum += qlp_coeff[20] * data[i-21];
384 				case 20: sum += qlp_coeff[19] * data[i-20];
385 				case 19: sum += qlp_coeff[18] * data[i-19];
386 				case 18: sum += qlp_coeff[17] * data[i-18];
387 				case 17: sum += qlp_coeff[16] * data[i-17];
388 				case 16: sum += qlp_coeff[15] * data[i-16];
389 				case 15: sum += qlp_coeff[14] * data[i-15];
390 				case 14: sum += qlp_coeff[13] * data[i-14];
391 				case 13: sum += qlp_coeff[12] * data[i-13];
392 				         sum += qlp_coeff[11] * data[i-12];
393 				         sum += qlp_coeff[10] * data[i-11];
394 				         sum += qlp_coeff[ 9] * data[i-10];
395 				         sum += qlp_coeff[ 8] * data[i- 9];
396 				         sum += qlp_coeff[ 7] * data[i- 8];
397 				         sum += qlp_coeff[ 6] * data[i- 7];
398 				         sum += qlp_coeff[ 5] * data[i- 6];
399 				         sum += qlp_coeff[ 4] * data[i- 5];
400 				         sum += qlp_coeff[ 3] * data[i- 4];
401 				         sum += qlp_coeff[ 2] * data[i- 3];
402 				         sum += qlp_coeff[ 1] * data[i- 2];
403 				         sum += qlp_coeff[ 0] * data[i- 1];
404 			}
405 			residual[i] = data[i] - (sum >> lp_quantization);
406 		}
407 	}
408 }
409 
410 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 * data,unsigned data_len,const FLAC__int32 qlp_coeff[],unsigned order,int lp_quantization,FLAC__int32 residual[])411 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
412 {
413 	int i;
414 
415 	FLAC__ASSERT(order > 0);
416 	FLAC__ASSERT(order <= 32);
417 
418 	if(order <= 12) {
419 		if(order > 8) { /* order == 9, 10, 11, 12 */
420 			if(order > 10) { /* order == 11, 12 */
421 				if(order == 12) {
422 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
423 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
424 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
425 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
426 					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
427 					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
428 					xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
429 
430 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
431 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
432 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
433 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
434 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
435 					xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
436 
437 					for(i = 0; i < (int)data_len; i++) {
438 						//sum = 0;
439 						//sum += qlp_coeff[11] * data[i-12];
440 						//sum += qlp_coeff[10] * data[i-11];
441 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
442 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
443 						xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
444 
445 						//sum += qlp_coeff[9] * data[i-10];
446 						//sum += qlp_coeff[8] * data[i-9];
447 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
448 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
449 						xmm6 = _mm_mul_epu32(xmm6, xmm4);
450 						xmm7 = _mm_add_epi32(xmm7, xmm6);
451 
452 						//sum += qlp_coeff[7] * data[i-8];
453 						//sum += qlp_coeff[6] * data[i-7];
454 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
455 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
456 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
457 						xmm7 = _mm_add_epi32(xmm7, xmm6);
458 
459 						//sum += qlp_coeff[5] * data[i-6];
460 						//sum += qlp_coeff[4] * data[i-5];
461 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
462 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
463 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
464 						xmm7 = _mm_add_epi32(xmm7, xmm6);
465 
466 						//sum += qlp_coeff[3] * data[i-4];
467 						//sum += qlp_coeff[2] * data[i-3];
468 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
469 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
470 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
471 						xmm7 = _mm_add_epi32(xmm7, xmm6);
472 
473 						//sum += qlp_coeff[1] * data[i-2];
474 						//sum += qlp_coeff[0] * data[i-1];
475 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
476 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
477 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
478 						xmm7 = _mm_add_epi32(xmm7, xmm6);
479 
480 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
481 						RESIDUAL32_RESULT(xmm7);
482 					}
483 				}
484 				else { /* order == 11 */
485 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
486 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
487 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
488 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
489 					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
490 					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
491 					xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
492 
493 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
494 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
495 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
496 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
497 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
498 
499 					for(i = 0; i < (int)data_len; i++) {
500 						//sum = 0;
501 						//sum  = qlp_coeff[10] * data[i-11];
502 						xmm7 = _mm_cvtsi32_si128(data[i-11]);
503 						xmm7 = _mm_mul_epu32(xmm7, xmm5);
504 
505 						//sum += qlp_coeff[9] * data[i-10];
506 						//sum += qlp_coeff[8] * data[i-9];
507 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
508 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
509 						xmm6 = _mm_mul_epu32(xmm6, xmm4);
510 						xmm7 = _mm_add_epi32(xmm7, xmm6);
511 
512 						//sum += qlp_coeff[7] * data[i-8];
513 						//sum += qlp_coeff[6] * data[i-7];
514 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
515 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
516 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
517 						xmm7 = _mm_add_epi32(xmm7, xmm6);
518 
519 						//sum += qlp_coeff[5] * data[i-6];
520 						//sum += qlp_coeff[4] * data[i-5];
521 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
522 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
523 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
524 						xmm7 = _mm_add_epi32(xmm7, xmm6);
525 
526 						//sum += qlp_coeff[3] * data[i-4];
527 						//sum += qlp_coeff[2] * data[i-3];
528 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
529 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
530 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
531 						xmm7 = _mm_add_epi32(xmm7, xmm6);
532 
533 						//sum += qlp_coeff[1] * data[i-2];
534 						//sum += qlp_coeff[0] * data[i-1];
535 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
536 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
537 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
538 						xmm7 = _mm_add_epi32(xmm7, xmm6);
539 
540 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
541 						RESIDUAL32_RESULT(xmm7);
542 					}
543 				}
544 			}
545 			else { /* order == 9, 10 */
546 				if(order == 10) {
547 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
548 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
549 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
550 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
551 					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
552 					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
553 
554 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
555 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
556 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
557 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
558 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
559 
560 					for(i = 0; i < (int)data_len; i++) {
561 						//sum = 0;
562 						//sum += qlp_coeff[9] * data[i-10];
563 						//sum += qlp_coeff[8] * data[i-9];
564 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
565 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
566 						xmm7 = _mm_mul_epu32(xmm7, xmm4);
567 
568 						//sum += qlp_coeff[7] * data[i-8];
569 						//sum += qlp_coeff[6] * data[i-7];
570 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
571 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
572 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
573 						xmm7 = _mm_add_epi32(xmm7, xmm6);
574 
575 						//sum += qlp_coeff[5] * data[i-6];
576 						//sum += qlp_coeff[4] * data[i-5];
577 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
578 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
579 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
580 						xmm7 = _mm_add_epi32(xmm7, xmm6);
581 
582 						//sum += qlp_coeff[3] * data[i-4];
583 						//sum += qlp_coeff[2] * data[i-3];
584 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
585 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
586 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
587 						xmm7 = _mm_add_epi32(xmm7, xmm6);
588 
589 						//sum += qlp_coeff[1] * data[i-2];
590 						//sum += qlp_coeff[0] * data[i-1];
591 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
592 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
593 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
594 						xmm7 = _mm_add_epi32(xmm7, xmm6);
595 
596 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
597 						RESIDUAL32_RESULT(xmm7);
598 					}
599 				}
600 				else { /* order == 9 */
601 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
602 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
603 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
604 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
605 					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
606 					xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
607 
608 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
609 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
610 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
611 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
612 
613 					for(i = 0; i < (int)data_len; i++) {
614 						//sum = 0;
615 						//sum  = qlp_coeff[8] * data[i-9];
616 						xmm7 = _mm_cvtsi32_si128(data[i-9]);
617 						xmm7 = _mm_mul_epu32(xmm7, xmm4);
618 
619 						//sum += qlp_coeff[7] * data[i-8];
620 						//sum += qlp_coeff[6] * data[i-7];
621 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
622 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
623 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
624 						xmm7 = _mm_add_epi32(xmm7, xmm6);
625 
626 						//sum += qlp_coeff[5] * data[i-6];
627 						//sum += qlp_coeff[4] * data[i-5];
628 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
629 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
630 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
631 						xmm7 = _mm_add_epi32(xmm7, xmm6);
632 
633 						//sum += qlp_coeff[3] * data[i-4];
634 						//sum += qlp_coeff[2] * data[i-3];
635 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
636 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
637 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
638 						xmm7 = _mm_add_epi32(xmm7, xmm6);
639 
640 						//sum += qlp_coeff[1] * data[i-2];
641 						//sum += qlp_coeff[0] * data[i-1];
642 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
643 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
644 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
645 						xmm7 = _mm_add_epi32(xmm7, xmm6);
646 
647 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
648 						RESIDUAL32_RESULT(xmm7);
649 					}
650 				}
651 			}
652 		}
653 		else if(order > 4) { /* order == 5, 6, 7, 8 */
654 			if(order > 6) { /* order == 7, 8 */
655 				if(order == 8) {
656 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
657 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
658 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
659 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
660 					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
661 
662 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
663 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
664 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
665 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
666 
667 					for(i = 0; i < (int)data_len; i++) {
668 						//sum = 0;
669 						//sum += qlp_coeff[7] * data[i-8];
670 						//sum += qlp_coeff[6] * data[i-7];
671 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
672 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
673 						xmm7 = _mm_mul_epu32(xmm7, xmm3);
674 
675 						//sum += qlp_coeff[5] * data[i-6];
676 						//sum += qlp_coeff[4] * data[i-5];
677 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
678 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
679 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
680 						xmm7 = _mm_add_epi32(xmm7, xmm6);
681 
682 						//sum += qlp_coeff[3] * data[i-4];
683 						//sum += qlp_coeff[2] * data[i-3];
684 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
685 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
686 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
687 						xmm7 = _mm_add_epi32(xmm7, xmm6);
688 
689 						//sum += qlp_coeff[1] * data[i-2];
690 						//sum += qlp_coeff[0] * data[i-1];
691 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
692 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
693 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
694 						xmm7 = _mm_add_epi32(xmm7, xmm6);
695 
696 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
697 						RESIDUAL32_RESULT(xmm7);
698 					}
699 				}
700 				else { /* order == 7 */
701 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
702 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
703 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
704 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
705 					xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
706 
707 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
708 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
709 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
710 
711 					for(i = 0; i < (int)data_len; i++) {
712 						//sum = 0;
713 						//sum  = qlp_coeff[6] * data[i-7];
714 						xmm7 = _mm_cvtsi32_si128(data[i-7]);
715 						xmm7 = _mm_mul_epu32(xmm7, xmm3);
716 
717 						//sum += qlp_coeff[5] * data[i-6];
718 						//sum += qlp_coeff[4] * data[i-5];
719 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
720 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
721 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
722 						xmm7 = _mm_add_epi32(xmm7, xmm6);
723 
724 						//sum += qlp_coeff[3] * data[i-4];
725 						//sum += qlp_coeff[2] * data[i-3];
726 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
727 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
728 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
729 						xmm7 = _mm_add_epi32(xmm7, xmm6);
730 
731 						//sum += qlp_coeff[1] * data[i-2];
732 						//sum += qlp_coeff[0] * data[i-1];
733 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
734 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
735 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
736 						xmm7 = _mm_add_epi32(xmm7, xmm6);
737 
738 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
739 						RESIDUAL32_RESULT(xmm7);
740 					}
741 				}
742 			}
743 			else { /* order == 5, 6 */
744 				if(order == 6) {
745 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
746 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
747 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
748 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
749 
750 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
751 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
752 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
753 
754 					for(i = 0; i < (int)data_len; i++) {
755 						//sum = 0;
756 						//sum += qlp_coeff[5] * data[i-6];
757 						//sum += qlp_coeff[4] * data[i-5];
758 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
759 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
760 						xmm7 = _mm_mul_epu32(xmm7, xmm2);
761 
762 						//sum += qlp_coeff[3] * data[i-4];
763 						//sum += qlp_coeff[2] * data[i-3];
764 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
765 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
766 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
767 						xmm7 = _mm_add_epi32(xmm7, xmm6);
768 
769 						//sum += qlp_coeff[1] * data[i-2];
770 						//sum += qlp_coeff[0] * data[i-1];
771 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
772 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
773 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
774 						xmm7 = _mm_add_epi32(xmm7, xmm6);
775 
776 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
777 						RESIDUAL32_RESULT(xmm7);
778 					}
779 				}
780 				else { /* order == 5 */
781 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
782 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
783 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
784 					xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
785 
786 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
787 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
788 
789 					for(i = 0; i < (int)data_len; i++) {
790 						//sum = 0;
791 						//sum  = qlp_coeff[4] * data[i-5];
792 						xmm7 = _mm_cvtsi32_si128(data[i-5]);
793 						xmm7 = _mm_mul_epu32(xmm7, xmm2);
794 
795 						//sum += qlp_coeff[3] * data[i-4];
796 						//sum += qlp_coeff[2] * data[i-3];
797 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
798 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
799 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
800 						xmm7 = _mm_add_epi32(xmm7, xmm6);
801 
802 						//sum += qlp_coeff[1] * data[i-2];
803 						//sum += qlp_coeff[0] * data[i-1];
804 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
805 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
806 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
807 						xmm7 = _mm_add_epi32(xmm7, xmm6);
808 
809 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
810 						RESIDUAL32_RESULT(xmm7);
811 					}
812 				}
813 			}
814 		}
815 		else { /* order == 1, 2, 3, 4 */
816 			if(order > 2) { /* order == 3, 4 */
817 				if(order == 4) {
818 					__m128i xmm0, xmm1, xmm6, xmm7;
819 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
820 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
821 
822 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
823 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
824 
825 					for(i = 0; i < (int)data_len; i++) {
826 						//sum = 0;
827 						//sum += qlp_coeff[3] * data[i-4];
828 						//sum += qlp_coeff[2] * data[i-3];
829 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
830 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
831 						xmm7 = _mm_mul_epu32(xmm7, xmm1);
832 
833 						//sum += qlp_coeff[1] * data[i-2];
834 						//sum += qlp_coeff[0] * data[i-1];
835 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
836 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
837 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
838 						xmm7 = _mm_add_epi32(xmm7, xmm6);
839 
840 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
841 						RESIDUAL32_RESULT(xmm7);
842 					}
843 				}
844 				else { /* order == 3 */
845 					__m128i xmm0, xmm1, xmm6, xmm7;
846 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
847 					xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
848 
849 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
850 
851 					for(i = 0; i < (int)data_len; i++) {
852 						//sum = 0;
853 						//sum  = qlp_coeff[2] * data[i-3];
854 						xmm7 = _mm_cvtsi32_si128(data[i-3]);
855 						xmm7 = _mm_mul_epu32(xmm7, xmm1);
856 
857 						//sum += qlp_coeff[1] * data[i-2];
858 						//sum += qlp_coeff[0] * data[i-1];
859 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
860 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
861 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
862 						xmm7 = _mm_add_epi32(xmm7, xmm6);
863 
864 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
865 						RESIDUAL32_RESULT(xmm7);
866 					}
867 				}
868 			}
869 			else { /* order == 1, 2 */
870 				if(order == 2) {
871 					__m128i xmm0, xmm7;
872 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
873 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
874 
875 					for(i = 0; i < (int)data_len; i++) {
876 						//sum = 0;
877 						//sum += qlp_coeff[1] * data[i-2];
878 						//sum += qlp_coeff[0] * data[i-1];
879 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
880 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
881 						xmm7 = _mm_mul_epu32(xmm7, xmm0);
882 
883 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
884 						RESIDUAL32_RESULT(xmm7);
885 					}
886 				}
887 				else { /* order == 1 */
888 					for(i = 0; i < (int)data_len; i++)
889 						residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
890 				}
891 			}
892 		}
893 	}
894 	else { /* order > 12 */
895 		FLAC__int32 sum;
896 		for(i = 0; i < (int)data_len; i++) {
897 			sum = 0;
898 			switch(order) {
899 				case 32: sum += qlp_coeff[31] * data[i-32];
900 				case 31: sum += qlp_coeff[30] * data[i-31];
901 				case 30: sum += qlp_coeff[29] * data[i-30];
902 				case 29: sum += qlp_coeff[28] * data[i-29];
903 				case 28: sum += qlp_coeff[27] * data[i-28];
904 				case 27: sum += qlp_coeff[26] * data[i-27];
905 				case 26: sum += qlp_coeff[25] * data[i-26];
906 				case 25: sum += qlp_coeff[24] * data[i-25];
907 				case 24: sum += qlp_coeff[23] * data[i-24];
908 				case 23: sum += qlp_coeff[22] * data[i-23];
909 				case 22: sum += qlp_coeff[21] * data[i-22];
910 				case 21: sum += qlp_coeff[20] * data[i-21];
911 				case 20: sum += qlp_coeff[19] * data[i-20];
912 				case 19: sum += qlp_coeff[18] * data[i-19];
913 				case 18: sum += qlp_coeff[17] * data[i-18];
914 				case 17: sum += qlp_coeff[16] * data[i-17];
915 				case 16: sum += qlp_coeff[15] * data[i-16];
916 				case 15: sum += qlp_coeff[14] * data[i-15];
917 				case 14: sum += qlp_coeff[13] * data[i-14];
918 				case 13: sum += qlp_coeff[12] * data[i-13];
919 				         sum += qlp_coeff[11] * data[i-12];
920 				         sum += qlp_coeff[10] * data[i-11];
921 				         sum += qlp_coeff[ 9] * data[i-10];
922 				         sum += qlp_coeff[ 8] * data[i- 9];
923 				         sum += qlp_coeff[ 7] * data[i- 8];
924 				         sum += qlp_coeff[ 6] * data[i- 7];
925 				         sum += qlp_coeff[ 5] * data[i- 6];
926 				         sum += qlp_coeff[ 4] * data[i- 5];
927 				         sum += qlp_coeff[ 3] * data[i- 4];
928 				         sum += qlp_coeff[ 2] * data[i- 3];
929 				         sum += qlp_coeff[ 1] * data[i- 2];
930 				         sum += qlp_coeff[ 0] * data[i- 1];
931 			}
932 			residual[i] = data[i] - (sum >> lp_quantization);
933 		}
934 	}
935 }
936 
937 #if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */
938 
939 FLAC__SSE_TARGET("sse2")
FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[],unsigned data_len,const FLAC__int32 qlp_coeff[],unsigned order,int lp_quantization,FLAC__int32 data[])940 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
941 {
942 	if (order < 8 || order > 12) {
943 		FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
944 		return;
945 	}
946 	if (data_len == 0)
947 		return;
948 
949 	FLAC__ASSERT(order >= 8);
950 	FLAC__ASSERT(order <= 12);
951 
952 	if(order > 8) { /* order == 9, 10, 11, 12 */
953 		FLAC__int32 curr;
954 		__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
955 		xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
956 		xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
957 		xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
958 		switch(order)                                          /* ...and zero them out */
959 		{
960 		case 9:
961 			xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
962 		case 10:
963 			xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
964 		case 11:
965 			xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
966 		}
967 		xmm2 = _mm_setzero_si128();
968 		xmm0 = _mm_packs_epi32(xmm0, xmm6);
969 		xmm1 = _mm_packs_epi32(xmm1, xmm2);
970 
971 		xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
972 		xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
973 		xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
974 		xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
975 		xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
976 		xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
977 		xmm4 = _mm_packs_epi32(xmm4, xmm2);
978 		xmm3 = _mm_packs_epi32(xmm3, xmm5);
979 
980 		xmm7 = _mm_slli_si128(xmm1, 2);
981 		xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
982 		xmm2 = _mm_slli_si128(xmm0, 2);
983 
984 		/* xmm0, xmm1: qlp_coeff
985 			xmm2, xmm7: qlp_coeff << 16 bit
986 			xmm3, xmm4: data */
987 
988 		xmm5 = _mm_madd_epi16(xmm4, xmm1);
989 		xmm6 = _mm_madd_epi16(xmm3, xmm0);
990 		xmm6 = _mm_add_epi32(xmm6, xmm5);
991 		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
992 		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
993 
994 		DATA16_RESULT(xmm6);
995 
996 		data_len--;
997 
998 		if(data_len % 2) {
999 			xmm6 = _mm_srli_si128(xmm3, 14);
1000 			xmm4 = _mm_slli_si128(xmm4, 2);
1001 			xmm3 = _mm_slli_si128(xmm3, 2);
1002 			xmm4 = _mm_or_si128(xmm4, xmm6);
1003 			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1004 
1005 			xmm5 = _mm_madd_epi16(xmm4, xmm1);
1006 			xmm6 = _mm_madd_epi16(xmm3, xmm0);
1007 			xmm6 = _mm_add_epi32(xmm6, xmm5);
1008 			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1009 			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1010 
1011 			DATA16_RESULT(xmm6);
1012 
1013 			data_len--;
1014 		}
1015 
1016 		while(data_len) { /* data_len is a multiple of 2 */
1017 			/* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1018 			xmm6 = _mm_srli_si128(xmm3, 12);
1019 			xmm4 = _mm_slli_si128(xmm4, 4);
1020 			xmm3 = _mm_slli_si128(xmm3, 4);
1021 			xmm4 = _mm_or_si128(xmm4, xmm6);
1022 			xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1023 
1024 			xmm5 = _mm_madd_epi16(xmm4, xmm7);
1025 			xmm6 = _mm_madd_epi16(xmm3, xmm2);
1026 			xmm6 = _mm_add_epi32(xmm6, xmm5);
1027 			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1028 			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1029 
1030 			DATA16_RESULT(xmm6);
1031 
1032 			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1033 
1034 			xmm5 = _mm_madd_epi16(xmm4, xmm1);
1035 			xmm6 = _mm_madd_epi16(xmm3, xmm0);
1036 			xmm6 = _mm_add_epi32(xmm6, xmm5);
1037 			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1038 			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1039 
1040 			DATA16_RESULT(xmm6);
1041 
1042 			data_len-=2;
1043 		}
1044 	} /* endif(order > 8) */
1045 	else
1046 	{
1047 		FLAC__int32 curr;
1048 		__m128i xmm0, xmm1, xmm3, xmm6;
1049 		xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1050 		xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1051 		xmm0 = _mm_packs_epi32(xmm0, xmm1);
1052 
1053 		xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1054 		xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1055 		xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1056 		xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1057 		xmm3 = _mm_packs_epi32(xmm3, xmm1);
1058 
1059 		/* xmm0: qlp_coeff
1060 			xmm3: data */
1061 
1062 		xmm6 = _mm_madd_epi16(xmm3, xmm0);
1063 		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1064 		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1065 
1066 		DATA16_RESULT(xmm6);
1067 
1068 		data_len--;
1069 
1070 		while(data_len) {
1071 			xmm3 = _mm_slli_si128(xmm3, 2);
1072 			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1073 
1074 			xmm6 = _mm_madd_epi16(xmm3, xmm0);
1075 			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1076 			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1077 
1078 			DATA16_RESULT(xmm6);
1079 
1080 			data_len--;
1081 		}
1082 	}
1083 }
1084 
1085 #endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */
1086 
1087 #endif /* FLAC__SSE2_SUPPORTED */
1088 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1089 #endif /* FLAC__NO_ASM */
1090 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
1091