1;  vim:filetype=nasm ts=8
2
3;  libFLAC - Free Lossless Audio Codec library
4;  Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
5;
6;  Redistribution and use in source and binary forms, with or without
7;  modification, are permitted provided that the following conditions
8;  are met:
9;
10;  - Redistributions of source code must retain the above copyright
11;  notice, this list of conditions and the following disclaimer.
12;
13;  - Redistributions in binary form must reproduce the above copyright
14;  notice, this list of conditions and the following disclaimer in the
15;  documentation and/or other materials provided with the distribution.
16;
17;  - Neither the name of the Xiph.org Foundation nor the names of its
18;  contributors may be used to endorse or promote products derived from
19;  this software without specific prior written permission.
20;
21;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
25;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33%include "nasm.h"
34
35	data_section
36
37cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
38cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
39cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
40cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
41cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
42cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
43cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
44cglobal FLAC__lpc_restore_signal_asm_ia32
45cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
46
47	code_section
48
49; **********************************************************************
50;
51; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
52; {
53;	FLAC__real d;
54;	unsigned sample, coeff;
55;	const unsigned limit = data_len - lag;
56;
57;	FLAC__ASSERT(lag > 0);
58;	FLAC__ASSERT(lag <= data_len);
59;
60;	for(coeff = 0; coeff < lag; coeff++)
61;		autoc[coeff] = 0.0;
62;	for(sample = 0; sample <= limit; sample++) {
63;		d = data[sample];
64;		for(coeff = 0; coeff < lag; coeff++)
65;			autoc[coeff] += d * data[sample+coeff];
66;	}
67;	for(; sample < data_len; sample++) {
68;		d = data[sample];
69;		for(coeff = 0; coeff < data_len - sample; coeff++)
70;			autoc[coeff] += d * data[sample+coeff];
71;	}
72; }
73;
74	ALIGN 16
75cident FLAC__lpc_compute_autocorrelation_asm_ia32
76	;[esp + 28] == autoc[]
77	;[esp + 24] == lag
78	;[esp + 20] == data_len
79	;[esp + 16] == data[]
80
81	;ASSERT(lag > 0)
82	;ASSERT(lag <= 33)
83	;ASSERT(lag <= data_len)
84
85.begin:
86	push	esi
87	push	edi
88	push	ebx
89
90	;	for(coeff = 0; coeff < lag; coeff++)
91	;		autoc[coeff] = 0.0;
92	mov	edi, [esp + 28]			; edi == autoc
93	mov	ecx, [esp + 24]			; ecx = # of dwords (=lag) of 0 to write
94	xor	eax, eax
95	rep	stosd
96
97	;	const unsigned limit = data_len - lag;
98	mov	eax, [esp + 24]			; eax == lag
99	mov	ecx, [esp + 20]
100	sub	ecx, eax			; ecx == limit
101
102	mov	edi, [esp + 28]			; edi == autoc
103	mov	esi, [esp + 16]			; esi == data
104	inc	ecx				; we are looping <= limit so we add one to the counter
105
106	;	for(sample = 0; sample <= limit; sample++) {
107	;		d = data[sample];
108	;		for(coeff = 0; coeff < lag; coeff++)
109	;			autoc[coeff] += d * data[sample+coeff];
110	;	}
111	fld	dword [esi]			; ST = d <- data[sample]
112	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
113	lea	edx, [eax + eax*2]
114	neg	edx
115	lea	edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
116	call	.get_eip1
117.get_eip1:
118	pop	ebx
119	add	edx, ebx
120	inc	edx				; compensate for the shorter opcode on the last iteration
121	inc	edx				; compensate for the shorter opcode on the last iteration
122	inc	edx				; compensate for the shorter opcode on the last iteration
123	cmp	eax, 33
124	jne	.loop1_start
125	sub	edx, byte 9			; compensate for the longer opcodes on the first iteration
126.loop1_start:
127	jmp	edx
128
129	fld	st0				; ST = d d
130	fmul	dword [esi + (32*4)]		; ST = d*data[sample+32] d		WATCHOUT: not a byte displacement here!
131	fadd	dword [edi + (32*4)]		; ST = autoc[32]+d*data[sample+32] d	WATCHOUT: not a byte displacement here!
132	fstp	dword [edi + (32*4)]		; autoc[32]+=d*data[sample+32]  ST = d	WATCHOUT: not a byte displacement here!
133	fld	st0				; ST = d d
134	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
135	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
136	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
137	fld	st0				; ST = d d
138	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
139	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
140	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
141	fld	st0				; ST = d d
142	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
143	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
144	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
145	fld	st0				; ST = d d
146	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
147	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
148	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
149	fld	st0				; ST = d d
150	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
151	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
152	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
153	fld	st0				; ST = d d
154	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
155	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
156	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
157	fld	st0				; ST = d d
158	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
159	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
160	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
161	fld	st0				; ST = d d
162	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
163	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
164	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
165	fld	st0				; ST = d d
166	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
167	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
168	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
169	fld	st0				; ST = d d
170	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
171	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
172	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
173	fld	st0				; ST = d d
174	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
175	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
176	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
177	fld	st0				; ST = d d
178	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
179	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
180	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
181	fld	st0				; ST = d d
182	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
183	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
184	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
185	fld	st0				; ST = d d
186	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
187	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
188	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
189	fld	st0				; ST = d d
190	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
191	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
192	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
193	fld	st0				; ST = d d
194	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
195	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
196	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
197	fld	st0				; ST = d d
198	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
199	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
200	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
201	fld	st0				; ST = d d
202	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
203	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
204	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
205	fld	st0				; ST = d d
206	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
207	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
208	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
209	fld	st0				; ST = d d
210	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
211	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
212	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
213	fld	st0				; ST = d d
214	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
215	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
216	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
217	fld	st0				; ST = d d
218	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
219	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
220	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
221	fld	st0				; ST = d d
222	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
223	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
224	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
225	fld	st0				; ST = d d
226	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
227	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
228	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
229	fld	st0				; ST = d d
230	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
231	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
232	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
233	fld	st0				; ST = d d
234	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
235	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
236	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
237	fld	st0				; ST = d d
238	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
239	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
240	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
241	fld	st0				; ST = d d
242	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
243	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
244	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
245	fld	st0				; ST = d d
246	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
247	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
248	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
249	fld	st0				; ST = d d
250	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
251	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
252	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
253	fld	st0				; ST = d d
254	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
255	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
256	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
257	fld	st0				; ST = d d
258	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
259	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
260	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
261.jumper1_0:
262
263	fstp	st0				; pop d, ST = empty
264	add	esi, byte 4			; sample++
265	dec	ecx
266	jz	.loop1_end
267	fld	dword [esi]			; ST = d <- data[sample]
268	jmp	edx
269.loop1_end:
270
271	;	for(; sample < data_len; sample++) {
272	;		d = data[sample];
273	;		for(coeff = 0; coeff < data_len - sample; coeff++)
274	;			autoc[coeff] += d * data[sample+coeff];
275	;	}
276	mov	ecx, [esp + 24]			; ecx <- lag
277	dec	ecx				; ecx <- lag - 1
278	jz	near .end			; skip loop if 0 (i.e. lag == 1)
279
280	fld	dword [esi]			; ST = d <- data[sample]
281	mov	eax, ecx			; eax <- lag - 1 == data_len - sample the first time through
282	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
283	lea	edx, [eax + eax*2]
284	neg	edx
285	lea	edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
286	call	.get_eip2
287.get_eip2:
288	pop	ebx
289	add	edx, ebx
290	inc	edx				; compensate for the shorter opcode on the last iteration
291	inc	edx				; compensate for the shorter opcode on the last iteration
292	inc	edx				; compensate for the shorter opcode on the last iteration
293	jmp	edx
294
295	fld	st0				; ST = d d
296	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
297	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
298	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
299	fld	st0				; ST = d d
300	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
301	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
302	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
303	fld	st0				; ST = d d
304	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
305	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
306	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
307	fld	st0				; ST = d d
308	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
309	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
310	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
311	fld	st0				; ST = d d
312	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
313	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
314	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
315	fld	st0				; ST = d d
316	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
317	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
318	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
319	fld	st0				; ST = d d
320	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
321	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
322	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
323	fld	st0				; ST = d d
324	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
325	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
326	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
327	fld	st0				; ST = d d
328	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
329	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
330	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
331	fld	st0				; ST = d d
332	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
333	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
334	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
335	fld	st0				; ST = d d
336	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
337	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
338	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
339	fld	st0				; ST = d d
340	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
341	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
342	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
343	fld	st0				; ST = d d
344	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
345	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
346	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
347	fld	st0				; ST = d d
348	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
349	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
350	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
351	fld	st0				; ST = d d
352	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
353	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
354	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
355	fld	st0				; ST = d d
356	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
357	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
358	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
359	fld	st0				; ST = d d
360	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
361	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
362	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
363	fld	st0				; ST = d d
364	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
365	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
366	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
367	fld	st0				; ST = d d
368	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
369	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
370	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
371	fld	st0				; ST = d d
372	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
373	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
374	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
375	fld	st0				; ST = d d
376	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
377	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
378	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
379	fld	st0				; ST = d d
380	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
381	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
382	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
383	fld	st0				; ST = d d
384	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
385	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
386	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
387	fld	st0				; ST = d d
388	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
389	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
390	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
391	fld	st0				; ST = d d
392	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
393	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
394	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
395	fld	st0				; ST = d d
396	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
397	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
398	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
399	fld	st0				; ST = d d
400	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
401	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
402	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
403	fld	st0				; ST = d d
404	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
405	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
406	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
407	fld	st0				; ST = d d
408	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
409	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
410	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
411	fld	st0				; ST = d d
412	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
413	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
414	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
415	fld	st0				; ST = d d
416	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
417	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
418	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
419	fld	st0				; ST = d d
420	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
421	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
422	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
423.jumper2_0:
424
425	fstp	st0				; pop d, ST = empty
426	add	esi, byte 4			; sample++
427	dec	ecx
428	jz	.loop2_end
429	add	edx, byte 11			; adjust our inner loop counter by adjusting the jump target
430	fld	dword [esi]			; ST = d <- data[sample]
431	jmp	edx
432.loop2_end:
433
434.end:
435	pop	ebx
436	pop	edi
437	pop	esi
438	ret
439
440	ALIGN 16
441cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
442	;[esp + 16] == autoc[]
443	;[esp + 12] == lag
444	;[esp + 8] == data_len
445	;[esp + 4] == data[]
446
447	;ASSERT(lag > 0)
448	;ASSERT(lag <= 4)
449	;ASSERT(lag <= data_len)
450
451	;	for(coeff = 0; coeff < lag; coeff++)
452	;		autoc[coeff] = 0.0;
453	xorps	xmm5, xmm5
454
455	mov	edx, [esp + 8]			; edx == data_len
456	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
457
458	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
459	add	eax, 4
460	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
461	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
462.warmup:					; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
463	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
464	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
465	dec	edx
466	jz	.loop_end
467	ALIGN 16
468.loop_start:
469	; start by reading the next sample
470	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
471	add	eax, 4
472	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
473	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
474	movss	xmm2, xmm0
475	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
476	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
477	dec	edx
478	jnz	.loop_start
479.loop_end:
480	; store autoc
481	mov	edx, [esp + 16]			; edx == autoc
482	movups	[edx], xmm5
483
484.end:
485	ret
486
487	ALIGN 16
488cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
489	;[esp + 16] == autoc[]
490	;[esp + 12] == lag
491	;[esp + 8] == data_len
492	;[esp + 4] == data[]
493
494	;ASSERT(lag > 0)
495	;ASSERT(lag <= 8)
496	;ASSERT(lag <= data_len)
497
498	;	for(coeff = 0; coeff < lag; coeff++)
499	;		autoc[coeff] = 0.0;
500	xorps	xmm5, xmm5
501	xorps	xmm6, xmm6
502
503	mov	edx, [esp + 8]			; edx == data_len
504	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
505
506	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
507	add	eax, 4
508	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
509	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
510	movaps	xmm1, xmm0			; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
511	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
512.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
513	mulps	xmm0, xmm2
514	mulps	xmm1, xmm3			; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
515	addps	xmm5, xmm0
516	addps	xmm6, xmm1			; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
517	dec	edx
518	jz	.loop_end
519	ALIGN 16
520.loop_start:
521	; start by reading the next sample
522	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
523	; here we reorder the instructions; see the (#) indexes for a logical order
524	shufps	xmm2, xmm2, 93h			; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
525	add	eax, 4				; (0)
526	shufps	xmm3, xmm3, 93h			; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
527	shufps	xmm0, xmm0, 0			; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
528	movss	xmm3, xmm2			; (5)
529	movaps	xmm1, xmm0			; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
530	movss	xmm2, xmm0			; (6)
531	mulps	xmm1, xmm3			; (8)
532	mulps	xmm0, xmm2			; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
533	addps	xmm6, xmm1			; (10)
534	addps	xmm5, xmm0			; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
535	dec	edx
536	jnz	.loop_start
537.loop_end:
538	; store autoc
539	mov	edx, [esp + 16]			; edx == autoc
540	movups	[edx], xmm5
541	movups	[edx + 16], xmm6
542
543.end:
544	ret
545
546	ALIGN 16
547cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
548	;[esp + 16] == autoc[]
549	;[esp + 12] == lag
550	;[esp + 8] == data_len
551	;[esp + 4] == data[]
552
553	;ASSERT(lag > 0)
554	;ASSERT(lag <= 12)
555	;ASSERT(lag <= data_len)
556
557	;	for(coeff = 0; coeff < lag; coeff++)
558	;		autoc[coeff] = 0.0;
559	xorps	xmm5, xmm5
560	xorps	xmm6, xmm6
561	xorps	xmm7, xmm7
562
563	mov	edx, [esp + 8]			; edx == data_len
564	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
565
566	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
567	add	eax, 4
568	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
569	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
570	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
571	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0
572.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
573	movaps	xmm1, xmm0
574	mulps	xmm1, xmm2
575	addps	xmm5, xmm1
576	movaps	xmm1, xmm0
577	mulps	xmm1, xmm3
578	addps	xmm6, xmm1
579	mulps	xmm0, xmm4
580	addps	xmm7, xmm0			; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
581	dec	edx
582	jz	.loop_end
583	ALIGN 16
584.loop_start:
585	; start by reading the next sample
586	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
587	add	eax, 4
588	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
589
590	; shift xmm4:xmm3:xmm2 left by one float
591	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
592	shufps	xmm3, xmm3, 93h			; 93h=2-1-0-3 => xmm3 gets rotated left by one float
593	shufps	xmm4, xmm4, 93h			; 93h=2-1-0-3 => xmm4 gets rotated left by one float
594	movss	xmm4, xmm3
595	movss	xmm3, xmm2
596	movss	xmm2, xmm0
597
598	; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
599	movaps	xmm1, xmm0
600	mulps	xmm1, xmm2
601	addps	xmm5, xmm1
602	movaps	xmm1, xmm0
603	mulps	xmm1, xmm3
604	addps	xmm6, xmm1
605	mulps	xmm0, xmm4
606	addps	xmm7, xmm0
607
608	dec	edx
609	jnz	.loop_start
610.loop_end:
611	; store autoc
612	mov	edx, [esp + 16]			; edx == autoc
613	movups	[edx], xmm5
614	movups	[edx + 16], xmm6
615	movups	[edx + 32], xmm7
616
617.end:
618	ret
619
620	ALIGN 16
621cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
622	;[ebp + 32] autoc
623	;[ebp + 28] lag
624	;[ebp + 24] data_len
625	;[ebp + 20] data
626
627	push	ebp
628	push	ebx
629	push	esi
630	push	edi
631	mov	ebp, esp
632
633	mov	esi, [ebp + 20]
634	mov	edi, [ebp + 24]
635	mov	edx, [ebp + 28]
636	inc	edx
637	and	edx, byte -2
638	mov	eax, edx
639	neg	eax
640	and	esp, byte -8
641	lea	esp, [esp + 4 * eax]
642	mov	ecx, edx
643	xor	eax, eax
644.loop0:
645	dec	ecx
646	mov	[esp + 4 * ecx], eax
647	jnz	short .loop0
648
649	mov	eax, edi
650	sub	eax, edx
651	mov	ebx, edx
652	and	ebx, byte 1
653	sub	eax, ebx
654	lea	ecx, [esi + 4 * eax - 12]
655	cmp	esi, ecx
656	mov	eax, esi
657	ja	short .loop2_pre
658	ALIGN	16		;4 nops
659.loop1_i:
660	movd	mm0, [eax]
661	movd	mm2, [eax + 4]
662	movd	mm4, [eax + 8]
663	movd	mm6, [eax + 12]
664	mov	ebx, edx
665	punpckldq	mm0, mm0
666	punpckldq	mm2, mm2
667	punpckldq	mm4, mm4
668	punpckldq	mm6, mm6
669	ALIGN	16		;3 nops
670.loop1_j:
671	sub	ebx, byte 2
672	movd	mm1, [eax + 4 * ebx]
673	movd	mm3, [eax + 4 * ebx + 4]
674	movd	mm5, [eax + 4 * ebx + 8]
675	movd	mm7, [eax + 4 * ebx + 12]
676	punpckldq	mm1, mm3
677	punpckldq	mm3, mm5
678	pfmul	mm1, mm0
679	punpckldq	mm5, mm7
680	pfmul	mm3, mm2
681	punpckldq	mm7, [eax + 4 * ebx + 16]
682	pfmul	mm5, mm4
683	pfmul	mm7, mm6
684	pfadd	mm1, mm3
685	movq	mm3, [esp + 4 * ebx]
686	pfadd	mm5, mm7
687	pfadd	mm1, mm5
688	pfadd	mm3, mm1
689	movq	[esp + 4 * ebx], mm3
690	jg	short .loop1_j
691
692	add	eax, byte 16
693	cmp	eax, ecx
694	jb	short .loop1_i
695
696.loop2_pre:
697	mov	ebx, eax
698	sub	eax, esi
699	shr	eax, 2
700	lea	ecx, [esi + 4 * edi]
701	mov	esi, ebx
702.loop2_i:
703	movd	mm0, [esi]
704	mov	ebx, edi
705	sub	ebx, eax
706	cmp	ebx, edx
707	jbe	short .loop2_j
708	mov	ebx, edx
709.loop2_j:
710	dec	ebx
711	movd	mm1, [esi + 4 * ebx]
712	pfmul	mm1, mm0
713	movd	mm2, [esp + 4 * ebx]
714	pfadd	mm1, mm2
715	movd	[esp + 4 * ebx], mm1
716
717	jnz	short .loop2_j
718
719	add	esi, byte 4
720	inc	eax
721	cmp	esi, ecx
722	jnz	short .loop2_i
723
724	mov	edi, [ebp + 32]
725	mov	edx, [ebp + 28]
726.loop3:
727	dec	edx
728	mov	eax, [esp + 4 * edx]
729	mov	[edi + 4 * edx], eax
730	jnz	short .loop3
731
732	femms
733
734	mov	esp, ebp
735	pop	edi
736	pop	esi
737	pop	ebx
738	pop	ebp
739	ret
740
741;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
742;
743;	for(i = 0; i < data_len; i++) {
744;		sum = 0;
745;		for(j = 0; j < order; j++)
746;			sum += qlp_coeff[j] * data[i-j-1];
747;		residual[i] = data[i] - (sum >> lp_quantization);
748;	}
749;
750	ALIGN	16
751cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
752	;[esp + 40]	residual[]
753	;[esp + 36]	lp_quantization
754	;[esp + 32]	order
755	;[esp + 28]	qlp_coeff[]
756	;[esp + 24]	data_len
757	;[esp + 20]	data[]
758
759	;ASSERT(order > 0)
760
761	push	ebp
762	push	ebx
763	push	esi
764	push	edi
765
766	mov	esi, [esp + 20]			; esi = data[]
767	mov	edi, [esp + 40]			; edi = residual[]
768	mov	eax, [esp + 32]			; eax = order
769	mov	ebx, [esp + 24]			; ebx = data_len
770
771	test	ebx, ebx
772	jz	near .end			; do nothing if data_len == 0
773.begin:
774	cmp	eax, byte 1
775	jg	short .i_1more
776
777	mov	ecx, [esp + 28]
778	mov	edx, [ecx]			; edx = qlp_coeff[0]
779	mov	eax, [esi - 4]			; eax = data[-1]
780	mov	cl, [esp + 36]			; cl = lp_quantization
781	ALIGN	16
782.i_1_loop_i:
783	imul	eax, edx
784	sar	eax, cl
785	neg	eax
786	add	eax, [esi]
787	mov	[edi], eax
788	mov	eax, [esi]
789	add	edi, byte 4
790	add	esi, byte 4
791	dec	ebx
792	jnz	.i_1_loop_i
793
794	jmp	.end
795
796.i_1more:
797	cmp	eax, byte 32			; for order <= 32 there is a faster routine
798	jbe	short .i_32
799
800	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
801	ALIGN 16
802.i_32more_loop_i:
803	xor	ebp, ebp
804	mov	ecx, [esp + 32]
805	mov	edx, ecx
806	shl	edx, 2
807	add	edx, [esp + 28]
808	neg	ecx
809	ALIGN	16
810.i_32more_loop_j:
811	sub	edx, byte 4
812	mov	eax, [edx]
813	imul	eax, [esi + 4 * ecx]
814	add	ebp, eax
815	inc	ecx
816	jnz	short .i_32more_loop_j
817
818	mov	cl, [esp + 36]
819	sar	ebp, cl
820	neg	ebp
821	add	ebp, [esi]
822	mov	[edi], ebp
823	add	esi, byte 4
824	add	edi, byte 4
825
826	dec	ebx
827	jnz	.i_32more_loop_i
828
829	jmp	.end
830
831.i_32:
832	sub	edi, esi
833	neg	eax
834	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
835	call	.get_eip0
836.get_eip0:
837	pop	eax
838	add	edx, eax
839	inc	edx
840	mov	eax, [esp + 28]			; eax = qlp_coeff[]
841	xor	ebp, ebp
842	jmp	edx
843
844	mov	ecx, [eax + 124]
845	imul	ecx, [esi - 128]
846	add	ebp, ecx
847	mov	ecx, [eax + 120]
848	imul	ecx, [esi - 124]
849	add	ebp, ecx
850	mov	ecx, [eax + 116]
851	imul	ecx, [esi - 120]
852	add	ebp, ecx
853	mov	ecx, [eax + 112]
854	imul	ecx, [esi - 116]
855	add	ebp, ecx
856	mov	ecx, [eax + 108]
857	imul	ecx, [esi - 112]
858	add	ebp, ecx
859	mov	ecx, [eax + 104]
860	imul	ecx, [esi - 108]
861	add	ebp, ecx
862	mov	ecx, [eax + 100]
863	imul	ecx, [esi - 104]
864	add	ebp, ecx
865	mov	ecx, [eax + 96]
866	imul	ecx, [esi - 100]
867	add	ebp, ecx
868	mov	ecx, [eax + 92]
869	imul	ecx, [esi - 96]
870	add	ebp, ecx
871	mov	ecx, [eax + 88]
872	imul	ecx, [esi - 92]
873	add	ebp, ecx
874	mov	ecx, [eax + 84]
875	imul	ecx, [esi - 88]
876	add	ebp, ecx
877	mov	ecx, [eax + 80]
878	imul	ecx, [esi - 84]
879	add	ebp, ecx
880	mov	ecx, [eax + 76]
881	imul	ecx, [esi - 80]
882	add	ebp, ecx
883	mov	ecx, [eax + 72]
884	imul	ecx, [esi - 76]
885	add	ebp, ecx
886	mov	ecx, [eax + 68]
887	imul	ecx, [esi - 72]
888	add	ebp, ecx
889	mov	ecx, [eax + 64]
890	imul	ecx, [esi - 68]
891	add	ebp, ecx
892	mov	ecx, [eax + 60]
893	imul	ecx, [esi - 64]
894	add	ebp, ecx
895	mov	ecx, [eax + 56]
896	imul	ecx, [esi - 60]
897	add	ebp, ecx
898	mov	ecx, [eax + 52]
899	imul	ecx, [esi - 56]
900	add	ebp, ecx
901	mov	ecx, [eax + 48]
902	imul	ecx, [esi - 52]
903	add	ebp, ecx
904	mov	ecx, [eax + 44]
905	imul	ecx, [esi - 48]
906	add	ebp, ecx
907	mov	ecx, [eax + 40]
908	imul	ecx, [esi - 44]
909	add	ebp, ecx
910	mov	ecx, [eax + 36]
911	imul	ecx, [esi - 40]
912	add	ebp, ecx
913	mov	ecx, [eax + 32]
914	imul	ecx, [esi - 36]
915	add	ebp, ecx
916	mov	ecx, [eax + 28]
917	imul	ecx, [esi - 32]
918	add	ebp, ecx
919	mov	ecx, [eax + 24]
920	imul	ecx, [esi - 28]
921	add	ebp, ecx
922	mov	ecx, [eax + 20]
923	imul	ecx, [esi - 24]
924	add	ebp, ecx
925	mov	ecx, [eax + 16]
926	imul	ecx, [esi - 20]
927	add	ebp, ecx
928	mov	ecx, [eax + 12]
929	imul	ecx, [esi - 16]
930	add	ebp, ecx
931	mov	ecx, [eax + 8]
932	imul	ecx, [esi - 12]
933	add	ebp, ecx
934	mov	ecx, [eax + 4]
935	imul	ecx, [esi - 8]
936	add	ebp, ecx
937	mov	ecx, [eax]			; there is one byte missing
938	imul	ecx, [esi - 4]
939	add	ebp, ecx
940.jumper_0:
941
942	mov	cl, [esp + 36]
943	sar	ebp, cl
944	neg	ebp
945	add	ebp, [esi]
946	mov	[edi + esi], ebp
947	add	esi, byte 4
948
949	dec	ebx
950	jz	short .end
951	xor	ebp, ebp
952	jmp	edx
953
954.end:
955	pop	edi
956	pop	esi
957	pop	ebx
958	pop	ebp
959	ret
960
961; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
962; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
963; cannot be used for side-channel coded 16bps channels since the effective bps
964; is 17.
965	ALIGN	16
966cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
967	;[esp + 40]	residual[]
968	;[esp + 36]	lp_quantization
969	;[esp + 32]	order
970	;[esp + 28]	qlp_coeff[]
971	;[esp + 24]	data_len
972	;[esp + 20]	data[]
973
974	;ASSERT(order > 0)
975
976	push	ebp
977	push	ebx
978	push	esi
979	push	edi
980
981	mov	esi, [esp + 20]			; esi = data[]
982	mov	edi, [esp + 40]			; edi = residual[]
983	mov	eax, [esp + 32]			; eax = order
984	mov	ebx, [esp + 24]			; ebx = data_len
985
986	test	ebx, ebx
987	jz	near .end			; do nothing if data_len == 0
988	dec	ebx
989	test	ebx, ebx
990	jz	near .last_one
991
992	mov	edx, [esp + 28]			; edx = qlp_coeff[]
993	movd	mm6, [esp + 36]			; mm6 = 0:lp_quantization
994	mov	ebp, esp
995
996	and	esp, 0xfffffff8
997
998	xor	ecx, ecx
999.copy_qlp_loop:
1000	push	word [edx + 4 * ecx]
1001	inc	ecx
1002	cmp	ecx, eax
1003	jnz	short .copy_qlp_loop
1004
1005	and	ecx, 0x3
1006	test	ecx, ecx
1007	je	short .za_end
1008	sub	ecx, byte 4
1009.za_loop:
1010	push	word 0
1011	inc	eax
1012	inc	ecx
1013	jnz	short .za_loop
1014.za_end:
1015
1016	movq	mm5, [esp + 2 * eax - 8]
1017	movd	mm4, [esi - 16]
1018	punpckldq	mm4, [esi - 12]
1019	movd	mm0, [esi - 8]
1020	punpckldq	mm0, [esi - 4]
1021	packssdw	mm4, mm0
1022
1023	cmp	eax, byte 4
1024	jnbe	short .mmx_4more
1025
1026	ALIGN	16
1027.mmx_4_loop_i:
1028	movd	mm1, [esi]
1029	movq	mm3, mm4
1030	punpckldq	mm1, [esi + 4]
1031	psrlq	mm4, 16
1032	movq	mm0, mm1
1033	psllq	mm0, 48
1034	por	mm4, mm0
1035	movq	mm2, mm4
1036	psrlq	mm4, 16
1037	pxor	mm0, mm0
1038	punpckhdq	mm0, mm1
1039	pmaddwd	mm3, mm5
1040	pmaddwd	mm2, mm5
1041	psllq	mm0, 16
1042	por	mm4, mm0
1043	movq	mm0, mm3
1044	punpckldq	mm3, mm2
1045	punpckhdq	mm0, mm2
1046	paddd	mm3, mm0
1047	psrad	mm3, mm6
1048	psubd	mm1, mm3
1049	movd	[edi], mm1
1050	punpckhdq	mm1, mm1
1051	movd	[edi + 4], mm1
1052
1053	add	edi, byte 8
1054	add	esi, byte 8
1055
1056	sub	ebx, 2
1057	jg	.mmx_4_loop_i
1058	jmp	.mmx_end
1059
1060.mmx_4more:
1061	shl	eax, 2
1062	neg	eax
1063	add	eax, byte 16
1064
1065	ALIGN	16
1066.mmx_4more_loop_i:
1067	movd	mm1, [esi]
1068	punpckldq	mm1, [esi + 4]
1069	movq	mm3, mm4
1070	psrlq	mm4, 16
1071	movq	mm0, mm1
1072	psllq	mm0, 48
1073	por	mm4, mm0
1074	movq	mm2, mm4
1075	psrlq	mm4, 16
1076	pxor	mm0, mm0
1077	punpckhdq	mm0, mm1
1078	pmaddwd	mm3, mm5
1079	pmaddwd	mm2, mm5
1080	psllq	mm0, 16
1081	por	mm4, mm0
1082
1083	mov	ecx, esi
1084	add	ecx, eax
1085	mov	edx, esp
1086
1087	ALIGN	16
1088.mmx_4more_loop_j:
1089	movd	mm0, [ecx - 16]
1090	movd	mm7, [ecx - 8]
1091	punpckldq	mm0, [ecx - 12]
1092	punpckldq	mm7, [ecx - 4]
1093	packssdw	mm0, mm7
1094	pmaddwd	mm0, [edx]
1095	punpckhdq	mm7, mm7
1096	paddd	mm3, mm0
1097	movd	mm0, [ecx - 12]
1098	punpckldq	mm0, [ecx - 8]
1099	punpckldq	mm7, [ecx]
1100	packssdw	mm0, mm7
1101	pmaddwd	mm0, [edx]
1102	paddd	mm2, mm0
1103
1104	add	edx, byte 8
1105	add	ecx, byte 16
1106	cmp	ecx, esi
1107	jnz	.mmx_4more_loop_j
1108
1109	movq	mm0, mm3
1110	punpckldq	mm3, mm2
1111	punpckhdq	mm0, mm2
1112	paddd	mm3, mm0
1113	psrad	mm3, mm6
1114	psubd	mm1, mm3
1115	movd	[edi], mm1
1116	punpckhdq	mm1, mm1
1117	movd	[edi + 4], mm1
1118
1119	add	edi, byte 8
1120	add	esi, byte 8
1121
1122	sub	ebx, 2
1123	jg	near .mmx_4more_loop_i
1124
1125.mmx_end:
1126	emms
1127	mov	esp, ebp
1128.last_one:
1129	mov	eax, [esp + 32]
1130	inc	ebx
1131	jnz	near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
1132
1133.end:
1134	pop	edi
1135	pop	esi
1136	pop	ebx
1137	pop	ebp
1138	ret
1139
1140; **********************************************************************
1141;
1142; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1143; {
1144; 	unsigned i, j;
1145; 	FLAC__int32 sum;
1146;
1147; 	FLAC__ASSERT(order > 0);
1148;
1149; 	for(i = 0; i < data_len; i++) {
1150; 		sum = 0;
1151; 		for(j = 0; j < order; j++)
1152; 			sum += qlp_coeff[j] * data[i-j-1];
1153; 		data[i] = residual[i] + (sum >> lp_quantization);
1154; 	}
1155; }
1156	ALIGN	16
1157cident FLAC__lpc_restore_signal_asm_ia32
1158	;[esp + 40]	data[]
1159	;[esp + 36]	lp_quantization
1160	;[esp + 32]	order
1161	;[esp + 28]	qlp_coeff[]
1162	;[esp + 24]	data_len
1163	;[esp + 20]	residual[]
1164
1165	;ASSERT(order > 0)
1166
1167	push	ebp
1168	push	ebx
1169	push	esi
1170	push	edi
1171
1172	mov	esi, [esp + 20]			; esi = residual[]
1173	mov	edi, [esp + 40]			; edi = data[]
1174	mov	eax, [esp + 32]			; eax = order
1175	mov	ebx, [esp + 24]			; ebx = data_len
1176
1177	test	ebx, ebx
1178	jz	near .end			; do nothing if data_len == 0
1179
1180.begin:
1181	cmp	eax, byte 1
1182	jg	short .x87_1more
1183
1184	mov	ecx, [esp + 28]
1185	mov	edx, [ecx]
1186	mov	eax, [edi - 4]
1187	mov	cl, [esp + 36]
1188	ALIGN	16
1189.x87_1_loop_i:
1190	imul	eax, edx
1191	sar	eax, cl
1192	add	eax, [esi]
1193	mov	[edi], eax
1194	add	esi, byte 4
1195	add	edi, byte 4
1196	dec	ebx
1197	jnz	.x87_1_loop_i
1198
1199	jmp	.end
1200
1201.x87_1more:
1202	cmp	eax, byte 32			; for order <= 32 there is a faster routine
1203	jbe	short .x87_32
1204
1205	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1206	ALIGN 16
1207.x87_32more_loop_i:
1208	xor	ebp, ebp
1209	mov	ecx, [esp + 32]
1210	mov	edx, ecx
1211	shl	edx, 2
1212	add	edx, [esp + 28]
1213	neg	ecx
1214	ALIGN	16
1215.x87_32more_loop_j:
1216	sub	edx, byte 4
1217	mov	eax, [edx]
1218	imul	eax, [edi + 4 * ecx]
1219	add	ebp, eax
1220	inc	ecx
1221	jnz	short .x87_32more_loop_j
1222
1223	mov	cl, [esp + 36]
1224	sar	ebp, cl
1225	add	ebp, [esi]
1226	mov	[edi], ebp
1227	add	edi, byte 4
1228	add	esi, byte 4
1229
1230	dec	ebx
1231	jnz	.x87_32more_loop_i
1232
1233	jmp	.end
1234
1235.x87_32:
1236	sub	esi, edi
1237	neg	eax
1238	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1239	call	.get_eip0
1240.get_eip0:
1241	pop	eax
1242	add	edx, eax
1243	inc	edx				; compensate for the shorter opcode on the last iteration
1244	mov	eax, [esp + 28]			; eax = qlp_coeff[]
1245	xor	ebp, ebp
1246	jmp	edx
1247
1248	mov	ecx, [eax + 124]		; ecx =  qlp_coeff[31]
1249	imul	ecx, [edi - 128]		; ecx =  qlp_coeff[31] * data[i-32]
1250	add	ebp, ecx			; sum += qlp_coeff[31] * data[i-32]
1251	mov	ecx, [eax + 120]		; ecx =  qlp_coeff[30]
1252	imul	ecx, [edi - 124]		; ecx =  qlp_coeff[30] * data[i-31]
1253	add	ebp, ecx			; sum += qlp_coeff[30] * data[i-31]
1254	mov	ecx, [eax + 116]		; ecx =  qlp_coeff[29]
1255	imul	ecx, [edi - 120]		; ecx =  qlp_coeff[29] * data[i-30]
1256	add	ebp, ecx			; sum += qlp_coeff[29] * data[i-30]
1257	mov	ecx, [eax + 112]		; ecx =  qlp_coeff[28]
1258	imul	ecx, [edi - 116]		; ecx =  qlp_coeff[28] * data[i-29]
1259	add	ebp, ecx			; sum += qlp_coeff[28] * data[i-29]
1260	mov	ecx, [eax + 108]		; ecx =  qlp_coeff[27]
1261	imul	ecx, [edi - 112]		; ecx =  qlp_coeff[27] * data[i-28]
1262	add	ebp, ecx			; sum += qlp_coeff[27] * data[i-28]
1263	mov	ecx, [eax + 104]		; ecx =  qlp_coeff[26]
1264	imul	ecx, [edi - 108]		; ecx =  qlp_coeff[26] * data[i-27]
1265	add	ebp, ecx			; sum += qlp_coeff[26] * data[i-27]
1266	mov	ecx, [eax + 100]		; ecx =  qlp_coeff[25]
1267	imul	ecx, [edi - 104]		; ecx =  qlp_coeff[25] * data[i-26]
1268	add	ebp, ecx			; sum += qlp_coeff[25] * data[i-26]
1269	mov	ecx, [eax + 96]			; ecx =  qlp_coeff[24]
1270	imul	ecx, [edi - 100]		; ecx =  qlp_coeff[24] * data[i-25]
1271	add	ebp, ecx			; sum += qlp_coeff[24] * data[i-25]
1272	mov	ecx, [eax + 92]			; ecx =  qlp_coeff[23]
1273	imul	ecx, [edi - 96]			; ecx =  qlp_coeff[23] * data[i-24]
1274	add	ebp, ecx			; sum += qlp_coeff[23] * data[i-24]
1275	mov	ecx, [eax + 88]			; ecx =  qlp_coeff[22]
1276	imul	ecx, [edi - 92]			; ecx =  qlp_coeff[22] * data[i-23]
1277	add	ebp, ecx			; sum += qlp_coeff[22] * data[i-23]
1278	mov	ecx, [eax + 84]			; ecx =  qlp_coeff[21]
1279	imul	ecx, [edi - 88]			; ecx =  qlp_coeff[21] * data[i-22]
1280	add	ebp, ecx			; sum += qlp_coeff[21] * data[i-22]
1281	mov	ecx, [eax + 80]			; ecx =  qlp_coeff[20]
1282	imul	ecx, [edi - 84]			; ecx =  qlp_coeff[20] * data[i-21]
1283	add	ebp, ecx			; sum += qlp_coeff[20] * data[i-21]
1284	mov	ecx, [eax + 76]			; ecx =  qlp_coeff[19]
1285	imul	ecx, [edi - 80]			; ecx =  qlp_coeff[19] * data[i-20]
1286	add	ebp, ecx			; sum += qlp_coeff[19] * data[i-20]
1287	mov	ecx, [eax + 72]			; ecx =  qlp_coeff[18]
1288	imul	ecx, [edi - 76]			; ecx =  qlp_coeff[18] * data[i-19]
1289	add	ebp, ecx			; sum += qlp_coeff[18] * data[i-19]
1290	mov	ecx, [eax + 68]			; ecx =  qlp_coeff[17]
1291	imul	ecx, [edi - 72]			; ecx =  qlp_coeff[17] * data[i-18]
1292	add	ebp, ecx			; sum += qlp_coeff[17] * data[i-18]
1293	mov	ecx, [eax + 64]			; ecx =  qlp_coeff[16]
1294	imul	ecx, [edi - 68]			; ecx =  qlp_coeff[16] * data[i-17]
1295	add	ebp, ecx			; sum += qlp_coeff[16] * data[i-17]
1296	mov	ecx, [eax + 60]			; ecx =  qlp_coeff[15]
1297	imul	ecx, [edi - 64]			; ecx =  qlp_coeff[15] * data[i-16]
1298	add	ebp, ecx			; sum += qlp_coeff[15] * data[i-16]
1299	mov	ecx, [eax + 56]			; ecx =  qlp_coeff[14]
1300	imul	ecx, [edi - 60]			; ecx =  qlp_coeff[14] * data[i-15]
1301	add	ebp, ecx			; sum += qlp_coeff[14] * data[i-15]
1302	mov	ecx, [eax + 52]			; ecx =  qlp_coeff[13]
1303	imul	ecx, [edi - 56]			; ecx =  qlp_coeff[13] * data[i-14]
1304	add	ebp, ecx			; sum += qlp_coeff[13] * data[i-14]
1305	mov	ecx, [eax + 48]			; ecx =  qlp_coeff[12]
1306	imul	ecx, [edi - 52]			; ecx =  qlp_coeff[12] * data[i-13]
1307	add	ebp, ecx			; sum += qlp_coeff[12] * data[i-13]
1308	mov	ecx, [eax + 44]			; ecx =  qlp_coeff[11]
1309	imul	ecx, [edi - 48]			; ecx =  qlp_coeff[11] * data[i-12]
1310	add	ebp, ecx			; sum += qlp_coeff[11] * data[i-12]
1311	mov	ecx, [eax + 40]			; ecx =  qlp_coeff[10]
1312	imul	ecx, [edi - 44]			; ecx =  qlp_coeff[10] * data[i-11]
1313	add	ebp, ecx			; sum += qlp_coeff[10] * data[i-11]
1314	mov	ecx, [eax + 36]			; ecx =  qlp_coeff[ 9]
1315	imul	ecx, [edi - 40]			; ecx =  qlp_coeff[ 9] * data[i-10]
1316	add	ebp, ecx			; sum += qlp_coeff[ 9] * data[i-10]
1317	mov	ecx, [eax + 32]			; ecx =  qlp_coeff[ 8]
1318	imul	ecx, [edi - 36]			; ecx =  qlp_coeff[ 8] * data[i- 9]
1319	add	ebp, ecx			; sum += qlp_coeff[ 8] * data[i- 9]
1320	mov	ecx, [eax + 28]			; ecx =  qlp_coeff[ 7]
1321	imul	ecx, [edi - 32]			; ecx =  qlp_coeff[ 7] * data[i- 8]
1322	add	ebp, ecx			; sum += qlp_coeff[ 7] * data[i- 8]
1323	mov	ecx, [eax + 24]			; ecx =  qlp_coeff[ 6]
1324	imul	ecx, [edi - 28]			; ecx =  qlp_coeff[ 6] * data[i- 7]
1325	add	ebp, ecx			; sum += qlp_coeff[ 6] * data[i- 7]
1326	mov	ecx, [eax + 20]			; ecx =  qlp_coeff[ 5]
1327	imul	ecx, [edi - 24]			; ecx =  qlp_coeff[ 5] * data[i- 6]
1328	add	ebp, ecx			; sum += qlp_coeff[ 5] * data[i- 6]
1329	mov	ecx, [eax + 16]			; ecx =  qlp_coeff[ 4]
1330	imul	ecx, [edi - 20]			; ecx =  qlp_coeff[ 4] * data[i- 5]
1331	add	ebp, ecx			; sum += qlp_coeff[ 4] * data[i- 5]
1332	mov	ecx, [eax + 12]			; ecx =  qlp_coeff[ 3]
1333	imul	ecx, [edi - 16]			; ecx =  qlp_coeff[ 3] * data[i- 4]
1334	add	ebp, ecx			; sum += qlp_coeff[ 3] * data[i- 4]
1335	mov	ecx, [eax + 8]			; ecx =  qlp_coeff[ 2]
1336	imul	ecx, [edi - 12]			; ecx =  qlp_coeff[ 2] * data[i- 3]
1337	add	ebp, ecx			; sum += qlp_coeff[ 2] * data[i- 3]
1338	mov	ecx, [eax + 4]			; ecx =  qlp_coeff[ 1]
1339	imul	ecx, [edi - 8]			; ecx =  qlp_coeff[ 1] * data[i- 2]
1340	add	ebp, ecx			; sum += qlp_coeff[ 1] * data[i- 2]
1341	mov	ecx, [eax]			; ecx =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1342	imul	ecx, [edi - 4]			; ecx =  qlp_coeff[ 0] * data[i- 1]
1343	add	ebp, ecx			; sum += qlp_coeff[ 0] * data[i- 1]
1344.jumper_0:
1345
1346	mov	cl, [esp + 36]
1347	sar	ebp, cl				; ebp = (sum >> lp_quantization)
1348	add	ebp, [esi + edi]		; ebp = residual[i] + (sum >> lp_quantization)
1349	mov	[edi], ebp			; data[i] = residual[i] + (sum >> lp_quantization)
1350	add	edi, byte 4
1351
1352	dec	ebx
1353	jz	short .end
1354	xor	ebp, ebp
1355	jmp	edx
1356
1357.end:
1358	pop	edi
1359	pop	esi
1360	pop	ebx
1361	pop	ebp
1362	ret
1363
1364; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1365; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
1366; cannot be used for side-channel coded 16bps channels since the effective bps
1367; is 17.
1368; WATCHOUT: this routine requires that each data array have a buffer of up to
1369; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1370; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
1371	ALIGN	16
1372cident FLAC__lpc_restore_signal_asm_ia32_mmx
1373	;[esp + 40]	data[]
1374	;[esp + 36]	lp_quantization
1375	;[esp + 32]	order
1376	;[esp + 28]	qlp_coeff[]
1377	;[esp + 24]	data_len
1378	;[esp + 20]	residual[]
1379
1380	;ASSERT(order > 0)
1381
1382	push	ebp
1383	push	ebx
1384	push	esi
1385	push	edi
1386
1387	mov	esi, [esp + 20]
1388	mov	edi, [esp + 40]
1389	mov	eax, [esp + 32]
1390	mov	ebx, [esp + 24]
1391
1392	test	ebx, ebx
1393	jz	near .end			; do nothing if data_len == 0
1394	cmp	eax, byte 4
1395	jb	near FLAC__lpc_restore_signal_asm_ia32.begin
1396
1397	mov	edx, [esp + 28]
1398	movd	mm6, [esp + 36]
1399	mov	ebp, esp
1400
1401	and	esp, 0xfffffff8
1402
1403	xor	ecx, ecx
1404.copy_qlp_loop:
1405	push	word [edx + 4 * ecx]
1406	inc	ecx
1407	cmp	ecx, eax
1408	jnz	short .copy_qlp_loop
1409
1410	and	ecx, 0x3
1411	test	ecx, ecx
1412	je	short .za_end
1413	sub	ecx, byte 4
1414.za_loop:
1415	push	word 0
1416	inc	eax
1417	inc	ecx
1418	jnz	short .za_loop
1419.za_end:
1420
1421	movq	mm5, [esp + 2 * eax - 8]
1422	movd	mm4, [edi - 16]
1423	punpckldq	mm4, [edi - 12]
1424	movd	mm0, [edi - 8]
1425	punpckldq	mm0, [edi - 4]
1426	packssdw	mm4, mm0
1427
1428	cmp	eax, byte 4
1429	jnbe	short .mmx_4more
1430
1431	ALIGN	16
1432.mmx_4_loop_i:
1433	movq	mm7, mm4
1434	pmaddwd	mm7, mm5
1435	movq	mm0, mm7
1436	punpckhdq	mm7, mm7
1437	paddd	mm7, mm0
1438	psrad	mm7, mm6
1439	movd	mm1, [esi]
1440	paddd	mm7, mm1
1441	movd	[edi], mm7
1442	psllq	mm7, 48
1443	psrlq	mm4, 16
1444	por	mm4, mm7
1445
1446	add	esi, byte 4
1447	add	edi, byte 4
1448
1449	dec	ebx
1450	jnz	.mmx_4_loop_i
1451	jmp	.mmx_end
1452.mmx_4more:
1453	shl	eax, 2
1454	neg	eax
1455	add	eax, byte 16
1456	ALIGN	16
1457.mmx_4more_loop_i:
1458	mov	ecx, edi
1459	add	ecx, eax
1460	mov	edx, esp
1461
1462	movq	mm7, mm4
1463	pmaddwd	mm7, mm5
1464
1465	ALIGN	16
1466.mmx_4more_loop_j:
1467	movd	mm0, [ecx - 16]
1468	punpckldq	mm0, [ecx - 12]
1469	movd	mm1, [ecx - 8]
1470	punpckldq	mm1, [ecx - 4]
1471	packssdw	mm0, mm1
1472	pmaddwd	mm0, [edx]
1473	paddd	mm7, mm0
1474
1475	add	edx, byte 8
1476	add	ecx, byte 16
1477	cmp	ecx, edi
1478	jnz	.mmx_4more_loop_j
1479
1480	movq	mm0, mm7
1481	punpckhdq	mm7, mm7
1482	paddd	mm7, mm0
1483	psrad	mm7, mm6
1484	movd	mm1, [esi]
1485	paddd	mm7, mm1
1486	movd	[edi], mm7
1487	psllq	mm7, 48
1488	psrlq	mm4, 16
1489	por	mm4, mm7
1490
1491	add	esi, byte 4
1492	add	edi, byte 4
1493
1494	dec	ebx
1495	jnz	short .mmx_4more_loop_i
1496.mmx_end:
1497	emms
1498	mov	esp, ebp
1499
1500.end:
1501	pop	edi
1502	pop	esi
1503	pop	ebx
1504	pop	ebp
1505	ret
1506
1507end
1508
1509%ifdef OBJ_FORMAT_elf
1510       section .note.GNU-stack noalloc
1511%endif
1512