1;  vim:filetype=nasm ts=8
2
3;  libFLAC - Free Lossless Audio Codec library
4;  Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
5;
6;  Redistribution and use in source and binary forms, with or without
7;  modification, are permitted provided that the following conditions
8;  are met:
9;
10;  - Redistributions of source code must retain the above copyright
11;  notice, this list of conditions and the following disclaimer.
12;
13;  - Redistributions in binary form must reproduce the above copyright
14;  notice, this list of conditions and the following disclaimer in the
15;  documentation and/or other materials provided with the distribution.
16;
17;  - Neither the name of the Xiph.org Foundation nor the names of its
18;  contributors may be used to endorse or promote products derived from
19;  this software without specific prior written permission.
20;
21;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
25;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33%include "nasm.h"
34
35	data_section
36
37cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
38
39	code_section
40
41; **********************************************************************
42;
43; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
44; {
45; 	FLAC__int32 last_error_0 = data[-1];
46; 	FLAC__int32 last_error_1 = data[-1] - data[-2];
47; 	FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]);
48; 	FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
49; 	FLAC__int32 error, save;
50; 	FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0;
51; 	unsigned i, order;
52;
53; 	for(i = 0; i < data_len; i++) {
54; 		error  = data[i]     ; total_error_0 += local_abs(error);                      save = error;
55; 		error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error;
56; 		error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error;
57; 		error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error;
58; 		error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save;
59; 	}
60;
61; 	if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
62; 		order = 0;
63; 	else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
64; 		order = 1;
65; 	else if(total_error_2 < min(total_error_3, total_error_4))
66; 		order = 2;
67; 	else if(total_error_3 < total_error_4)
68; 		order = 3;
69; 	else
70; 		order = 4;
71;
72; 	residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
73; 	residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
74; 	residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
75; 	residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
76; 	residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
77;
78; 	return order;
79; }
80	ALIGN 16
81cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
82
83	; esp + 36 == data[]
84	; esp + 40 == data_len
85	; esp + 44 == residual_bits_per_sample[]
86
87	push	ebp
88	push	ebx
89	push	esi
90	push	edi
91	sub	esp, byte 16
92	; qword [esp] == temp space for loading FLAC__uint64s to FPU regs
93
94	; ebx == &data[i]
95	; ecx == loop counter (i)
96	; ebp == order
97	; mm0 == total_error_1:total_error_0
98	; mm1 == total_error_2:total_error_3
99	; mm2 == :total_error_4
100	; mm3 == last_error_1:last_error_0
101	; mm4 == last_error_2:last_error_3
102
103	mov	ecx, [esp + 40]			; ecx = data_len
104	test	ecx, ecx
105	jz	near .data_len_is_0
106
107	mov	ebx, [esp + 36]			; ebx = data[]
108	movd	mm3, [ebx - 4]			; mm3 = 0:last_error_0
109	movd	mm2, [ebx - 8]			; mm2 = 0:data[-2]
110	movd	mm1, [ebx - 12]			; mm1 = 0:data[-3]
111	movd	mm0, [ebx - 16]			; mm0 = 0:data[-4]
112	movq	mm5, mm3			; mm5 = 0:last_error_0
113	psubd	mm5, mm2			; mm5 = 0:last_error_1
114	punpckldq	mm3, mm5		; mm3 = last_error_1:last_error_0
115	psubd	mm2, mm1			; mm2 = 0:data[-2] - data[-3]
116	psubd	mm5, mm2			; mm5 = 0:last_error_2
117	movq	mm4, mm5			; mm4 = 0:last_error_2
118	psubd	mm4, mm2			; mm4 = 0:last_error_2 - (data[-2] - data[-3])
119	paddd	mm4, mm1			; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3])
120	psubd	mm4, mm0			; mm4 = 0:last_error_3
121	punpckldq	mm4, mm5		; mm4 = last_error_2:last_error_3
122	pxor	mm0, mm0			; mm0 = total_error_1:total_error_0
123	pxor	mm1, mm1			; mm1 = total_error_2:total_error_3
124	pxor	mm2, mm2			; mm2 = 0:total_error_4
125
126	ALIGN 16
127.loop:
128	movd	mm7, [ebx]			; mm7 = 0:error_0
129	add	ebx, byte 4
130	movq	mm6, mm7			; mm6 = 0:error_0
131	psubd	mm7, mm3			; mm7 = :error_1
132	punpckldq	mm6, mm7		; mm6 = error_1:error_0
133	movq	mm5, mm6			; mm5 = error_1:error_0
134	movq	mm7, mm6			; mm7 = error_1:error_0
135	psubd	mm5, mm3			; mm5 = error_2:
136	movq	mm3, mm6			; mm3 = error_1:error_0
137	psrad	mm6, 31
138	pxor	mm7, mm6
139	psubd	mm7, mm6			; mm7 = abs(error_1):abs(error_0)
140	paddd	mm0, mm7			; mm0 = total_error_1:total_error_0
141	movq	mm6, mm5			; mm6 = error_2:
142	psubd	mm5, mm4			; mm5 = error_3:
143	punpckhdq	mm5, mm6		; mm5 = error_2:error_3
144	movq	mm7, mm5			; mm7 = error_2:error_3
145	movq	mm6, mm5			; mm6 = error_2:error_3
146	psubd	mm5, mm4			; mm5 = :error_4
147	movq	mm4, mm6			; mm4 = error_2:error_3
148	psrad	mm6, 31
149	pxor	mm7, mm6
150	psubd	mm7, mm6			; mm7 = abs(error_2):abs(error_3)
151	paddd	mm1, mm7			; mm1 = total_error_2:total_error_3
152	movq	mm6, mm5			; mm6 = :error_4
153	psrad	mm5, 31
154	pxor	mm6, mm5
155	psubd	mm6, mm5			; mm6 = :abs(error_4)
156	paddd	mm2, mm6			; mm2 = :total_error_4
157
158	dec	ecx
159	jnz	short .loop
160
161; 	if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
162; 		order = 0;
163; 	else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
164; 		order = 1;
165; 	else if(total_error_2 < min(total_error_3, total_error_4))
166; 		order = 2;
167; 	else if(total_error_3 < total_error_4)
168; 		order = 3;
169; 	else
170; 		order = 4;
171	movq	mm3, mm0			; mm3 = total_error_1:total_error_0
172	movd	edi, mm2			; edi = total_error_4
173	movd	esi, mm1			; esi = total_error_3
174	movd	eax, mm0			; eax = total_error_0
175	punpckhdq	mm1, mm1		; mm1 = total_error_2:total_error_2
176	punpckhdq	mm3, mm3		; mm3 = total_error_1:total_error_1
177	movd	edx, mm1			; edx = total_error_2
178	movd	ecx, mm3			; ecx = total_error_1
179
180	xor	ebx, ebx
181	xor	ebp, ebp
182	inc	ebx
183	cmp	ecx, eax
184	cmovb	eax, ecx			; eax = min(total_error_0, total_error_1)
185	cmovbe	ebp, ebx
186	inc	ebx
187	cmp	edx, eax
188	cmovb	eax, edx			; eax = min(total_error_0, total_error_1, total_error_2)
189	cmovbe	ebp, ebx
190	inc	ebx
191	cmp	esi, eax
192	cmovb	eax, esi			; eax = min(total_error_0, total_error_1, total_error_2, total_error_3)
193	cmovbe	ebp, ebx
194	inc	ebx
195	cmp	edi, eax
196	cmovb	eax, edi			; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4)
197	cmovbe	ebp, ebx
198	movd	ebx, mm0			; ebx = total_error_0
199	emms
200
201	; 	residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
202	; 	residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
203	; 	residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
204	; 	residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
205	; 	residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
206	xor	eax, eax
207	fild	dword [esp + 40]		; ST = data_len (NOTE: assumes data_len is <2gigs)
208.rbps_0:
209	test	ebx, ebx
210	jz	.total_error_0_is_0
211	fld1					; ST = 1.0 data_len
212	mov	[esp], ebx
213	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_0
214	mov	ebx, [esp + 44]
215	fild	qword [esp]			; ST = total_error_0 1.0 data_len
216	fdiv	st2				; ST = total_error_0/data_len 1.0 data_len
217	fldln2					; ST = ln2 total_error_0/data_len 1.0 data_len
218	fmulp	st1				; ST = ln2*total_error_0/data_len 1.0 data_len
219	fyl2x					; ST = log2(ln2*total_error_0/data_len) data_len
220	fstp	dword [ebx]			; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len)   ST = data_len
221	jmp	short .rbps_1
222.total_error_0_is_0:
223	mov	ebx, [esp + 44]
224	mov	[ebx], eax			; residual_bits_per_sample[0] = 0.0
225.rbps_1:
226	test	ecx, ecx
227	jz	.total_error_1_is_0
228	fld1					; ST = 1.0 data_len
229	mov	[esp], ecx
230	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_1
231	fild	qword [esp]			; ST = total_error_1 1.0 data_len
232	fdiv	st2				; ST = total_error_1/data_len 1.0 data_len
233	fldln2					; ST = ln2 total_error_1/data_len 1.0 data_len
234	fmulp	st1				; ST = ln2*total_error_1/data_len 1.0 data_len
235	fyl2x					; ST = log2(ln2*total_error_1/data_len) data_len
236	fstp	dword [ebx + 4]			; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len)   ST = data_len
237	jmp	short .rbps_2
238.total_error_1_is_0:
239	mov	[ebx + 4], eax			; residual_bits_per_sample[1] = 0.0
240.rbps_2:
241	test	edx, edx
242	jz	.total_error_2_is_0
243	fld1					; ST = 1.0 data_len
244	mov	[esp], edx
245	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_2
246	fild	qword [esp]			; ST = total_error_2 1.0 data_len
247	fdiv	st2				; ST = total_error_2/data_len 1.0 data_len
248	fldln2					; ST = ln2 total_error_2/data_len 1.0 data_len
249	fmulp	st1				; ST = ln2*total_error_2/data_len 1.0 data_len
250	fyl2x					; ST = log2(ln2*total_error_2/data_len) data_len
251	fstp	dword [ebx + 8]			; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len)   ST = data_len
252	jmp	short .rbps_3
253.total_error_2_is_0:
254	mov	[ebx + 8], eax			; residual_bits_per_sample[2] = 0.0
255.rbps_3:
256	test	esi, esi
257	jz	.total_error_3_is_0
258	fld1					; ST = 1.0 data_len
259	mov	[esp], esi
260	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_3
261	fild	qword [esp]			; ST = total_error_3 1.0 data_len
262	fdiv	st2				; ST = total_error_3/data_len 1.0 data_len
263	fldln2					; ST = ln2 total_error_3/data_len 1.0 data_len
264	fmulp	st1				; ST = ln2*total_error_3/data_len 1.0 data_len
265	fyl2x					; ST = log2(ln2*total_error_3/data_len) data_len
266	fstp	dword [ebx + 12]		; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len)   ST = data_len
267	jmp	short .rbps_4
268.total_error_3_is_0:
269	mov	[ebx + 12], eax			; residual_bits_per_sample[3] = 0.0
270.rbps_4:
271	test	edi, edi
272	jz	.total_error_4_is_0
273	fld1					; ST = 1.0 data_len
274	mov	[esp], edi
275	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_4
276	fild	qword [esp]			; ST = total_error_4 1.0 data_len
277	fdiv	st2				; ST = total_error_4/data_len 1.0 data_len
278	fldln2					; ST = ln2 total_error_4/data_len 1.0 data_len
279	fmulp	st1				; ST = ln2*total_error_4/data_len 1.0 data_len
280	fyl2x					; ST = log2(ln2*total_error_4/data_len) data_len
281	fstp	dword [ebx + 16]		; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len)   ST = data_len
282	jmp	short .rbps_end
283.total_error_4_is_0:
284	mov	[ebx + 16], eax			; residual_bits_per_sample[4] = 0.0
285.rbps_end:
286	fstp	st0				; ST = [empty]
287	jmp	short .end
288.data_len_is_0:
289	; data_len == 0, so residual_bits_per_sample[*] = 0.0
290	xor	ebp, ebp
291	mov	edi, [esp + 44]
292	mov	[edi], ebp
293	mov	[edi + 4], ebp
294	mov	[edi + 8], ebp
295	mov	[edi + 12], ebp
296	mov	[edi + 16], ebp
297	add	ebp, byte 4			; order = 4
298
299.end:
300	mov	eax, ebp			; return order
301	add	esp, byte 16
302	pop	edi
303	pop	esi
304	pop	ebx
305	pop	ebp
306	ret
307
308end
309
310%ifdef OBJ_FORMAT_elf
311       section .note.GNU-stack noalloc
312%endif
313