1# SIMD MMX dot product
2# Equivalent to the following C code:
3# long dotprod(signed short *a,signed short *b,int cnt)
4# {
5#	long sum = 0;
6#	cnt *= 4;
7#	while(cnt--)
8#		sum += *a++ + *b++;
9#	return sum;
10# }
11# a and b should also be 64-bit aligned, or speed will suffer greatly
12# Copyright 1999, Phil Karn KA9Q
13# May be used under the terms of the GNU Lesser General Public License (LGPL)
14
15	.text
16	.global dotprod_mmx_assist
17	.type dotprod_mmx_assist,@function
18dotprod_mmx_assist:
19	pushl %ebp
20	movl %esp,%ebp
21	pushl %esi
22	pushl %edi
23	pushl %ecx
24	pushl %ebx
25	movl 8(%ebp),%esi	# a
26	movl 12(%ebp),%edi	# b
27	movl 16(%ebp),%ecx	# cnt
28	pxor %mm0,%mm0		# clear running sum (in two 32-bit halves)
29
30# MMX dot product loop unrolled 4 times, crunching 16 terms per loop
31	.align 16
32.Loop1:	subl $4,%ecx
33	jl   .Loop1Done
34
35	movq (%esi),%mm1	# mm1 = a[3],a[2],a[1],a[0]
36 	pmaddwd (%edi),%mm1	# mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0]
37	paddd %mm1,%mm0
38
39	movq 8(%esi),%mm1
40	pmaddwd 8(%edi),%mm1
41	paddd %mm1,%mm0
42
43	movq 16(%esi),%mm1
44	pmaddwd 16(%edi),%mm1
45	paddd %mm1,%mm0
46
47	movq 24(%esi),%mm1
48	addl $32,%esi
49	pmaddwd 24(%edi),%mm1
50	addl $32,%edi
51	paddd %mm1,%mm0
52
53	jmp .Loop1
54.Loop1Done:
55
56	addl $4,%ecx
57
58# MMX dot product loop, not unrolled, crunching 4 terms per loop
59# This could be redone as Duff's Device on the unrolled loop above
60.Loop2:	subl $1,%ecx
61	jl   .Loop2Done
62
63	movq (%esi),%mm1
64	addl $8,%esi
65	pmaddwd (%edi),%mm1
66	addl $8,%edi
67	paddd %mm1,%mm0
68	jmp .Loop2
69.Loop2Done:
70
71	movd %mm0,%ebx		# right-hand word to ebx
72	punpckhdq %mm0,%mm0	# left-hand word to right side of %mm0
73	movd %mm0,%eax
74	addl %ebx,%eax		# running sum now in %eax
75	emms			# done with MMX
76
77	popl %ebx
78	popl %ecx
79	popl %edi
80	popl %esi
81	movl %ebp,%esp
82	popl %ebp
83	ret
84