1# SIMD SSE2 dot product
2# Equivalent to the following C code:
3# long dotprod(signed short *a,signed short *b,int cnt)
4# {
5#	long sum = 0;
6#	cnt *= 8;
7#	while(cnt--)
8#		sum += *a++ + *b++;
9#	return sum;
10# }
11# a and b must be 128-bit aligned
12# Copyright 2001, Phil Karn KA9Q
13# May be used under the terms of the GNU Lesser General Public License (LGPL)
14
15	.text
16	.global dotprod_sse2_assist
17	.type dotprod_sse2_assist,@function
18dotprod_sse2_assist:
19	pushl %ebp
20	movl %esp,%ebp
21	pushl %esi
22	pushl %edi
23	pushl %ecx
24	pushl %ebx
25	movl 8(%ebp),%esi	# a
26	movl 12(%ebp),%edi	# b
27	movl 16(%ebp),%ecx	# cnt
28	pxor %xmm0,%xmm0		# clear running sum (in two 32-bit halves)
29
30# SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop
31	.align 16
32.Loop1:	subl $4,%ecx
33	jl   .Loop1Done
34
35	movdqa (%esi),%xmm1
36 	pmaddwd (%edi),%xmm1
37	paddd %xmm1,%xmm0
38
39	movdqa 16(%esi),%xmm1
40	pmaddwd 16(%edi),%xmm1
41	paddd %xmm1,%xmm0
42
43	movdqa 32(%esi),%xmm1
44	pmaddwd 32(%edi),%xmm1
45	paddd %xmm1,%xmm0
46
47	movdqa 48(%esi),%xmm1
48	addl $64,%esi
49	pmaddwd 48(%edi),%xmm1
50	addl $64,%edi
51	paddd %xmm1,%xmm0
52
53	jmp .Loop1
54.Loop1Done:
55
56	addl $4,%ecx
57
58# SSE2 dot product loop, not unrolled, crunching 4 terms per loop
59# This could be redone as Duff's Device on the unrolled loop above
60.Loop2:	subl $1,%ecx
61	jl   .Loop2Done
62
63	movdqa (%esi),%xmm1
64	addl $16,%esi
65	pmaddwd (%edi),%xmm1
66	addl $16,%edi
67	paddd %xmm1,%xmm0
68	jmp .Loop2
69.Loop2Done:
70
71	movdqa %xmm0,%xmm1
72	psrldq $8,%xmm0
73	paddd %xmm1,%xmm0
74	movd %xmm0,%eax		# right-hand word to eax
75	psrldq $4,%xmm0
76	movd %xmm0,%ebx
77	addl %ebx,%eax
78
79	popl %ebx
80	popl %ecx
81	popl %edi
82	popl %esi
83	movl %ebp,%esp
84	popl %ebp
85	ret
86