1# SIMD MMX dot product 2# Equivalent to the following C code: 3# long dotprod(signed short *a,signed short *b,int cnt) 4# { 5# long sum = 0; 6# cnt *= 4; 7# while(cnt--) 8# sum += *a++ + *b++; 9# return sum; 10# } 11# a and b should also be 64-bit aligned, or speed will suffer greatly 12# Copyright 1999, Phil Karn KA9Q 13# May be used under the terms of the GNU Lesser General Public License (LGPL) 14 15 .text 16 .global dotprod_mmx_assist 17 .type dotprod_mmx_assist,@function 18dotprod_mmx_assist: 19 pushl %ebp 20 movl %esp,%ebp 21 pushl %esi 22 pushl %edi 23 pushl %ecx 24 pushl %ebx 25 movl 8(%ebp),%esi # a 26 movl 12(%ebp),%edi # b 27 movl 16(%ebp),%ecx # cnt 28 pxor %mm0,%mm0 # clear running sum (in two 32-bit halves) 29 30# MMX dot product loop unrolled 4 times, crunching 16 terms per loop 31 .align 16 32.Loop1: subl $4,%ecx 33 jl .Loop1Done 34 35 movq (%esi),%mm1 # mm1 = a[3],a[2],a[1],a[0] 36 pmaddwd (%edi),%mm1 # mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0] 37 paddd %mm1,%mm0 38 39 movq 8(%esi),%mm1 40 pmaddwd 8(%edi),%mm1 41 paddd %mm1,%mm0 42 43 movq 16(%esi),%mm1 44 pmaddwd 16(%edi),%mm1 45 paddd %mm1,%mm0 46 47 movq 24(%esi),%mm1 48 addl $32,%esi 49 pmaddwd 24(%edi),%mm1 50 addl $32,%edi 51 paddd %mm1,%mm0 52 53 jmp .Loop1 54.Loop1Done: 55 56 addl $4,%ecx 57 58# MMX dot product loop, not unrolled, crunching 4 terms per loop 59# This could be redone as Duff's Device on the unrolled loop above 60.Loop2: subl $1,%ecx 61 jl .Loop2Done 62 63 movq (%esi),%mm1 64 addl $8,%esi 65 pmaddwd (%edi),%mm1 66 addl $8,%edi 67 paddd %mm1,%mm0 68 jmp .Loop2 69.Loop2Done: 70 71 movd %mm0,%ebx # right-hand word to ebx 72 punpckhdq %mm0,%mm0 # left-hand word to right side of %mm0 73 movd %mm0,%eax 74 addl %ebx,%eax # running sum now in %eax 75 emms # done with MMX 76 77 popl %ebx 78 popl %ecx 79 popl %edi 80 popl %esi 81 movl %ebp,%esp 82 popl %ebp 83 ret 84