1 /* 16-bit signed integer dot product
2  * Altivec-assisted version
3  * Copyright 2004 Phil Karn
4  * May be used under the terms of the GNU Lesser General Public License (LGPL)
5  */
6 #include <stdlib.h>
7 #include "fec.h"
8 
9 struct dotprod {
10   int len; /* Number of coefficients */
11 
12   /* On an Altivec machine, these hold 8 copies of the coefficients,
13    * preshifted by 0,1,..7 words to meet all possible input data
14    */
15   signed short *coeffs[8];
16 };
17 
18 /* Create and return a descriptor for use with the dot product function */
initdp_av(signed short coeffs[],int len)19 void *initdp_av(signed short coeffs[],int len){
20   struct dotprod *dp;
21   int i,j;
22 
23   if(len == 0)
24     return NULL;
25 
26   dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
27   dp->len = len;
28 
29   /* Make 8 copies of coefficients, one for each data alignment,
30    * each aligned to 16-byte boundary
31    */
32   for(i=0;i<8;i++){
33     dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
34     for(j=0;j<len;j++)
35       dp->coeffs[i][j+i] = coeffs[j];
36   }
37   return (void *)dp;
38 }
39 
40 
41 /* Free a dot product descriptor created earlier */
freedp_av(void * p)42 void freedp_av(void *p){
43   struct dotprod *dp = (struct dotprod *)p;
44   int i;
45 
46   for(i=0;i<8;i++)
47     if(dp->coeffs[i] != NULL)
48       free(dp->coeffs[i]);
49   free(dp);
50 }
51 
52 /* Compute a dot product given a descriptor and an input array
53  * The length is taken from the descriptor
54  */
dotprod_av(void * p,signed short a[])55 long dotprod_av(void *p,signed short a[]){
56   struct dotprod *dp = (struct dotprod *)p;
57   int al;
58   vector signed short *ar,*d;
59   vector signed int sums0,sums1,sums2,sums3;
60   union { vector signed int v; signed int w[4];} s;
61   int nblocks;
62 
63   /* round ar down to beginning of 16-byte block containing 0th element of
64    * input buffer. Then set d to one of 8 sets of shifted coefficients
65    */
66   ar = (vector signed short *)((int)a & ~15);
67   al = ((int)a & 15)/sizeof(signed short);
68   d = (vector signed short *)dp->coeffs[al];
69 
70   nblocks = (dp->len+al-1)/8+1;
71 
72   /* Sum into four vectors each holding four 32-bit partial sums */
73   sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
74   while(nblocks >= 4){
75     sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
76     sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
77     sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
78     sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
79     nblocks -= 4;
80   }
81   sums0 = vec_adds(sums0,sums1);
82   sums2 = vec_adds(sums2,sums3);
83   sums0 = vec_adds(sums0,sums2);
84   while(nblocks-- > 0){
85     sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
86   }
87   /* Sum 4 partial sums into final result */
88   s.v = vec_sums(sums0,(vector signed int)(0));
89 
90   return s.w[3];
91 }
92 
93 
94