1 /* K=15 r=1/6 Viterbi decoder for x86 SSE2
2  * Copyright Mar 2004, Phil Karn, KA9Q
3  * May be used under the terms of the GNU Lesser General Public License (LGPL)
4  */
5 #include <emmintrin.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <memory.h>
9 #include <limits.h>
10 #include "fec.h"
11 
12 typedef union { unsigned long w[8]; unsigned short s[16];} decision_t;
13 typedef union { signed short s[256]; __m128i v[32];} metric_t;
14 
15 static union branchtab39 { unsigned short s[128]; __m128i v[16];} Branchtab39[3];
16 static int Init = 0;
17 
18 /* State info for instance of Viterbi decoder */
19 struct v39 {
20   metric_t metrics1; /* path metric buffer 1 */
21   metric_t metrics2; /* path metric buffer 2 */
22   void *dp;          /* Pointer to current decision */
23   metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
24   void *decisions;   /* Beginning of decisions for block */
25 };
26 
27 /* Initialize Viterbi decoder for start of new frame */
init_viterbi39_sse2(void * p,int starting_state)28 int init_viterbi39_sse2(void *p,int starting_state){
29   struct v39 *vp = p;
30   int i;
31 
32   for(i=0;i<256;i++)
33     vp->metrics1.s[i] = (SHRT_MIN+1000);
34 
35   vp->old_metrics = &vp->metrics1;
36   vp->new_metrics = &vp->metrics2;
37   vp->dp = vp->decisions;
38   vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */
39   return 0;
40 }
41 
42 /* Create a new instance of a Viterbi decoder */
create_viterbi39_sse2(int len)43 void *create_viterbi39_sse2(int len){
44   void *p;
45   struct v39 *vp;
46 
47   if(!Init){
48     int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
49 
50     set_viterbi39_polynomial_sse2(polys);
51   }
52   /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
53   if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v39)))
54     return NULL;
55 
56   vp = (struct v39 *)p;
57   if((p = malloc((len+8)*sizeof(decision_t))) == NULL){
58     free(vp);
59     return NULL;
60   }
61   vp->decisions = (decision_t *)p;
62   init_viterbi39_sse2(vp,0);
63   return vp;
64 }
65 
set_viterbi39_polynomial_sse2(int polys[3])66 void set_viterbi39_polynomial_sse2(int polys[3]){
67   int state;
68 
69   for(state=0;state < 128;state++){
70     Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
71     Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
72     Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
73   }
74   Init++;
75 }
76 
77 /* Viterbi chainback */
chainback_viterbi39_sse2(void * p,unsigned char * data,unsigned int nbits,unsigned int endstate)78 int chainback_viterbi39_sse2(
79       void *p,
80       unsigned char *data, /* Decoded output data */
81       unsigned int nbits, /* Number of data bits */
82       unsigned int endstate){ /* Terminal encoder state */
83   struct v39 *vp = p;
84   decision_t *d = (decision_t *)vp->decisions;
85   int path_metric;
86 
87   endstate %= 256;
88 
89   path_metric = vp->old_metrics->s[endstate];
90 
91   /* The store into data[] only needs to be done every 8 bits.
92    * But this avoids a conditional branch, and the writes will
93    * combine in the cache anyway
94    */
95   d += 8; /* Look past tail */
96   while(nbits-- != 0){
97     int k;
98 
99     k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;
100     endstate = (k << 7) | (endstate >> 1);
101     data[nbits>>3] = endstate;
102   }
103   return path_metric;
104 }
105 
106 /* Delete instance of a Viterbi decoder */
delete_viterbi39_sse2(void * p)107 void delete_viterbi39_sse2(void *p){
108   struct v39 *vp = p;
109 
110   if(vp != NULL){
111     free(vp->decisions);
112     free(vp);
113   }
114 }
115 
116 
update_viterbi39_blk_sse2(void * p,unsigned char * syms,int nbits)117 int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){
118   struct v39 *vp = p;
119   decision_t *d = (decision_t *)vp->dp;
120   int path_metric = 0;
121 
122   while(nbits--){
123     __m128i sym0v,sym1v,sym2v;
124     void *tmp;
125     int i;
126 
127     /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
128     sym0v = _mm_set1_epi16(syms[0]);
129     sym1v = _mm_set1_epi16(syms[1]);
130     sym2v = _mm_set1_epi16(syms[2]);
131     syms += 3;
132 
133     /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */
134     for(i=0;i<16;i++){
135       __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
136 
137       /* Form branch metrics
138        * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
139        * the XOR operations constitute conditional negation.
140        * metric and m_metric (-metric) are in the range 0-765
141        */
142       m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i],sym0v),_mm_xor_si128(Branchtab39[1].v[i],sym1v));
143       metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i],sym2v),m0);
144       m_metric = _mm_sub_epi16(_mm_set1_epi16(765),metric);
145 
146       /* Add branch metrics to path metrics */
147       m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric);
148       m3 = _mm_adds_epi16(vp->old_metrics->v[16+i],metric);
149       m1 = _mm_adds_epi16(vp->old_metrics->v[16+i],m_metric);
150       m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric);
151 
152       /* Compare and select */
153       survivor0 = _mm_min_epi16(m0,m1);
154       survivor1 = _mm_min_epi16(m2,m3);
155       decision0 = _mm_cmpeq_epi16(survivor0,m1);
156       decision1 = _mm_cmpeq_epi16(survivor1,m3);
157 
158       /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */
159       d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128())));
160 
161       /* Store surviving metrics */
162       vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1);
163       vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1);
164     }
165     /* See if we need to renormalize */
166     if(vp->new_metrics->s[0] >= SHRT_MAX-5000){
167       int i,adjust;
168       __m128i adjustv;
169       union { __m128i v; signed short w[8]; } t;
170 
171       /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
172       adjustv = vp->new_metrics->v[0];
173       for(i=1;i<32;i++)
174 	adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]);
175 
176       adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8));
177       adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4));
178       adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2));
179       t.v = adjustv;
180       adjust = t.w[0] - SHRT_MIN;
181       path_metric += adjust;
182       adjustv = _mm_set1_epi16(adjust);
183 
184       /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX
185        * This is okay since it can't overflow anyway
186        */
187       for(i=0;i<32;i++)
188 	vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv);
189     }
190     d++;
191     /* Swap pointers to old and new metrics */
192     tmp = vp->old_metrics;
193     vp->old_metrics = vp->new_metrics;
194     vp->new_metrics = tmp;
195   }
196   vp->dp = d;
197   return path_metric;
198 }
199 
200 
201