1 /***********************************************************************
2 Copyright (c) 2006-2011, Skype Limited. All rights reserved.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 #include "API.h"
32 #include "main.h"
33 #include "stack_alloc.h"
34 #include "os_support.h"
35 
36 /************************/
37 /* Decoder Super Struct */
38 /************************/
39 typedef struct {
40     silk_decoder_state          channel_state[ DECODER_NUM_CHANNELS ];
41     stereo_dec_state                sStereo;
42     opus_int                         nChannelsAPI;
43     opus_int                         nChannelsInternal;
44     opus_int                         prev_decode_only_middle;
45 } silk_decoder;
46 
47 /*********************/
48 /* Decoder functions */
49 /*********************/
50 
silk_Get_Decoder_Size(opus_int * decSizeBytes)51 opus_int silk_Get_Decoder_Size(                         /* O    Returns error code                              */
52     opus_int                        *decSizeBytes       /* O    Number of bytes in SILK decoder state           */
53 )
54 {
55     opus_int ret = SILK_NO_ERROR;
56 
57     *decSizeBytes = sizeof( silk_decoder );
58 
59     return ret;
60 }
61 
62 /* Reset decoder state */
silk_InitDecoder(void * decState)63 opus_int silk_InitDecoder(                              /* O    Returns error code                              */
64     void                            *decState           /* I/O  State                                           */
65 )
66 {
67     opus_int n, ret = SILK_NO_ERROR;
68     silk_decoder_state *channel_state = ((silk_decoder *)decState)->channel_state;
69 
70     for( n = 0; n < DECODER_NUM_CHANNELS; n++ ) {
71         ret  = silk_init_decoder( &channel_state[ n ] );
72     }
73     silk_memset(&((silk_decoder *)decState)->sStereo, 0, sizeof(((silk_decoder *)decState)->sStereo));
74     /* Not strictly needed, but it's cleaner that way */
75     ((silk_decoder *)decState)->prev_decode_only_middle = 0;
76 
77     return ret;
78 }
79 
80 /* Decode a frame */
silk_Decode(void * decState,silk_DecControlStruct * decControl,opus_int lostFlag,opus_int newPacketFlag,ec_dec * psRangeDec,opus_int16 * samplesOut,opus_int32 * nSamplesOut,int arch)81 opus_int silk_Decode(                                   /* O    Returns error code                              */
82     void*                           decState,           /* I/O  State                                           */
83     silk_DecControlStruct*          decControl,         /* I/O  Control Structure                               */
84     opus_int                        lostFlag,           /* I    0: no loss, 1 loss, 2 decode fec                */
85     opus_int                        newPacketFlag,      /* I    Indicates first decoder call for this packet    */
86     ec_dec                          *psRangeDec,        /* I/O  Compressor data structure                       */
87     opus_int16                      *samplesOut,        /* O    Decoded output speech vector                    */
88     opus_int32                      *nSamplesOut,       /* O    Number of samples decoded                       */
89     int                             arch                /* I    Run-time architecture                           */
90 )
91 {
92     opus_int   i, n, decode_only_middle = 0, ret = SILK_NO_ERROR;
93     opus_int32 nSamplesOutDec, LBRR_symbol;
94     opus_int16 *samplesOut1_tmp[ 2 ];
95     VARDECL( opus_int16, samplesOut1_tmp_storage1 );
96     VARDECL( opus_int16, samplesOut1_tmp_storage2 );
97     VARDECL( opus_int16, samplesOut2_tmp );
98     opus_int32 MS_pred_Q13[ 2 ] = { 0 };
99     opus_int16 *resample_out_ptr;
100     silk_decoder *psDec = ( silk_decoder * )decState;
101     silk_decoder_state *channel_state = psDec->channel_state;
102     opus_int has_side;
103     opus_int stereo_to_mono;
104     int delay_stack_alloc;
105     SAVE_STACK;
106 
107     celt_assert( decControl->nChannelsInternal == 1 || decControl->nChannelsInternal == 2 );
108 
109     /**********************************/
110     /* Test if first frame in payload */
111     /**********************************/
112     if( newPacketFlag ) {
113         for( n = 0; n < decControl->nChannelsInternal; n++ ) {
114             channel_state[ n ].nFramesDecoded = 0;  /* Used to count frames in packet */
115         }
116     }
117 
118     /* If Mono -> Stereo transition in bitstream: init state of second channel */
119     if( decControl->nChannelsInternal > psDec->nChannelsInternal ) {
120         ret += silk_init_decoder( &channel_state[ 1 ] );
121     }
122 
123     stereo_to_mono = decControl->nChannelsInternal == 1 && psDec->nChannelsInternal == 2 &&
124                      ( decControl->internalSampleRate == 1000*channel_state[ 0 ].fs_kHz );
125 
126     if( channel_state[ 0 ].nFramesDecoded == 0 ) {
127         for( n = 0; n < decControl->nChannelsInternal; n++ ) {
128             opus_int fs_kHz_dec;
129             if( decControl->payloadSize_ms == 0 ) {
130                 /* Assuming packet loss, use 10 ms */
131                 channel_state[ n ].nFramesPerPacket = 1;
132                 channel_state[ n ].nb_subfr = 2;
133             } else if( decControl->payloadSize_ms == 10 ) {
134                 channel_state[ n ].nFramesPerPacket = 1;
135                 channel_state[ n ].nb_subfr = 2;
136             } else if( decControl->payloadSize_ms == 20 ) {
137                 channel_state[ n ].nFramesPerPacket = 1;
138                 channel_state[ n ].nb_subfr = 4;
139             } else if( decControl->payloadSize_ms == 40 ) {
140                 channel_state[ n ].nFramesPerPacket = 2;
141                 channel_state[ n ].nb_subfr = 4;
142             } else if( decControl->payloadSize_ms == 60 ) {
143                 channel_state[ n ].nFramesPerPacket = 3;
144                 channel_state[ n ].nb_subfr = 4;
145             } else {
146                 celt_assert( 0 );
147                 RESTORE_STACK;
148                 return SILK_DEC_INVALID_FRAME_SIZE;
149             }
150             fs_kHz_dec = ( decControl->internalSampleRate >> 10 ) + 1;
151             if( fs_kHz_dec != 8 && fs_kHz_dec != 12 && fs_kHz_dec != 16 ) {
152                 celt_assert( 0 );
153                 RESTORE_STACK;
154                 return SILK_DEC_INVALID_SAMPLING_FREQUENCY;
155             }
156             ret += silk_decoder_set_fs( &channel_state[ n ], fs_kHz_dec, decControl->API_sampleRate );
157         }
158     }
159 
160     if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 && ( psDec->nChannelsAPI == 1 || psDec->nChannelsInternal == 1 ) ) {
161         silk_memset( psDec->sStereo.pred_prev_Q13, 0, sizeof( psDec->sStereo.pred_prev_Q13 ) );
162         silk_memset( psDec->sStereo.sSide, 0, sizeof( psDec->sStereo.sSide ) );
163         silk_memcpy( &channel_state[ 1 ].resampler_state, &channel_state[ 0 ].resampler_state, sizeof( silk_resampler_state_struct ) );
164     }
165     psDec->nChannelsAPI      = decControl->nChannelsAPI;
166     psDec->nChannelsInternal = decControl->nChannelsInternal;
167 
168     if( decControl->API_sampleRate > (opus_int32)MAX_API_FS_KHZ * 1000 || decControl->API_sampleRate < 8000 ) {
169         ret = SILK_DEC_INVALID_SAMPLING_FREQUENCY;
170         RESTORE_STACK;
171         return( ret );
172     }
173 
174     if( lostFlag != FLAG_PACKET_LOST && channel_state[ 0 ].nFramesDecoded == 0 ) {
175         /* First decoder call for this payload */
176         /* Decode VAD flags and LBRR flag */
177         for( n = 0; n < decControl->nChannelsInternal; n++ ) {
178             for( i = 0; i < channel_state[ n ].nFramesPerPacket; i++ ) {
179                 channel_state[ n ].VAD_flags[ i ] = ec_dec_bit_logp(psRangeDec, 1);
180             }
181             channel_state[ n ].LBRR_flag = ec_dec_bit_logp(psRangeDec, 1);
182         }
183         /* Decode LBRR flags */
184         for( n = 0; n < decControl->nChannelsInternal; n++ ) {
185             silk_memset( channel_state[ n ].LBRR_flags, 0, sizeof( channel_state[ n ].LBRR_flags ) );
186             if( channel_state[ n ].LBRR_flag ) {
187                 if( channel_state[ n ].nFramesPerPacket == 1 ) {
188                     channel_state[ n ].LBRR_flags[ 0 ] = 1;
189                 } else {
190                     LBRR_symbol = ec_dec_icdf( psRangeDec, silk_LBRR_flags_iCDF_ptr[ channel_state[ n ].nFramesPerPacket - 2 ], 8 ) + 1;
191                     for( i = 0; i < channel_state[ n ].nFramesPerPacket; i++ ) {
192                         channel_state[ n ].LBRR_flags[ i ] = silk_RSHIFT( LBRR_symbol, i ) & 1;
193                     }
194                 }
195             }
196         }
197 
198         if( lostFlag == FLAG_DECODE_NORMAL ) {
199             /* Regular decoding: skip all LBRR data */
200             for( i = 0; i < channel_state[ 0 ].nFramesPerPacket; i++ ) {
201                 for( n = 0; n < decControl->nChannelsInternal; n++ ) {
202                     if( channel_state[ n ].LBRR_flags[ i ] ) {
203                         opus_int16 pulses[ MAX_FRAME_LENGTH ];
204                         opus_int condCoding;
205 
206                         if( decControl->nChannelsInternal == 2 && n == 0 ) {
207                             silk_stereo_decode_pred( psRangeDec, MS_pred_Q13 );
208                             if( channel_state[ 1 ].LBRR_flags[ i ] == 0 ) {
209                                 silk_stereo_decode_mid_only( psRangeDec, &decode_only_middle );
210                             }
211                         }
212                         /* Use conditional coding if previous frame available */
213                         if( i > 0 && channel_state[ n ].LBRR_flags[ i - 1 ] ) {
214                             condCoding = CODE_CONDITIONALLY;
215                         } else {
216                             condCoding = CODE_INDEPENDENTLY;
217                         }
218                         silk_decode_indices( &channel_state[ n ], psRangeDec, i, 1, condCoding );
219                         silk_decode_pulses( psRangeDec, pulses, channel_state[ n ].indices.signalType,
220                             channel_state[ n ].indices.quantOffsetType, channel_state[ n ].frame_length );
221                     }
222                 }
223             }
224         }
225     }
226 
227     /* Get MS predictor index */
228     if( decControl->nChannelsInternal == 2 ) {
229         if(   lostFlag == FLAG_DECODE_NORMAL ||
230             ( lostFlag == FLAG_DECODE_LBRR && channel_state[ 0 ].LBRR_flags[ channel_state[ 0 ].nFramesDecoded ] == 1 ) )
231         {
232             silk_stereo_decode_pred( psRangeDec, MS_pred_Q13 );
233             /* For LBRR data, decode mid-only flag only if side-channel's LBRR flag is false */
234             if( ( lostFlag == FLAG_DECODE_NORMAL && channel_state[ 1 ].VAD_flags[ channel_state[ 0 ].nFramesDecoded ] == 0 ) ||
235                 ( lostFlag == FLAG_DECODE_LBRR && channel_state[ 1 ].LBRR_flags[ channel_state[ 0 ].nFramesDecoded ] == 0 ) )
236             {
237                 silk_stereo_decode_mid_only( psRangeDec, &decode_only_middle );
238             } else {
239                 decode_only_middle = 0;
240             }
241         } else {
242             for( n = 0; n < 2; n++ ) {
243                 MS_pred_Q13[ n ] = psDec->sStereo.pred_prev_Q13[ n ];
244             }
245         }
246     }
247 
248     /* Reset side channel decoder prediction memory for first frame with side coding */
249     if( decControl->nChannelsInternal == 2 && decode_only_middle == 0 && psDec->prev_decode_only_middle == 1 ) {
250         silk_memset( psDec->channel_state[ 1 ].outBuf, 0, sizeof(psDec->channel_state[ 1 ].outBuf) );
251         silk_memset( psDec->channel_state[ 1 ].sLPC_Q14_buf, 0, sizeof(psDec->channel_state[ 1 ].sLPC_Q14_buf) );
252         psDec->channel_state[ 1 ].lagPrev        = 100;
253         psDec->channel_state[ 1 ].LastGainIndex  = 10;
254         psDec->channel_state[ 1 ].prevSignalType = TYPE_NO_VOICE_ACTIVITY;
255         psDec->channel_state[ 1 ].first_frame_after_reset = 1;
256     }
257 
258     /* Check if the temp buffer fits into the output PCM buffer. If it fits,
259        we can delay allocating the temp buffer until after the SILK peak stack
260        usage. We need to use a < and not a <= because of the two extra samples. */
261     delay_stack_alloc = decControl->internalSampleRate*decControl->nChannelsInternal
262           < decControl->API_sampleRate*decControl->nChannelsAPI;
263     ALLOC( samplesOut1_tmp_storage1, delay_stack_alloc ? ALLOC_NONE
264            : decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2 ),
265            opus_int16 );
266     if ( delay_stack_alloc )
267     {
268        samplesOut1_tmp[ 0 ] = samplesOut;
269        samplesOut1_tmp[ 1 ] = samplesOut + channel_state[ 0 ].frame_length + 2;
270     } else {
271        samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage1;
272        samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage1 + channel_state[ 0 ].frame_length + 2;
273     }
274 
275     if( lostFlag == FLAG_DECODE_NORMAL ) {
276         has_side = !decode_only_middle;
277     } else {
278         has_side = !psDec->prev_decode_only_middle
279               || (decControl->nChannelsInternal == 2 && lostFlag == FLAG_DECODE_LBRR && channel_state[1].LBRR_flags[ channel_state[1].nFramesDecoded ] == 1 );
280     }
281     /* Call decoder for one frame */
282     for( n = 0; n < decControl->nChannelsInternal; n++ ) {
283         if( n == 0 || has_side ) {
284             opus_int FrameIndex;
285             opus_int condCoding;
286 
287             FrameIndex = channel_state[ 0 ].nFramesDecoded - n;
288             /* Use independent coding if no previous frame available */
289             if( FrameIndex <= 0 ) {
290                 condCoding = CODE_INDEPENDENTLY;
291             } else if( lostFlag == FLAG_DECODE_LBRR ) {
292                 condCoding = channel_state[ n ].LBRR_flags[ FrameIndex - 1 ] ? CODE_CONDITIONALLY : CODE_INDEPENDENTLY;
293             } else if( n > 0 && psDec->prev_decode_only_middle ) {
294                 /* If we skipped a side frame in this packet, we don't
295                    need LTP scaling; the LTP state is well-defined. */
296                 condCoding = CODE_INDEPENDENTLY_NO_LTP_SCALING;
297             } else {
298                 condCoding = CODE_CONDITIONALLY;
299             }
300             ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding, arch);
301         } else {
302             silk_memset( &samplesOut1_tmp[ n ][ 2 ], 0, nSamplesOutDec * sizeof( opus_int16 ) );
303         }
304         channel_state[ n ].nFramesDecoded++;
305     }
306 
307     if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 ) {
308         /* Convert Mid/Side to Left/Right */
309         silk_stereo_MS_to_LR( &psDec->sStereo, samplesOut1_tmp[ 0 ], samplesOut1_tmp[ 1 ], MS_pred_Q13, channel_state[ 0 ].fs_kHz, nSamplesOutDec );
310     } else {
311         /* Buffering */
312         silk_memcpy( samplesOut1_tmp[ 0 ], psDec->sStereo.sMid, 2 * sizeof( opus_int16 ) );
313         silk_memcpy( psDec->sStereo.sMid, &samplesOut1_tmp[ 0 ][ nSamplesOutDec ], 2 * sizeof( opus_int16 ) );
314     }
315 
316     /* Number of output samples */
317     *nSamplesOut = silk_DIV32( nSamplesOutDec * decControl->API_sampleRate, silk_SMULBB( channel_state[ 0 ].fs_kHz, 1000 ) );
318 
319     /* Set up pointers to temp buffers */
320     ALLOC( samplesOut2_tmp,
321            decControl->nChannelsAPI == 2 ? *nSamplesOut : ALLOC_NONE, opus_int16 );
322     if( decControl->nChannelsAPI == 2 ) {
323         resample_out_ptr = samplesOut2_tmp;
324     } else {
325         resample_out_ptr = samplesOut;
326     }
327 
328     ALLOC( samplesOut1_tmp_storage2, delay_stack_alloc
329            ? decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2 )
330            : ALLOC_NONE,
331            opus_int16 );
332     if ( delay_stack_alloc ) {
333        OPUS_COPY(samplesOut1_tmp_storage2, samplesOut, decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2));
334        samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage2;
335        samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage2 + channel_state[ 0 ].frame_length + 2;
336     }
337     for( n = 0; n < silk_min( decControl->nChannelsAPI, decControl->nChannelsInternal ); n++ ) {
338 
339         /* Resample decoded signal to API_sampleRate */
340         ret += silk_resampler( &channel_state[ n ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ n ][ 1 ], nSamplesOutDec );
341 
342         /* Interleave if stereo output and stereo stream */
343         if( decControl->nChannelsAPI == 2 ) {
344             for( i = 0; i < *nSamplesOut; i++ ) {
345                 samplesOut[ n + 2 * i ] = resample_out_ptr[ i ];
346             }
347         }
348     }
349 
350     /* Create two channel output from mono stream */
351     if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 1 ) {
352         if ( stereo_to_mono ){
353             /* Resample right channel for newly collapsed stereo just in case
354                we weren't doing collapsing when switching to mono */
355             ret += silk_resampler( &channel_state[ 1 ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ 0 ][ 1 ], nSamplesOutDec );
356 
357             for( i = 0; i < *nSamplesOut; i++ ) {
358                 samplesOut[ 1 + 2 * i ] = resample_out_ptr[ i ];
359             }
360         } else {
361             for( i = 0; i < *nSamplesOut; i++ ) {
362                 samplesOut[ 1 + 2 * i ] = samplesOut[ 0 + 2 * i ];
363             }
364         }
365     }
366 
367     /* Export pitch lag, measured at 48 kHz sampling rate */
368     if( channel_state[ 0 ].prevSignalType == TYPE_VOICED ) {
369         int mult_tab[ 3 ] = { 6, 4, 3 };
370         decControl->prevPitchLag = channel_state[ 0 ].lagPrev * mult_tab[ ( channel_state[ 0 ].fs_kHz - 8 ) >> 2 ];
371     } else {
372         decControl->prevPitchLag = 0;
373     }
374 
375     if( lostFlag == FLAG_PACKET_LOST ) {
376        /* On packet loss, remove the gain clamping to prevent having the energy "bounce back"
377           if we lose packets when the energy is going down */
378        for ( i = 0; i < psDec->nChannelsInternal; i++ )
379           psDec->channel_state[ i ].LastGainIndex = 10;
380     } else {
381        psDec->prev_decode_only_middle = decode_only_middle;
382     }
383     RESTORE_STACK;
384     return ret;
385 }
386 
387 #if 0
388 /* Getting table of contents for a packet */
389 opus_int silk_get_TOC(
390     const opus_uint8                *payload,           /* I    Payload data                                */
391     const opus_int                  nBytesIn,           /* I    Number of input bytes                       */
392     const opus_int                  nFramesPerPayload,  /* I    Number of SILK frames per payload           */
393     silk_TOC_struct                 *Silk_TOC           /* O    Type of content                             */
394 )
395 {
396     opus_int i, flags, ret = SILK_NO_ERROR;
397 
398     if( nBytesIn < 1 ) {
399         return -1;
400     }
401     if( nFramesPerPayload < 0 || nFramesPerPayload > 3 ) {
402         return -1;
403     }
404 
405     silk_memset( Silk_TOC, 0, sizeof( *Silk_TOC ) );
406 
407     /* For stereo, extract the flags for the mid channel */
408     flags = silk_RSHIFT( payload[ 0 ], 7 - nFramesPerPayload ) & ( silk_LSHIFT( 1, nFramesPerPayload + 1 ) - 1 );
409 
410     Silk_TOC->inbandFECFlag = flags & 1;
411     for( i = nFramesPerPayload - 1; i >= 0 ; i-- ) {
412         flags = silk_RSHIFT( flags, 1 );
413         Silk_TOC->VADFlags[ i ] = flags & 1;
414         Silk_TOC->VADFlag |= flags & 1;
415     }
416 
417     return ret;
418 }
419 #endif
420