1/* Copyright (c) 2014, Cisco Systems, INC
2   Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7
8   - Redistributions of source code must retain the above copyright
9   notice, this list of conditions and the following disclaimer.
10
11   - Redistributions in binary form must reproduce the above copyright
12   notice, this list of conditions and the following disclaimer in the
13   documentation and/or other materials provided with the distribution.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35#include "main.h"
36#include "celt/x86/x86cpu.h"
37
38#include "stack_alloc.h"
39
40typedef struct {
41    opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];
42    opus_int32 RandState[ DECISION_DELAY ];
43    opus_int32 Q_Q10[     DECISION_DELAY ];
44    opus_int32 Xq_Q14[    DECISION_DELAY ];
45    opus_int32 Pred_Q15[  DECISION_DELAY ];
46    opus_int32 Shape_Q14[ DECISION_DELAY ];
47    opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
48    opus_int32 LF_AR_Q14;
49    opus_int32 Seed;
50    opus_int32 SeedInit;
51    opus_int32 RD_Q10;
52} NSQ_del_dec_struct;
53
54typedef struct {
55    opus_int32 Q_Q10;
56    opus_int32 RD_Q10;
57    opus_int32 xq_Q14;
58    opus_int32 LF_AR_Q14;
59    opus_int32 sLTP_shp_Q14;
60    opus_int32 LPC_exc_Q14;
61} NSQ_sample_struct;
62
63typedef NSQ_sample_struct  NSQ_sample_pair[ 2 ];
64
65static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
66    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
67    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
68    NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
69    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
70    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
71    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
72    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
73    opus_int            subfr,                      /* I    Subframe number                     */
74    opus_int            nStatesDelayedDecision,     /* I    Number of del dec states            */
75    const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
76    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
77    const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
78    const opus_int      signal_type,                /* I    Signal type                         */
79    const opus_int      decisionDelay               /* I    Decision delay                      */
80);
81
82/******************************************/
83/* Noise shape quantizer for one subframe */
84/******************************************/
85static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
86    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
87    NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
88    opus_int            signalType,             /* I    Signal type                         */
89    const opus_int32    x_Q10[],                /* I                                        */
90    opus_int8           pulses[],               /* O                                        */
91    opus_int16          xq[],                   /* O                                        */
92    opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
93    opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
94    const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
95    const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
96    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
97    opus_int            lag,                    /* I    Pitch lag                           */
98    opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
99    opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
100    opus_int32          LF_shp_Q14,             /* I                                        */
101    opus_int32          Gain_Q16,               /* I                                        */
102    opus_int            Lambda_Q10,             /* I                                        */
103    opus_int            offset_Q10,             /* I                                        */
104    opus_int            length,                 /* I    Input length                        */
105    opus_int            subfr,                  /* I    Subframe number                     */
106    opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
107    opus_int            predictLPCOrder,        /* I    Prediction filter order             */
108    opus_int            warping_Q16,            /* I                                        */
109    opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
110    opus_int            *smpl_buf_idx,          /* I    Index to newest samples in buffers  */
111    opus_int            decisionDelay           /* I                                        */
112);
113
114void silk_NSQ_del_dec_sse4_1(
115    const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
116    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
117    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
118    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
119    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
120    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
121    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
122    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
123    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
124    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
125    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
126    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
127    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
128    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
129    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
130)
131{
132    opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
133    opus_int            last_smple_idx, smpl_buf_idx, decisionDelay;
134    const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
135    opus_int16          *pxq;
136    VARDECL( opus_int32, sLTP_Q15 );
137    VARDECL( opus_int16, sLTP );
138    opus_int32          HarmShapeFIRPacked_Q14;
139    opus_int            offset_Q10;
140    opus_int32          RDmin_Q10, Gain_Q10;
141    VARDECL( opus_int32, x_sc_Q10 );
142    VARDECL( opus_int32, delayedGain_Q10 );
143    VARDECL( NSQ_del_dec_struct, psDelDec );
144    NSQ_del_dec_struct  *psDD;
145    SAVE_STACK;
146
147    /* Set unvoiced lag to the previous one, overwrite later for voiced */
148    lag = NSQ->lagPrev;
149
150    silk_assert( NSQ->prev_gain_Q16 != 0 );
151
152    /* Initialize delayed decision states */
153    ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct );
154    silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) );
155    for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
156        psDD                 = &psDelDec[ k ];
157        psDD->Seed           = ( k + psIndices->Seed ) & 3;
158        psDD->SeedInit       = psDD->Seed;
159        psDD->RD_Q10         = 0;
160        psDD->LF_AR_Q14      = NSQ->sLF_AR_shp_Q14;
161        psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
162        silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
163        silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
164    }
165
166    offset_Q10   = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
167    smpl_buf_idx = 0; /* index of oldest samples */
168
169    decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
170
171    /* For voiced frames limit the decision delay to lower than the pitch lag */
172    if( psIndices->signalType == TYPE_VOICED ) {
173        for( k = 0; k < psEncC->nb_subfr; k++ ) {
174            decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 );
175        }
176    } else {
177        if( lag > 0 ) {
178            decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
179        }
180    }
181
182    if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
183        LSF_interpolation_flag = 0;
184    } else {
185        LSF_interpolation_flag = 1;
186    }
187
188    ALLOC( sLTP_Q15,
189           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
190    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
191    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
192    ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
193    /* Set up pointers to start of sub frame */
194    pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
195    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
196    NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
197    subfr = 0;
198    for( k = 0; k < psEncC->nb_subfr; k++ ) {
199        A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
200        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
201        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
202
203        /* Noise shape parameters */
204        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
205        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
206        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
207
208        NSQ->rewhite_flag = 0;
209        if( psIndices->signalType == TYPE_VOICED ) {
210            /* Voiced */
211            lag = pitchL[ k ];
212
213            /* Re-whitening */
214            if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
215                if( k == 2 ) {
216                    /* RESET DELAYED DECISIONS */
217                    /* Find winner */
218                    RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
219                    Winner_ind = 0;
220                    for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
221                        if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) {
222                            RDmin_Q10 = psDelDec[ i ].RD_Q10;
223                            Winner_ind = i;
224                        }
225                    }
226                    for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) {
227                        if( i != Winner_ind ) {
228                            psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 );
229                            silk_assert( psDelDec[ i ].RD_Q10 >= 0 );
230                        }
231                    }
232
233                    /* Copy final part of signals from winner state to output and long-term filter states */
234                    psDD = &psDelDec[ Winner_ind ];
235                    last_smple_idx = smpl_buf_idx + decisionDelay;
236                    for( i = 0; i < decisionDelay; i++ ) {
237                        last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK;
238                        pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
239                        pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
240                            silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) );
241                        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
242                    }
243
244                    subfr = 0;
245                }
246
247                /* Rewhiten with new A coefs */
248                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
249                silk_assert( start_idx > 0 );
250
251                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
252                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
253
254                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
255                NSQ->rewhite_flag = 1;
256            }
257        }
258
259        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
260            psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
261
262        silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
263            delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
264            Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
265            psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
266
267        x_Q3   += psEncC->subfr_length;
268        pulses += psEncC->subfr_length;
269        pxq    += psEncC->subfr_length;
270    }
271
272    /* Find winner */
273    RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
274    Winner_ind = 0;
275    for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
276        if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) {
277            RDmin_Q10 = psDelDec[ k ].RD_Q10;
278            Winner_ind = k;
279        }
280    }
281
282    /* Copy final part of signals from winner state to output and long-term filter states */
283    psDD = &psDelDec[ Winner_ind ];
284    psIndices->Seed = psDD->SeedInit;
285    last_smple_idx = smpl_buf_idx + decisionDelay;
286    Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
287    for( i = 0; i < decisionDelay; i++ ) {
288        last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK;
289        pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
290        pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
291            silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
292        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
293    }
294    silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
295    silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) );
296
297    /* Update states */
298    NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
299    NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];
300
301    /* Save quantized speech signal */
302    /* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[psEncC->ltp_mem_length], psEncC->frame_length * sizeof( opus_int16 ) ) */
303    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
304    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
305    RESTORE_STACK;
306}
307
308/******************************************/
309/* Noise shape quantizer for one subframe */
310/******************************************/
311static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
312    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
313    NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
314    opus_int            signalType,             /* I    Signal type                         */
315    const opus_int32    x_Q10[],                /* I                                        */
316    opus_int8           pulses[],               /* O                                        */
317    opus_int16          xq[],                   /* O                                        */
318    opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
319    opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
320    const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
321    const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
322    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
323    opus_int            lag,                    /* I    Pitch lag                           */
324    opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
325    opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
326    opus_int32          LF_shp_Q14,             /* I                                        */
327    opus_int32          Gain_Q16,               /* I                                        */
328    opus_int            Lambda_Q10,             /* I                                        */
329    opus_int            offset_Q10,             /* I                                        */
330    opus_int            length,                 /* I    Input length                        */
331    opus_int            subfr,                  /* I    Subframe number                     */
332    opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
333    opus_int            predictLPCOrder,        /* I    Prediction filter order             */
334    opus_int            warping_Q16,            /* I                                        */
335    opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
336    opus_int            *smpl_buf_idx,          /* I    Index to newest samples in buffers  */
337    opus_int            decisionDelay           /* I                                        */
338)
339{
340    opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
341    opus_int32   Winner_rand_state;
342    opus_int32   LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14;
343    opus_int32   n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10;
344    opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
345    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
346    opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
347    VARDECL( NSQ_sample_pair, psSampleState );
348    NSQ_del_dec_struct *psDD;
349    NSQ_sample_struct  *psSS;
350
351    __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF;
352    __m128i b_Q12_0123, b_sr_Q12_0123;
353    SAVE_STACK;
354
355    silk_assert( nStatesDelayedDecision > 0 );
356    ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
357
358    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
359    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
360    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
361
362    a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 );
363    a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 );
364
365    if( opus_likely( predictLPCOrder == 16 ) ) {
366        a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 );
367        a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 );
368    }
369
370    if( signalType == TYPE_VOICED ){
371        b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 );
372        b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
373    }
374    for( i = 0; i < length; i++ ) {
375        /* Perform common calculations used in all states */
376
377        /* Long-term prediction */
378        if( signalType == TYPE_VOICED ) {
379            /* Unrolled loop */
380            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
381            LTP_pred_Q14 = 2;
382            {
383                __m128i tmpa, tmpb, pred_lag_ptr_tmp;
384                pred_lag_ptr_tmp    = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
385                pred_lag_ptr_tmp    = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );
386                tmpa                = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 );
387                tmpa                = _mm_srli_si128( tmpa, 2 );
388
389                pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */
390                pred_lag_ptr_tmp    = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_0123 );
391                pred_lag_ptr_tmp    = _mm_srli_si128( pred_lag_ptr_tmp, 2 );
392                pred_lag_ptr_tmp    = _mm_add_epi32( pred_lag_ptr_tmp, tmpa );
393
394                tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3, 2 ) );/* equal shift right 8 bytes */
395                pred_lag_ptr_tmp    = _mm_add_epi32( pred_lag_ptr_tmp, tmpb );
396                LTP_pred_Q14        += _mm_cvtsi128_si32( pred_lag_ptr_tmp );
397
398                LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
399                LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 );                          /* Q13 -> Q14 */
400                pred_lag_ptr++;
401            }
402        } else {
403            LTP_pred_Q14 = 0;
404        }
405
406        /* Long-term shaping */
407        if( lag > 0 ) {
408            /* Symmetric, packed FIR coefficients */
409            n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
410            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
411            n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
412            shp_lag_ptr++;
413        } else {
414            n_LTP_Q14 = 0;
415        }
416        {
417            __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp;
418
419            for( k = 0; k < nStatesDelayedDecision; k++ ) {
420                /* Delayed decision state */
421                psDD = &psDelDec[ k ];
422
423                /* Sample state */
424                psSS = psSampleState[ k ];
425
426                /* Generate dither */
427                psDD->Seed = silk_RAND( psDD->Seed );
428
429                /* Pointer used in short term prediction and shaping */
430                psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
431                /* Short-term prediction */
432                silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
433                /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
434                LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
435
436                tmpb = _mm_setzero_si128();
437
438                /* step 1 */
439                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
440                psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );      /* 0, -1, -2, -3 */
441                tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 );    /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */
442
443                tmpa            = _mm_srli_epi64( tmpa, 16 );
444                tmpb            = _mm_add_epi32( tmpb, tmpa );
445
446                psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
447                a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
448                psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1*-1, 3*-3 */
449                psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
450                tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
451
452                /* step 2 */
453                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) );
454                psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
455                tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );
456                tmpa            = _mm_srli_epi64( tmpa, 16 );
457                tmpb            = _mm_add_epi32( tmpb, tmpa );
458
459                psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
460                a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
461                psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
462                psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
463                tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
464
465                if ( opus_likely( predictLPCOrder == 16 ) )
466                {
467                    /* step 3 */
468                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) );
469                    psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
470                    tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB );
471                    tmpa            = _mm_srli_epi64( tmpa, 16 );
472                    tmpb            = _mm_add_epi32( tmpb, tmpa );
473
474                    psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
475                    a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */
476                    psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
477                    psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
478                    tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
479
480                    /* setp 4 */
481                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
482                    psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
483                    tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
484                    tmpa            = _mm_srli_epi64( tmpa, 16 );
485                    tmpb            = _mm_add_epi32( tmpb, tmpa );
486
487                    psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
488                    a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
489                    psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
490                    psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
491                    tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
492
493                    /* add at last */
494                    /* equal shift right 8 bytes*/
495                    tmpa            = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) );
496                    tmpb            = _mm_add_epi32( tmpb, tmpa );
497                    LPC_pred_Q14    += _mm_cvtsi128_si32( tmpb );
498                }
499                else
500                {
501                    /* add at last */
502                    tmpa            = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); /* equal shift right 8 bytes*/
503                    tmpb            = _mm_add_epi32( tmpb, tmpa );
504                    LPC_pred_Q14    += _mm_cvtsi128_si32( tmpb );
505
506                    LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] );
507                    LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] );
508                }
509
510                LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
511
512                /* Noise shape feedback */
513                silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
514                /* Output of lowpass section */
515                tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
516                /* Output of allpass section */
517                tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
518                psDD->sAR2_Q14[ 0 ] = tmp2;
519                n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
520                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
521                /* Loop over allpass sections */
522                for( j = 2; j < shapingLPCOrder; j += 2 ) {
523                    /* Output of allpass section */
524                    tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 );
525                    psDD->sAR2_Q14[ j - 1 ] = tmp1;
526                    n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
527                    /* Output of allpass section */
528                    tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 );
529                    psDD->sAR2_Q14[ j + 0 ] = tmp2;
530                    n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
531                }
532                psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
533                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] );
534
535                n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 );                                      /* Q11 -> Q12 */
536                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 );              /* Q12 */
537                n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 );                                      /* Q12 -> Q14 */
538
539                n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 );     /* Q12 */
540                n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 );            /* Q12 */
541                n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 );                                      /* Q12 -> Q14 */
542
543                /* Input minus prediction plus noise feedback                       */
544                /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
545                tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );                                    /* Q14 */
546                tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );                               /* Q13 */
547                tmp1 = silk_SUB32( tmp2, tmp1 );                                            /* Q13 */
548                tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */
549
550                r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */
551
552                /* Flip sign depending on dither */
553                if ( psDD->Seed < 0 ) {
554                    r_Q10 = -r_Q10;
555                }
556                r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
557
558                /* Find two quantization level candidates and measure their rate-distortion */
559                q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
560                q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
561                if( q1_Q0 > 0 ) {
562                    q1_Q10  = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
563                    q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
564                    q2_Q10  = silk_ADD32( q1_Q10, 1024 );
565                    rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
566                    rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
567                } else if( q1_Q0 == 0 ) {
568                    q1_Q10  = offset_Q10;
569                    q2_Q10  = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
570                    rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
571                    rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
572                } else if( q1_Q0 == -1 ) {
573                    q2_Q10  = offset_Q10;
574                    q1_Q10  = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
575                    rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
576                    rd2_Q10 = silk_SMULBB(  q2_Q10, Lambda_Q10 );
577                } else {            /* q1_Q0 < -1 */
578                    q1_Q10  = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
579                    q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
580                    q2_Q10  = silk_ADD32( q1_Q10, 1024 );
581                    rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
582                    rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 );
583                }
584                rr_Q10  = silk_SUB32( r_Q10, q1_Q10 );
585                rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 );
586                rr_Q10  = silk_SUB32( r_Q10, q2_Q10 );
587                rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 );
588
589                if( rd1_Q10 < rd2_Q10 ) {
590                    psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
591                    psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
592                    psSS[ 0 ].Q_Q10  = q1_Q10;
593                    psSS[ 1 ].Q_Q10  = q2_Q10;
594                } else {
595                    psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
596                    psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
597                    psSS[ 0 ].Q_Q10  = q2_Q10;
598                    psSS[ 1 ].Q_Q10  = q1_Q10;
599                }
600
601                /* Update states for best quantization */
602
603                /* Quantized excitation */
604                exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 );
605                if ( psDD->Seed < 0 ) {
606                    exc_Q14 = -exc_Q14;
607                }
608
609                /* Add predictions */
610                LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
611                xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
612
613                /* Update states */
614                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
615                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
616                psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;
617                psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;
618                psSS[ 0 ].xq_Q14       = xq_Q14;
619
620                /* Update states for second best quantization */
621
622                /* Quantized excitation */
623                exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 );
624                if ( psDD->Seed < 0 ) {
625                    exc_Q14 = -exc_Q14;
626                }
627
628
629                /* Add predictions */
630                LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
631                xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
632
633                /* Update states */
634                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
635                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
636                psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;
637                psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;
638                psSS[ 1 ].xq_Q14       = xq_Q14;
639            }
640        }
641        *smpl_buf_idx  = ( *smpl_buf_idx - 1 ) & DECISION_DELAY_MASK;                   /* Index to newest samples              */
642        last_smple_idx = ( *smpl_buf_idx + decisionDelay ) & DECISION_DELAY_MASK;       /* Index to decisionDelay old samples   */
643
644        /* Find winner */
645        RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
646        Winner_ind = 0;
647        for( k = 1; k < nStatesDelayedDecision; k++ ) {
648            if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) {
649                RDmin_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
650                Winner_ind = k;
651            }
652        }
653
654        /* Increase RD values of expired states */
655        Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ];
656        for( k = 0; k < nStatesDelayedDecision; k++ ) {
657            if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) {
658                psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 );
659                psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 );
660                silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 );
661            }
662        }
663
664        /* Find worst in first set and best in second set */
665        RDmax_Q10  = psSampleState[ 0 ][ 0 ].RD_Q10;
666        RDmin_Q10  = psSampleState[ 0 ][ 1 ].RD_Q10;
667        RDmax_ind = 0;
668        RDmin_ind = 0;
669        for( k = 1; k < nStatesDelayedDecision; k++ ) {
670            /* find worst in first set */
671            if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) {
672                RDmax_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
673                RDmax_ind = k;
674            }
675            /* find best in second set */
676            if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) {
677                RDmin_Q10  = psSampleState[ k ][ 1 ].RD_Q10;
678                RDmin_ind = k;
679            }
680        }
681
682        /* Replace a state if best from second set outperforms worst in first set */
683        if( RDmin_Q10 < RDmax_Q10 ) {
684            silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i,
685                         ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) );
686            silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) );
687        }
688
689        /* Write samples from winner to output and long-term filter states */
690        psDD = &psDelDec[ Winner_ind ];
691        if( subfr > 0 || i >= decisionDelay ) {
692            pulses[  i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
693            xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
694                silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
695            NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ];
696            sLTP_Q15[          NSQ->sLTP_buf_idx     - decisionDelay ] = psDD->Pred_Q15[  last_smple_idx ];
697        }
698        NSQ->sLTP_shp_buf_idx++;
699        NSQ->sLTP_buf_idx++;
700
701        /* Update states */
702        for( k = 0; k < nStatesDelayedDecision; k++ ) {
703            psDD                                     = &psDelDec[ k ];
704            psSS                                     = &psSampleState[ k ][ 0 ];
705            psDD->LF_AR_Q14                          = psSS->LF_AR_Q14;
706            psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
707            psDD->Xq_Q14[    *smpl_buf_idx ]         = psSS->xq_Q14;
708            psDD->Q_Q10[     *smpl_buf_idx ]         = psSS->Q_Q10;
709            psDD->Pred_Q15[  *smpl_buf_idx ]         = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 );
710            psDD->Shape_Q14[ *smpl_buf_idx ]         = psSS->sLTP_shp_Q14;
711            psDD->Seed                               = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) );
712            psDD->RandState[ *smpl_buf_idx ]         = psDD->Seed;
713            psDD->RD_Q10                             = psSS->RD_Q10;
714        }
715        delayedGain_Q10[     *smpl_buf_idx ]         = Gain_Q10;
716    }
717    /* Update LPC states */
718    for( k = 0; k < nStatesDelayedDecision; k++ ) {
719        psDD = &psDelDec[ k ];
720        silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
721    }
722    RESTORE_STACK;
723}
724
725static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
726    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
727    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
728    NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
729    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
730    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
731    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
732    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
733    opus_int            subfr,                      /* I    Subframe number                     */
734    opus_int            nStatesDelayedDecision,     /* I    Number of del dec states            */
735    const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
736    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
737    const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
738    const opus_int      signal_type,                /* I    Signal type                         */
739    const opus_int      decisionDelay               /* I    Decision delay                      */
740)
741{
742    opus_int            i, k, lag;
743    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
744    NSQ_del_dec_struct  *psDD;
745    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
746
747    lag          = pitchL[ subfr ];
748    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
749
750    silk_assert( inv_gain_Q31 != 0 );
751
752    /* Calculate gain adjustment factor */
753    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
754        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
755    } else {
756        gain_adj_Q16 = (opus_int32)1 << 16;
757    }
758
759    /* Scale input */
760    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
761
762    /* prepare inv_gain_Q23 in packed 4 32-bits */
763    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
764
765    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
766        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
767        /* equal shift right 4 bytes*/
768        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
769
770        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
771        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
772
773        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
774        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
775
776        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
777
778        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
779    }
780
781    for( ; i < psEncC->subfr_length; i++ ) {
782        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
783    }
784
785    /* Save inverse gain */
786    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
787
788    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
789    if( NSQ->rewhite_flag ) {
790        if( subfr == 0 ) {
791            /* Do LTP downscaling */
792            inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
793        }
794        for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
795            silk_assert( i < MAX_FRAME_LENGTH );
796            sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
797        }
798    }
799
800    /* Adjust for changing gain */
801    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
802        /* Scale long-term shaping state */
803        {
804            __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
805
806            /* prepare gain_adj_Q16 in packed 4 32-bits */
807            xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 );
808
809            for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
810            {
811                xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
812                /* equal shift right 4 bytes*/
813                xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
814
815                xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
816                xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
817
818                xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
819                xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
820
821                xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
822
823                _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
824            }
825
826            for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
827                NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
828            }
829
830            /* Scale long-term prediction state */
831            if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
832                for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
833                    sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
834                }
835            }
836
837            for( k = 0; k < nStatesDelayedDecision; k++ ) {
838                psDD = &psDelDec[ k ];
839
840                /* Scale scalar states */
841                psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
842
843                /* Scale short-term prediction and shaping states */
844                for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
845                    psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] );
846                }
847                for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
848                    psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] );
849                }
850                for( i = 0; i < DECISION_DELAY; i++ ) {
851                    psDD->Pred_Q15[  i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[  i ] );
852                    psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] );
853                }
854            }
855        }
856    }
857}
858