1/* Copyright (c) 2014, Cisco Systems, INC
2   Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7
8   - Redistributions of source code must retain the above copyright
9   notice, this list of conditions and the following disclaimer.
10
11   - Redistributions in binary form must reproduce the above copyright
12   notice, this list of conditions and the following disclaimer in the
13   documentation and/or other materials provided with the distribution.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35#include "main.h"
36#include "celt/x86/x86cpu.h"
37#include "stack_alloc.h"
38
39static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
40    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
41    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
42    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
43    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
44    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
45    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
46    opus_int            subfr,                  /* I    subframe number                 */
47    const opus_int      LTP_scale_Q14,          /* I                                    */
48    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
49    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
50    const opus_int      signal_type             /* I    Signal type                     */
51);
52
53static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
54    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
55    opus_int            signalType,             /* I    Signal type                     */
56    const opus_int32    x_sc_Q10[],             /* I                                    */
57    opus_int8           pulses[],               /* O                                    */
58    opus_int16          xq[],                   /* O                                    */
59    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
60    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
61    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
62    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
63    opus_int            lag,                    /* I    Pitch lag                       */
64    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
65    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
66    opus_int32          LF_shp_Q14,             /* I                                    */
67    opus_int32          Gain_Q16,               /* I                                    */
68    opus_int            offset_Q10,             /* I                                    */
69    opus_int            length,                 /* I    Input length                    */
70    opus_int32          table[][4]              /* I                                    */
71);
72
73void silk_NSQ_sse4_1(
74    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
75    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
76    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
77    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
78    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
79    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
80    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
81    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
82    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
83    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
84    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
85    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
86    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
87    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
88    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
89)
90{
91    opus_int            k, lag, start_idx, LSF_interpolation_flag;
92    const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
93    opus_int16          *pxq;
94    VARDECL( opus_int32, sLTP_Q15 );
95    VARDECL( opus_int16, sLTP );
96    opus_int32          HarmShapeFIRPacked_Q14;
97    opus_int            offset_Q10;
98    VARDECL( opus_int32, x_sc_Q10 );
99
100    opus_int32   table[ 64 ][ 4 ];
101    opus_int32   tmp1;
102    opus_int32   q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
103
104    SAVE_STACK;
105
106    NSQ->rand_seed = psIndices->Seed;
107
108    /* Set unvoiced lag to the previous one, overwrite later for voiced */
109    lag = NSQ->lagPrev;
110
111    silk_assert( NSQ->prev_gain_Q16 != 0 );
112
113    offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
114
115    /* 0 */
116    q1_Q10  = offset_Q10;
117    q2_Q10  = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
118    rd1_Q20 = q1_Q10 * Lambda_Q10;
119    rd2_Q20 = q2_Q10 * Lambda_Q10;
120
121    table[ 32 ][ 0 ] = q1_Q10;
122    table[ 32 ][ 1 ] = q2_Q10;
123    table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
124    table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
125
126    /* -1 */
127    q1_Q10  = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
128    q2_Q10  = offset_Q10;
129    rd1_Q20 = - q1_Q10 * Lambda_Q10;
130    rd2_Q20 = q2_Q10 * Lambda_Q10;
131
132    table[ 31 ][ 0 ] = q1_Q10;
133    table[ 31 ][ 1 ] = q2_Q10;
134    table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
135    table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
136
137    /* > 0 */
138    for (k = 1; k <= 31; k++)
139    {
140        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
141
142        q1_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10;
143        q2_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024;
144        rd1_Q20 = q1_Q10 * Lambda_Q10;
145        rd2_Q20 = q2_Q10 * Lambda_Q10;
146
147        table[ 32 + k ][ 0 ] = q1_Q10;
148        table[ 32 + k ][ 1 ] = q2_Q10;
149        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
150        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
151    }
152
153    /* < -1 */
154    for (k = -32; k <= -2; k++)
155    {
156        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
157
158        q1_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10;
159        q2_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024;
160        rd1_Q20 = - q1_Q10 * Lambda_Q10;
161        rd2_Q20 = - q2_Q10 * Lambda_Q10;
162
163        table[ 32 + k ][ 0 ] = q1_Q10;
164        table[ 32 + k ][ 1 ] = q2_Q10;
165        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
166        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
167    }
168
169    if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
170        LSF_interpolation_flag = 0;
171    } else {
172        LSF_interpolation_flag = 1;
173    }
174
175    ALLOC( sLTP_Q15,
176           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
177    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
178    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
179    /* Set up pointers to start of sub frame */
180    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
181    NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
182    pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
183    for( k = 0; k < psEncC->nb_subfr; k++ ) {
184        A_Q12      = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
185        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER ];
186        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
187
188        /* Noise shape parameters */
189        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
190        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
191        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
192
193        NSQ->rewhite_flag = 0;
194        if( psIndices->signalType == TYPE_VOICED ) {
195            /* Voiced */
196            lag = pitchL[ k ];
197
198            /* Re-whitening */
199            if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
200                /* Rewhiten with new A coefs */
201                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
202                silk_assert( start_idx > 0 );
203
204                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
205                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
206
207                NSQ->rewhite_flag = 1;
208                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
209            }
210        }
211
212        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
213
214        if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
215        {
216            silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
217                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
218                offset_Q10, psEncC->subfr_length, &(table[32]) );
219        }
220        else
221        {
222            silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
223                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
224                offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
225        }
226
227        x_Q3   += psEncC->subfr_length;
228        pulses += psEncC->subfr_length;
229        pxq    += psEncC->subfr_length;
230    }
231
232    /* Update lagPrev for next frame */
233    NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
234
235    /* Save quantized speech and noise shaping signals */
236    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
237    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
238    RESTORE_STACK;
239}
240
241/***********************************/
242/* silk_noise_shape_quantizer_10_16  */
243/***********************************/
244static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
245    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
246    opus_int            signalType,             /* I    Signal type                     */
247    const opus_int32    x_sc_Q10[],             /* I                                    */
248    opus_int8           pulses[],               /* O                                    */
249    opus_int16          xq[],                   /* O                                    */
250    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
251    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
252    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
253    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
254    opus_int            lag,                    /* I    Pitch lag                       */
255    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
256    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
257    opus_int32          LF_shp_Q14,             /* I                                    */
258    opus_int32          Gain_Q16,               /* I                                    */
259    opus_int            offset_Q10,             /* I                                    */
260    opus_int            length,                 /* I    Input length                    */
261    opus_int32          table[][4]              /* I                                    */
262)
263{
264    opus_int     i;
265    opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
266    opus_int32   n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
267    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
268    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
269    opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
270
271    __m128i xmm_tempa, xmm_tempb;
272
273    __m128i xmm_one;
274
275    __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF;
276    __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF;
277    __m128i a_Q12_01234567,        a_Q12_89ABCDEF;
278
279    __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
280    __m128i AR_shp_Q13_76543210;
281
282    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
283    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
284    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
285
286    /* Set up short term AR state */
287    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
288
289    sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
290    xq_Q14         = psLPC_Q14[ 0 ];
291    LTP_pred_Q13   = 0;
292
293    /* load a_Q12 */
294    xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
295
296    /* load a_Q12[0] - a_Q12[7] */
297    a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) );
298    /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
299    a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) );
300
301    a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
302    a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
303
304    /* load AR_shp_Q13 */
305    AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) );
306
307    /* load psLPC_Q14 */
308    xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
309
310    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) );
311    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) );
312
313    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
314    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
315
316    psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
317    psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
318
319    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) );
320    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) );
321
322    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
323    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
324
325    psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
326    psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
327
328    /* load sAR2_Q14 */
329    xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) );
330    xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) );
331
332    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
333    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
334
335    sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
336    sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
337
338    /* prepare 1 in 8 * 16bit */
339    xmm_one = _mm_set1_epi16(1);
340
341    for( i = 0; i < length; i++ )
342    {
343        /* Short-term prediction */
344        __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F;
345
346        /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
347        LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */
348
349        /* shift psLPC_Q14 */
350        psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 );
351        psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 );
352
353        psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 );
354        psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 );
355
356        psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 );
357        psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14),       7 );
358
359        /* high part, use pmaddwd, results in 4 32-bit */
360        xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 );
361        xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF );
362
363        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */
364        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 );
365        xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF );
366
367        xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 );
368        xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF );
369
370        xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 );
371        xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF );
372
373        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
374        xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb );
375
376        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
377        xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one );
378
379        /* accumulate */
380        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F );
381        xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F );
382
383        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
384
385        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
386        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
387
388        LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 );
389
390        /* Long-term prediction */
391        if ( opus_likely( signalType == TYPE_VOICED ) ) {
392            /* Unrolled loop */
393            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
394            LTP_pred_Q13 = 2;
395            {
396                __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123;
397
398                b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 );
399                b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
400
401                /* loaded: [0] [-1] [-2] [-3] */
402                pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
403                /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
404                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
405                /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
406                xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 );
407                /* right shift 2 bytes (16 bits), zero extended */
408                xmm_tempa = _mm_srli_si128( xmm_tempa, 2 );
409
410                /* a[1] * b[-1], a[3] * b[-3] */
411                pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 );
412                pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 );
413
414                pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa );
415                /* equal shift right 8 bytes*/
416                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) );
417                xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 );
418
419                LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa );
420
421                LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
422                pred_lag_ptr++;
423            }
424        }
425
426        /* Noise shape feedback */
427        NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ];
428        NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) );
429
430        sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
431        sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
432
433        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
434        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14),       0 );
435
436        /* high part, use pmaddwd, results in 4 32-bit */
437        xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
438
439        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */
440        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 );
441        xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 );
442
443        xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 );
444        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
445
446        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
447
448        /* accumulate */
449        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
450
451        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
452        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
453
454        n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 );
455
456        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] );
457        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] );
458
459        n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 );                                /* Q11 -> Q12 */
460        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 );
461
462        n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
463        n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
464
465        silk_assert( lag > 0 || signalType != TYPE_VOICED );
466
467        /* Combine prediction and noise shaping signals */
468        tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
469        tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
470        if( lag > 0 ) {
471            /* Symmetric, packed FIR coefficients */
472            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
473            n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
474            n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
475            shp_lag_ptr++;
476
477            tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 );                       /* Q13 */
478            tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 );                          /* Q13 */
479            tmp1 = silk_RSHIFT_ROUND( tmp1, 3 );                                /* Q10 */
480        } else {
481            tmp1 = silk_RSHIFT_ROUND( tmp1, 2 );                                /* Q10 */
482        }
483
484        r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 );                              /* residual error Q10 */
485
486        /* Generate dither */
487        NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
488
489        /* Flip sign depending on dither */
490        tmp2 = -r_Q10;
491        if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2;
492
493        r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
494
495        /* Find two quantization level candidates and measure their rate-distortion */
496        q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
497        q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
498
499        q1_Q10 = table[q1_Q0][0];
500        q2_Q10 = table[q1_Q0][1];
501
502        if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0)
503        {
504            q1_Q10 = q2_Q10;
505        }
506
507        pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 );
508
509        /* Excitation */
510        exc_Q14 = silk_LSHIFT( q1_Q10, 4 );
511
512        tmp2 = -exc_Q14;
513        if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2;
514
515        /* Add predictions */
516        LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
517        xq_Q14      = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
518
519        /* Update states */
520        psLPC_Q14++;
521        *psLPC_Q14 = xq_Q14;
522        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
523
524        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
525        sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
526        NSQ->sLTP_shp_buf_idx++;
527        NSQ->sLTP_buf_idx++;
528
529        /* Make dither dependent on quantized signal */
530        NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] );
531    }
532
533    NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
534
535    /* Scale XQ back to normal level before saving */
536    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ];
537
538    /* write back sAR2_Q14 */
539    xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
540    xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
541    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
542    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
543
544    /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
545    {
546        __m128i xmm_Gain_Q10;
547        __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5;
548
549        /* prepare (1 << 7) in packed 4 32-bits */
550        xmm_tempa = _mm_set1_epi32( (1 << 7) );
551
552        /* prepare Gain_Q10 in packed 4 32-bits */
553        xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 );
554
555        /* process xq */
556        for (i = 0; i < length - 7; i += 8)
557        {
558            xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) );
559            xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) );
560
561            /* equal shift right 4 bytes*/
562            xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
563            /* equal shift right 4 bytes*/
564            xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) );
565
566            xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 );
567            xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 );
568            xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 );
569            xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 );
570
571            xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 );
572            xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 );
573            xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 );
574            xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 );
575
576            xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC );
577            xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC );
578
579            /* silk_RSHIFT_ROUND(xq, 8) */
580            xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa );
581            xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa );
582
583            xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 );
584            xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 );
585
586            /* silk_SAT16 */
587            xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
588
589            /* save to xq */
590            _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 );
591        }
592    }
593    for ( ; i < length; i++)
594    {
595        xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) );
596    }
597
598    /* Update LPC synth buffer */
599    silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
600}
601
602static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
603    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
604    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
605    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
606    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
607    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
608    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
609    opus_int            subfr,                  /* I    subframe number                 */
610    const opus_int      LTP_scale_Q14,          /* I                                    */
611    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
612    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
613    const opus_int      signal_type             /* I    Signal type                     */
614)
615{
616    opus_int   i, lag;
617    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
618    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
619
620    lag          = pitchL[ subfr ];
621    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
622    silk_assert( inv_gain_Q31 != 0 );
623
624    /* Calculate gain adjustment factor */
625    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
626        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
627    } else {
628        gain_adj_Q16 = (opus_int32)1 << 16;
629    }
630
631    /* Scale input */
632    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
633
634    /* prepare inv_gain_Q23 in packed 4 32-bits */
635    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
636
637    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
638        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
639
640        /* equal shift right 4 bytes*/
641        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
642
643        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
644        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
645
646        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
647        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
648
649        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
650
651        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
652    }
653
654    for( ; i < psEncC->subfr_length; i++ ) {
655        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
656    }
657
658    /* Save inverse gain */
659    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
660
661    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
662    if( NSQ->rewhite_flag ) {
663        if( subfr == 0 ) {
664            /* Do LTP downscaling */
665            inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
666        }
667        for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
668            silk_assert( i < MAX_FRAME_LENGTH );
669            sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
670        }
671    }
672
673    /* Adjust for changing gain */
674    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
675        /* Scale long-term shaping state */
676        __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
677
678        /* prepare gain_adj_Q16 in packed 4 32-bits */
679        xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
680
681        for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
682        {
683            xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
684            /* equal shift right 4 bytes*/
685            xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
686
687            xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
688            xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
689
690            xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
691            xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
692
693            xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
694
695            _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
696        }
697
698        for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
699            NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
700        }
701
702        /* Scale long-term prediction state */
703        if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
704            for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
705                sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
706            }
707        }
708
709        NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
710
711        /* Scale short-term prediction and shaping states */
712        for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
713            NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] );
714        }
715        for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
716            NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
717        }
718    }
719}
720