1/* Copyright (c) 2014, Cisco Systems, INC
2   Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7
8   - Redistributions of source code must retain the above copyright
9   notice, this list of conditions and the following disclaimer.
10
11   - Redistributions in binary form must reproduce the above copyright
12   notice, this list of conditions and the following disclaimer in the
13   documentation and/or other materials provided with the distribution.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35#include "main.h"
36#include "celt/x86/x86cpu.h"
37#include "stack_alloc.h"
38
39static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
40    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
41    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
42    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
43    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
44    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
45    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
46    opus_int            subfr,                  /* I    subframe number                 */
47    const opus_int      LTP_scale_Q14,          /* I                                    */
48    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
49    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
50    const opus_int      signal_type             /* I    Signal type                     */
51);
52
53static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
54    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
55    opus_int            signalType,             /* I    Signal type                     */
56    const opus_int32    x_sc_Q10[],             /* I                                    */
57    opus_int8           pulses[],               /* O                                    */
58    opus_int16          xq[],                   /* O                                    */
59    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
60    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
61    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
62    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
63    opus_int            lag,                    /* I    Pitch lag                       */
64    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
65    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
66    opus_int32          LF_shp_Q14,             /* I                                    */
67    opus_int32          Gain_Q16,               /* I                                    */
68    opus_int            offset_Q10,             /* I                                    */
69    opus_int            length,                 /* I    Input length                    */
70    opus_int32          table[][4]              /* I                                    */
71);
72
73void silk_NSQ_sse4_1(
74    const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
75    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
76    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
77    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
78    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
79    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
80    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
81    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
82    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
83    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
84    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
85    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
86    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
87    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
88    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
89)
90{
91    opus_int            k, lag, start_idx, LSF_interpolation_flag;
92    const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
93    opus_int16          *pxq;
94    VARDECL( opus_int32, sLTP_Q15 );
95    VARDECL( opus_int16, sLTP );
96    opus_int32          HarmShapeFIRPacked_Q14;
97    opus_int            offset_Q10;
98    VARDECL( opus_int32, x_sc_Q10 );
99
100    opus_int32   table[ 64 ][ 4 ];
101    opus_int32   tmp1;
102    opus_int32   q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
103
104    SAVE_STACK;
105
106    NSQ->rand_seed = psIndices->Seed;
107
108    /* Set unvoiced lag to the previous one, overwrite later for voiced */
109    lag = NSQ->lagPrev;
110
111    silk_assert( NSQ->prev_gain_Q16 != 0 );
112
113    offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
114
115    /* 0 */
116    q1_Q10  = offset_Q10;
117    q2_Q10  = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
118    rd1_Q20 = q1_Q10 * Lambda_Q10;
119    rd2_Q20 = q2_Q10 * Lambda_Q10;
120
121    table[ 32 ][ 0 ] = q1_Q10;
122    table[ 32 ][ 1 ] = q2_Q10;
123    table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
124    table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
125
126    /* -1 */
127    q1_Q10  = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
128    q2_Q10  = offset_Q10;
129    rd1_Q20 = - q1_Q10 * Lambda_Q10;
130    rd2_Q20 = q2_Q10 * Lambda_Q10;
131
132    table[ 31 ][ 0 ] = q1_Q10;
133    table[ 31 ][ 1 ] = q2_Q10;
134    table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
135    table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
136
137    /* > 0 */
138    for (k = 1; k <= 31; k++)
139    {
140        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
141
142        q1_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10;
143        q2_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024;
144        rd1_Q20 = q1_Q10 * Lambda_Q10;
145        rd2_Q20 = q2_Q10 * Lambda_Q10;
146
147        table[ 32 + k ][ 0 ] = q1_Q10;
148        table[ 32 + k ][ 1 ] = q2_Q10;
149        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
150        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
151    }
152
153    /* < -1 */
154    for (k = -32; k <= -2; k++)
155    {
156        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
157
158        q1_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10;
159        q2_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024;
160        rd1_Q20 = - q1_Q10 * Lambda_Q10;
161        rd2_Q20 = - q2_Q10 * Lambda_Q10;
162
163        table[ 32 + k ][ 0 ] = q1_Q10;
164        table[ 32 + k ][ 1 ] = q2_Q10;
165        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
166        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
167    }
168
169    if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
170        LSF_interpolation_flag = 0;
171    } else {
172        LSF_interpolation_flag = 1;
173    }
174
175    ALLOC( sLTP_Q15,
176           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
177    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
178    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
179    /* Set up pointers to start of sub frame */
180    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
181    NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
182    pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
183    for( k = 0; k < psEncC->nb_subfr; k++ ) {
184        A_Q12      = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
185        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER ];
186        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
187
188        /* Noise shape parameters */
189        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
190        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
191        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
192
193        NSQ->rewhite_flag = 0;
194        if( psIndices->signalType == TYPE_VOICED ) {
195            /* Voiced */
196            lag = pitchL[ k ];
197
198            /* Re-whitening */
199            if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
200                /* Rewhiten with new A coefs */
201                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
202                silk_assert( start_idx > 0 );
203
204                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
205                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
206
207                NSQ->rewhite_flag = 1;
208                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
209            }
210        }
211
212        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
213
214        if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
215        {
216            silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
217                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
218                offset_Q10, psEncC->subfr_length, &(table[32]) );
219        }
220        else
221        {
222            silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
223                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
224                offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
225        }
226
227        x_Q3   += psEncC->subfr_length;
228        pulses += psEncC->subfr_length;
229        pxq    += psEncC->subfr_length;
230    }
231
232    /* Update lagPrev for next frame */
233    NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
234
235    /* Save quantized speech and noise shaping signals */
236    /* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[ psEncC->ltp_mem_length ], psEncC->frame_length * sizeof( opus_int16 ) ) */
237    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
238    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
239    RESTORE_STACK;
240}
241
242/***********************************/
243/* silk_noise_shape_quantizer_10_16  */
244/***********************************/
245static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
246    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
247    opus_int            signalType,             /* I    Signal type                     */
248    const opus_int32    x_sc_Q10[],             /* I                                    */
249    opus_int8           pulses[],               /* O                                    */
250    opus_int16          xq[],                   /* O                                    */
251    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
252    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
253    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
254    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
255    opus_int            lag,                    /* I    Pitch lag                       */
256    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
257    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
258    opus_int32          LF_shp_Q14,             /* I                                    */
259    opus_int32          Gain_Q16,               /* I                                    */
260    opus_int            offset_Q10,             /* I                                    */
261    opus_int            length,                 /* I    Input length                    */
262    opus_int32          table[][4]              /* I                                    */
263)
264{
265    opus_int     i;
266    opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
267    opus_int32   n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
268    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
269    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
270    opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
271
272    __m128i xmm_tempa, xmm_tempb;
273
274    __m128i xmm_one;
275
276    __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF;
277    __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF;
278    __m128i a_Q12_01234567,        a_Q12_89ABCDEF;
279
280    __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
281    __m128i AR_shp_Q13_76543210;
282
283    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
284    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
285    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
286
287    /* Set up short term AR state */
288    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
289
290    sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
291    xq_Q14         = psLPC_Q14[ 0 ];
292    LTP_pred_Q13   = 0;
293
294    /* load a_Q12 */
295    xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
296
297    /* load a_Q12[0] - a_Q12[7] */
298    a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) );
299    /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
300    a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) );
301
302    a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
303    a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
304
305    /* load AR_shp_Q13 */
306    AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) );
307
308    /* load psLPC_Q14 */
309    xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
310
311    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) );
312    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) );
313
314    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
315    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
316
317    psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
318    psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
319
320    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) );
321    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) );
322
323    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
324    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
325
326    psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
327    psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
328
329    /* load sAR2_Q14 */
330    xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) );
331    xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) );
332
333    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
334    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
335
336    sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
337    sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
338
339    /* prepare 1 in 8 * 16bit */
340    xmm_one = _mm_set1_epi16(1);
341
342    for( i = 0; i < length; i++ )
343    {
344        /* Short-term prediction */
345        __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F;
346
347        /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
348        LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */
349
350        /* shift psLPC_Q14 */
351        psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 );
352        psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 );
353
354        psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 );
355        psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 );
356
357        psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 );
358        psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14),       7 );
359
360        /* high part, use pmaddwd, results in 4 32-bit */
361        xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 );
362        xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF );
363
364        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */
365        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 );
366        xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF );
367
368        xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 );
369        xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF );
370
371        xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 );
372        xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF );
373
374        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
375        xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb );
376
377        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
378        xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one );
379
380        /* accumulate */
381        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F );
382        xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F );
383
384        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
385
386        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
387        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
388
389        LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 );
390
391        /* Long-term prediction */
392        if ( opus_likely( signalType == TYPE_VOICED ) ) {
393            /* Unrolled loop */
394            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
395            LTP_pred_Q13 = 2;
396            {
397                __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123;
398
399                b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 );
400                b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
401
402                /* loaded: [0] [-1] [-2] [-3] */
403                pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
404                /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
405                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
406                /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
407                xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 );
408                /* right shift 2 bytes (16 bits), zero extended */
409                xmm_tempa = _mm_srli_si128( xmm_tempa, 2 );
410
411                /* a[1] * b[-1], a[3] * b[-3] */
412                pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 );
413                pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 );
414
415                pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa );
416                /* equal shift right 8 bytes*/
417                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) );
418                xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 );
419
420                LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa );
421
422                LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
423                pred_lag_ptr++;
424            }
425        }
426
427        /* Noise shape feedback */
428        NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ];
429        NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) );
430
431        sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
432        sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
433
434        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
435        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14),       0 );
436
437        /* high part, use pmaddwd, results in 4 32-bit */
438        xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
439
440        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */
441        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 );
442        xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 );
443
444        xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 );
445        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
446
447        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
448
449        /* accumulate */
450        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
451
452        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
453        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
454
455        n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 );
456
457        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] );
458        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] );
459
460        n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 );                                /* Q11 -> Q12 */
461        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 );
462
463        n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
464        n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
465
466        silk_assert( lag > 0 || signalType != TYPE_VOICED );
467
468        /* Combine prediction and noise shaping signals */
469        tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
470        tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
471        if( lag > 0 ) {
472            /* Symmetric, packed FIR coefficients */
473            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
474            n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
475            n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
476            shp_lag_ptr++;
477
478            tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 );                       /* Q13 */
479            tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 );                          /* Q13 */
480            tmp1 = silk_RSHIFT_ROUND( tmp1, 3 );                                /* Q10 */
481        } else {
482            tmp1 = silk_RSHIFT_ROUND( tmp1, 2 );                                /* Q10 */
483        }
484
485        r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 );                              /* residual error Q10 */
486
487        /* Generate dither */
488        NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
489
490        /* Flip sign depending on dither */
491        tmp2 = -r_Q10;
492        if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2;
493
494        r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
495
496        /* Find two quantization level candidates and measure their rate-distortion */
497        q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
498        q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
499
500        q1_Q10 = table[q1_Q0][0];
501        q2_Q10 = table[q1_Q0][1];
502
503        if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0)
504        {
505            q1_Q10 = q2_Q10;
506        }
507
508        pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 );
509
510        /* Excitation */
511        exc_Q14 = silk_LSHIFT( q1_Q10, 4 );
512
513        tmp2 = -exc_Q14;
514        if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2;
515
516        /* Add predictions */
517        LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
518        xq_Q14      = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
519
520        /* Update states */
521        psLPC_Q14++;
522        *psLPC_Q14 = xq_Q14;
523        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
524
525        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
526        sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
527        NSQ->sLTP_shp_buf_idx++;
528        NSQ->sLTP_buf_idx++;
529
530        /* Make dither dependent on quantized signal */
531        NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] );
532    }
533
534    NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
535
536    /* Scale XQ back to normal level before saving */
537    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ];
538
539    /* write back sAR2_Q14 */
540    xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
541    xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
542    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
543    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
544
545    /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
546    {
547        __m128i xmm_Gain_Q10;
548        __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5;
549
550        /* prepare (1 << 7) in packed 4 32-bits */
551        xmm_tempa = _mm_set1_epi32( (1 << 7) );
552
553        /* prepare Gain_Q10 in packed 4 32-bits */
554        xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 );
555
556        /* process xq */
557        for (i = 0; i < length - 7; i += 8)
558        {
559            xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) );
560            xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) );
561
562            /* equal shift right 4 bytes*/
563            xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
564            /* equal shift right 4 bytes*/
565            xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) );
566
567            xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 );
568            xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 );
569            xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 );
570            xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 );
571
572            xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 );
573            xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 );
574            xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 );
575            xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 );
576
577            xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC );
578            xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC );
579
580            /* silk_RSHIFT_ROUND(xq, 8) */
581            xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa );
582            xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa );
583
584            xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 );
585            xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 );
586
587            /* silk_SAT16 */
588            xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
589
590            /* save to xq */
591            _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 );
592        }
593    }
594    for ( ; i < length; i++)
595    {
596        xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) );
597    }
598
599    /* Update LPC synth buffer */
600    silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
601}
602
603static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
604    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
605    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
606    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
607    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
608    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
609    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
610    opus_int            subfr,                  /* I    subframe number                 */
611    const opus_int      LTP_scale_Q14,          /* I                                    */
612    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
613    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
614    const opus_int      signal_type             /* I    Signal type                     */
615)
616{
617    opus_int   i, lag;
618    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
619    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
620
621    lag          = pitchL[ subfr ];
622    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
623    silk_assert( inv_gain_Q31 != 0 );
624
625    /* Calculate gain adjustment factor */
626    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
627        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
628    } else {
629        gain_adj_Q16 = (opus_int32)1 << 16;
630    }
631
632    /* Scale input */
633    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
634
635    /* prepare inv_gain_Q23 in packed 4 32-bits */
636    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
637
638    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
639        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
640
641        /* equal shift right 4 bytes*/
642        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
643
644        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
645        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
646
647        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
648        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
649
650        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
651
652        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
653    }
654
655    for( ; i < psEncC->subfr_length; i++ ) {
656        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
657    }
658
659    /* Save inverse gain */
660    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
661
662    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
663    if( NSQ->rewhite_flag ) {
664        if( subfr == 0 ) {
665            /* Do LTP downscaling */
666            inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
667        }
668        for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
669            silk_assert( i < MAX_FRAME_LENGTH );
670            sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
671        }
672    }
673
674    /* Adjust for changing gain */
675    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
676        /* Scale long-term shaping state */
677        __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
678
679        /* prepare gain_adj_Q16 in packed 4 32-bits */
680        xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
681
682        for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
683        {
684            xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
685            /* equal shift right 4 bytes*/
686            xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
687
688            xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
689            xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
690
691            xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
692            xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
693
694            xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
695
696            _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
697        }
698
699        for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
700            NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
701        }
702
703        /* Scale long-term prediction state */
704        if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
705            for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
706                sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
707            }
708        }
709
710        NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
711
712        /* Scale short-term prediction and shaping states */
713        for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
714            NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] );
715        }
716        for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
717            NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
718        }
719    }
720}
721