1/* Copyright (c) 2014, Cisco Systems, INC 2 Written by XiangMingZhu WeiZhou MinPeng YanWang 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 - Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 - Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*/ 27 28#ifdef HAVE_CONFIG_H 29#include "config.h" 30#endif 31 32#include <xmmintrin.h> 33#include <emmintrin.h> 34#include <smmintrin.h> 35#include "main.h" 36#include "celt/x86/x86cpu.h" 37 38#include "stack_alloc.h" 39 40typedef struct { 41 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ]; 42 opus_int32 RandState[ DECISION_DELAY ]; 43 opus_int32 Q_Q10[ DECISION_DELAY ]; 44 opus_int32 Xq_Q14[ DECISION_DELAY ]; 45 opus_int32 Pred_Q15[ DECISION_DELAY ]; 46 opus_int32 Shape_Q14[ DECISION_DELAY ]; 47 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ]; 48 opus_int32 LF_AR_Q14; 49 opus_int32 Seed; 50 opus_int32 SeedInit; 51 opus_int32 RD_Q10; 52} NSQ_del_dec_struct; 53 54typedef struct { 55 opus_int32 Q_Q10; 56 opus_int32 RD_Q10; 57 opus_int32 xq_Q14; 58 opus_int32 LF_AR_Q14; 59 opus_int32 sLTP_shp_Q14; 60 opus_int32 LPC_exc_Q14; 61} NSQ_sample_struct; 62 63typedef NSQ_sample_struct NSQ_sample_pair[ 2 ]; 64 65static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( 66 const silk_encoder_state *psEncC, /* I Encoder State */ 67 silk_nsq_state *NSQ, /* I/O NSQ state */ 68 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ 69 const opus_int32 x_Q3[], /* I Input in Q3 */ 70 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ 71 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ 72 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 73 opus_int subfr, /* I Subframe number */ 74 opus_int nStatesDelayedDecision, /* I Number of del dec states */ 75 const opus_int LTP_scale_Q14, /* I LTP state scaling */ 76 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ 77 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ 78 const opus_int signal_type, /* I Signal type */ 79 const opus_int decisionDelay /* I Decision delay */ 80); 81 82/******************************************/ 83/* Noise shape quantizer for one subframe */ 84/******************************************/ 85static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( 86 silk_nsq_state *NSQ, /* I/O NSQ state */ 87 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ 88 opus_int signalType, /* I Signal type */ 89 const opus_int32 x_Q10[], /* I */ 90 opus_int8 pulses[], /* O */ 91 opus_int16 xq[], /* O */ 92 opus_int32 sLTP_Q15[], /* I/O LTP filter state */ 93 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */ 94 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 95 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 96 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ 97 opus_int lag, /* I Pitch lag */ 98 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 99 opus_int Tilt_Q14, /* I Spectral tilt */ 100 opus_int32 LF_shp_Q14, /* I */ 101 opus_int32 Gain_Q16, /* I */ 102 opus_int Lambda_Q10, /* I */ 103 opus_int offset_Q10, /* I */ 104 opus_int length, /* I Input length */ 105 opus_int subfr, /* I Subframe number */ 106 opus_int shapingLPCOrder, /* I Shaping LPC filter order */ 107 opus_int predictLPCOrder, /* I Prediction filter order */ 108 opus_int warping_Q16, /* I */ 109 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ 110 opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */ 111 opus_int decisionDelay /* I */ 112); 113 114void silk_NSQ_del_dec_sse4_1( 115 const silk_encoder_state *psEncC, /* I/O Encoder State */ 116 silk_nsq_state *NSQ, /* I/O NSQ state */ 117 SideInfoIndices *psIndices, /* I/O Quantization Indices */ 118 const opus_int32 x_Q3[], /* I Prefiltered input signal */ 119 opus_int8 pulses[], /* O Quantized pulse signal */ 120 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ 121 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ 122 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ 123 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ 124 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ 125 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ 126 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ 127 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ 128 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ 129 const opus_int LTP_scale_Q14 /* I LTP state scaling */ 130) 131{ 132 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; 133 opus_int last_smple_idx, smpl_buf_idx, decisionDelay; 134 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; 135 opus_int16 *pxq; 136 VARDECL( opus_int32, sLTP_Q15 ); 137 VARDECL( opus_int16, sLTP ); 138 opus_int32 HarmShapeFIRPacked_Q14; 139 opus_int offset_Q10; 140 opus_int32 RDmin_Q10, Gain_Q10; 141 VARDECL( opus_int32, x_sc_Q10 ); 142 VARDECL( opus_int32, delayedGain_Q10 ); 143 VARDECL( NSQ_del_dec_struct, psDelDec ); 144 NSQ_del_dec_struct *psDD; 145 SAVE_STACK; 146 147 /* Set unvoiced lag to the previous one, overwrite later for voiced */ 148 lag = NSQ->lagPrev; 149 150 silk_assert( NSQ->prev_gain_Q16 != 0 ); 151 152 /* Initialize delayed decision states */ 153 ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct ); 154 silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) ); 155 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) { 156 psDD = &psDelDec[ k ]; 157 psDD->Seed = ( k + psIndices->Seed ) & 3; 158 psDD->SeedInit = psDD->Seed; 159 psDD->RD_Q10 = 0; 160 psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14; 161 psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ]; 162 silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); 163 silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) ); 164 } 165 166 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ]; 167 smpl_buf_idx = 0; /* index of oldest samples */ 168 169 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length ); 170 171 /* For voiced frames limit the decision delay to lower than the pitch lag */ 172 if( psIndices->signalType == TYPE_VOICED ) { 173 for( k = 0; k < psEncC->nb_subfr; k++ ) { 174 decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 ); 175 } 176 } else { 177 if( lag > 0 ) { 178 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 ); 179 } 180 } 181 182 if( psIndices->NLSFInterpCoef_Q2 == 4 ) { 183 LSF_interpolation_flag = 0; 184 } else { 185 LSF_interpolation_flag = 1; 186 } 187 188 ALLOC( sLTP_Q15, 189 psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); 190 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); 191 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); 192 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 ); 193 /* Set up pointers to start of sub frame */ 194 pxq = &NSQ->xq[ psEncC->ltp_mem_length ]; 195 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; 196 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 197 subfr = 0; 198 for( k = 0; k < psEncC->nb_subfr; k++ ) { 199 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ]; 200 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; 201 AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ]; 202 203 /* Noise shape parameters */ 204 silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); 205 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); 206 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); 207 208 NSQ->rewhite_flag = 0; 209 if( psIndices->signalType == TYPE_VOICED ) { 210 /* Voiced */ 211 lag = pitchL[ k ]; 212 213 /* Re-whitening */ 214 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) { 215 if( k == 2 ) { 216 /* RESET DELAYED DECISIONS */ 217 /* Find winner */ 218 RDmin_Q10 = psDelDec[ 0 ].RD_Q10; 219 Winner_ind = 0; 220 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) { 221 if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) { 222 RDmin_Q10 = psDelDec[ i ].RD_Q10; 223 Winner_ind = i; 224 } 225 } 226 for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) { 227 if( i != Winner_ind ) { 228 psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 ); 229 silk_assert( psDelDec[ i ].RD_Q10 >= 0 ); 230 } 231 } 232 233 /* Copy final part of signals from winner state to output and long-term filter states */ 234 psDD = &psDelDec[ Winner_ind ]; 235 last_smple_idx = smpl_buf_idx + decisionDelay; 236 for( i = 0; i < decisionDelay; i++ ) { 237 last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK; 238 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); 239 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( 240 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) ); 241 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ]; 242 } 243 244 subfr = 0; 245 } 246 247 /* Rewhiten with new A coefs */ 248 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; 249 silk_assert( start_idx > 0 ); 250 251 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ], 252 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch ); 253 254 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 255 NSQ->rewhite_flag = 1; 256 } 257 } 258 259 silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, 260 psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay ); 261 262 silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, 263 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], 264 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, 265 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay ); 266 267 x_Q3 += psEncC->subfr_length; 268 pulses += psEncC->subfr_length; 269 pxq += psEncC->subfr_length; 270 } 271 272 /* Find winner */ 273 RDmin_Q10 = psDelDec[ 0 ].RD_Q10; 274 Winner_ind = 0; 275 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) { 276 if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) { 277 RDmin_Q10 = psDelDec[ k ].RD_Q10; 278 Winner_ind = k; 279 } 280 } 281 282 /* Copy final part of signals from winner state to output and long-term filter states */ 283 psDD = &psDelDec[ Winner_ind ]; 284 psIndices->Seed = psDD->SeedInit; 285 last_smple_idx = smpl_buf_idx + decisionDelay; 286 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 ); 287 for( i = 0; i < decisionDelay; i++ ) { 288 last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK; 289 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); 290 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( 291 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) ); 292 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ]; 293 } 294 silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); 295 silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) ); 296 297 /* Update states */ 298 NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14; 299 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; 300 301 /* Save quantized speech signal */ 302 /* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[psEncC->ltp_mem_length], psEncC->frame_length * sizeof( opus_int16 ) ) */ 303 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); 304 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); 305 RESTORE_STACK; 306} 307 308/******************************************/ 309/* Noise shape quantizer for one subframe */ 310/******************************************/ 311static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( 312 silk_nsq_state *NSQ, /* I/O NSQ state */ 313 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ 314 opus_int signalType, /* I Signal type */ 315 const opus_int32 x_Q10[], /* I */ 316 opus_int8 pulses[], /* O */ 317 opus_int16 xq[], /* O */ 318 opus_int32 sLTP_Q15[], /* I/O LTP filter state */ 319 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */ 320 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 321 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 322 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ 323 opus_int lag, /* I Pitch lag */ 324 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 325 opus_int Tilt_Q14, /* I Spectral tilt */ 326 opus_int32 LF_shp_Q14, /* I */ 327 opus_int32 Gain_Q16, /* I */ 328 opus_int Lambda_Q10, /* I */ 329 opus_int offset_Q10, /* I */ 330 opus_int length, /* I Input length */ 331 opus_int subfr, /* I Subframe number */ 332 opus_int shapingLPCOrder, /* I Shaping LPC filter order */ 333 opus_int predictLPCOrder, /* I Prediction filter order */ 334 opus_int warping_Q16, /* I */ 335 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ 336 opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */ 337 opus_int decisionDelay /* I */ 338) 339{ 340 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx; 341 opus_int32 Winner_rand_state; 342 opus_int32 LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14; 343 opus_int32 n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10; 344 opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; 345 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; 346 opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14; 347 VARDECL( NSQ_sample_pair, psSampleState ); 348 NSQ_del_dec_struct *psDD; 349 NSQ_sample_struct *psSS; 350 351 __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF; 352 __m128i b_Q12_0123, b_sr_Q12_0123; 353 SAVE_STACK; 354 355 silk_assert( nStatesDelayedDecision > 0 ); 356 ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair ); 357 358 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; 359 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; 360 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); 361 362 a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 ); 363 a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 ); 364 365 if( opus_likely( predictLPCOrder == 16 ) ) { 366 a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 ); 367 a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 ); 368 } 369 370 if( signalType == TYPE_VOICED ){ 371 b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 ); 372 b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 373 } 374 for( i = 0; i < length; i++ ) { 375 /* Perform common calculations used in all states */ 376 377 /* Long-term prediction */ 378 if( signalType == TYPE_VOICED ) { 379 /* Unrolled loop */ 380 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 381 LTP_pred_Q14 = 2; 382 { 383 __m128i tmpa, tmpb, pred_lag_ptr_tmp; 384 pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) ); 385 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B ); 386 tmpa = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 ); 387 tmpa = _mm_srli_si128( tmpa, 2 ); 388 389 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */ 390 pred_lag_ptr_tmp = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_0123 ); 391 pred_lag_ptr_tmp = _mm_srli_si128( pred_lag_ptr_tmp, 2 ); 392 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpa ); 393 394 tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3, 2 ) );/* equal shift right 8 bytes */ 395 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpb ); 396 LTP_pred_Q14 += _mm_cvtsi128_si32( pred_lag_ptr_tmp ); 397 398 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] ); 399 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */ 400 pred_lag_ptr++; 401 } 402 } else { 403 LTP_pred_Q14 = 0; 404 } 405 406 /* Long-term shaping */ 407 if( lag > 0 ) { 408 /* Symmetric, packed FIR coefficients */ 409 n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); 410 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); 411 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */ 412 shp_lag_ptr++; 413 } else { 414 n_LTP_Q14 = 0; 415 } 416 { 417 __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp; 418 419 for( k = 0; k < nStatesDelayedDecision; k++ ) { 420 /* Delayed decision state */ 421 psDD = &psDelDec[ k ]; 422 423 /* Sample state */ 424 psSS = psSampleState[ k ]; 425 426 /* Generate dither */ 427 psDD->Seed = silk_RAND( psDD->Seed ); 428 429 /* Pointer used in short term prediction and shaping */ 430 psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ]; 431 /* Short-term prediction */ 432 silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 ); 433 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 434 LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 ); 435 436 tmpb = _mm_setzero_si128(); 437 438 /* step 1 */ 439 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */ 440 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); /* 0, -1, -2, -3 */ 441 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 ); /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */ 442 443 tmpa = _mm_srli_epi64( tmpa, 16 ); 444 tmpb = _mm_add_epi32( tmpb, tmpa ); 445 446 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 447 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 448 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1*-1, 3*-3 */ 449 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); 450 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); 451 452 /* step 2 */ 453 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) ); 454 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); 455 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 ); 456 tmpa = _mm_srli_epi64( tmpa, 16 ); 457 tmpb = _mm_add_epi32( tmpb, tmpa ); 458 459 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 460 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 461 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); 462 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); 463 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); 464 465 if ( opus_likely( predictLPCOrder == 16 ) ) 466 { 467 /* step 3 */ 468 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) ); 469 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); 470 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB ); 471 tmpa = _mm_srli_epi64( tmpa, 16 ); 472 tmpb = _mm_add_epi32( tmpb, tmpa ); 473 474 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 475 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */ 476 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); 477 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); 478 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); 479 480 /* setp 4 */ 481 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) ); 482 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); 483 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF ); 484 tmpa = _mm_srli_epi64( tmpa, 16 ); 485 tmpb = _mm_add_epi32( tmpb, tmpa ); 486 487 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 488 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 489 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); 490 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); 491 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); 492 493 /* add at last */ 494 /* equal shift right 8 bytes*/ 495 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); 496 tmpb = _mm_add_epi32( tmpb, tmpa ); 497 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb ); 498 } 499 else 500 { 501 /* add at last */ 502 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); /* equal shift right 8 bytes*/ 503 tmpb = _mm_add_epi32( tmpb, tmpa ); 504 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb ); 505 506 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] ); 507 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] ); 508 } 509 510 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */ 511 512 /* Noise shape feedback */ 513 silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ 514 /* Output of lowpass section */ 515 tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 ); 516 /* Output of allpass section */ 517 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 ); 518 psDD->sAR2_Q14[ 0 ] = tmp2; 519 n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 ); 520 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] ); 521 /* Loop over allpass sections */ 522 for( j = 2; j < shapingLPCOrder; j += 2 ) { 523 /* Output of allpass section */ 524 tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 ); 525 psDD->sAR2_Q14[ j - 1 ] = tmp1; 526 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] ); 527 /* Output of allpass section */ 528 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 ); 529 psDD->sAR2_Q14[ j + 0 ] = tmp2; 530 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] ); 531 } 532 psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1; 533 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] ); 534 535 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 ); /* Q11 -> Q12 */ 536 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 ); /* Q12 */ 537 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 ); /* Q12 -> Q14 */ 538 539 n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 ); /* Q12 */ 540 n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 ); /* Q12 */ 541 n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 ); /* Q12 -> Q14 */ 542 543 /* Input minus prediction plus noise feedback */ 544 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ 545 tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */ 546 tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */ 547 tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */ 548 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */ 549 550 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */ 551 552 /* Flip sign depending on dither */ 553 if ( psDD->Seed < 0 ) { 554 r_Q10 = -r_Q10; 555 } 556 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 ); 557 558 /* Find two quantization level candidates and measure their rate-distortion */ 559 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); 560 q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); 561 if( q1_Q0 > 0 ) { 562 q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); 563 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); 564 q2_Q10 = silk_ADD32( q1_Q10, 1024 ); 565 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 ); 566 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); 567 } else if( q1_Q0 == 0 ) { 568 q1_Q10 = offset_Q10; 569 q2_Q10 = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 ); 570 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 ); 571 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); 572 } else if( q1_Q0 == -1 ) { 573 q2_Q10 = offset_Q10; 574 q1_Q10 = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 ); 575 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); 576 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); 577 } else { /* q1_Q0 < -1 */ 578 q1_Q10 = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); 579 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); 580 q2_Q10 = silk_ADD32( q1_Q10, 1024 ); 581 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); 582 rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 ); 583 } 584 rr_Q10 = silk_SUB32( r_Q10, q1_Q10 ); 585 rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 ); 586 rr_Q10 = silk_SUB32( r_Q10, q2_Q10 ); 587 rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 ); 588 589 if( rd1_Q10 < rd2_Q10 ) { 590 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 ); 591 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 ); 592 psSS[ 0 ].Q_Q10 = q1_Q10; 593 psSS[ 1 ].Q_Q10 = q2_Q10; 594 } else { 595 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 ); 596 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 ); 597 psSS[ 0 ].Q_Q10 = q2_Q10; 598 psSS[ 1 ].Q_Q10 = q1_Q10; 599 } 600 601 /* Update states for best quantization */ 602 603 /* Quantized excitation */ 604 exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 ); 605 if ( psDD->Seed < 0 ) { 606 exc_Q14 = -exc_Q14; 607 } 608 609 /* Add predictions */ 610 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); 611 xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); 612 613 /* Update states */ 614 sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); 615 psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); 616 psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14; 617 psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14; 618 psSS[ 0 ].xq_Q14 = xq_Q14; 619 620 /* Update states for second best quantization */ 621 622 /* Quantized excitation */ 623 exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 ); 624 if ( psDD->Seed < 0 ) { 625 exc_Q14 = -exc_Q14; 626 } 627 628 629 /* Add predictions */ 630 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); 631 xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); 632 633 /* Update states */ 634 sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); 635 psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); 636 psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14; 637 psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14; 638 psSS[ 1 ].xq_Q14 = xq_Q14; 639 } 640 } 641 *smpl_buf_idx = ( *smpl_buf_idx - 1 ) & DECISION_DELAY_MASK; /* Index to newest samples */ 642 last_smple_idx = ( *smpl_buf_idx + decisionDelay ) & DECISION_DELAY_MASK; /* Index to decisionDelay old samples */ 643 644 /* Find winner */ 645 RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10; 646 Winner_ind = 0; 647 for( k = 1; k < nStatesDelayedDecision; k++ ) { 648 if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) { 649 RDmin_Q10 = psSampleState[ k ][ 0 ].RD_Q10; 650 Winner_ind = k; 651 } 652 } 653 654 /* Increase RD values of expired states */ 655 Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ]; 656 for( k = 0; k < nStatesDelayedDecision; k++ ) { 657 if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) { 658 psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 ); 659 psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 ); 660 silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 ); 661 } 662 } 663 664 /* Find worst in first set and best in second set */ 665 RDmax_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10; 666 RDmin_Q10 = psSampleState[ 0 ][ 1 ].RD_Q10; 667 RDmax_ind = 0; 668 RDmin_ind = 0; 669 for( k = 1; k < nStatesDelayedDecision; k++ ) { 670 /* find worst in first set */ 671 if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) { 672 RDmax_Q10 = psSampleState[ k ][ 0 ].RD_Q10; 673 RDmax_ind = k; 674 } 675 /* find best in second set */ 676 if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) { 677 RDmin_Q10 = psSampleState[ k ][ 1 ].RD_Q10; 678 RDmin_ind = k; 679 } 680 } 681 682 /* Replace a state if best from second set outperforms worst in first set */ 683 if( RDmin_Q10 < RDmax_Q10 ) { 684 silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i, 685 ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) ); 686 silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) ); 687 } 688 689 /* Write samples from winner to output and long-term filter states */ 690 psDD = &psDelDec[ Winner_ind ]; 691 if( subfr > 0 || i >= decisionDelay ) { 692 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); 693 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( 694 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) ); 695 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ]; 696 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDD->Pred_Q15[ last_smple_idx ]; 697 } 698 NSQ->sLTP_shp_buf_idx++; 699 NSQ->sLTP_buf_idx++; 700 701 /* Update states */ 702 for( k = 0; k < nStatesDelayedDecision; k++ ) { 703 psDD = &psDelDec[ k ]; 704 psSS = &psSampleState[ k ][ 0 ]; 705 psDD->LF_AR_Q14 = psSS->LF_AR_Q14; 706 psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14; 707 psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14; 708 psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10; 709 psDD->Pred_Q15[ *smpl_buf_idx ] = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 ); 710 psDD->Shape_Q14[ *smpl_buf_idx ] = psSS->sLTP_shp_Q14; 711 psDD->Seed = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) ); 712 psDD->RandState[ *smpl_buf_idx ] = psDD->Seed; 713 psDD->RD_Q10 = psSS->RD_Q10; 714 } 715 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10; 716 } 717 /* Update LPC states */ 718 for( k = 0; k < nStatesDelayedDecision; k++ ) { 719 psDD = &psDelDec[ k ]; 720 silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); 721 } 722 RESTORE_STACK; 723} 724 725static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( 726 const silk_encoder_state *psEncC, /* I Encoder State */ 727 silk_nsq_state *NSQ, /* I/O NSQ state */ 728 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ 729 const opus_int32 x_Q3[], /* I Input in Q3 */ 730 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ 731 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ 732 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 733 opus_int subfr, /* I Subframe number */ 734 opus_int nStatesDelayedDecision, /* I Number of del dec states */ 735 const opus_int LTP_scale_Q14, /* I LTP state scaling */ 736 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ 737 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ 738 const opus_int signal_type, /* I Signal type */ 739 const opus_int decisionDelay /* I Decision delay */ 740) 741{ 742 opus_int i, k, lag; 743 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23; 744 NSQ_del_dec_struct *psDD; 745 __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1; 746 747 lag = pitchL[ subfr ]; 748 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); 749 750 silk_assert( inv_gain_Q31 != 0 ); 751 752 /* Calculate gain adjustment factor */ 753 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { 754 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); 755 } else { 756 gain_adj_Q16 = (opus_int32)1 << 16; 757 } 758 759 /* Scale input */ 760 inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 ); 761 762 /* prepare inv_gain_Q23 in packed 4 32-bits */ 763 xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23); 764 765 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) { 766 xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) ); 767 /* equal shift right 4 bytes*/ 768 xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 769 770 xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 ); 771 xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 ); 772 773 xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 ); 774 xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 ); 775 776 xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC ); 777 778 _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 ); 779 } 780 781 for( ; i < psEncC->subfr_length; i++ ) { 782 x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 ); 783 } 784 785 /* Save inverse gain */ 786 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; 787 788 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ 789 if( NSQ->rewhite_flag ) { 790 if( subfr == 0 ) { 791 /* Do LTP downscaling */ 792 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 ); 793 } 794 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) { 795 silk_assert( i < MAX_FRAME_LENGTH ); 796 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] ); 797 } 798 } 799 800 /* Adjust for changing gain */ 801 if( gain_adj_Q16 != (opus_int32)1 << 16 ) { 802 /* Scale long-term shaping state */ 803 { 804 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1; 805 806 /* prepare gain_adj_Q16 in packed 4 32-bits */ 807 xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 ); 808 809 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 ) 810 { 811 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); 812 /* equal shift right 4 bytes*/ 813 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 814 815 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 ); 816 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 ); 817 818 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 ); 819 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 ); 820 821 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC ); 822 823 _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); 824 } 825 826 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { 827 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] ); 828 } 829 830 /* Scale long-term prediction state */ 831 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { 832 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) { 833 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); 834 } 835 } 836 837 for( k = 0; k < nStatesDelayedDecision; k++ ) { 838 psDD = &psDelDec[ k ]; 839 840 /* Scale scalar states */ 841 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 ); 842 843 /* Scale short-term prediction and shaping states */ 844 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { 845 psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] ); 846 } 847 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { 848 psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] ); 849 } 850 for( i = 0; i < DECISION_DELAY; i++ ) { 851 psDD->Pred_Q15[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[ i ] ); 852 psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] ); 853 } 854 } 855 } 856 } 857} 858