1/* Copyright (c) 2014, Cisco Systems, INC 2 Written by XiangMingZhu WeiZhou MinPeng YanWang 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 - Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 - Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*/ 27 28#ifdef HAVE_CONFIG_H 29#include "config.h" 30#endif 31 32#include <xmmintrin.h> 33#include <emmintrin.h> 34#include <smmintrin.h> 35#include "main.h" 36#include "celt/x86/x86cpu.h" 37#include "stack_alloc.h" 38 39static OPUS_INLINE void silk_nsq_scale_states_sse4_1( 40 const silk_encoder_state *psEncC, /* I Encoder State */ 41 silk_nsq_state *NSQ, /* I/O NSQ state */ 42 const opus_int32 x_Q3[], /* I input in Q3 */ 43 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ 44 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ 45 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 46 opus_int subfr, /* I subframe number */ 47 const opus_int LTP_scale_Q14, /* I */ 48 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ 49 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ 50 const opus_int signal_type /* I Signal type */ 51); 52 53static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( 54 silk_nsq_state *NSQ, /* I/O NSQ state */ 55 opus_int signalType, /* I Signal type */ 56 const opus_int32 x_sc_Q10[], /* I */ 57 opus_int8 pulses[], /* O */ 58 opus_int16 xq[], /* O */ 59 opus_int32 sLTP_Q15[], /* I/O LTP state */ 60 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 61 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 62 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */ 63 opus_int lag, /* I Pitch lag */ 64 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 65 opus_int Tilt_Q14, /* I Spectral tilt */ 66 opus_int32 LF_shp_Q14, /* I */ 67 opus_int32 Gain_Q16, /* I */ 68 opus_int offset_Q10, /* I */ 69 opus_int length, /* I Input length */ 70 opus_int32 table[][4] /* I */ 71); 72 73void silk_NSQ_sse4_1( 74 const silk_encoder_state *psEncC, /* I Encoder State */ 75 silk_nsq_state *NSQ, /* I/O NSQ state */ 76 SideInfoIndices *psIndices, /* I/O Quantization Indices */ 77 const opus_int32 x_Q3[], /* I Prefiltered input signal */ 78 opus_int8 pulses[], /* O Quantized pulse signal */ 79 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ 80 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ 81 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ 82 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ 83 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ 84 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ 85 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ 86 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ 87 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ 88 const opus_int LTP_scale_Q14 /* I LTP state scaling */ 89) 90{ 91 opus_int k, lag, start_idx, LSF_interpolation_flag; 92 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; 93 opus_int16 *pxq; 94 VARDECL( opus_int32, sLTP_Q15 ); 95 VARDECL( opus_int16, sLTP ); 96 opus_int32 HarmShapeFIRPacked_Q14; 97 opus_int offset_Q10; 98 VARDECL( opus_int32, x_sc_Q10 ); 99 100 opus_int32 table[ 64 ][ 4 ]; 101 opus_int32 tmp1; 102 opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20; 103 104 SAVE_STACK; 105 106 NSQ->rand_seed = psIndices->Seed; 107 108 /* Set unvoiced lag to the previous one, overwrite later for voiced */ 109 lag = NSQ->lagPrev; 110 111 silk_assert( NSQ->prev_gain_Q16 != 0 ); 112 113 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ]; 114 115 /* 0 */ 116 q1_Q10 = offset_Q10; 117 q2_Q10 = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 ); 118 rd1_Q20 = q1_Q10 * Lambda_Q10; 119 rd2_Q20 = q2_Q10 * Lambda_Q10; 120 121 table[ 32 ][ 0 ] = q1_Q10; 122 table[ 32 ][ 1 ] = q2_Q10; 123 table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10); 124 table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); 125 126 /* -1 */ 127 q1_Q10 = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ); 128 q2_Q10 = offset_Q10; 129 rd1_Q20 = - q1_Q10 * Lambda_Q10; 130 rd2_Q20 = q2_Q10 * Lambda_Q10; 131 132 table[ 31 ][ 0 ] = q1_Q10; 133 table[ 31 ][ 1 ] = q2_Q10; 134 table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10); 135 table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); 136 137 /* > 0 */ 138 for (k = 1; k <= 31; k++) 139 { 140 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 ); 141 142 q1_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10; 143 q2_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024; 144 rd1_Q20 = q1_Q10 * Lambda_Q10; 145 rd2_Q20 = q2_Q10 * Lambda_Q10; 146 147 table[ 32 + k ][ 0 ] = q1_Q10; 148 table[ 32 + k ][ 1 ] = q2_Q10; 149 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10); 150 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); 151 } 152 153 /* < -1 */ 154 for (k = -32; k <= -2; k++) 155 { 156 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 ); 157 158 q1_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10; 159 q2_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024; 160 rd1_Q20 = - q1_Q10 * Lambda_Q10; 161 rd2_Q20 = - q2_Q10 * Lambda_Q10; 162 163 table[ 32 + k ][ 0 ] = q1_Q10; 164 table[ 32 + k ][ 1 ] = q2_Q10; 165 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10); 166 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); 167 } 168 169 if( psIndices->NLSFInterpCoef_Q2 == 4 ) { 170 LSF_interpolation_flag = 0; 171 } else { 172 LSF_interpolation_flag = 1; 173 } 174 175 ALLOC( sLTP_Q15, 176 psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); 177 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); 178 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); 179 /* Set up pointers to start of sub frame */ 180 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; 181 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 182 pxq = &NSQ->xq[ psEncC->ltp_mem_length ]; 183 for( k = 0; k < psEncC->nb_subfr; k++ ) { 184 A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ]; 185 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; 186 AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ]; 187 188 /* Noise shape parameters */ 189 silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); 190 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); 191 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); 192 193 NSQ->rewhite_flag = 0; 194 if( psIndices->signalType == TYPE_VOICED ) { 195 /* Voiced */ 196 lag = pitchL[ k ]; 197 198 /* Re-whitening */ 199 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) { 200 /* Rewhiten with new A coefs */ 201 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; 202 silk_assert( start_idx > 0 ); 203 204 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ], 205 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch ); 206 207 NSQ->rewhite_flag = 1; 208 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 209 } 210 } 211 212 silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType ); 213 214 if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) ) 215 { 216 silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14, 217 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], 218 offset_Q10, psEncC->subfr_length, &(table[32]) ); 219 } 220 else 221 { 222 silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14, 223 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10, 224 offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch ); 225 } 226 227 x_Q3 += psEncC->subfr_length; 228 pulses += psEncC->subfr_length; 229 pxq += psEncC->subfr_length; 230 } 231 232 /* Update lagPrev for next frame */ 233 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; 234 235 /* Save quantized speech and noise shaping signals */ 236 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); 237 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); 238 RESTORE_STACK; 239} 240 241/***********************************/ 242/* silk_noise_shape_quantizer_10_16 */ 243/***********************************/ 244static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( 245 silk_nsq_state *NSQ, /* I/O NSQ state */ 246 opus_int signalType, /* I Signal type */ 247 const opus_int32 x_sc_Q10[], /* I */ 248 opus_int8 pulses[], /* O */ 249 opus_int16 xq[], /* O */ 250 opus_int32 sLTP_Q15[], /* I/O LTP state */ 251 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 252 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 253 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */ 254 opus_int lag, /* I Pitch lag */ 255 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 256 opus_int Tilt_Q14, /* I Spectral tilt */ 257 opus_int32 LF_shp_Q14, /* I */ 258 opus_int32 Gain_Q16, /* I */ 259 opus_int offset_Q10, /* I */ 260 opus_int length, /* I Input length */ 261 opus_int32 table[][4] /* I */ 262) 263{ 264 opus_int i; 265 opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13; 266 opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10; 267 opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; 268 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; 269 opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr; 270 271 __m128i xmm_tempa, xmm_tempb; 272 273 __m128i xmm_one; 274 275 __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF; 276 __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF; 277 __m128i a_Q12_01234567, a_Q12_89ABCDEF; 278 279 __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210; 280 __m128i AR_shp_Q13_76543210; 281 282 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; 283 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; 284 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); 285 286 /* Set up short term AR state */ 287 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ]; 288 289 sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14; 290 xq_Q14 = psLPC_Q14[ 0 ]; 291 LTP_pred_Q13 = 0; 292 293 /* load a_Q12 */ 294 xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ); 295 296 /* load a_Q12[0] - a_Q12[7] */ 297 a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) ); 298 /* load a_Q12[ 8 ] - a_Q12[ 15 ] */ 299 a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) ); 300 301 a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one ); 302 a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one ); 303 304 /* load AR_shp_Q13 */ 305 AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) ); 306 307 /* load psLPC_Q14 */ 308 xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 ); 309 310 xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) ); 311 xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) ); 312 313 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); 314 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); 315 316 psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb ); 317 psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); 318 319 xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) ); 320 xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) ); 321 322 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); 323 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); 324 325 psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb ); 326 psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); 327 328 /* load sAR2_Q14 */ 329 xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) ); 330 xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) ); 331 332 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); 333 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); 334 335 sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb ); 336 sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); 337 338 /* prepare 1 in 8 * 16bit */ 339 xmm_one = _mm_set1_epi16(1); 340 341 for( i = 0; i < length; i++ ) 342 { 343 /* Short-term prediction */ 344 __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F; 345 346 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 347 LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */ 348 349 /* shift psLPC_Q14 */ 350 psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 ); 351 psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 ); 352 353 psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 ); 354 psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 ); 355 356 psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 ); 357 psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14), 7 ); 358 359 /* high part, use pmaddwd, results in 4 32-bit */ 360 xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 ); 361 xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF ); 362 363 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */ 364 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 ); 365 xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF ); 366 367 xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 ); 368 xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF ); 369 370 xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 ); 371 xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF ); 372 373 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa ); 374 xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb ); 375 376 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one ); 377 xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one ); 378 379 /* accumulate */ 380 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F ); 381 xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F ); 382 383 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 ); 384 385 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) ); 386 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) ); 387 388 LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 ); 389 390 /* Long-term prediction */ 391 if ( opus_likely( signalType == TYPE_VOICED ) ) { 392 /* Unrolled loop */ 393 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 394 LTP_pred_Q13 = 2; 395 { 396 __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123; 397 398 b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 ); 399 b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B ); 400 401 /* loaded: [0] [-1] [-2] [-3] */ 402 pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) ); 403 /* shuffle to [-3] [-2] [-1] [0] and to new xmm */ 404 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B ); 405 /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */ 406 xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 ); 407 /* right shift 2 bytes (16 bits), zero extended */ 408 xmm_tempa = _mm_srli_si128( xmm_tempa, 2 ); 409 410 /* a[1] * b[-1], a[3] * b[-3] */ 411 pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 ); 412 pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 ); 413 414 pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa ); 415 /* equal shift right 8 bytes*/ 416 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) ); 417 xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 ); 418 419 LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa ); 420 421 LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] ); 422 pred_lag_ptr++; 423 } 424 } 425 426 /* Noise shape feedback */ 427 NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ]; 428 NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) ); 429 430 sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 ); 431 sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 ); 432 433 sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 ); 434 sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 ); 435 436 /* high part, use pmaddwd, results in 4 32-bit */ 437 xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 ); 438 439 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */ 440 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 ); 441 xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 ); 442 443 xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 ); 444 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa ); 445 446 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one ); 447 448 /* accumulate */ 449 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 ); 450 451 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) ); 452 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) ); 453 454 n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 ); 455 456 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] ); 457 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] ); 458 459 n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 ); /* Q11 -> Q12 */ 460 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 ); 461 462 n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 ); 463 n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 ); 464 465 silk_assert( lag > 0 || signalType != TYPE_VOICED ); 466 467 /* Combine prediction and noise shaping signals */ 468 tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */ 469 tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */ 470 if( lag > 0 ) { 471 /* Symmetric, packed FIR coefficients */ 472 n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); 473 n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); 474 n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 ); 475 shp_lag_ptr++; 476 477 tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 ); /* Q13 */ 478 tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 ); /* Q13 */ 479 tmp1 = silk_RSHIFT_ROUND( tmp1, 3 ); /* Q10 */ 480 } else { 481 tmp1 = silk_RSHIFT_ROUND( tmp1, 2 ); /* Q10 */ 482 } 483 484 r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 ); /* residual error Q10 */ 485 486 /* Generate dither */ 487 NSQ->rand_seed = silk_RAND( NSQ->rand_seed ); 488 489 /* Flip sign depending on dither */ 490 tmp2 = -r_Q10; 491 if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2; 492 493 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 ); 494 495 /* Find two quantization level candidates and measure their rate-distortion */ 496 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); 497 q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); 498 499 q1_Q10 = table[q1_Q0][0]; 500 q2_Q10 = table[q1_Q0][1]; 501 502 if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0) 503 { 504 q1_Q10 = q2_Q10; 505 } 506 507 pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 ); 508 509 /* Excitation */ 510 exc_Q14 = silk_LSHIFT( q1_Q10, 4 ); 511 512 tmp2 = -exc_Q14; 513 if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2; 514 515 /* Add predictions */ 516 LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 ); 517 xq_Q14 = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 ); 518 519 /* Update states */ 520 psLPC_Q14++; 521 *psLPC_Q14 = xq_Q14; 522 sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 ); 523 524 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 ); 525 sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 ); 526 NSQ->sLTP_shp_buf_idx++; 527 NSQ->sLTP_buf_idx++; 528 529 /* Make dither dependent on quantized signal */ 530 NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] ); 531 } 532 533 NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14; 534 535 /* Scale XQ back to normal level before saving */ 536 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ]; 537 538 /* write back sAR2_Q14 */ 539 xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ); 540 xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ); 541 _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa ); 542 _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb ); 543 544 /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */ 545 { 546 __m128i xmm_Gain_Q10; 547 __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5; 548 549 /* prepare (1 << 7) in packed 4 32-bits */ 550 xmm_tempa = _mm_set1_epi32( (1 << 7) ); 551 552 /* prepare Gain_Q10 in packed 4 32-bits */ 553 xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 ); 554 555 /* process xq */ 556 for (i = 0; i < length - 7; i += 8) 557 { 558 xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) ); 559 xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) ); 560 561 /* equal shift right 4 bytes*/ 562 xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 563 /* equal shift right 4 bytes*/ 564 xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 565 566 xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 ); 567 xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 ); 568 xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 ); 569 xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 ); 570 571 xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 ); 572 xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 ); 573 xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 ); 574 xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 ); 575 576 xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC ); 577 xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC ); 578 579 /* silk_RSHIFT_ROUND(xq, 8) */ 580 xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa ); 581 xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa ); 582 583 xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 ); 584 xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 ); 585 586 /* silk_SAT16 */ 587 xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 ); 588 589 /* save to xq */ 590 _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 ); 591 } 592 } 593 for ( ; i < length; i++) 594 { 595 xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); 596 } 597 598 /* Update LPC synth buffer */ 599 silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); 600} 601 602static OPUS_INLINE void silk_nsq_scale_states_sse4_1( 603 const silk_encoder_state *psEncC, /* I Encoder State */ 604 silk_nsq_state *NSQ, /* I/O NSQ state */ 605 const opus_int32 x_Q3[], /* I input in Q3 */ 606 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ 607 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ 608 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 609 opus_int subfr, /* I subframe number */ 610 const opus_int LTP_scale_Q14, /* I */ 611 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ 612 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ 613 const opus_int signal_type /* I Signal type */ 614) 615{ 616 opus_int i, lag; 617 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23; 618 __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1; 619 620 lag = pitchL[ subfr ]; 621 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); 622 silk_assert( inv_gain_Q31 != 0 ); 623 624 /* Calculate gain adjustment factor */ 625 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { 626 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); 627 } else { 628 gain_adj_Q16 = (opus_int32)1 << 16; 629 } 630 631 /* Scale input */ 632 inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 ); 633 634 /* prepare inv_gain_Q23 in packed 4 32-bits */ 635 xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23); 636 637 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) { 638 xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) ); 639 640 /* equal shift right 4 bytes*/ 641 xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 642 643 xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 ); 644 xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 ); 645 646 xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 ); 647 xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 ); 648 649 xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC ); 650 651 _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 ); 652 } 653 654 for( ; i < psEncC->subfr_length; i++ ) { 655 x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 ); 656 } 657 658 /* Save inverse gain */ 659 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; 660 661 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ 662 if( NSQ->rewhite_flag ) { 663 if( subfr == 0 ) { 664 /* Do LTP downscaling */ 665 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 ); 666 } 667 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) { 668 silk_assert( i < MAX_FRAME_LENGTH ); 669 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] ); 670 } 671 } 672 673 /* Adjust for changing gain */ 674 if( gain_adj_Q16 != (opus_int32)1 << 16 ) { 675 /* Scale long-term shaping state */ 676 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1; 677 678 /* prepare gain_adj_Q16 in packed 4 32-bits */ 679 xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16); 680 681 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 ) 682 { 683 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); 684 /* equal shift right 4 bytes*/ 685 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 686 687 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 ); 688 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 ); 689 690 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 ); 691 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 ); 692 693 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC ); 694 695 _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); 696 } 697 698 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { 699 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] ); 700 } 701 702 /* Scale long-term prediction state */ 703 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { 704 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) { 705 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); 706 } 707 } 708 709 NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 ); 710 711 /* Scale short-term prediction and shaping states */ 712 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { 713 NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] ); 714 } 715 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { 716 NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] ); 717 } 718 } 719} 720