1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <tmmintrin.h> // SSSE3 13 14#include "./vp9_rtcd.h" 15#include "./vpx_config.h" 16#include "vpx_dsp/vpx_dsp_common.h" 17#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" 18#include "vpx_dsp/x86/inv_txfm_sse2.h" 19#include "vpx_dsp/x86/txfm_common_sse2.h" 20 21void vp9_fdct8x8_quant_ssse3( 22 const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, 23 int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, 24 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, 25 uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { 26 __m128i zero; 27 int pass; 28 29 // Constants 30 // When we use them, in one case, they are all the same. In all others 31 // it's a pair of them that we need to repeat four times. This is done 32 // by constructing the 32 bit constant corresponding to that pair. 33 const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170); 34 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 35 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 36 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 37 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 38 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 39 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 40 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 41 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 42 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 43 // Load input 44 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 45 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 46 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 47 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 48 __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 49 __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 50 __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 51 __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 52 __m128i *in[8]; 53 int index = 0; 54 55 (void)scan_ptr; 56 (void)coeff_ptr; 57 58 // Pre-condition input (shift by two) 59 in0 = _mm_slli_epi16(in0, 2); 60 in1 = _mm_slli_epi16(in1, 2); 61 in2 = _mm_slli_epi16(in2, 2); 62 in3 = _mm_slli_epi16(in3, 2); 63 in4 = _mm_slli_epi16(in4, 2); 64 in5 = _mm_slli_epi16(in5, 2); 65 in6 = _mm_slli_epi16(in6, 2); 66 in7 = _mm_slli_epi16(in7, 2); 67 68 in[0] = &in0; 69 in[1] = &in1; 70 in[2] = &in2; 71 in[3] = &in3; 72 in[4] = &in4; 73 in[5] = &in5; 74 in[6] = &in6; 75 in[7] = &in7; 76 77 // We do two passes, first the columns, then the rows. The results of the 78 // first pass are transposed so that the same column code can be reused. The 79 // results of the second pass are also transposed so that the rows (processed 80 // as columns) are put back in row positions. 81 for (pass = 0; pass < 2; pass++) { 82 // To store results of each pass before the transpose. 83 __m128i res0, res1, res2, res3, res4, res5, res6, res7; 84 // Add/subtract 85 const __m128i q0 = _mm_add_epi16(in0, in7); 86 const __m128i q1 = _mm_add_epi16(in1, in6); 87 const __m128i q2 = _mm_add_epi16(in2, in5); 88 const __m128i q3 = _mm_add_epi16(in3, in4); 89 const __m128i q4 = _mm_sub_epi16(in3, in4); 90 const __m128i q5 = _mm_sub_epi16(in2, in5); 91 const __m128i q6 = _mm_sub_epi16(in1, in6); 92 const __m128i q7 = _mm_sub_epi16(in0, in7); 93 // Work on first four results 94 { 95 // Add/subtract 96 const __m128i r0 = _mm_add_epi16(q0, q3); 97 const __m128i r1 = _mm_add_epi16(q1, q2); 98 const __m128i r2 = _mm_sub_epi16(q1, q2); 99 const __m128i r3 = _mm_sub_epi16(q0, q3); 100 // Interleave to do the multiply by constants which gets us into 32bits 101 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 102 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 103 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 104 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 105 106 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 107 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 108 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 109 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 110 111 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 112 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 113 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 114 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 115 // dct_const_round_shift 116 117 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 118 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 119 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 120 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 121 122 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 123 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 124 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 125 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 126 127 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 128 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 129 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 130 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 131 132 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 133 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 134 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 135 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 136 // Combine 137 138 res0 = _mm_packs_epi32(w0, w1); 139 res4 = _mm_packs_epi32(w2, w3); 140 res2 = _mm_packs_epi32(w4, w5); 141 res6 = _mm_packs_epi32(w6, w7); 142 } 143 // Work on next four results 144 { 145 // Interleave to do the multiply by constants which gets us into 32bits 146 const __m128i d0 = _mm_sub_epi16(q6, q5); 147 const __m128i d1 = _mm_add_epi16(q6, q5); 148 const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16); 149 const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16); 150 151 // Add/subtract 152 const __m128i x0 = _mm_add_epi16(q4, r0); 153 const __m128i x1 = _mm_sub_epi16(q4, r0); 154 const __m128i x2 = _mm_sub_epi16(q7, r1); 155 const __m128i x3 = _mm_add_epi16(q7, r1); 156 // Interleave to do the multiply by constants which gets us into 32bits 157 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 158 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 159 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 160 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 161 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 162 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 163 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 164 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 165 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 166 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 167 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 168 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 169 // dct_const_round_shift 170 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 171 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 172 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 173 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 174 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 175 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 176 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 177 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 178 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 179 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 180 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 181 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 182 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 183 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 184 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 185 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 186 // Combine 187 res1 = _mm_packs_epi32(w0, w1); 188 res7 = _mm_packs_epi32(w2, w3); 189 res5 = _mm_packs_epi32(w4, w5); 190 res3 = _mm_packs_epi32(w6, w7); 191 } 192 // Transpose the 8x8. 193 { 194 // 00 01 02 03 04 05 06 07 195 // 10 11 12 13 14 15 16 17 196 // 20 21 22 23 24 25 26 27 197 // 30 31 32 33 34 35 36 37 198 // 40 41 42 43 44 45 46 47 199 // 50 51 52 53 54 55 56 57 200 // 60 61 62 63 64 65 66 67 201 // 70 71 72 73 74 75 76 77 202 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 203 const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); 204 const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); 205 const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); 206 const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); 207 const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); 208 const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); 209 const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); 210 // 00 10 01 11 02 12 03 13 211 // 20 30 21 31 22 32 23 33 212 // 04 14 05 15 06 16 07 17 213 // 24 34 25 35 26 36 27 37 214 // 40 50 41 51 42 52 43 53 215 // 60 70 61 71 62 72 63 73 216 // 54 54 55 55 56 56 57 57 217 // 64 74 65 75 66 76 67 77 218 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 219 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 220 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 221 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 222 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 223 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 224 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 225 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 226 // 00 10 20 30 01 11 21 31 227 // 40 50 60 70 41 51 61 71 228 // 02 12 22 32 03 13 23 33 229 // 42 52 62 72 43 53 63 73 230 // 04 14 24 34 05 15 21 36 231 // 44 54 64 74 45 55 61 76 232 // 06 16 26 36 07 17 27 37 233 // 46 56 66 76 47 57 67 77 234 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 235 in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 236 in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 237 in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 238 in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 239 in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 240 in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 241 in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 242 // 00 10 20 30 40 50 60 70 243 // 01 11 21 31 41 51 61 71 244 // 02 12 22 32 42 52 62 72 245 // 03 13 23 33 43 53 63 73 246 // 04 14 24 34 44 54 64 74 247 // 05 15 25 35 45 55 65 75 248 // 06 16 26 36 46 56 66 76 249 // 07 17 27 37 47 57 67 77 250 } 251 } 252 // Post-condition output and store it 253 { 254 // Post-condition (division by two) 255 // division of two 16 bits signed numbers using shifts 256 // n / 2 = (n - (n >> 15)) >> 1 257 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); 258 const __m128i sign_in1 = _mm_srai_epi16(in1, 15); 259 const __m128i sign_in2 = _mm_srai_epi16(in2, 15); 260 const __m128i sign_in3 = _mm_srai_epi16(in3, 15); 261 const __m128i sign_in4 = _mm_srai_epi16(in4, 15); 262 const __m128i sign_in5 = _mm_srai_epi16(in5, 15); 263 const __m128i sign_in6 = _mm_srai_epi16(in6, 15); 264 const __m128i sign_in7 = _mm_srai_epi16(in7, 15); 265 in0 = _mm_sub_epi16(in0, sign_in0); 266 in1 = _mm_sub_epi16(in1, sign_in1); 267 in2 = _mm_sub_epi16(in2, sign_in2); 268 in3 = _mm_sub_epi16(in3, sign_in3); 269 in4 = _mm_sub_epi16(in4, sign_in4); 270 in5 = _mm_sub_epi16(in5, sign_in5); 271 in6 = _mm_sub_epi16(in6, sign_in6); 272 in7 = _mm_sub_epi16(in7, sign_in7); 273 in0 = _mm_srai_epi16(in0, 1); 274 in1 = _mm_srai_epi16(in1, 1); 275 in2 = _mm_srai_epi16(in2, 1); 276 in3 = _mm_srai_epi16(in3, 1); 277 in4 = _mm_srai_epi16(in4, 1); 278 in5 = _mm_srai_epi16(in5, 1); 279 in6 = _mm_srai_epi16(in6, 1); 280 in7 = _mm_srai_epi16(in7, 1); 281 } 282 283 iscan_ptr += n_coeffs; 284 qcoeff_ptr += n_coeffs; 285 dqcoeff_ptr += n_coeffs; 286 n_coeffs = -n_coeffs; 287 zero = _mm_setzero_si128(); 288 289 if (!skip_block) { 290 __m128i eob; 291 __m128i round, quant, dequant, thr; 292 int16_t nzflag; 293 { 294 __m128i coeff0, coeff1; 295 296 // Setup global values 297 { 298 round = _mm_load_si128((const __m128i *)round_ptr); 299 quant = _mm_load_si128((const __m128i *)quant_ptr); 300 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 301 } 302 303 { 304 __m128i coeff0_sign, coeff1_sign; 305 __m128i qcoeff0, qcoeff1; 306 __m128i qtmp0, qtmp1; 307 // Do DC and first 15 AC 308 coeff0 = *in[0]; 309 coeff1 = *in[1]; 310 311 // Poor man's sign extract 312 coeff0_sign = _mm_srai_epi16(coeff0, 15); 313 coeff1_sign = _mm_srai_epi16(coeff1, 15); 314 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 315 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 316 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 317 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 318 319 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 320 round = _mm_unpackhi_epi64(round, round); 321 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 322 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 323 quant = _mm_unpackhi_epi64(quant, quant); 324 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 325 326 // Reinsert signs 327 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 328 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 329 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 330 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 331 332 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 333 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 334 335 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 336 dequant = _mm_unpackhi_epi64(dequant, dequant); 337 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 338 339 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 340 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 341 } 342 343 { 344 // Scan for eob 345 __m128i zero_coeff0, zero_coeff1; 346 __m128i nzero_coeff0, nzero_coeff1; 347 __m128i iscan0, iscan1; 348 __m128i eob1; 349 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 350 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 351 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 352 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 353 iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 354 iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 355 // Add one to convert from indices to counts 356 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 357 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 358 eob = _mm_and_si128(iscan0, nzero_coeff0); 359 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 360 eob = _mm_max_epi16(eob, eob1); 361 } 362 n_coeffs += 8 * 2; 363 } 364 365 // AC only loop 366 index = 2; 367 thr = _mm_srai_epi16(dequant, 1); 368 while (n_coeffs < 0) { 369 __m128i coeff0, coeff1; 370 { 371 __m128i coeff0_sign, coeff1_sign; 372 __m128i qcoeff0, qcoeff1; 373 __m128i qtmp0, qtmp1; 374 375 assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); 376 coeff0 = *in[index]; 377 coeff1 = *in[index + 1]; 378 379 // Poor man's sign extract 380 coeff0_sign = _mm_srai_epi16(coeff0, 15); 381 coeff1_sign = _mm_srai_epi16(coeff1, 15); 382 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 383 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 384 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 385 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 386 387 nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | 388 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); 389 390 if (nzflag) { 391 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 392 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 393 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 394 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 395 396 // Reinsert signs 397 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 398 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 399 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 400 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 401 402 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 403 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 404 405 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 406 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 407 408 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 409 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 410 } else { 411 // Maybe a more efficient way to store 0? 412 store_zero_tran_low(qcoeff_ptr + n_coeffs); 413 store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); 414 415 store_zero_tran_low(dqcoeff_ptr + n_coeffs); 416 store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); 417 } 418 } 419 420 if (nzflag) { 421 // Scan for eob 422 __m128i zero_coeff0, zero_coeff1; 423 __m128i nzero_coeff0, nzero_coeff1; 424 __m128i iscan0, iscan1; 425 __m128i eob0, eob1; 426 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 427 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 428 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 429 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 430 iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 431 iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 432 // Add one to convert from indices to counts 433 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 434 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 435 eob0 = _mm_and_si128(iscan0, nzero_coeff0); 436 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 437 eob0 = _mm_max_epi16(eob0, eob1); 438 eob = _mm_max_epi16(eob, eob0); 439 } 440 n_coeffs += 8 * 2; 441 index += 2; 442 } 443 444 // Accumulate EOB 445 { 446 __m128i eob_shuffled; 447 eob_shuffled = _mm_shuffle_epi32(eob, 0xe); 448 eob = _mm_max_epi16(eob, eob_shuffled); 449 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); 450 eob = _mm_max_epi16(eob, eob_shuffled); 451 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); 452 eob = _mm_max_epi16(eob, eob_shuffled); 453 *eob_ptr = _mm_extract_epi16(eob, 1); 454 } 455 } else { 456 do { 457 store_zero_tran_low(dqcoeff_ptr + n_coeffs); 458 store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); 459 store_zero_tran_low(qcoeff_ptr + n_coeffs); 460 store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); 461 n_coeffs += 8 * 2; 462 } while (n_coeffs < 0); 463 *eob_ptr = 0; 464 } 465} 466