1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/*
2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *
4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */
10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <emmintrin.h>
12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <xmmintrin.h>
13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vp9_rtcd.h"
15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx/vpx_integer.h"
168b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#include "vpx_dsp/vpx_dsp_common.h"
178b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
198b92989c89bec8632aa47dc58dc162f199d62edcJames Zernvoid vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
208b92989c89bec8632aa47dc58dc162f199d62edcJames Zern                          int skip_block, const int16_t *round_ptr,
218b92989c89bec8632aa47dc58dc162f199d62edcJames Zern                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
228b92989c89bec8632aa47dc58dc162f199d62edcJames Zern                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
237bc9febe8749e98a3812a0dc4380ceae75c29450Johann                          uint16_t *eob_ptr, const int16_t *scan_ptr,
247bc9febe8749e98a3812a0dc4380ceae75c29450Johann                          const int16_t *iscan_ptr) {
25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i zero;
26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  __m128i thr;
27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t nzflag;
288b92989c89bec8632aa47dc58dc162f199d62edcJames Zern
29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  (void)scan_ptr;
30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  coeff_ptr += n_coeffs;
32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  iscan_ptr += n_coeffs;
33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  qcoeff_ptr += n_coeffs;
34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dqcoeff_ptr += n_coeffs;
35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  n_coeffs = -n_coeffs;
36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  zero = _mm_setzero_si128();
37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (!skip_block) {
39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    __m128i eob;
40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    __m128i round, quant, dequant;
41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __m128i coeff0, coeff1;
43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      // Setup global values
45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      {
467bc9febe8749e98a3812a0dc4380ceae75c29450Johann        round = _mm_load_si128((const __m128i *)round_ptr);
477bc9febe8749e98a3812a0dc4380ceae75c29450Johann        quant = _mm_load_si128((const __m128i *)quant_ptr);
487bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      {
52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i coeff0_sign, coeff1_sign;
53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i qcoeff0, qcoeff1;
54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i qtmp0, qtmp1;
55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Do DC and first 15 AC
568b92989c89bec8632aa47dc58dc162f199d62edcJames Zern        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
578b92989c89bec8632aa47dc58dc162f199d62edcJames Zern        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Poor man's sign extract
60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        coeff0_sign = _mm_srai_epi16(coeff0, 15);
61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        coeff1_sign = _mm_srai_epi16(coeff1, 15);
62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        round = _mm_unpackhi_epi64(round, round);
69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        quant = _mm_unpackhi_epi64(quant, quant);
72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Reinsert signs
75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
808b92989c89bec8632aa47dc58dc162f199d62edcJames Zern        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
818b92989c89bec8632aa47dc58dc162f199d62edcJames Zern        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dequant = _mm_unpackhi_epi64(dequant, dequant);
85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
878b92989c89bec8632aa47dc58dc162f199d62edcJames Zern        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
888b92989c89bec8632aa47dc58dc162f199d62edcJames Zern        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      {
92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Scan for eob
93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i zero_coeff0, zero_coeff1;
94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i nzero_coeff0, nzero_coeff1;
95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i iscan0, iscan1;
96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i eob1;
97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
1017bc9febe8749e98a3812a0dc4380ceae75c29450Johann        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
1027bc9febe8749e98a3812a0dc4380ceae75c29450Johann        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Add one to convert from indices to counts
104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        eob = _mm_and_si128(iscan0, nzero_coeff0);
107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        eob = _mm_max_epi16(eob, eob1);
109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      n_coeffs += 8 * 2;
111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    thr = _mm_srai_epi16(dequant, 1);
114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // AC only loop
116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    while (n_coeffs < 0) {
117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __m128i coeff0, coeff1;
118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      {
119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i coeff0_sign, coeff1_sign;
120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i qcoeff0, qcoeff1;
121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i qtmp0, qtmp1;
122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1238b92989c89bec8632aa47dc58dc162f199d62edcJames Zern        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
1248b92989c89bec8632aa47dc58dc162f199d62edcJames Zern        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Poor man's sign extract
127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        coeff0_sign = _mm_srai_epi16(coeff0, 15);
128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        coeff1_sign = _mm_srai_epi16(coeff1, 15);
129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
1357bc9febe8749e98a3812a0dc4380ceae75c29450Johann                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        if (nzflag) {
138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          // Reinsert signs
144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1498b92989c89bec8632aa47dc58dc162f199d62edcJames Zern          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
1508b92989c89bec8632aa47dc58dc162f199d62edcJames Zern          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1558b92989c89bec8632aa47dc58dc162f199d62edcJames Zern          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
1568b92989c89bec8632aa47dc58dc162f199d62edcJames Zern          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        } else {
1588b92989c89bec8632aa47dc58dc162f199d62edcJames Zern          store_zero_tran_low(qcoeff_ptr + n_coeffs);
1598b92989c89bec8632aa47dc58dc162f199d62edcJames Zern          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1618b92989c89bec8632aa47dc58dc162f199d62edcJames Zern          store_zero_tran_low(dqcoeff_ptr + n_coeffs);
1628b92989c89bec8632aa47dc58dc162f199d62edcJames Zern          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      if (nzflag) {
167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Scan for eob
168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i zero_coeff0, zero_coeff1;
169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i nzero_coeff0, nzero_coeff1;
170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i iscan0, iscan1;
171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        __m128i eob0, eob1;
172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
1767bc9febe8749e98a3812a0dc4380ceae75c29450Johann        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
1777bc9febe8749e98a3812a0dc4380ceae75c29450Johann        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        // Add one to convert from indices to counts
179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        eob0 = _mm_max_epi16(eob0, eob1);
184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        eob = _mm_max_epi16(eob, eob0);
185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      }
186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      n_coeffs += 8 * 2;
187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    // Accumulate EOB
190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __m128i eob_shuffled;
192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      eob = _mm_max_epi16(eob, eob_shuffled);
194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      eob = _mm_max_epi16(eob, eob_shuffled);
196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      eob = _mm_max_epi16(eob, eob_shuffled);
198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      *eob_ptr = _mm_extract_epi16(eob, 1);
199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    do {
2028b92989c89bec8632aa47dc58dc162f199d62edcJames Zern      store_zero_tran_low(qcoeff_ptr + n_coeffs);
2038b92989c89bec8632aa47dc58dc162f199d62edcJames Zern      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
2048b92989c89bec8632aa47dc58dc162f199d62edcJames Zern
2058b92989c89bec8632aa47dc58dc162f199d62edcJames Zern      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
2068b92989c89bec8632aa47dc58dc162f199d62edcJames Zern      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      n_coeffs += 8 * 2;
208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } while (n_coeffs < 0);
209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    *eob_ptr = 0;
210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
212