1df37111358d02836cb29bbcb9c6e4c95dff90a16Johann/*
2df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *
4df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  Use of this source code is governed by a BSD-style license
5df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  that can be found in the LICENSE file in the root of the source
6df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  tree. An additional intellectual property rights grant can be found
7df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  in the file PATENTS.  All contributing project authors may
8df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  be found in the AUTHORS file in the root of the source tree.
9df37111358d02836cb29bbcb9c6e4c95dff90a16Johann */
10df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
11df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include <assert.h>
12df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include <tmmintrin.h>
13df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
14df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "./vpx_dsp_rtcd.h"
15df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx/vpx_integer.h"
16df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
17df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx_dsp/x86/quantize_x86.h"
18df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
19df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
20df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                          int skip_block, const int16_t *zbin_ptr,
21df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                          const int16_t *round_ptr, const int16_t *quant_ptr,
22df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                          const int16_t *quant_shift_ptr,
23df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
24df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
25df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
26df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const __m128i zero = _mm_setzero_si128();
27df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int index = 16;
28df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
29df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __m128i zbin, round, quant, dequant, shift;
30df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __m128i coeff0, coeff1;
31df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __m128i qcoeff0, qcoeff1;
32df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __m128i cmp_mask0, cmp_mask1;
33df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __m128i eob, eob0;
34df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
35df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  (void)scan_ptr;
36df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  (void)skip_block;
37df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  assert(!skip_block);
38df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
39df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
40df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                dequant_ptr, &dequant, quant_shift_ptr, &shift);
41df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
42df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // Do DC and first 15 AC.
43df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  coeff0 = load_tran_low(coeff_ptr);
44df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  coeff1 = load_tran_low(coeff_ptr + 8);
45df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
46df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  qcoeff0 = _mm_abs_epi16(coeff0);
47df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  qcoeff1 = _mm_abs_epi16(coeff1);
48df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
49df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
50df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
51df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
52df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
53df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  calculate_qcoeff(&qcoeff0, round, quant, shift);
54df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  round = _mm_unpackhi_epi64(round, round);
55df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  quant = _mm_unpackhi_epi64(quant, quant);
56df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  shift = _mm_unpackhi_epi64(shift, shift);
57df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  calculate_qcoeff(&qcoeff1, round, quant, shift);
58df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
59df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // Reinsert signs
60df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
61df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
62df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
63df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // Mask out zbin threshold coeffs
64df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
65df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
66df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
67df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  store_tran_low(qcoeff0, qcoeff_ptr);
68df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  store_tran_low(qcoeff1, qcoeff_ptr + 8);
69df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
70df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
71df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dequant = _mm_unpackhi_epi64(dequant, dequant);
72df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
73df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
74df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  store_tran_low(coeff0, dqcoeff_ptr);
75df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  store_tran_low(coeff1, dqcoeff_ptr + 8);
76df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
77df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  eob =
78df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
79df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
80df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // AC only loop.
81df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  while (index < n_coeffs) {
82df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff0 = load_tran_low(coeff_ptr + index);
83df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff1 = load_tran_low(coeff_ptr + index + 8);
84df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
85df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff0 = _mm_abs_epi16(coeff0);
86df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff1 = _mm_abs_epi16(coeff1);
87df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
88df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
89df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
90df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
91df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    calculate_qcoeff(&qcoeff0, round, quant, shift);
92df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    calculate_qcoeff(&qcoeff1, round, quant, shift);
93df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
94df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
95df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
96df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
97df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
98df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
99df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
100df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(qcoeff0, qcoeff_ptr + index);
101df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
102df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
103df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
104df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
105df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
106df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(coeff0, dqcoeff_ptr + index);
107df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
108df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
109df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
110df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                        index, zero);
111df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    eob = _mm_max_epi16(eob, eob0);
112df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
113df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    index += 16;
114df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  }
115df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
116df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  *eob_ptr = accumulate_eob(eob);
117df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
118df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
119df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_quantize_b_32x32_ssse3(
120df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
121df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
122df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
123df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
124df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
125df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const __m128i zero = _mm_setzero_si128();
126df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const __m128i one = _mm_set1_epi16(1);
127df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int index;
128df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
129df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __m128i zbin, round, quant, dequant, shift;
130df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __m128i coeff0, coeff1;
131df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __m128i qcoeff0, qcoeff1;
132df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __m128i cmp_mask0, cmp_mask1;
133df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __m128i all_zero;
134df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __m128i eob = zero, eob0;
135df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
136df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  (void)scan_ptr;
137df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  (void)n_coeffs;
138df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  (void)skip_block;
139df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  assert(!skip_block);
140df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
141df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // Setup global values.
142df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // The 32x32 halves zbin and round.
143df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
144df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // Shift with rounding.
145df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  zbin = _mm_add_epi16(zbin, one);
146df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  zbin = _mm_srli_epi16(zbin, 1);
147df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
148df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // it is a strict "greater" comparison.
149df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  zbin = _mm_sub_epi16(zbin, one);
150df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
151df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  round = _mm_load_si128((const __m128i *)round_ptr);
152df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  round = _mm_add_epi16(round, one);
153df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  round = _mm_srli_epi16(round, 1);
154df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
155df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  quant = _mm_load_si128((const __m128i *)quant_ptr);
156df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
157df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
158df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // I suspect this is not technically OK because quant_shift can be up
159df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // to 1 << 16 and shifting up again will outrange that, but the test is not
160df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // comprehensive enough to catch that and "it's been that way forever"
161df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  shift = _mm_slli_epi16(shift, 1);
162df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
163df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // Do DC and first 15 AC.
164df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  coeff0 = load_tran_low(coeff_ptr);
165df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  coeff1 = load_tran_low(coeff_ptr + 8);
166df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
167df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  qcoeff0 = _mm_abs_epi16(coeff0);
168df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  qcoeff1 = _mm_abs_epi16(coeff1);
169df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
170df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
171df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
172df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
173df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
174df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
175df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  if (_mm_movemask_epi8(all_zero) == 0) {
176df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
177df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
178df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
179df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
180df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#if CONFIG_VP9_HIGHBITDEPTH
181df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
182df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
183df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
184df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
185df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#endif  // CONFIG_HIGHBITDEPTH
186df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
187df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    round = _mm_unpackhi_epi64(round, round);
188df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    quant = _mm_unpackhi_epi64(quant, quant);
189df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    shift = _mm_unpackhi_epi64(shift, shift);
190df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    dequant = _mm_unpackhi_epi64(dequant, dequant);
191df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  } else {
192df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    calculate_qcoeff(&qcoeff0, round, quant, shift);
193df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    round = _mm_unpackhi_epi64(round, round);
194df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    quant = _mm_unpackhi_epi64(quant, quant);
195df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    shift = _mm_unpackhi_epi64(shift, shift);
196df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    calculate_qcoeff(&qcoeff1, round, quant, shift);
197df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
198df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // Reinsert signs.
199df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
200df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
201df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
202df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // Mask out zbin threshold coeffs.
203df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
204df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
205df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
206df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(qcoeff0, qcoeff_ptr);
207df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(qcoeff1, qcoeff_ptr + 8);
208df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
209df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // Un-sign to bias rounding like C.
210df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // dequant is almost always negative, so this is probably the backwards way
211df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // to handle the sign. However, it matches the previous assembly.
212df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff0 = _mm_abs_epi16(qcoeff0);
213df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff1 = _mm_abs_epi16(qcoeff1);
214df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
215df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff0 = calculate_dqcoeff(coeff0, dequant);
216df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    dequant = _mm_unpackhi_epi64(dequant, dequant);
217df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff1 = calculate_dqcoeff(coeff1, dequant);
218df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
219df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // "Divide" by 2.
220df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff0 = _mm_srli_epi16(coeff0, 1);
221df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff1 = _mm_srli_epi16(coeff1, 1);
222df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
223df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
224df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
225df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
226df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(coeff0, dqcoeff_ptr);
227df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(coeff1, dqcoeff_ptr + 8);
228df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
229df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
230df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                       zero);
231df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  }
232df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
233df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // AC only loop.
234df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  for (index = 16; index < 32 * 32; index += 16) {
235df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff0 = load_tran_low(coeff_ptr + index);
236df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff1 = load_tran_low(coeff_ptr + index + 8);
237df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
238df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff0 = _mm_abs_epi16(coeff0);
239df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff1 = _mm_abs_epi16(coeff1);
240df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
241df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
242df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
243df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
244df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
245df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    if (_mm_movemask_epi8(all_zero) == 0) {
246df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
247df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
248df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
249df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
250df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#if CONFIG_VP9_HIGHBITDEPTH
251df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
252df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
253df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
254df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
255df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#endif  // CONFIG_VP9_HIGHBITDEPTH
256df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      continue;
257df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    }
258df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
259df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    calculate_qcoeff(&qcoeff0, round, quant, shift);
260df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    calculate_qcoeff(&qcoeff1, round, quant, shift);
261df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
262df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
263df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
264df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
265df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
266df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
267df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
268df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(qcoeff0, qcoeff_ptr + index);
269df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
270df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
271df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff0 = _mm_abs_epi16(qcoeff0);
272df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff1 = _mm_abs_epi16(qcoeff1);
273df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
274df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff0 = calculate_dqcoeff(coeff0, dequant);
275df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff1 = calculate_dqcoeff(coeff1, dequant);
276df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
277df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff0 = _mm_srli_epi16(coeff0, 1);
278df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff1 = _mm_srli_epi16(coeff1, 1);
279df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
280df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
281df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
282df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
283df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(coeff0, dqcoeff_ptr + index);
284df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
285df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
286df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
287df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                        index, zero);
288df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    eob = _mm_max_epi16(eob, eob0);
289df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  }
290df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
291df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  *eob_ptr = accumulate_eob(eob);
292df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
293