1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/*
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx_config.h"
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp8_rtcd.h"
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx_ports/x86.h"
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx_mem/vpx_mem.h"
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp8/encoder/block.h"
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
187bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include <mmintrin.h>  /* MMX */
19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <xmmintrin.h> /* SSE */
20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <emmintrin.h> /* SSE2 */
21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
227bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define SELECT_EOB(i, z)                    \
237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  do {                                      \
247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    short boost = *zbin_boost_ptr;          \
257bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int cmp = (x[z] < boost) | (y[z] == 0); \
267bc9febe8749e98a3812a0dc4380ceae75c29450Johann    zbin_boost_ptr++;                       \
277bc9febe8749e98a3812a0dc4380ceae75c29450Johann    if (cmp) break;                         \
287bc9febe8749e98a3812a0dc4380ceae75c29450Johann    qcoeff_ptr[z] = y[z];                   \
297bc9febe8749e98a3812a0dc4380ceae75c29450Johann    eob = i;                                \
307bc9febe8749e98a3812a0dc4380ceae75c29450Johann    zbin_boost_ptr = b->zrun_zbin_boost;    \
317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  } while (0)
327bc9febe8749e98a3812a0dc4380ceae75c29450Johann
337bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) {
347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  char eob = 0;
357bc9febe8749e98a3812a0dc4380ceae75c29450Johann  short *zbin_boost_ptr;
367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  short *qcoeff_ptr = d->qcoeff;
377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  DECLARE_ALIGNED(16, short, x[16]);
387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  DECLARE_ALIGNED(16, short, y[16]);
397bc9febe8749e98a3812a0dc4380ceae75c29450Johann
407bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
417bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
427bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
437bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
447bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
457bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
467bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
487bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
497bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
507bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
527bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
547bc9febe8749e98a3812a0dc4380ceae75c29450Johann
557bc9febe8749e98a3812a0dc4380ceae75c29450Johann  memset(qcoeff_ptr, 0, 32);
567bc9febe8749e98a3812a0dc4380ceae75c29450Johann
577bc9febe8749e98a3812a0dc4380ceae75c29450Johann  /* Duplicate to all lanes. */
587bc9febe8749e98a3812a0dc4380ceae75c29450Johann  zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
597bc9febe8749e98a3812a0dc4380ceae75c29450Johann  zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
607bc9febe8749e98a3812a0dc4380ceae75c29450Johann
617bc9febe8749e98a3812a0dc4380ceae75c29450Johann  /* Sign of z: z >> 15 */
627bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sz0 = _mm_srai_epi16(z0, 15);
637bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sz1 = _mm_srai_epi16(z1, 15);
647bc9febe8749e98a3812a0dc4380ceae75c29450Johann
657bc9febe8749e98a3812a0dc4380ceae75c29450Johann  /* x = abs(z): (z ^ sz) - sz */
667bc9febe8749e98a3812a0dc4380ceae75c29450Johann  x0 = _mm_xor_si128(z0, sz0);
677bc9febe8749e98a3812a0dc4380ceae75c29450Johann  x1 = _mm_xor_si128(z1, sz1);
687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  x0 = _mm_sub_epi16(x0, sz0);
697bc9febe8749e98a3812a0dc4380ceae75c29450Johann  x1 = _mm_sub_epi16(x1, sz1);
707bc9febe8749e98a3812a0dc4380ceae75c29450Johann
717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  /* zbin[] + zbin_extra */
727bc9febe8749e98a3812a0dc4380ceae75c29450Johann  zbin0 = _mm_add_epi16(zbin0, zbin_extra);
737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  zbin1 = _mm_add_epi16(zbin1, zbin_extra);
747bc9febe8749e98a3812a0dc4380ceae75c29450Johann
757bc9febe8749e98a3812a0dc4380ceae75c29450Johann  /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
767bc9febe8749e98a3812a0dc4380ceae75c29450Johann   * the equation because boost is the only value which can change:
777bc9febe8749e98a3812a0dc4380ceae75c29450Johann   * x - (zbin[] + extra) >= boost */
787bc9febe8749e98a3812a0dc4380ceae75c29450Johann  x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
797bc9febe8749e98a3812a0dc4380ceae75c29450Johann  x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
807bc9febe8749e98a3812a0dc4380ceae75c29450Johann
817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  _mm_store_si128((__m128i *)(x), x_minus_zbin0);
827bc9febe8749e98a3812a0dc4380ceae75c29450Johann  _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);
837bc9febe8749e98a3812a0dc4380ceae75c29450Johann
847bc9febe8749e98a3812a0dc4380ceae75c29450Johann  /* All the remaining calculations are valid whether they are done now with
857bc9febe8749e98a3812a0dc4380ceae75c29450Johann   * simd or later inside the loop one at a time. */
867bc9febe8749e98a3812a0dc4380ceae75c29450Johann  x0 = _mm_add_epi16(x0, round0);
877bc9febe8749e98a3812a0dc4380ceae75c29450Johann  x1 = _mm_add_epi16(x1, round1);
887bc9febe8749e98a3812a0dc4380ceae75c29450Johann
897bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y0 = _mm_mulhi_epi16(x0, quant0);
907bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y1 = _mm_mulhi_epi16(x1, quant1);
917bc9febe8749e98a3812a0dc4380ceae75c29450Johann
927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y0 = _mm_add_epi16(y0, x0);
937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y1 = _mm_add_epi16(y1, x1);
947bc9febe8749e98a3812a0dc4380ceae75c29450Johann
957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  /* Instead of shifting each value independently we convert the scaling
967bc9febe8749e98a3812a0dc4380ceae75c29450Johann   * factor with 1 << (16 - shift) so we can use multiply/return high half. */
977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y0 = _mm_mulhi_epi16(y0, quant_shift0);
987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y1 = _mm_mulhi_epi16(y1, quant_shift1);
997bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1007bc9febe8749e98a3812a0dc4380ceae75c29450Johann  /* Return the sign: (y ^ sz) - sz */
1017bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y0 = _mm_xor_si128(y0, sz0);
1027bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y1 = _mm_xor_si128(y1, sz1);
1037bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y0 = _mm_sub_epi16(y0, sz0);
1047bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y1 = _mm_sub_epi16(y1, sz1);
1057bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1067bc9febe8749e98a3812a0dc4380ceae75c29450Johann  _mm_store_si128((__m128i *)(y), y0);
1077bc9febe8749e98a3812a0dc4380ceae75c29450Johann  _mm_store_si128((__m128i *)(y + 8), y1);
1087bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1097bc9febe8749e98a3812a0dc4380ceae75c29450Johann  zbin_boost_ptr = b->zrun_zbin_boost;
1107bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1117bc9febe8749e98a3812a0dc4380ceae75c29450Johann  /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
1127bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(1, 0);
1137bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(2, 1);
1147bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(3, 4);
1157bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(4, 8);
1167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(5, 5);
1177bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(6, 2);
1187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(7, 3);
1197bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(8, 6);
1207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(9, 9);
1217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(10, 12);
1227bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(11, 13);
1237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(12, 10);
1247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(13, 7);
1257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(14, 11);
1267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(15, 14);
1277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  SELECT_EOB(16, 15);
1287bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y0 = _mm_load_si128((__m128i *)(d->qcoeff));
1307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));
1317bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  /* dqcoeff = qcoeff * dequant */
1337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y0 = _mm_mullo_epi16(y0, dequant0);
1347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y1 = _mm_mullo_epi16(y1, dequant1);
1357bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  _mm_store_si128((__m128i *)(d->dqcoeff), y0);
1377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);
1387bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *d->eob = eob;
140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1427bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) {
143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
1517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i inv_zig_zag0 =
1527bc9febe8749e98a3812a0dc4380ceae75c29450Johann      _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
1537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __m128i inv_zig_zag1 =
1547bc9febe8749e98a3812a0dc4380ceae75c29450Johann      _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* sign of z: z >> 15 */
159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  sz0 = _mm_srai_epi16(z0, 15);
160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  sz1 = _mm_srai_epi16(z1, 15);
161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* x = abs(z): (z ^ sz) - sz */
163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = _mm_xor_si128(z0, sz0);
164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = _mm_xor_si128(z1, sz1);
165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = _mm_sub_epi16(x0, sz0);
166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = _mm_sub_epi16(x1, sz1);
167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* x += round */
169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = _mm_add_epi16(x0, round0);
170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = _mm_add_epi16(x1, round1);
171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* y = (x * quant) >> 16 */
173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  y0 = _mm_mulhi_epi16(x0, quant_fast0);
174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  y1 = _mm_mulhi_epi16(x1, quant_fast1);
175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* x = abs(y) = (y ^ sz) - sz */
177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  y0 = _mm_xor_si128(y0, sz0);
178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  y1 = _mm_xor_si128(y1, sz1);
179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = _mm_sub_epi16(y0, sz0);
180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = _mm_sub_epi16(y1, sz1);
181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* qcoeff = x */
183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  _mm_store_si128((__m128i *)(d->qcoeff), x0);
184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* x * dequant */
187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  xdq0 = _mm_mullo_epi16(x0, dequant0);
188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  xdq1 = _mm_mullo_epi16(x1, dequant1);
189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* dqcoeff = x * dequant */
191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* build a mask for the zig zag */
195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  zeros = _mm_setzero_si128();
196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = _mm_cmpeq_epi16(x0, zeros);
198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = _mm_cmpeq_epi16(x1, zeros);
199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  ones = _mm_cmpeq_epi16(zeros, zeros);
201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = _mm_xor_si128(x0, ones);
203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = _mm_xor_si128(x1, ones);
204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = _mm_and_si128(x0, inv_zig_zag0);
206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x1 = _mm_and_si128(x1, inv_zig_zag1);
207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = _mm_max_epi16(x0, x1);
209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* now down to 8 */
2117bc9febe8749e98a3812a0dc4380ceae75c29450Johann  x1 = _mm_shuffle_epi32(x0, 0xE);  // 0b00001110
212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = _mm_max_epi16(x0, x1);
214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* only 4 left */
2167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  x1 = _mm_shufflelo_epi16(x0, 0xE);  // 0b00001110
217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = _mm_max_epi16(x0, x1);
219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  /* okay, just 2! */
2217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  x1 = _mm_shufflelo_epi16(x0, 0x1);  // 0b00000001
222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  x0 = _mm_max_epi16(x0, x1);
224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang}
227