1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/* 2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * 4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * Use of this source code is governed by a BSD-style license 5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * that can be found in the LICENSE file in the root of the source 6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * tree. An additional intellectual property rights grant can be found 7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * in the file PATENTS. All contributing project authors may 8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * be found in the AUTHORS file in the root of the source tree. 9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */ 10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx_config.h" 12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp8_rtcd.h" 13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx_ports/x86.h" 14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx_mem/vpx_mem.h" 15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp8/encoder/block.h" 16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ 17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 187bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include <mmintrin.h> /* MMX */ 19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <xmmintrin.h> /* SSE */ 20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <emmintrin.h> /* SSE2 */ 21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 227bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define SELECT_EOB(i, z) \ 237bc9febe8749e98a3812a0dc4380ceae75c29450Johann do { \ 247bc9febe8749e98a3812a0dc4380ceae75c29450Johann short boost = *zbin_boost_ptr; \ 257bc9febe8749e98a3812a0dc4380ceae75c29450Johann int cmp = (x[z] < boost) | (y[z] == 0); \ 267bc9febe8749e98a3812a0dc4380ceae75c29450Johann zbin_boost_ptr++; \ 277bc9febe8749e98a3812a0dc4380ceae75c29450Johann if (cmp) break; \ 287bc9febe8749e98a3812a0dc4380ceae75c29450Johann qcoeff_ptr[z] = y[z]; \ 297bc9febe8749e98a3812a0dc4380ceae75c29450Johann eob = i; \ 307bc9febe8749e98a3812a0dc4380ceae75c29450Johann zbin_boost_ptr = b->zrun_zbin_boost; \ 317bc9febe8749e98a3812a0dc4380ceae75c29450Johann } while (0) 327bc9febe8749e98a3812a0dc4380ceae75c29450Johann 337bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) { 347bc9febe8749e98a3812a0dc4380ceae75c29450Johann char eob = 0; 357bc9febe8749e98a3812a0dc4380ceae75c29450Johann short *zbin_boost_ptr; 367bc9febe8749e98a3812a0dc4380ceae75c29450Johann short *qcoeff_ptr = d->qcoeff; 377bc9febe8749e98a3812a0dc4380ceae75c29450Johann DECLARE_ALIGNED(16, short, x[16]); 387bc9febe8749e98a3812a0dc4380ceae75c29450Johann DECLARE_ALIGNED(16, short, y[16]); 397bc9febe8749e98a3812a0dc4380ceae75c29450Johann 407bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; 417bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); 427bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); 437bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 447bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); 457bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); 467bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); 477bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); 487bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 497bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 507bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); 517bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); 527bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 537bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 547bc9febe8749e98a3812a0dc4380ceae75c29450Johann 557bc9febe8749e98a3812a0dc4380ceae75c29450Johann memset(qcoeff_ptr, 0, 32); 567bc9febe8749e98a3812a0dc4380ceae75c29450Johann 577bc9febe8749e98a3812a0dc4380ceae75c29450Johann /* Duplicate to all lanes. */ 587bc9febe8749e98a3812a0dc4380ceae75c29450Johann zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); 597bc9febe8749e98a3812a0dc4380ceae75c29450Johann zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); 607bc9febe8749e98a3812a0dc4380ceae75c29450Johann 617bc9febe8749e98a3812a0dc4380ceae75c29450Johann /* Sign of z: z >> 15 */ 627bc9febe8749e98a3812a0dc4380ceae75c29450Johann sz0 = _mm_srai_epi16(z0, 15); 637bc9febe8749e98a3812a0dc4380ceae75c29450Johann sz1 = _mm_srai_epi16(z1, 15); 647bc9febe8749e98a3812a0dc4380ceae75c29450Johann 657bc9febe8749e98a3812a0dc4380ceae75c29450Johann /* x = abs(z): (z ^ sz) - sz */ 667bc9febe8749e98a3812a0dc4380ceae75c29450Johann x0 = _mm_xor_si128(z0, sz0); 677bc9febe8749e98a3812a0dc4380ceae75c29450Johann x1 = _mm_xor_si128(z1, sz1); 687bc9febe8749e98a3812a0dc4380ceae75c29450Johann x0 = _mm_sub_epi16(x0, sz0); 697bc9febe8749e98a3812a0dc4380ceae75c29450Johann x1 = _mm_sub_epi16(x1, sz1); 707bc9febe8749e98a3812a0dc4380ceae75c29450Johann 717bc9febe8749e98a3812a0dc4380ceae75c29450Johann /* zbin[] + zbin_extra */ 727bc9febe8749e98a3812a0dc4380ceae75c29450Johann zbin0 = _mm_add_epi16(zbin0, zbin_extra); 737bc9febe8749e98a3812a0dc4380ceae75c29450Johann zbin1 = _mm_add_epi16(zbin1, zbin_extra); 747bc9febe8749e98a3812a0dc4380ceae75c29450Johann 757bc9febe8749e98a3812a0dc4380ceae75c29450Johann /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance 767bc9febe8749e98a3812a0dc4380ceae75c29450Johann * the equation because boost is the only value which can change: 777bc9febe8749e98a3812a0dc4380ceae75c29450Johann * x - (zbin[] + extra) >= boost */ 787bc9febe8749e98a3812a0dc4380ceae75c29450Johann x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); 797bc9febe8749e98a3812a0dc4380ceae75c29450Johann x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); 807bc9febe8749e98a3812a0dc4380ceae75c29450Johann 817bc9febe8749e98a3812a0dc4380ceae75c29450Johann _mm_store_si128((__m128i *)(x), x_minus_zbin0); 827bc9febe8749e98a3812a0dc4380ceae75c29450Johann _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); 837bc9febe8749e98a3812a0dc4380ceae75c29450Johann 847bc9febe8749e98a3812a0dc4380ceae75c29450Johann /* All the remaining calculations are valid whether they are done now with 857bc9febe8749e98a3812a0dc4380ceae75c29450Johann * simd or later inside the loop one at a time. */ 867bc9febe8749e98a3812a0dc4380ceae75c29450Johann x0 = _mm_add_epi16(x0, round0); 877bc9febe8749e98a3812a0dc4380ceae75c29450Johann x1 = _mm_add_epi16(x1, round1); 887bc9febe8749e98a3812a0dc4380ceae75c29450Johann 897bc9febe8749e98a3812a0dc4380ceae75c29450Johann y0 = _mm_mulhi_epi16(x0, quant0); 907bc9febe8749e98a3812a0dc4380ceae75c29450Johann y1 = _mm_mulhi_epi16(x1, quant1); 917bc9febe8749e98a3812a0dc4380ceae75c29450Johann 927bc9febe8749e98a3812a0dc4380ceae75c29450Johann y0 = _mm_add_epi16(y0, x0); 937bc9febe8749e98a3812a0dc4380ceae75c29450Johann y1 = _mm_add_epi16(y1, x1); 947bc9febe8749e98a3812a0dc4380ceae75c29450Johann 957bc9febe8749e98a3812a0dc4380ceae75c29450Johann /* Instead of shifting each value independently we convert the scaling 967bc9febe8749e98a3812a0dc4380ceae75c29450Johann * factor with 1 << (16 - shift) so we can use multiply/return high half. */ 977bc9febe8749e98a3812a0dc4380ceae75c29450Johann y0 = _mm_mulhi_epi16(y0, quant_shift0); 987bc9febe8749e98a3812a0dc4380ceae75c29450Johann y1 = _mm_mulhi_epi16(y1, quant_shift1); 997bc9febe8749e98a3812a0dc4380ceae75c29450Johann 1007bc9febe8749e98a3812a0dc4380ceae75c29450Johann /* Return the sign: (y ^ sz) - sz */ 1017bc9febe8749e98a3812a0dc4380ceae75c29450Johann y0 = _mm_xor_si128(y0, sz0); 1027bc9febe8749e98a3812a0dc4380ceae75c29450Johann y1 = _mm_xor_si128(y1, sz1); 1037bc9febe8749e98a3812a0dc4380ceae75c29450Johann y0 = _mm_sub_epi16(y0, sz0); 1047bc9febe8749e98a3812a0dc4380ceae75c29450Johann y1 = _mm_sub_epi16(y1, sz1); 1057bc9febe8749e98a3812a0dc4380ceae75c29450Johann 1067bc9febe8749e98a3812a0dc4380ceae75c29450Johann _mm_store_si128((__m128i *)(y), y0); 1077bc9febe8749e98a3812a0dc4380ceae75c29450Johann _mm_store_si128((__m128i *)(y + 8), y1); 1087bc9febe8749e98a3812a0dc4380ceae75c29450Johann 1097bc9febe8749e98a3812a0dc4380ceae75c29450Johann zbin_boost_ptr = b->zrun_zbin_boost; 1107bc9febe8749e98a3812a0dc4380ceae75c29450Johann 1117bc9febe8749e98a3812a0dc4380ceae75c29450Johann /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ 1127bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(1, 0); 1137bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(2, 1); 1147bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(3, 4); 1157bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(4, 8); 1167bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(5, 5); 1177bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(6, 2); 1187bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(7, 3); 1197bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(8, 6); 1207bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(9, 9); 1217bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(10, 12); 1227bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(11, 13); 1237bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(12, 10); 1247bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(13, 7); 1257bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(14, 11); 1267bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(15, 14); 1277bc9febe8749e98a3812a0dc4380ceae75c29450Johann SELECT_EOB(16, 15); 1287bc9febe8749e98a3812a0dc4380ceae75c29450Johann 1297bc9febe8749e98a3812a0dc4380ceae75c29450Johann y0 = _mm_load_si128((__m128i *)(d->qcoeff)); 1307bc9febe8749e98a3812a0dc4380ceae75c29450Johann y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); 1317bc9febe8749e98a3812a0dc4380ceae75c29450Johann 1327bc9febe8749e98a3812a0dc4380ceae75c29450Johann /* dqcoeff = qcoeff * dequant */ 1337bc9febe8749e98a3812a0dc4380ceae75c29450Johann y0 = _mm_mullo_epi16(y0, dequant0); 1347bc9febe8749e98a3812a0dc4380ceae75c29450Johann y1 = _mm_mullo_epi16(y1, dequant1); 1357bc9febe8749e98a3812a0dc4380ceae75c29450Johann 1367bc9febe8749e98a3812a0dc4380ceae75c29450Johann _mm_store_si128((__m128i *)(d->dqcoeff), y0); 1377bc9febe8749e98a3812a0dc4380ceae75c29450Johann _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); 1387bc9febe8749e98a3812a0dc4380ceae75c29450Johann 1397bc9febe8749e98a3812a0dc4380ceae75c29450Johann *d->eob = eob; 140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1427bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) { 143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); 145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); 148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); 149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 1517bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i inv_zig_zag0 = 1527bc9febe8749e98a3812a0dc4380ceae75c29450Johann _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); 1537bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i inv_zig_zag1 = 1547bc9febe8749e98a3812a0dc4380ceae75c29450Johann _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); 155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; 157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* sign of z: z >> 15 */ 159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sz0 = _mm_srai_epi16(z0, 15); 160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sz1 = _mm_srai_epi16(z1, 15); 161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* x = abs(z): (z ^ sz) - sz */ 163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_xor_si128(z0, sz0); 164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_xor_si128(z1, sz1); 165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_sub_epi16(x0, sz0); 166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_sub_epi16(x1, sz1); 167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* x += round */ 169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_add_epi16(x0, round0); 170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_add_epi16(x1, round1); 171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* y = (x * quant) >> 16 */ 173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y0 = _mm_mulhi_epi16(x0, quant_fast0); 174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y1 = _mm_mulhi_epi16(x1, quant_fast1); 175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* x = abs(y) = (y ^ sz) - sz */ 177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y0 = _mm_xor_si128(y0, sz0); 178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y1 = _mm_xor_si128(y1, sz1); 179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_sub_epi16(y0, sz0); 180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_sub_epi16(y1, sz1); 181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* qcoeff = x */ 183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(d->qcoeff), x0); 184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); 185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* x * dequant */ 187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang xdq0 = _mm_mullo_epi16(x0, dequant0); 188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang xdq1 = _mm_mullo_epi16(x1, dequant1); 189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* dqcoeff = x * dequant */ 191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); 192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); 193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* build a mask for the zig zag */ 195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang zeros = _mm_setzero_si128(); 196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_cmpeq_epi16(x0, zeros); 198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_cmpeq_epi16(x1, zeros); 199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ones = _mm_cmpeq_epi16(zeros, zeros); 201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_xor_si128(x0, ones); 203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_xor_si128(x1, ones); 204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_and_si128(x0, inv_zig_zag0); 206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_and_si128(x1, inv_zig_zag1); 207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_max_epi16(x0, x1); 209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* now down to 8 */ 2117bc9febe8749e98a3812a0dc4380ceae75c29450Johann x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_max_epi16(x0, x1); 214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* only 4 left */ 2167bc9febe8749e98a3812a0dc4380ceae75c29450Johann x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_max_epi16(x0, x1); 219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* okay, just 2! */ 2217bc9febe8749e98a3812a0dc4380ceae75c29450Johann x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_max_epi16(x0, x1); 224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *d->eob = 0xFF & _mm_cvtsi128_si32(x0); 226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 227