1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang/* 2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * 4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * Use of this source code is governed by a BSD-style license 5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * that can be found in the LICENSE file in the root of the source 6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * tree. An additional intellectual property rights grant can be found 7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * in the file PATENTS. All contributing project authors may 8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * be found in the AUTHORS file in the root of the source tree. 9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang */ 10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx_config.h" 13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp8_rtcd.h" 14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx_ports/x86.h" 15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx_mem/vpx_mem.h" 16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp8/encoder/block.h" 17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ 18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <mmintrin.h> /* MMX */ 20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <xmmintrin.h> /* SSE */ 21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include <emmintrin.h> /* SSE2 */ 22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#define SELECT_EOB(i, z) \ 24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang do { \ 25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang short boost = *zbin_boost_ptr; \ 26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang int cmp = (x[z] < boost) | (y[z] == 0); \ 27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang zbin_boost_ptr++; \ 28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang if (cmp) \ 29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang goto select_eob_end_##i; \ 30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang qcoeff_ptr[z] = y[z]; \ 31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang eob = i; \ 32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang zbin_boost_ptr = b->zrun_zbin_boost; \ 33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang select_eob_end_##i:; \ 34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang } while (0) 35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) 37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang{ 38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang char eob = 0; 39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang short *zbin_boost_ptr = b->zrun_zbin_boost; 40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang short *qcoeff_ptr = d->qcoeff; 41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang DECLARE_ALIGNED_ARRAY(16, short, x, 16); 42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang DECLARE_ALIGNED_ARRAY(16, short, y, 16); 43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; 45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); 46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); 47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); 49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); 50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); 51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); 52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); 55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); 56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang vpx_memset(qcoeff_ptr, 0, 32); 60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Duplicate to all lanes. */ 62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); 63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); 64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Sign of z: z >> 15 */ 66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sz0 = _mm_srai_epi16(z0, 15); 67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sz1 = _mm_srai_epi16(z1, 15); 68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* x = abs(z): (z ^ sz) - sz */ 70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_xor_si128(z0, sz0); 71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_xor_si128(z1, sz1); 72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_sub_epi16(x0, sz0); 73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_sub_epi16(x1, sz1); 74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* zbin[] + zbin_extra */ 76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang zbin0 = _mm_add_epi16(zbin0, zbin_extra); 77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang zbin1 = _mm_add_epi16(zbin1, zbin_extra); 78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance 80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * the equation because boost is the only value which can change: 81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * x - (zbin[] + extra) >= boost */ 82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); 83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); 84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(x), x_minus_zbin0); 86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); 87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* All the remaining calculations are valid whether they are done now with 89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * simd or later inside the loop one at a time. */ 90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_add_epi16(x0, round0); 91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_add_epi16(x1, round1); 92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y0 = _mm_mulhi_epi16(x0, quant0); 94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y1 = _mm_mulhi_epi16(x1, quant1); 95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y0 = _mm_add_epi16(y0, x0); 97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y1 = _mm_add_epi16(y1, x1); 98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Instead of shifting each value independently we convert the scaling 100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang * factor with 1 << (16 - shift) so we can use multiply/return high half. */ 101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y0 = _mm_mulhi_epi16(y0, quant_shift0); 102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y1 = _mm_mulhi_epi16(y1, quant_shift1); 103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* Return the sign: (y ^ sz) - sz */ 105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y0 = _mm_xor_si128(y0, sz0); 106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y1 = _mm_xor_si128(y1, sz1); 107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y0 = _mm_sub_epi16(y0, sz0); 108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y1 = _mm_sub_epi16(y1, sz1); 109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(y), y0); 111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(y + 8), y1); 112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang zbin_boost_ptr = b->zrun_zbin_boost; 114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ 116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(1, 0); 117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(2, 1); 118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(3, 4); 119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(4, 8); 120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(5, 5); 121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(6, 2); 122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(7, 3); 123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(8, 6); 124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(9, 9); 125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(10, 12); 126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(11, 13); 127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(12, 10); 128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(13, 7); 129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(14, 11); 130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(15, 14); 131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SELECT_EOB(16, 15); 132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y0 = _mm_load_si128((__m128i *)(d->qcoeff)); 134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); 135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* dqcoeff = qcoeff * dequant */ 137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y0 = _mm_mullo_epi16(y0, dequant0); 138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y1 = _mm_mullo_epi16(y1, dequant1); 139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(d->dqcoeff), y0); 141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); 142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *d->eob = eob; 144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangvoid vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) 147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang{ 148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); 150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); 153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); 154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); 157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); 158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; 160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* sign of z: z >> 15 */ 162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sz0 = _mm_srai_epi16(z0, 15); 163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sz1 = _mm_srai_epi16(z1, 15); 164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* x = abs(z): (z ^ sz) - sz */ 166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_xor_si128(z0, sz0); 167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_xor_si128(z1, sz1); 168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_sub_epi16(x0, sz0); 169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_sub_epi16(x1, sz1); 170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* x += round */ 172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_add_epi16(x0, round0); 173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_add_epi16(x1, round1); 174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* y = (x * quant) >> 16 */ 176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y0 = _mm_mulhi_epi16(x0, quant_fast0); 177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y1 = _mm_mulhi_epi16(x1, quant_fast1); 178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* x = abs(y) = (y ^ sz) - sz */ 180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y0 = _mm_xor_si128(y0, sz0); 181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang y1 = _mm_xor_si128(y1, sz1); 182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_sub_epi16(y0, sz0); 183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_sub_epi16(y1, sz1); 184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* qcoeff = x */ 186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(d->qcoeff), x0); 187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); 188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* x * dequant */ 190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang xdq0 = _mm_mullo_epi16(x0, dequant0); 191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang xdq1 = _mm_mullo_epi16(x1, dequant1); 192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* dqcoeff = x * dequant */ 194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); 195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); 196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* build a mask for the zig zag */ 198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang zeros = _mm_setzero_si128(); 199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_cmpeq_epi16(x0, zeros); 201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_cmpeq_epi16(x1, zeros); 202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ones = _mm_cmpeq_epi16(zeros, zeros); 204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_xor_si128(x0, ones); 206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_xor_si128(x1, ones); 207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_and_si128(x0, inv_zig_zag0); 209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_and_si128(x1, inv_zig_zag1); 210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_max_epi16(x0, x1); 212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* now down to 8 */ 214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_max_epi16(x0, x1); 217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* only 4 left */ 219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_max_epi16(x0, x1); 222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang /* okay, just 2! */ 224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang x0 = _mm_max_epi16(x0, x1); 227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang *d->eob = 0xFF & _mm_cvtsi128_si32(x0); 229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang} 230