1c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org/* 2c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org * 4c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org * Use of this source code is governed by a BSD-style license 5c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org * that can be found in the LICENSE file in the root of the source 6c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org * tree. An additional intellectual property rights grant can be found 7c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org * in the file PATENTS. All contributing project authors may 8c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org * be found in the AUTHORS file in the root of the source tree. 9c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org */ 10c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 11c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 12c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org#include "vpx_config.h" 13c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org#include "vp8_rtcd.h" 14c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org#include "vpx_ports/x86.h" 15c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org#include "vpx_mem/vpx_mem.h" 16c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org#include "vp8/encoder/block.h" 17c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ 18c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 19c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org#include <mmintrin.h> /* MMX */ 20c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org#include <xmmintrin.h> /* SSE */ 21c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org#include <emmintrin.h> /* SSE2 */ 22c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 23c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org#define SELECT_EOB(i, z) \ 24c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org do { \ 25c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org short boost = *zbin_boost_ptr; \ 26c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org int cmp = (x[z] < boost) | (y[z] == 0); \ 27c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org zbin_boost_ptr++; \ 28c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org if (cmp) \ 2988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org break; \ 30c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org qcoeff_ptr[z] = y[z]; \ 31c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org eob = i; \ 32c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org zbin_boost_ptr = b->zrun_zbin_boost; \ 33c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org } while (0) 34c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 35c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.orgvoid vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) 36c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org{ 37c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org char eob = 0; 38c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org short *zbin_boost_ptr = b->zrun_zbin_boost; 39c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org short *qcoeff_ptr = d->qcoeff; 40c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, short, x, 16); 41c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, short, y, 16); 42c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 43c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; 44c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); 45c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); 46c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 47c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); 48c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); 49c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); 50c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); 51c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 52c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 53c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); 54c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); 55c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 56c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 57c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 58c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org vpx_memset(qcoeff_ptr, 0, 32); 59c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 60c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* Duplicate to all lanes. */ 61c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); 62c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); 63c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 64c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* Sign of z: z >> 15 */ 65c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org sz0 = _mm_srai_epi16(z0, 15); 66c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org sz1 = _mm_srai_epi16(z1, 15); 67c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 68c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* x = abs(z): (z ^ sz) - sz */ 69c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_xor_si128(z0, sz0); 70c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_xor_si128(z1, sz1); 71c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_sub_epi16(x0, sz0); 72c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_sub_epi16(x1, sz1); 73c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 74c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* zbin[] + zbin_extra */ 75c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org zbin0 = _mm_add_epi16(zbin0, zbin_extra); 76c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org zbin1 = _mm_add_epi16(zbin1, zbin_extra); 77c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 78c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance 79c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org * the equation because boost is the only value which can change: 80c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org * x - (zbin[] + extra) >= boost */ 81c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); 82c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); 83c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 84c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org _mm_store_si128((__m128i *)(x), x_minus_zbin0); 85c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); 86c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 87c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* All the remaining calculations are valid whether they are done now with 88c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org * simd or later inside the loop one at a time. */ 89c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_add_epi16(x0, round0); 90c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_add_epi16(x1, round1); 91c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 92c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y0 = _mm_mulhi_epi16(x0, quant0); 93c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y1 = _mm_mulhi_epi16(x1, quant1); 94c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 95c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y0 = _mm_add_epi16(y0, x0); 96c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y1 = _mm_add_epi16(y1, x1); 97c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 98c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* Instead of shifting each value independently we convert the scaling 99c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org * factor with 1 << (16 - shift) so we can use multiply/return high half. */ 100c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y0 = _mm_mulhi_epi16(y0, quant_shift0); 101c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y1 = _mm_mulhi_epi16(y1, quant_shift1); 102c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 103c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* Return the sign: (y ^ sz) - sz */ 104c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y0 = _mm_xor_si128(y0, sz0); 105c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y1 = _mm_xor_si128(y1, sz1); 106c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y0 = _mm_sub_epi16(y0, sz0); 107c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y1 = _mm_sub_epi16(y1, sz1); 108c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 109c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org _mm_store_si128((__m128i *)(y), y0); 110c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org _mm_store_si128((__m128i *)(y + 8), y1); 111c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 112c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org zbin_boost_ptr = b->zrun_zbin_boost; 113c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 114c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ 115c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(1, 0); 116c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(2, 1); 117c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(3, 4); 118c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(4, 8); 119c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(5, 5); 120c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(6, 2); 121c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(7, 3); 122c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(8, 6); 123c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(9, 9); 124c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(10, 12); 125c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(11, 13); 126c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(12, 10); 127c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(13, 7); 128c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(14, 11); 129c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(15, 14); 130c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org SELECT_EOB(16, 15); 131c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 132c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y0 = _mm_load_si128((__m128i *)(d->qcoeff)); 133c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); 134c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 135c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* dqcoeff = qcoeff * dequant */ 136c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y0 = _mm_mullo_epi16(y0, dequant0); 137c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y1 = _mm_mullo_epi16(y1, dequant1); 138c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 139c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org _mm_store_si128((__m128i *)(d->dqcoeff), y0); 140c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); 141c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 142c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org *d->eob = eob; 143c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org} 144c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 145c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.orgvoid vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) 146c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org{ 147c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 148c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); 149c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 150c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 151c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); 152c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); 153c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 154c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 155c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); 156c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); 157c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 158c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; 159c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 160c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* sign of z: z >> 15 */ 161c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org sz0 = _mm_srai_epi16(z0, 15); 162c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org sz1 = _mm_srai_epi16(z1, 15); 163c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 164c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* x = abs(z): (z ^ sz) - sz */ 165c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_xor_si128(z0, sz0); 166c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_xor_si128(z1, sz1); 167c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_sub_epi16(x0, sz0); 168c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_sub_epi16(x1, sz1); 169c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 170c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* x += round */ 171c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_add_epi16(x0, round0); 172c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_add_epi16(x1, round1); 173c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 174c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* y = (x * quant) >> 16 */ 175c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y0 = _mm_mulhi_epi16(x0, quant_fast0); 176c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y1 = _mm_mulhi_epi16(x1, quant_fast1); 177c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 178c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* x = abs(y) = (y ^ sz) - sz */ 179c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y0 = _mm_xor_si128(y0, sz0); 180c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org y1 = _mm_xor_si128(y1, sz1); 181c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_sub_epi16(y0, sz0); 182c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_sub_epi16(y1, sz1); 183c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 184c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* qcoeff = x */ 185c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org _mm_store_si128((__m128i *)(d->qcoeff), x0); 186c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); 187c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 188c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* x * dequant */ 189c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org xdq0 = _mm_mullo_epi16(x0, dequant0); 190c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org xdq1 = _mm_mullo_epi16(x1, dequant1); 191c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 192c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* dqcoeff = x * dequant */ 193c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); 194c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); 195c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 196c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* build a mask for the zig zag */ 197c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org zeros = _mm_setzero_si128(); 198c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 199c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_cmpeq_epi16(x0, zeros); 200c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_cmpeq_epi16(x1, zeros); 201c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 202c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org ones = _mm_cmpeq_epi16(zeros, zeros); 203c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 204c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_xor_si128(x0, ones); 205c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_xor_si128(x1, ones); 206c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 207c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_and_si128(x0, inv_zig_zag0); 208c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_and_si128(x1, inv_zig_zag1); 209c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 210c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_max_epi16(x0, x1); 211c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 212c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* now down to 8 */ 213c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 214c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 215c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_max_epi16(x0, x1); 216c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 217c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* only 4 left */ 218c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 219c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 220c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_max_epi16(x0, x1); 221c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 222c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org /* okay, just 2! */ 223c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 224c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 225c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org x0 = _mm_max_epi16(x0, x1); 226c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org 227c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org *d->eob = 0xFF & _mm_cvtsi128_si32(x0); 228c5c68e6d76b1fafad38fcecbb4ffaed6f142a9fcjohannkoenig@chromium.org} 229