1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* 2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * 4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */ 10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <emmintrin.h> 12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <xmmintrin.h> 13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vp9_rtcd.h" 15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx/vpx_integer.h" 168b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#include "vpx_dsp/vpx_dsp_common.h" 178b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" 18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 198b92989c89bec8632aa47dc58dc162f199d62edcJames Zernvoid vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 208b92989c89bec8632aa47dc58dc162f199d62edcJames Zern int skip_block, const int16_t *round_ptr, 218b92989c89bec8632aa47dc58dc162f199d62edcJames Zern const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, 228b92989c89bec8632aa47dc58dc162f199d62edcJames Zern tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, 237bc9febe8749e98a3812a0dc4380ceae75c29450Johann uint16_t *eob_ptr, const int16_t *scan_ptr, 247bc9febe8749e98a3812a0dc4380ceae75c29450Johann const int16_t *iscan_ptr) { 25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i zero; 26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i thr; 27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t nzflag; 288b92989c89bec8632aa47dc58dc162f199d62edcJames Zern 29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian (void)scan_ptr; 30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff_ptr += n_coeffs; 32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iscan_ptr += n_coeffs; 33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff_ptr += n_coeffs; 34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dqcoeff_ptr += n_coeffs; 35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n_coeffs = -n_coeffs; 36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero = _mm_setzero_si128(); 37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (!skip_block) { 39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i eob; 40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i round, quant, dequant; 41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i coeff0, coeff1; 43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Setup global values 45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 467bc9febe8749e98a3812a0dc4380ceae75c29450Johann round = _mm_load_si128((const __m128i *)round_ptr); 477bc9febe8749e98a3812a0dc4380ceae75c29450Johann quant = _mm_load_si128((const __m128i *)quant_ptr); 487bc9febe8749e98a3812a0dc4380ceae75c29450Johann dequant = _mm_load_si128((const __m128i *)dequant_ptr); 49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i coeff0_sign, coeff1_sign; 53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i qcoeff0, qcoeff1; 54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i qtmp0, qtmp1; 55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Do DC and first 15 AC 568b92989c89bec8632aa47dc58dc162f199d62edcJames Zern coeff0 = load_tran_low(coeff_ptr + n_coeffs); 578b92989c89bec8632aa47dc58dc162f199d62edcJames Zern coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); 58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Poor man's sign extract 60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff0_sign = _mm_srai_epi16(coeff0, 15); 61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff1_sign = _mm_srai_epi16(coeff1, 15); 62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_adds_epi16(qcoeff0, round); 68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian round = _mm_unpackhi_epi64(round, round); 69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_adds_epi16(qcoeff1, round); 70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian quant = _mm_unpackhi_epi64(quant, quant); 72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Reinsert signs 75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 808b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 818b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dequant = _mm_unpackhi_epi64(dequant, dequant); 85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 878b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 888b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Scan for eob 93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i zero_coeff0, zero_coeff1; 94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i nzero_coeff0, nzero_coeff1; 95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i iscan0, iscan1; 96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i eob1; 97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 1017bc9febe8749e98a3812a0dc4380ceae75c29450Johann iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 1027bc9febe8749e98a3812a0dc4380ceae75c29450Johann iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Add one to convert from indices to counts 104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_and_si128(iscan0, nzero_coeff0); 107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob1 = _mm_and_si128(iscan1, nzero_coeff1); 108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_max_epi16(eob, eob1); 109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n_coeffs += 8 * 2; 111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian thr = _mm_srai_epi16(dequant, 1); 114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // AC only loop 116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian while (n_coeffs < 0) { 117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i coeff0, coeff1; 118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i coeff0_sign, coeff1_sign; 120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i qcoeff0, qcoeff1; 121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i qtmp0, qtmp1; 122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1238b92989c89bec8632aa47dc58dc162f199d62edcJames Zern coeff0 = load_tran_low(coeff_ptr + n_coeffs); 1248b92989c89bec8632aa47dc58dc162f199d62edcJames Zern coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); 125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Poor man's sign extract 127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff0_sign = _mm_srai_epi16(coeff0, 15); 128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff1_sign = _mm_srai_epi16(coeff1, 15); 129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | 1357bc9febe8749e98a3812a0dc4380ceae75c29450Johann _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); 136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (nzflag) { 138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_adds_epi16(qcoeff0, round); 139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_adds_epi16(qcoeff1, round); 140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Reinsert signs 144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1498b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 1508b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1558b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 1568b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 1588b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_zero_tran_low(qcoeff_ptr + n_coeffs); 1598b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); 160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1618b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_zero_tran_low(dqcoeff_ptr + n_coeffs); 1628b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); 163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (nzflag) { 167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Scan for eob 168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i zero_coeff0, zero_coeff1; 169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i nzero_coeff0, nzero_coeff1; 170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i iscan0, iscan1; 171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i eob0, eob1; 172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 1767bc9febe8749e98a3812a0dc4380ceae75c29450Johann iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 1777bc9febe8749e98a3812a0dc4380ceae75c29450Johann iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Add one to convert from indices to counts 179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob0 = _mm_and_si128(iscan0, nzero_coeff0); 182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob1 = _mm_and_si128(iscan1, nzero_coeff1); 183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob0 = _mm_max_epi16(eob0, eob1); 184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_max_epi16(eob, eob0); 185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n_coeffs += 8 * 2; 187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Accumulate EOB 190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i eob_shuffled; 192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob_shuffled = _mm_shuffle_epi32(eob, 0xe); 193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_max_epi16(eob, eob_shuffled); 194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); 195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_max_epi16(eob, eob_shuffled); 196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); 197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_max_epi16(eob, eob_shuffled); 198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *eob_ptr = _mm_extract_epi16(eob, 1); 199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian do { 2028b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_zero_tran_low(qcoeff_ptr + n_coeffs); 2038b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); 2048b92989c89bec8632aa47dc58dc162f199d62edcJames Zern 2058b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_zero_tran_low(dqcoeff_ptr + n_coeffs); 2068b92989c89bec8632aa47dc58dc162f199d62edcJames Zern store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); 207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n_coeffs += 8 * 2; 208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } while (n_coeffs < 0); 209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *eob_ptr = 0; 210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 212