1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* 2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * 4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */ 10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <assert.h> 12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <tmmintrin.h> // SSSE3 13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vp9_rtcd.h" 157bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include "./vpx_config.h" 167bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include "vpx_dsp/vpx_dsp_common.h" 170a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" 18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/x86/inv_txfm_sse2.h" 19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/x86/txfm_common_sse2.h" 20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 217bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vp9_fdct8x8_quant_ssse3( 227bc9febe8749e98a3812a0dc4380ceae75c29450Johann const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, 230a39d0a697ff3603e8c100300fda363658e10b23James Zern int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, 247bc9febe8749e98a3812a0dc4380ceae75c29450Johann tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, 257bc9febe8749e98a3812a0dc4380ceae75c29450Johann uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { 26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i zero; 27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int pass; 280a39d0a697ff3603e8c100300fda363658e10b23James Zern 29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Constants 30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // When we use them, in one case, they are all the same. In all others 31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // it's a pair of them that we need to repeat four times. This is done 32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // by constructing the 32 bit constant corresponding to that pair. 33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170); 34df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Load input 447bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 457bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 467bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 477bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 487bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 497bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 507bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 517bc9febe8749e98a3812a0dc4380ceae75c29450Johann __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i *in[8]; 53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int index = 0; 54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian (void)scan_ptr; 56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian (void)coeff_ptr; 57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Pre-condition input (shift by two) 59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0 = _mm_slli_epi16(in0, 2); 60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in1 = _mm_slli_epi16(in1, 2); 61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in2 = _mm_slli_epi16(in2, 2); 62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in3 = _mm_slli_epi16(in3, 2); 63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in4 = _mm_slli_epi16(in4, 2); 64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in5 = _mm_slli_epi16(in5, 2); 65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in6 = _mm_slli_epi16(in6, 2); 66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = _mm_slli_epi16(in7, 2); 67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[0] = &in0; 69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[1] = &in1; 70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[2] = &in2; 71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[3] = &in3; 72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[4] = &in4; 73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[5] = &in5; 74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[6] = &in6; 75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in[7] = &in7; 76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // We do two passes, first the columns, then the rows. The results of the 78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // first pass are transposed so that the same column code can be reused. The 79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // results of the second pass are also transposed so that the rows (processed 80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // as columns) are put back in row positions. 81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (pass = 0; pass < 2; pass++) { 82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // To store results of each pass before the transpose. 83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i res0, res1, res2, res3, res4, res5, res6, res7; 84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Add/subtract 85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i q0 = _mm_add_epi16(in0, in7); 86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i q1 = _mm_add_epi16(in1, in6); 87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i q2 = _mm_add_epi16(in2, in5); 88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i q3 = _mm_add_epi16(in3, in4); 89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i q4 = _mm_sub_epi16(in3, in4); 90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i q5 = _mm_sub_epi16(in2, in5); 91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i q6 = _mm_sub_epi16(in1, in6); 92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i q7 = _mm_sub_epi16(in0, in7); 93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Work on first four results 94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Add/subtract 96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i r0 = _mm_add_epi16(q0, q3); 97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i r1 = _mm_add_epi16(q1, q2); 98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i r2 = _mm_sub_epi16(q1, q2); 99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i r3 = _mm_sub_epi16(q0, q3); 100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Interleave to do the multiply by constants which gets us into 32bits 101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // dct_const_round_shift 116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Combine 137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res0 = _mm_packs_epi32(w0, w1); 139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res4 = _mm_packs_epi32(w2, w3); 140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res2 = _mm_packs_epi32(w4, w5); 141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res6 = _mm_packs_epi32(w6, w7); 142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Work on next four results 144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Interleave to do the multiply by constants which gets us into 32bits 146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i d0 = _mm_sub_epi16(q6, q5); 147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i d1 = _mm_add_epi16(q6, q5); 148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16); 149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16); 150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Add/subtract 152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i x0 = _mm_add_epi16(q4, r0); 153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i x1 = _mm_sub_epi16(q4, r0); 154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i x2 = _mm_sub_epi16(q7, r1); 155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i x3 = _mm_add_epi16(q7, r1); 156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Interleave to do the multiply by constants which gets us into 32bits 157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // dct_const_round_shift 170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Combine 187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res1 = _mm_packs_epi32(w0, w1); 188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res7 = _mm_packs_epi32(w2, w3); 189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res5 = _mm_packs_epi32(w4, w5); 190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res3 = _mm_packs_epi32(w6, w7); 191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Transpose the 8x8. 193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 00 01 02 03 04 05 06 07 195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 10 11 12 13 14 15 16 17 196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 20 21 22 23 24 25 26 27 197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 30 31 32 33 34 35 36 37 198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 40 41 42 43 44 45 46 47 199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 50 51 52 53 54 55 56 57 200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 60 61 62 63 64 65 66 67 201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 70 71 72 73 74 75 76 77 202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); 204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); 205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); 206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); 207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); 208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); 209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); 210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 00 10 01 11 02 12 03 13 211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 20 30 21 31 22 32 23 33 212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 04 14 05 15 06 16 07 17 213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 24 34 25 35 26 36 27 37 214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 40 50 41 51 42 52 43 53 215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 60 70 61 71 62 72 63 73 216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 54 54 55 55 56 56 57 57 217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 64 74 65 75 66 76 67 77 218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 00 10 20 30 01 11 21 31 227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 40 50 60 70 41 51 61 71 228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 02 12 22 32 03 13 23 33 229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 42 52 62 72 43 53 63 73 230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 04 14 24 34 05 15 21 36 231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 44 54 64 74 45 55 61 76 232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 06 16 26 36 07 17 27 37 233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 46 56 66 76 47 57 67 77 234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 00 10 20 30 40 50 60 70 243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 01 11 21 31 41 51 61 71 244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 02 12 22 32 42 52 62 72 245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 03 13 23 33 43 53 63 73 246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 04 14 24 34 44 54 64 74 247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 05 15 25 35 45 55 65 75 248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 06 16 26 36 46 56 66 76 249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // 07 17 27 37 47 57 67 77 250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Post-condition output and store it 253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Post-condition (division by two) 255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // division of two 16 bits signed numbers using shifts 256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // n / 2 = (n - (n >> 15)) >> 1 257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i sign_in0 = _mm_srai_epi16(in0, 15); 258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i sign_in1 = _mm_srai_epi16(in1, 15); 259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i sign_in2 = _mm_srai_epi16(in2, 15); 260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i sign_in3 = _mm_srai_epi16(in3, 15); 261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i sign_in4 = _mm_srai_epi16(in4, 15); 262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i sign_in5 = _mm_srai_epi16(in5, 15); 263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i sign_in6 = _mm_srai_epi16(in6, 15); 264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const __m128i sign_in7 = _mm_srai_epi16(in7, 15); 265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0 = _mm_sub_epi16(in0, sign_in0); 266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in1 = _mm_sub_epi16(in1, sign_in1); 267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in2 = _mm_sub_epi16(in2, sign_in2); 268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in3 = _mm_sub_epi16(in3, sign_in3); 269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in4 = _mm_sub_epi16(in4, sign_in4); 270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in5 = _mm_sub_epi16(in5, sign_in5); 271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in6 = _mm_sub_epi16(in6, sign_in6); 272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = _mm_sub_epi16(in7, sign_in7); 273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in0 = _mm_srai_epi16(in0, 1); 274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in1 = _mm_srai_epi16(in1, 1); 275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in2 = _mm_srai_epi16(in2, 1); 276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in3 = _mm_srai_epi16(in3, 1); 277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in4 = _mm_srai_epi16(in4, 1); 278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in5 = _mm_srai_epi16(in5, 1); 279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in6 = _mm_srai_epi16(in6, 1); 280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian in7 = _mm_srai_epi16(in7, 1); 281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iscan_ptr += n_coeffs; 284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff_ptr += n_coeffs; 285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dqcoeff_ptr += n_coeffs; 286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n_coeffs = -n_coeffs; 287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero = _mm_setzero_si128(); 288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (!skip_block) { 290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i eob; 291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i round, quant, dequant, thr; 292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t nzflag; 293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i coeff0, coeff1; 295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Setup global values 297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 2987bc9febe8749e98a3812a0dc4380ceae75c29450Johann round = _mm_load_si128((const __m128i *)round_ptr); 2997bc9febe8749e98a3812a0dc4380ceae75c29450Johann quant = _mm_load_si128((const __m128i *)quant_ptr); 3007bc9febe8749e98a3812a0dc4380ceae75c29450Johann dequant = _mm_load_si128((const __m128i *)dequant_ptr); 301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i coeff0_sign, coeff1_sign; 305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i qcoeff0, qcoeff1; 306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i qtmp0, qtmp1; 307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Do DC and first 15 AC 308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff0 = *in[0]; 309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff1 = *in[1]; 310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Poor man's sign extract 312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff0_sign = _mm_srai_epi16(coeff0, 15); 313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff1_sign = _mm_srai_epi16(coeff1, 15); 314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_adds_epi16(qcoeff0, round); 320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian round = _mm_unpackhi_epi64(round, round); 321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_adds_epi16(qcoeff1, round); 322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian quant = _mm_unpackhi_epi64(quant, quant); 324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Reinsert signs 327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3327bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 3337bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dequant = _mm_unpackhi_epi64(dequant, dequant); 337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3397bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 3407bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Scan for eob 345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i zero_coeff0, zero_coeff1; 346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i nzero_coeff0, nzero_coeff1; 347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i iscan0, iscan1; 348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i eob1; 349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 3537bc9febe8749e98a3812a0dc4380ceae75c29450Johann iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 3547bc9febe8749e98a3812a0dc4380ceae75c29450Johann iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Add one to convert from indices to counts 356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_and_si128(iscan0, nzero_coeff0); 359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob1 = _mm_and_si128(iscan1, nzero_coeff1); 360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_max_epi16(eob, eob1); 361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n_coeffs += 8 * 2; 363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // AC only loop 366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian index = 2; 367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian thr = _mm_srai_epi16(dequant, 1); 368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian while (n_coeffs < 0) { 369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i coeff0, coeff1; 370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i coeff0_sign, coeff1_sign; 372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i qcoeff0, qcoeff1; 373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i qtmp0, qtmp1; 374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); 376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff0 = *in[index]; 377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff1 = *in[index + 1]; 378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Poor man's sign extract 380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff0_sign = _mm_srai_epi16(coeff0, 15); 381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff1_sign = _mm_srai_epi16(coeff1, 15); 382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | 3887bc9febe8749e98a3812a0dc4380ceae75c29450Johann _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); 389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (nzflag) { 391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_adds_epi16(qcoeff0, round); 392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_adds_epi16(qcoeff1, round); 393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Reinsert signs 397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 4027bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 4037bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 4087bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 4097bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 4117bc9febe8749e98a3812a0dc4380ceae75c29450Johann // Maybe a more efficient way to store 0? 4127bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_zero_tran_low(qcoeff_ptr + n_coeffs); 4137bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); 414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 4157bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_zero_tran_low(dqcoeff_ptr + n_coeffs); 4167bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); 417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (nzflag) { 421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Scan for eob 422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i zero_coeff0, zero_coeff1; 423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i nzero_coeff0, nzero_coeff1; 424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i iscan0, iscan1; 425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i eob0, eob1; 426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 4307bc9febe8749e98a3812a0dc4380ceae75c29450Johann iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 4317bc9febe8749e98a3812a0dc4380ceae75c29450Johann iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Add one to convert from indices to counts 433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob0 = _mm_and_si128(iscan0, nzero_coeff0); 436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob1 = _mm_and_si128(iscan1, nzero_coeff1); 437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob0 = _mm_max_epi16(eob0, eob1); 438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_max_epi16(eob, eob0); 439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n_coeffs += 8 * 2; 441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian index += 2; 442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian // Accumulate EOB 445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian { 446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian __m128i eob_shuffled; 447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob_shuffled = _mm_shuffle_epi32(eob, 0xe); 448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_max_epi16(eob, eob_shuffled); 449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); 450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_max_epi16(eob, eob_shuffled); 451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); 452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian eob = _mm_max_epi16(eob, eob_shuffled); 453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *eob_ptr = _mm_extract_epi16(eob, 1); 454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian do { 4577bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_zero_tran_low(dqcoeff_ptr + n_coeffs); 4587bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); 4597bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_zero_tran_low(qcoeff_ptr + n_coeffs); 4607bc9febe8749e98a3812a0dc4380ceae75c29450Johann store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); 461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian n_coeffs += 8 * 2; 462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } while (n_coeffs < 0); 463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *eob_ptr = 0; 464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 466