17ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/*
27ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
37ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *
47ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
57ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
67ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
77ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
87ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
97ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian */
107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include <arm_neon.h>
127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vp8/encoder/block.h"
137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic const uint16_t inv_zig_zag[16] = {
157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    1,  2,  6,   7,
167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    3,  5,  8,  13,
177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    4,  9,  12, 14,
187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    10, 11, 15, 16
197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian};
207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    const int16x8_t one_q = vdupq_n_s16(-1),
237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                    z0 = vld1q_s16(b->coeff),
247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                    z1 = vld1q_s16(b->coeff + 8),
257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                    round0 = vld1q_s16(b->round),
267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                    round1 = vld1q_s16(b->round + 8),
277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                    quant0 = vld1q_s16(b->quant_fast),
287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                    quant1 = vld1q_s16(b->quant_fast + 8),
297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                    dequant0 = vld1q_s16(d->dequant),
307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                    dequant1 = vld1q_s16(d->dequant + 8);
317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                     zig_zag1 = vld1q_u16(inv_zig_zag + 8);
337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    int16x8_t x0, x1, sz0, sz1, y0, y1;
347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    uint16x8_t eob0, eob1;
357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    uint16x4_t eob_d16;
367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    uint32x2_t eob_d32;
377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    uint32x4_t eob_q32;
387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* sign of z: z >> 15 */
407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sz0 = vshrq_n_s16(z0, 15);
417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sz1 = vshrq_n_s16(z1, 15);
427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* x = abs(z) */
447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    x0 = vabsq_s16(z0);
457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    x1 = vabsq_s16(z1);
467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* x += round */
487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    x0 = vaddq_s16(x0, round0);
497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    x1 = vaddq_s16(x1, round1);
507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* y = 2 * (x * quant) >> 16 */
527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    y0 = vqdmulhq_s16(x0, quant0);
537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    y1 = vqdmulhq_s16(x1, quant1);
547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Compensate for doubling in vqdmulhq */
567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    y0 = vshrq_n_s16(y0, 1);
577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    y1 = vshrq_n_s16(y1, 1);
587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Restore sign bit */
607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    y0 = veorq_s16(y0, sz0);
617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    y1 = veorq_s16(y1, sz1);
627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    x0 = vsubq_s16(y0, sz0);
637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    x1 = vsubq_s16(y1, sz1);
647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* find non-zero elements */
667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    eob0 = vtstq_s16(x0, one_q);
677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    eob1 = vtstq_s16(x1, one_q);
687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* mask zig zag */
707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    eob0 = vandq_u16(eob0, zig_zag0);
717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    eob1 = vandq_u16(eob1, zig_zag1);
727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* select the largest value */
747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    eob0 = vmaxq_u16(eob0, eob1);
757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    eob_q32 = vmovl_u16(eob_d16);
777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    eob_d32 = vpmax_u32(eob_d32, eob_d32);
797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* qcoeff = x */
817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst1q_s16(d->qcoeff, x0);
827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst1q_s16(d->qcoeff + 8, x1);
837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* dqcoeff = x * dequant */
857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
90