17ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* 27ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 37ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * 47ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 57ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 67ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 77ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 87ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 97ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian */ 107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include <arm_neon.h> 127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h" 147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "./vpx_config.h" 157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx/vpx_integer.h" 167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic INLINE void loop_filter_neon_16( 187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t qblimit, // blimit 197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t qlimit, // limit 207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t qthresh, // thresh 217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t q3, // p3 227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t q4, // p2 237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t q5, // p1 247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t q6, // p0 257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t q7, // q0 267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t q8, // q1 277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t q9, // q2 287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t q10, // q3 297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t *q5r, // p1 307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t *q6r, // p0 317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t *q7r, // q0 327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t *q8r) { // q1 337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; 347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int16x8_t q2s16, q11s16; 357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint16x8_t q4u16; 367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; 377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int8x8_t d2s8, d3s8; 387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q11u8 = vabdq_u8(q3, q4); 407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q12u8 = vabdq_u8(q4, q5); 417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q13u8 = vabdq_u8(q5, q6); 427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q14u8 = vabdq_u8(q8, q7); 437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q3 = vabdq_u8(q9, q8); 447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q4 = vabdq_u8(q10, q9); 457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q11u8 = vmaxq_u8(q11u8, q12u8); 477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q12u8 = vmaxq_u8(q13u8, q14u8); 487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q3 = vmaxq_u8(q3, q4); 497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q15u8 = vmaxq_u8(q11u8, q12u8); 507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q9 = vabdq_u8(q6, q7); 527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // vp8_hevmask 547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q13u8 = vcgtq_u8(q13u8, qthresh); 557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q14u8 = vcgtq_u8(q14u8, qthresh); 567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q15u8 = vmaxq_u8(q15u8, q3); 577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q2u8 = vabdq_u8(q5, q8); 597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q9 = vqaddq_u8(q9, q9); 607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q15u8 = vcgeq_u8(qlimit, q15u8); 627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // vp8_filter() function 647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // convert to signed 657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q10 = vdupq_n_u8(0x80); 667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q8 = veorq_u8(q8, q10); 677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q7 = veorq_u8(q7, q10); 687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q6 = veorq_u8(q6, q10); 697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q5 = veorq_u8(q5, q10); 707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q2u8 = vshrq_n_u8(q2u8, 1); 727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q9 = vqaddq_u8(q9, q2u8); 737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), 757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vget_low_s8(vreinterpretq_s8_u8(q6))); 767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), 777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vget_high_s8(vreinterpretq_s8_u8(q6))); 787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q9 = vcgeq_u8(qblimit, q9); 807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), 827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vreinterpretq_s8_u8(q8)); 837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q14u8 = vorrq_u8(q13u8, q14u8); 857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q4u16 = vdupq_n_u16(3); 877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); 887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); 897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); 917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q15u8 = vandq_u8(q15u8, q9); 927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q1s8 = vreinterpretq_s8_u8(q1u8); 947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); 957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); 967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q4 = vdupq_n_u8(3); 987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q9 = vdupq_n_u8(4); 997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) 1007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian d2s8 = vqmovn_s16(q2s16); 1017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian d3s8 = vqmovn_s16(q11s16); 1027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q1s8 = vcombine_s8(d2s8, d3s8); 1037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); 1047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q1s8 = vreinterpretq_s8_u8(q1u8); 1057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); 1077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); 1087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q2s8 = vshrq_n_s8(q2s8, 3); 1097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q1s8 = vshrq_n_s8(q1s8, 3); 1107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); 1127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); 1137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q1s8 = vrshrq_n_s8(q1s8, 1); 1157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); 1167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); 1187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); 1197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); 1217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); 1227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); 1237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); 1247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian return; 1257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 1267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */, 1287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8_t *blimit0, 1297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8_t *limit0, 1307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8_t *thresh0, 1317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8_t *blimit1, 1327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8_t *limit1, 1337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8_t *thresh1) { 1347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; 1357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t qblimit, qlimit, qthresh; 1367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; 1377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dblimit0 = vld1_u8(blimit0); 1397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dlimit0 = vld1_u8(limit0); 1407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dthresh0 = vld1_u8(thresh0); 1417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dblimit1 = vld1_u8(blimit1); 1427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dlimit1 = vld1_u8(limit1); 1437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dthresh1 = vld1_u8(thresh1); 1447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian qblimit = vcombine_u8(dblimit0, dblimit1); 1457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian qlimit = vcombine_u8(dlimit0, dlimit1); 1467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian qthresh = vcombine_u8(dthresh0, dthresh1); 1477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s -= (p << 2); 1497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q3u8 = vld1q_u8(s); 1517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s += p; 1527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q4u8 = vld1q_u8(s); 1537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s += p; 1547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q5u8 = vld1q_u8(s); 1557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s += p; 1567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q6u8 = vld1q_u8(s); 1577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s += p; 1587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q7u8 = vld1q_u8(s); 1597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s += p; 1607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q8u8 = vld1q_u8(s); 1617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s += p; 1627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q9u8 = vld1q_u8(s); 1637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s += p; 1647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q10u8 = vld1q_u8(s); 1657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian loop_filter_neon_16(qblimit, qlimit, qthresh, 1677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, 1687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian &q5u8, &q6u8, &q7u8, &q8u8); 1697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s -= (p * 5); 1717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vst1q_u8(s, q5u8); 1727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s += p; 1737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vst1q_u8(s, q6u8); 1747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s += p; 1757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vst1q_u8(s, q7u8); 1767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian s += p; 1777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vst1q_u8(s, q8u8); 1787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian return; 1797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 180