1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* 2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * 4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */ 10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <arm_neon.h> 12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_config.h" 14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h" 15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx/vpx_integer.h" 16df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx_dsp/arm/mem_neon.h" 17df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx_dsp/arm/sum_neon.h" 18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 19df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride, 20df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const ref[4], int ref_stride, 21df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *res) { 22df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int i; 23df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride); 24df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (i = 0; i < 4; ++i) { 25df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride); 26df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); 27df37111358d02836cb29bbcb9c6e4c95dff90a16Johann abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); 28df37111358d02836cb29bbcb9c6e4c95dff90a16Johann res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); 29df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 32df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride, 33df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const ref[4], int ref_stride, 34df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *res) { 35df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int i; 36df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride); 37df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride); 38df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (i = 0; i < 4; ++i) { 39df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride); 40df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t ref_1 = 41df37111358d02836cb29bbcb9c6e4c95dff90a16Johann load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride); 42df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0)); 43df37111358d02836cb29bbcb9c6e4c95dff90a16Johann abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0)); 44df37111358d02836cb29bbcb9c6e4c95dff90a16Johann abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1)); 45df37111358d02836cb29bbcb9c6e4c95dff90a16Johann abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1)); 46df37111358d02836cb29bbcb9c6e4c95dff90a16Johann res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); 47df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 48df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 49df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 50df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic INLINE void sad8x_4d(const uint8_t *a, int a_stride, 51df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const b[4], int b_stride, 52df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *result, const int height) { 53df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int i, j; 54df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 55df37111358d02836cb29bbcb9c6e4c95dff90a16Johann vdupq_n_u16(0) }; 56df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; 57df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 58df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (i = 0; i < height; ++i) { 59df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x8_t a_u8 = vld1_u8(a); 60df37111358d02836cb29bbcb9c6e4c95dff90a16Johann a += a_stride; 61df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (j = 0; j < 4; ++j) { 62df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x8_t b_u8 = vld1_u8(b_loop[j]); 63df37111358d02836cb29bbcb9c6e4c95dff90a16Johann b_loop[j] += b_stride; 64df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum[j] = vabal_u8(sum[j], a_u8, b_u8); 65df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 66df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 67df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 68df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (j = 0; j < 4; ++j) { 69df37111358d02836cb29bbcb9c6e4c95dff90a16Johann result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); 70df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 71df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 72df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 73df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride, 74df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const ref[4], int ref_stride, 75df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *res) { 76df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sad8x_4d(src, src_stride, ref, ref_stride, res, 4); 77df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 78df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 79df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride, 80df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const ref[4], int ref_stride, 81df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *res) { 82df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sad8x_4d(src, src_stride, ref, ref_stride, res, 8); 83df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 84df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 85df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride, 86df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const ref[4], int ref_stride, 87df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *res) { 88df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sad8x_4d(src, src_stride, ref, ref_stride, res, 16); 89df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 90df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 91df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic INLINE void sad16x_4d(const uint8_t *a, int a_stride, 92df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const b[4], int b_stride, 93df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *result, const int height) { 94df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int i, j; 95df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 96df37111358d02836cb29bbcb9c6e4c95dff90a16Johann vdupq_n_u16(0) }; 97df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; 98df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 99df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (i = 0; i < height; ++i) { 100df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t a_u8 = vld1q_u8(a); 101df37111358d02836cb29bbcb9c6e4c95dff90a16Johann a += a_stride; 102df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (j = 0; j < 4; ++j) { 103df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t b_u8 = vld1q_u8(b_loop[j]); 104df37111358d02836cb29bbcb9c6e4c95dff90a16Johann b_loop[j] += b_stride; 105df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8)); 106df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8)); 107df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 108df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 109df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 110df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (j = 0; j < 4; ++j) { 111df37111358d02836cb29bbcb9c6e4c95dff90a16Johann result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); 112df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 113df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 114df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 115df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride, 116df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const ref[4], int ref_stride, 117df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *res) { 118df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sad16x_4d(src, src_stride, ref, ref_stride, res, 8); 119df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 120df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 121df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, 1227bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *const ref[4], int ref_stride, 123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t *res) { 124df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sad16x_4d(src, src_stride, ref, ref_stride, res, 16); 125df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 126df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 127df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride, 128df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const ref[4], int ref_stride, 129df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *res) { 130df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sad16x_4d(src, src_stride, ref, ref_stride, res, 32); 131df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 132df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 133df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic INLINE void sad32x_4d(const uint8_t *a, int a_stride, 134df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const b[4], int b_stride, 135df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *result, const int height) { 136df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int i, j; 137df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 138df37111358d02836cb29bbcb9c6e4c95dff90a16Johann vdupq_n_u16(0) }; 139df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; 140df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 141df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (i = 0; i < height; ++i) { 142df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t a_0 = vld1q_u8(a); 143df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t a_1 = vld1q_u8(a + 16); 144df37111358d02836cb29bbcb9c6e4c95dff90a16Johann a += a_stride; 145df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (j = 0; j < 4; ++j) { 146df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t b_0 = vld1q_u8(b_loop[j]); 147df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16); 148df37111358d02836cb29bbcb9c6e4c95dff90a16Johann b_loop[j] += b_stride; 149df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0)); 150df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0)); 151df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1)); 152df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1)); 153df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 156df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (j = 0; j < 4; ++j) { 157df37111358d02836cb29bbcb9c6e4c95dff90a16Johann result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); 158df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 159df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 160df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 161df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride, 162df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const ref[4], int ref_stride, 163df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *res) { 164df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sad32x_4d(src, src_stride, ref, ref_stride, res, 16); 165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, 1687bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *const ref[4], int ref_stride, 169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t *res) { 170df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sad32x_4d(src, src_stride, ref, ref_stride, res, 32); 171df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 172df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 173df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride, 174df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const ref[4], int ref_stride, 175df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *res) { 176df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sad32x_4d(src, src_stride, ref, ref_stride, res, 64); 177df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 178df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 179df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1, 180df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t b_0, const uint8x16_t b_1, 181df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t *sum) { 182df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0)); 183df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0)); 184df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1)); 185df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1)); 186df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 187df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 188df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic INLINE void sad64x_4d(const uint8_t *a, int a_stride, 189df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const b[4], int b_stride, 190df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *result, const int height) { 191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int i; 192df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t sum_0 = vdupq_n_u16(0); 193df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t sum_1 = vdupq_n_u16(0); 194df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t sum_2 = vdupq_n_u16(0); 195df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t sum_3 = vdupq_n_u16(0); 196df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t sum_4 = vdupq_n_u16(0); 197df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t sum_5 = vdupq_n_u16(0); 198df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t sum_6 = vdupq_n_u16(0); 199df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint16x8_t sum_7 = vdupq_n_u16(0); 200df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; 201df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 202df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (i = 0; i < height; ++i) { 203df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t a_0 = vld1q_u8(a); 204df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t a_1 = vld1q_u8(a + 16); 205df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t a_2 = vld1q_u8(a + 32); 206df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8x16_t a_3 = vld1q_u8(a + 48); 207df37111358d02836cb29bbcb9c6e4c95dff90a16Johann a += a_stride; 208df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0); 209df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48), 210df37111358d02836cb29bbcb9c6e4c95dff90a16Johann &sum_1); 211df37111358d02836cb29bbcb9c6e4c95dff90a16Johann b_loop[0] += b_stride; 212df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2); 213df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48), 214df37111358d02836cb29bbcb9c6e4c95dff90a16Johann &sum_3); 215df37111358d02836cb29bbcb9c6e4c95dff90a16Johann b_loop[1] += b_stride; 216df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4); 217df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48), 218df37111358d02836cb29bbcb9c6e4c95dff90a16Johann &sum_5); 219df37111358d02836cb29bbcb9c6e4c95dff90a16Johann b_loop[2] += b_stride; 220df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6); 221df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48), 222df37111358d02836cb29bbcb9c6e4c95dff90a16Johann &sum_7); 223df37111358d02836cb29bbcb9c6e4c95dff90a16Johann b_loop[3] += b_stride; 224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 226df37111358d02836cb29bbcb9c6e4c95dff90a16Johann result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0); 227df37111358d02836cb29bbcb9c6e4c95dff90a16Johann result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0); 228df37111358d02836cb29bbcb9c6e4c95dff90a16Johann result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0); 229df37111358d02836cb29bbcb9c6e4c95dff90a16Johann result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0); 230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 232df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride, 2337bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *const ref[4], int ref_stride, 234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t *res) { 235df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sad64x_4d(src, src_stride, ref, ref_stride, res, 32); 236df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 238df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, 239df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const ref[4], int ref_stride, 240df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t *res) { 241df37111358d02836cb29bbcb9c6e4c95dff90a16Johann sad64x_4d(src, src_stride, ref, ref_stride, res, 64); 242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 243