sad_neon.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13#include "./vpx_config.h" 14 15#include "vpx/vpx_integer.h" 16 17unsigned int vpx_sad8x16_neon( 18 unsigned char *src_ptr, 19 int src_stride, 20 unsigned char *ref_ptr, 21 int ref_stride) { 22 uint8x8_t d0, d8; 23 uint16x8_t q12; 24 uint32x4_t q1; 25 uint64x2_t q3; 26 uint32x2_t d5; 27 int i; 28 29 d0 = vld1_u8(src_ptr); 30 src_ptr += src_stride; 31 d8 = vld1_u8(ref_ptr); 32 ref_ptr += ref_stride; 33 q12 = vabdl_u8(d0, d8); 34 35 for (i = 0; i < 15; i++) { 36 d0 = vld1_u8(src_ptr); 37 src_ptr += src_stride; 38 d8 = vld1_u8(ref_ptr); 39 ref_ptr += ref_stride; 40 q12 = vabal_u8(q12, d0, d8); 41 } 42 43 q1 = vpaddlq_u16(q12); 44 q3 = vpaddlq_u32(q1); 45 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), 46 vreinterpret_u32_u64(vget_high_u64(q3))); 47 48 return vget_lane_u32(d5, 0); 49} 50 51unsigned int vpx_sad4x4_neon( 52 unsigned char *src_ptr, 53 int src_stride, 54 unsigned char *ref_ptr, 55 int ref_stride) { 56 uint8x8_t d0, d8; 57 uint16x8_t q12; 58 uint32x2_t d1; 59 uint64x1_t d3; 60 int i; 61 62 d0 = vld1_u8(src_ptr); 63 src_ptr += src_stride; 64 d8 = vld1_u8(ref_ptr); 65 ref_ptr += ref_stride; 66 q12 = vabdl_u8(d0, d8); 67 68 for (i = 0; i < 3; i++) { 69 d0 = vld1_u8(src_ptr); 70 src_ptr += src_stride; 71 d8 = vld1_u8(ref_ptr); 72 ref_ptr += ref_stride; 73 q12 = vabal_u8(q12, d0, d8); 74 } 75 76 d1 = vpaddl_u16(vget_low_u16(q12)); 77 d3 = vpaddl_u32(d1); 78 79 return vget_lane_u32(vreinterpret_u32_u64(d3), 0); 80} 81 82unsigned int vpx_sad16x8_neon( 83 unsigned char *src_ptr, 84 int src_stride, 85 unsigned char *ref_ptr, 86 int ref_stride) { 87 uint8x16_t q0, q4; 88 uint16x8_t q12, q13; 89 uint32x4_t q1; 90 uint64x2_t q3; 91 uint32x2_t d5; 92 int i; 93 94 q0 = vld1q_u8(src_ptr); 95 src_ptr += src_stride; 96 q4 = vld1q_u8(ref_ptr); 97 ref_ptr += ref_stride; 98 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); 99 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); 100 101 for (i = 0; i < 7; i++) { 102 q0 = vld1q_u8(src_ptr); 103 src_ptr += src_stride; 104 q4 = vld1q_u8(ref_ptr); 105 ref_ptr += ref_stride; 106 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); 107 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); 108 } 109 110 q12 = vaddq_u16(q12, q13); 111 q1 = vpaddlq_u16(q12); 112 q3 = vpaddlq_u32(q1); 113 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), 114 vreinterpret_u32_u64(vget_high_u64(q3))); 115 116 return vget_lane_u32(d5, 0); 117} 118 119static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, 120 const uint16x8_t vec_hi) { 121 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), 122 vget_high_u16(vec_lo)); 123 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), 124 vget_high_u16(vec_hi)); 125 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); 126 const uint64x2_t b = vpaddlq_u32(a); 127 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), 128 vreinterpret_u32_u64(vget_high_u64(b))); 129 return vget_lane_u32(c, 0); 130} 131static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { 132 const uint32x4_t a = vpaddlq_u16(vec_16x8); 133 const uint64x2_t b = vpaddlq_u32(a); 134 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), 135 vreinterpret_u32_u64(vget_high_u64(b))); 136 return vget_lane_u32(c, 0); 137} 138 139unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, 140 const uint8_t *ref, int ref_stride) { 141 int i; 142 uint16x8_t vec_accum_lo = vdupq_n_u16(0); 143 uint16x8_t vec_accum_hi = vdupq_n_u16(0); 144 for (i = 0; i < 64; ++i) { 145 const uint8x16_t vec_src_00 = vld1q_u8(src); 146 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); 147 const uint8x16_t vec_src_32 = vld1q_u8(src + 32); 148 const uint8x16_t vec_src_48 = vld1q_u8(src + 48); 149 const uint8x16_t vec_ref_00 = vld1q_u8(ref); 150 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); 151 const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); 152 const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); 153 src += src_stride; 154 ref += ref_stride; 155 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), 156 vget_low_u8(vec_ref_00)); 157 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), 158 vget_high_u8(vec_ref_00)); 159 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), 160 vget_low_u8(vec_ref_16)); 161 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), 162 vget_high_u8(vec_ref_16)); 163 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), 164 vget_low_u8(vec_ref_32)); 165 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), 166 vget_high_u8(vec_ref_32)); 167 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), 168 vget_low_u8(vec_ref_48)); 169 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), 170 vget_high_u8(vec_ref_48)); 171 } 172 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); 173} 174 175unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, 176 const uint8_t *ref, int ref_stride) { 177 int i; 178 uint16x8_t vec_accum_lo = vdupq_n_u16(0); 179 uint16x8_t vec_accum_hi = vdupq_n_u16(0); 180 181 for (i = 0; i < 32; ++i) { 182 const uint8x16_t vec_src_00 = vld1q_u8(src); 183 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); 184 const uint8x16_t vec_ref_00 = vld1q_u8(ref); 185 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); 186 src += src_stride; 187 ref += ref_stride; 188 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), 189 vget_low_u8(vec_ref_00)); 190 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), 191 vget_high_u8(vec_ref_00)); 192 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), 193 vget_low_u8(vec_ref_16)); 194 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), 195 vget_high_u8(vec_ref_16)); 196 } 197 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); 198} 199 200unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, 201 const uint8_t *ref, int ref_stride) { 202 int i; 203 uint16x8_t vec_accum_lo = vdupq_n_u16(0); 204 uint16x8_t vec_accum_hi = vdupq_n_u16(0); 205 206 for (i = 0; i < 16; ++i) { 207 const uint8x16_t vec_src = vld1q_u8(src); 208 const uint8x16_t vec_ref = vld1q_u8(ref); 209 src += src_stride; 210 ref += ref_stride; 211 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), 212 vget_low_u8(vec_ref)); 213 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), 214 vget_high_u8(vec_ref)); 215 } 216 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); 217} 218 219unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, 220 const uint8_t *ref, int ref_stride) { 221 int i; 222 uint16x8_t vec_accum = vdupq_n_u16(0); 223 224 for (i = 0; i < 8; ++i) { 225 const uint8x8_t vec_src = vld1_u8(src); 226 const uint8x8_t vec_ref = vld1_u8(ref); 227 src += src_stride; 228 ref += ref_stride; 229 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); 230 } 231 return horizontal_add_16x8(vec_accum); 232} 233