1/* 2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12#include <assert.h> 13 14#include "./vpx_config.h" 15#include "./vpx_dsp_rtcd.h" 16#include "vpx/vpx_integer.h" 17#include "vpx_dsp/arm/transpose_neon.h" 18#include "vpx_ports/mem.h" 19 20static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p, 21 int16x4_t *const s0, int16x4_t *const s1, 22 int16x4_t *const s2, int16x4_t *const s3) { 23 *s0 = vld1_s16(s); 24 s += p; 25 *s1 = vld1_s16(s); 26 s += p; 27 *s2 = vld1_s16(s); 28 s += p; 29 *s3 = vld1_s16(s); 30} 31 32static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p, 33 uint16x8_t *const s0, uint16x8_t *const s1, 34 uint16x8_t *const s2, uint16x8_t *const s3) { 35 *s0 = vld1q_u16(s); 36 s += p; 37 *s1 = vld1q_u16(s); 38 s += p; 39 *s2 = vld1q_u16(s); 40 s += p; 41 *s3 = vld1q_u16(s); 42} 43 44static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p, 45 int16x8_t *const s0, int16x8_t *const s1, 46 int16x8_t *const s2, int16x8_t *const s3, 47 int16x8_t *const s4, int16x8_t *const s5, 48 int16x8_t *const s6, int16x8_t *const s7) { 49 *s0 = vld1q_s16(s); 50 s += p; 51 *s1 = vld1q_s16(s); 52 s += p; 53 *s2 = vld1q_s16(s); 54 s += p; 55 *s3 = vld1q_s16(s); 56 s += p; 57 *s4 = vld1q_s16(s); 58 s += p; 59 *s5 = vld1q_s16(s); 60 s += p; 61 *s6 = vld1q_s16(s); 62 s += p; 63 *s7 = vld1q_s16(s); 64} 65 66static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p, 67 const uint16x8_t s0, const uint16x8_t s1, 68 const uint16x8_t s2, const uint16x8_t s3, 69 const uint16x8_t s4, const uint16x8_t s5, 70 const uint16x8_t s6, const uint16x8_t s7) { 71 vst1q_u16(s, s0); 72 s += p; 73 vst1q_u16(s, s1); 74 s += p; 75 vst1q_u16(s, s2); 76 s += p; 77 vst1q_u16(s, s3); 78 s += p; 79 vst1q_u16(s, s4); 80 s += p; 81 vst1q_u16(s, s5); 82 s += p; 83 vst1q_u16(s, s6); 84 s += p; 85 vst1q_u16(s, s7); 86} 87 88static INLINE int32x4_t highbd_convolve8_4( 89 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 90 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 91 const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) { 92 const int16x4_t filters_lo = vget_low_s16(filters); 93 const int16x4_t filters_hi = vget_high_s16(filters); 94 int32x4_t sum; 95 96 sum = vmull_lane_s16(s0, filters_lo, 0); 97 sum = vmlal_lane_s16(sum, s1, filters_lo, 1); 98 sum = vmlal_lane_s16(sum, s2, filters_lo, 2); 99 sum = vmlal_lane_s16(sum, s3, filters_lo, 3); 100 sum = vmlal_lane_s16(sum, s4, filters_hi, 0); 101 sum = vmlal_lane_s16(sum, s5, filters_hi, 1); 102 sum = vmlal_lane_s16(sum, s6, filters_hi, 2); 103 sum = vmlal_lane_s16(sum, s7, filters_hi, 3); 104 return sum; 105} 106 107static INLINE uint16x8_t 108highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 109 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 110 const int16x8_t s6, const int16x8_t s7, 111 const int16x8_t filters, const uint16x8_t max) { 112 const int16x4_t filters_lo = vget_low_s16(filters); 113 const int16x4_t filters_hi = vget_high_s16(filters); 114 int32x4_t sum0, sum1; 115 uint16x8_t d; 116 117 sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0); 118 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1); 119 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2); 120 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3); 121 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0); 122 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1); 123 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2); 124 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3); 125 sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0); 126 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1); 127 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2); 128 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3); 129 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0); 130 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1); 131 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2); 132 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3); 133 d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7)); 134 d = vminq_u16(d, max); 135 return d; 136} 137 138void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, 139 uint16_t *dst, ptrdiff_t dst_stride, 140 const InterpKernel *filter, int x0_q4, 141 int x_step_q4, int y0_q4, int y_step_q4, 142 int w, int h, int bd) { 143 if (x_step_q4 != 16) { 144 vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, 145 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); 146 } else { 147 const int16x8_t filters = vld1q_s16(filter[x0_q4]); 148 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 149 uint16x8_t t0, t1, t2, t3; 150 151 assert(!((intptr_t)dst & 3)); 152 assert(!(dst_stride & 3)); 153 154 src -= 3; 155 156 if (h == 4) { 157 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 158 int32x4_t d0, d1, d2, d3; 159 uint16x8_t d01, d23; 160 161 __builtin_prefetch(src + 0 * src_stride); 162 __builtin_prefetch(src + 1 * src_stride); 163 __builtin_prefetch(src + 2 * src_stride); 164 __builtin_prefetch(src + 3 * src_stride); 165 load_8x4(src, src_stride, &t0, &t1, &t2, &t3); 166 transpose_u16_8x4(&t0, &t1, &t2, &t3); 167 s0 = vreinterpret_s16_u16(vget_low_u16(t0)); 168 s1 = vreinterpret_s16_u16(vget_low_u16(t1)); 169 s2 = vreinterpret_s16_u16(vget_low_u16(t2)); 170 s3 = vreinterpret_s16_u16(vget_low_u16(t3)); 171 s4 = vreinterpret_s16_u16(vget_high_u16(t0)); 172 s5 = vreinterpret_s16_u16(vget_high_u16(t1)); 173 s6 = vreinterpret_s16_u16(vget_high_u16(t2)); 174 __builtin_prefetch(dst + 0 * dst_stride); 175 __builtin_prefetch(dst + 1 * dst_stride); 176 __builtin_prefetch(dst + 2 * dst_stride); 177 __builtin_prefetch(dst + 3 * dst_stride); 178 src += 7; 179 180 do { 181 load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); 182 transpose_s16_4x4d(&s7, &s8, &s9, &s10); 183 184 d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); 185 d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); 186 d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); 187 d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); 188 189 d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); 190 d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); 191 d01 = vminq_u16(d01, max); 192 d23 = vminq_u16(d23, max); 193 transpose_u16_4x4q(&d01, &d23); 194 195 vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); 196 vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); 197 vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); 198 vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); 199 200 s0 = s4; 201 s1 = s5; 202 s2 = s6; 203 s3 = s7; 204 s4 = s8; 205 s5 = s9; 206 s6 = s10; 207 src += 4; 208 dst += 4; 209 w -= 4; 210 } while (w > 0); 211 } else { 212 int16x8_t t4, t5, t6, t7; 213 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 214 uint16x8_t d0, d1, d2, d3; 215 216 if (w == 4) { 217 do { 218 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, 219 &s5, &s6, &s7); 220 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); 221 222 load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, 223 &t4, &t5, &t6, &t7); 224 src += 8 * src_stride; 225 __builtin_prefetch(dst + 0 * dst_stride); 226 __builtin_prefetch(dst + 1 * dst_stride); 227 __builtin_prefetch(dst + 2 * dst_stride); 228 __builtin_prefetch(dst + 3 * dst_stride); 229 __builtin_prefetch(dst + 4 * dst_stride); 230 __builtin_prefetch(dst + 5 * dst_stride); 231 __builtin_prefetch(dst + 6 * dst_stride); 232 __builtin_prefetch(dst + 7 * dst_stride); 233 transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); 234 235 __builtin_prefetch(src + 0 * src_stride); 236 __builtin_prefetch(src + 1 * src_stride); 237 __builtin_prefetch(src + 2 * src_stride); 238 __builtin_prefetch(src + 3 * src_stride); 239 __builtin_prefetch(src + 4 * src_stride); 240 __builtin_prefetch(src + 5 * src_stride); 241 __builtin_prefetch(src + 6 * src_stride); 242 __builtin_prefetch(src + 7 * src_stride); 243 d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); 244 d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); 245 d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); 246 d3 = 247 highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); 248 249 transpose_u16_8x4(&d0, &d1, &d2, &d3); 250 vst1_u16(dst, vget_low_u16(d0)); 251 dst += dst_stride; 252 vst1_u16(dst, vget_low_u16(d1)); 253 dst += dst_stride; 254 vst1_u16(dst, vget_low_u16(d2)); 255 dst += dst_stride; 256 vst1_u16(dst, vget_low_u16(d3)); 257 dst += dst_stride; 258 vst1_u16(dst, vget_high_u16(d0)); 259 dst += dst_stride; 260 vst1_u16(dst, vget_high_u16(d1)); 261 dst += dst_stride; 262 vst1_u16(dst, vget_high_u16(d2)); 263 dst += dst_stride; 264 vst1_u16(dst, vget_high_u16(d3)); 265 dst += dst_stride; 266 h -= 8; 267 } while (h > 0); 268 } else { 269 int width; 270 const uint16_t *s; 271 uint16_t *d; 272 int16x8_t s11, s12, s13, s14; 273 uint16x8_t d4, d5, d6, d7; 274 275 do { 276 __builtin_prefetch(src + 0 * src_stride); 277 __builtin_prefetch(src + 1 * src_stride); 278 __builtin_prefetch(src + 2 * src_stride); 279 __builtin_prefetch(src + 3 * src_stride); 280 __builtin_prefetch(src + 4 * src_stride); 281 __builtin_prefetch(src + 5 * src_stride); 282 __builtin_prefetch(src + 6 * src_stride); 283 __builtin_prefetch(src + 7 * src_stride); 284 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, 285 &s5, &s6, &s7); 286 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); 287 288 width = w; 289 s = src + 7; 290 d = dst; 291 __builtin_prefetch(dst + 0 * dst_stride); 292 __builtin_prefetch(dst + 1 * dst_stride); 293 __builtin_prefetch(dst + 2 * dst_stride); 294 __builtin_prefetch(dst + 3 * dst_stride); 295 __builtin_prefetch(dst + 4 * dst_stride); 296 __builtin_prefetch(dst + 5 * dst_stride); 297 __builtin_prefetch(dst + 6 * dst_stride); 298 __builtin_prefetch(dst + 7 * dst_stride); 299 300 do { 301 load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, 302 &s12, &s13, &s14); 303 transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); 304 305 d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, 306 max); 307 d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, 308 max); 309 d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, 310 max); 311 d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, 312 max); 313 d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, 314 max); 315 d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, 316 max); 317 d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, 318 max); 319 d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, 320 filters, max); 321 322 transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 323 store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 324 325 s0 = s8; 326 s1 = s9; 327 s2 = s10; 328 s3 = s11; 329 s4 = s12; 330 s5 = s13; 331 s6 = s14; 332 s += 8; 333 d += 8; 334 width -= 8; 335 } while (width > 0); 336 src += 8 * src_stride; 337 dst += 8 * dst_stride; 338 h -= 8; 339 } while (h > 0); 340 } 341 } 342 } 343} 344 345void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, 346 ptrdiff_t src_stride, uint16_t *dst, 347 ptrdiff_t dst_stride, 348 const InterpKernel *filter, int x0_q4, 349 int x_step_q4, int y0_q4, 350 int y_step_q4, int w, int h, int bd) { 351 if (x_step_q4 != 16) { 352 vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, 353 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, 354 bd); 355 } else { 356 const int16x8_t filters = vld1q_s16(filter[x0_q4]); 357 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 358 uint16x8_t t0, t1, t2, t3; 359 360 assert(!((intptr_t)dst & 3)); 361 assert(!(dst_stride & 3)); 362 363 src -= 3; 364 365 if (h == 4) { 366 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 367 int32x4_t d0, d1, d2, d3; 368 uint16x8_t d01, d23, t01, t23; 369 370 __builtin_prefetch(src + 0 * src_stride); 371 __builtin_prefetch(src + 1 * src_stride); 372 __builtin_prefetch(src + 2 * src_stride); 373 __builtin_prefetch(src + 3 * src_stride); 374 load_8x4(src, src_stride, &t0, &t1, &t2, &t3); 375 transpose_u16_8x4(&t0, &t1, &t2, &t3); 376 s0 = vreinterpret_s16_u16(vget_low_u16(t0)); 377 s1 = vreinterpret_s16_u16(vget_low_u16(t1)); 378 s2 = vreinterpret_s16_u16(vget_low_u16(t2)); 379 s3 = vreinterpret_s16_u16(vget_low_u16(t3)); 380 s4 = vreinterpret_s16_u16(vget_high_u16(t0)); 381 s5 = vreinterpret_s16_u16(vget_high_u16(t1)); 382 s6 = vreinterpret_s16_u16(vget_high_u16(t2)); 383 __builtin_prefetch(dst + 0 * dst_stride); 384 __builtin_prefetch(dst + 1 * dst_stride); 385 __builtin_prefetch(dst + 2 * dst_stride); 386 __builtin_prefetch(dst + 3 * dst_stride); 387 src += 7; 388 389 do { 390 load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); 391 transpose_s16_4x4d(&s7, &s8, &s9, &s10); 392 393 d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); 394 d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); 395 d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); 396 d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); 397 398 t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); 399 t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); 400 t01 = vminq_u16(t01, max); 401 t23 = vminq_u16(t23, max); 402 transpose_u16_4x4q(&t01, &t23); 403 404 d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), 405 vld1_u16(dst + 2 * dst_stride)); 406 d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), 407 vld1_u16(dst + 3 * dst_stride)); 408 d01 = vrhaddq_u16(d01, t01); 409 d23 = vrhaddq_u16(d23, t23); 410 411 vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); 412 vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); 413 vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); 414 vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); 415 416 s0 = s4; 417 s1 = s5; 418 s2 = s6; 419 s3 = s7; 420 s4 = s8; 421 s5 = s9; 422 s6 = s10; 423 src += 4; 424 dst += 4; 425 w -= 4; 426 } while (w > 0); 427 } else { 428 int16x8_t t4, t5, t6, t7; 429 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 430 uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; 431 432 if (w == 4) { 433 do { 434 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, 435 &s5, &s6, &s7); 436 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); 437 438 load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, 439 &t4, &t5, &t6, &t7); 440 src += 8 * src_stride; 441 __builtin_prefetch(dst + 0 * dst_stride); 442 __builtin_prefetch(dst + 1 * dst_stride); 443 __builtin_prefetch(dst + 2 * dst_stride); 444 __builtin_prefetch(dst + 3 * dst_stride); 445 __builtin_prefetch(dst + 4 * dst_stride); 446 __builtin_prefetch(dst + 5 * dst_stride); 447 __builtin_prefetch(dst + 6 * dst_stride); 448 __builtin_prefetch(dst + 7 * dst_stride); 449 transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); 450 451 __builtin_prefetch(src + 0 * src_stride); 452 __builtin_prefetch(src + 1 * src_stride); 453 __builtin_prefetch(src + 2 * src_stride); 454 __builtin_prefetch(src + 3 * src_stride); 455 __builtin_prefetch(src + 4 * src_stride); 456 __builtin_prefetch(src + 5 * src_stride); 457 __builtin_prefetch(src + 6 * src_stride); 458 __builtin_prefetch(src + 7 * src_stride); 459 t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); 460 t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); 461 t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); 462 t3 = 463 highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); 464 transpose_u16_8x4(&t0, &t1, &t2, &t3); 465 466 d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), 467 vld1_u16(dst + 4 * dst_stride)); 468 d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), 469 vld1_u16(dst + 5 * dst_stride)); 470 d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), 471 vld1_u16(dst + 6 * dst_stride)); 472 d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride), 473 vld1_u16(dst + 7 * dst_stride)); 474 d0 = vrhaddq_u16(d0, t0); 475 d1 = vrhaddq_u16(d1, t1); 476 d2 = vrhaddq_u16(d2, t2); 477 d3 = vrhaddq_u16(d3, t3); 478 479 vst1_u16(dst, vget_low_u16(d0)); 480 dst += dst_stride; 481 vst1_u16(dst, vget_low_u16(d1)); 482 dst += dst_stride; 483 vst1_u16(dst, vget_low_u16(d2)); 484 dst += dst_stride; 485 vst1_u16(dst, vget_low_u16(d3)); 486 dst += dst_stride; 487 vst1_u16(dst, vget_high_u16(d0)); 488 dst += dst_stride; 489 vst1_u16(dst, vget_high_u16(d1)); 490 dst += dst_stride; 491 vst1_u16(dst, vget_high_u16(d2)); 492 dst += dst_stride; 493 vst1_u16(dst, vget_high_u16(d3)); 494 dst += dst_stride; 495 h -= 8; 496 } while (h > 0); 497 } else { 498 int width; 499 const uint16_t *s; 500 uint16_t *d; 501 int16x8_t s11, s12, s13, s14; 502 uint16x8_t d4, d5, d6, d7; 503 504 do { 505 __builtin_prefetch(src + 0 * src_stride); 506 __builtin_prefetch(src + 1 * src_stride); 507 __builtin_prefetch(src + 2 * src_stride); 508 __builtin_prefetch(src + 3 * src_stride); 509 __builtin_prefetch(src + 4 * src_stride); 510 __builtin_prefetch(src + 5 * src_stride); 511 __builtin_prefetch(src + 6 * src_stride); 512 __builtin_prefetch(src + 7 * src_stride); 513 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, 514 &s5, &s6, &s7); 515 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); 516 517 width = w; 518 s = src + 7; 519 d = dst; 520 __builtin_prefetch(dst + 0 * dst_stride); 521 __builtin_prefetch(dst + 1 * dst_stride); 522 __builtin_prefetch(dst + 2 * dst_stride); 523 __builtin_prefetch(dst + 3 * dst_stride); 524 __builtin_prefetch(dst + 4 * dst_stride); 525 __builtin_prefetch(dst + 5 * dst_stride); 526 __builtin_prefetch(dst + 6 * dst_stride); 527 __builtin_prefetch(dst + 7 * dst_stride); 528 529 do { 530 load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, 531 &s12, &s13, &s14); 532 transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); 533 534 d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, 535 max); 536 d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, 537 max); 538 d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, 539 max); 540 d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, 541 max); 542 d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, 543 max); 544 d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, 545 max); 546 d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, 547 max); 548 d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, 549 filters, max); 550 551 transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 552 553 d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); 554 d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); 555 d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); 556 d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); 557 d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride)); 558 d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride)); 559 d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride)); 560 d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride)); 561 562 store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 563 564 s0 = s8; 565 s1 = s9; 566 s2 = s10; 567 s3 = s11; 568 s4 = s12; 569 s5 = s13; 570 s6 = s14; 571 s += 8; 572 d += 8; 573 width -= 8; 574 } while (width > 0); 575 src += 8 * src_stride; 576 dst += 8 * dst_stride; 577 h -= 8; 578 } while (h > 0); 579 } 580 } 581 } 582} 583 584void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, 585 uint16_t *dst, ptrdiff_t dst_stride, 586 const InterpKernel *filter, int x0_q4, 587 int x_step_q4, int y0_q4, int y_step_q4, 588 int w, int h, int bd) { 589 if (y_step_q4 != 16) { 590 vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, 591 x_step_q4, y0_q4, y_step_q4, w, h, bd); 592 } else { 593 const int16x8_t filters = vld1q_s16(filter[y0_q4]); 594 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 595 596 assert(!((intptr_t)dst & 3)); 597 assert(!(dst_stride & 3)); 598 599 src -= 3 * src_stride; 600 601 if (w == 4) { 602 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 603 int32x4_t d0, d1, d2, d3; 604 uint16x8_t d01, d23; 605 606 s0 = vreinterpret_s16_u16(vld1_u16(src)); 607 src += src_stride; 608 s1 = vreinterpret_s16_u16(vld1_u16(src)); 609 src += src_stride; 610 s2 = vreinterpret_s16_u16(vld1_u16(src)); 611 src += src_stride; 612 s3 = vreinterpret_s16_u16(vld1_u16(src)); 613 src += src_stride; 614 s4 = vreinterpret_s16_u16(vld1_u16(src)); 615 src += src_stride; 616 s5 = vreinterpret_s16_u16(vld1_u16(src)); 617 src += src_stride; 618 s6 = vreinterpret_s16_u16(vld1_u16(src)); 619 src += src_stride; 620 621 do { 622 s7 = vreinterpret_s16_u16(vld1_u16(src)); 623 src += src_stride; 624 s8 = vreinterpret_s16_u16(vld1_u16(src)); 625 src += src_stride; 626 s9 = vreinterpret_s16_u16(vld1_u16(src)); 627 src += src_stride; 628 s10 = vreinterpret_s16_u16(vld1_u16(src)); 629 src += src_stride; 630 631 __builtin_prefetch(dst + 0 * dst_stride); 632 __builtin_prefetch(dst + 1 * dst_stride); 633 __builtin_prefetch(dst + 2 * dst_stride); 634 __builtin_prefetch(dst + 3 * dst_stride); 635 __builtin_prefetch(src + 0 * src_stride); 636 __builtin_prefetch(src + 1 * src_stride); 637 __builtin_prefetch(src + 2 * src_stride); 638 __builtin_prefetch(src + 3 * src_stride); 639 d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); 640 d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); 641 d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); 642 d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); 643 644 d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); 645 d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); 646 d01 = vminq_u16(d01, max); 647 d23 = vminq_u16(d23, max); 648 vst1_u16(dst, vget_low_u16(d01)); 649 dst += dst_stride; 650 vst1_u16(dst, vget_high_u16(d01)); 651 dst += dst_stride; 652 vst1_u16(dst, vget_low_u16(d23)); 653 dst += dst_stride; 654 vst1_u16(dst, vget_high_u16(d23)); 655 dst += dst_stride; 656 657 s0 = s4; 658 s1 = s5; 659 s2 = s6; 660 s3 = s7; 661 s4 = s8; 662 s5 = s9; 663 s6 = s10; 664 h -= 4; 665 } while (h > 0); 666 } else { 667 int height; 668 const uint16_t *s; 669 uint16_t *d; 670 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 671 uint16x8_t d0, d1, d2, d3; 672 673 do { 674 __builtin_prefetch(src + 0 * src_stride); 675 __builtin_prefetch(src + 1 * src_stride); 676 __builtin_prefetch(src + 2 * src_stride); 677 __builtin_prefetch(src + 3 * src_stride); 678 __builtin_prefetch(src + 4 * src_stride); 679 __builtin_prefetch(src + 5 * src_stride); 680 __builtin_prefetch(src + 6 * src_stride); 681 s = src; 682 s0 = vreinterpretq_s16_u16(vld1q_u16(s)); 683 s += src_stride; 684 s1 = vreinterpretq_s16_u16(vld1q_u16(s)); 685 s += src_stride; 686 s2 = vreinterpretq_s16_u16(vld1q_u16(s)); 687 s += src_stride; 688 s3 = vreinterpretq_s16_u16(vld1q_u16(s)); 689 s += src_stride; 690 s4 = vreinterpretq_s16_u16(vld1q_u16(s)); 691 s += src_stride; 692 s5 = vreinterpretq_s16_u16(vld1q_u16(s)); 693 s += src_stride; 694 s6 = vreinterpretq_s16_u16(vld1q_u16(s)); 695 s += src_stride; 696 d = dst; 697 height = h; 698 699 do { 700 s7 = vreinterpretq_s16_u16(vld1q_u16(s)); 701 s += src_stride; 702 s8 = vreinterpretq_s16_u16(vld1q_u16(s)); 703 s += src_stride; 704 s9 = vreinterpretq_s16_u16(vld1q_u16(s)); 705 s += src_stride; 706 s10 = vreinterpretq_s16_u16(vld1q_u16(s)); 707 s += src_stride; 708 709 __builtin_prefetch(d + 0 * dst_stride); 710 __builtin_prefetch(d + 1 * dst_stride); 711 __builtin_prefetch(d + 2 * dst_stride); 712 __builtin_prefetch(d + 3 * dst_stride); 713 __builtin_prefetch(s + 0 * src_stride); 714 __builtin_prefetch(s + 1 * src_stride); 715 __builtin_prefetch(s + 2 * src_stride); 716 __builtin_prefetch(s + 3 * src_stride); 717 d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); 718 d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); 719 d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); 720 d3 = 721 highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); 722 723 vst1q_u16(d, d0); 724 d += dst_stride; 725 vst1q_u16(d, d1); 726 d += dst_stride; 727 vst1q_u16(d, d2); 728 d += dst_stride; 729 vst1q_u16(d, d3); 730 d += dst_stride; 731 732 s0 = s4; 733 s1 = s5; 734 s2 = s6; 735 s3 = s7; 736 s4 = s8; 737 s5 = s9; 738 s6 = s10; 739 height -= 4; 740 } while (height > 0); 741 src += 8; 742 dst += 8; 743 w -= 8; 744 } while (w > 0); 745 } 746 } 747} 748 749void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, 750 ptrdiff_t src_stride, uint16_t *dst, 751 ptrdiff_t dst_stride, 752 const InterpKernel *filter, int x0_q4, 753 int x_step_q4, int y0_q4, int y_step_q4, 754 int w, int h, int bd) { 755 if (y_step_q4 != 16) { 756 vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, 757 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, 758 bd); 759 } else { 760 const int16x8_t filters = vld1q_s16(filter[y0_q4]); 761 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 762 763 assert(!((intptr_t)dst & 3)); 764 assert(!(dst_stride & 3)); 765 766 src -= 3 * src_stride; 767 768 if (w == 4) { 769 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 770 int32x4_t d0, d1, d2, d3; 771 uint16x8_t d01, d23, t01, t23; 772 773 s0 = vreinterpret_s16_u16(vld1_u16(src)); 774 src += src_stride; 775 s1 = vreinterpret_s16_u16(vld1_u16(src)); 776 src += src_stride; 777 s2 = vreinterpret_s16_u16(vld1_u16(src)); 778 src += src_stride; 779 s3 = vreinterpret_s16_u16(vld1_u16(src)); 780 src += src_stride; 781 s4 = vreinterpret_s16_u16(vld1_u16(src)); 782 src += src_stride; 783 s5 = vreinterpret_s16_u16(vld1_u16(src)); 784 src += src_stride; 785 s6 = vreinterpret_s16_u16(vld1_u16(src)); 786 src += src_stride; 787 788 do { 789 s7 = vreinterpret_s16_u16(vld1_u16(src)); 790 src += src_stride; 791 s8 = vreinterpret_s16_u16(vld1_u16(src)); 792 src += src_stride; 793 s9 = vreinterpret_s16_u16(vld1_u16(src)); 794 src += src_stride; 795 s10 = vreinterpret_s16_u16(vld1_u16(src)); 796 src += src_stride; 797 798 __builtin_prefetch(dst + 0 * dst_stride); 799 __builtin_prefetch(dst + 1 * dst_stride); 800 __builtin_prefetch(dst + 2 * dst_stride); 801 __builtin_prefetch(dst + 3 * dst_stride); 802 __builtin_prefetch(src + 0 * src_stride); 803 __builtin_prefetch(src + 1 * src_stride); 804 __builtin_prefetch(src + 2 * src_stride); 805 __builtin_prefetch(src + 3 * src_stride); 806 d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); 807 d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); 808 d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); 809 d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); 810 811 t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); 812 t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); 813 t01 = vminq_u16(t01, max); 814 t23 = vminq_u16(t23, max); 815 816 d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), 817 vld1_u16(dst + 1 * dst_stride)); 818 d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), 819 vld1_u16(dst + 3 * dst_stride)); 820 d01 = vrhaddq_u16(d01, t01); 821 d23 = vrhaddq_u16(d23, t23); 822 823 vst1_u16(dst, vget_low_u16(d01)); 824 dst += dst_stride; 825 vst1_u16(dst, vget_high_u16(d01)); 826 dst += dst_stride; 827 vst1_u16(dst, vget_low_u16(d23)); 828 dst += dst_stride; 829 vst1_u16(dst, vget_high_u16(d23)); 830 dst += dst_stride; 831 832 s0 = s4; 833 s1 = s5; 834 s2 = s6; 835 s3 = s7; 836 s4 = s8; 837 s5 = s9; 838 s6 = s10; 839 h -= 4; 840 } while (h > 0); 841 } else { 842 int height; 843 const uint16_t *s; 844 uint16_t *d; 845 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 846 uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; 847 848 do { 849 __builtin_prefetch(src + 0 * src_stride); 850 __builtin_prefetch(src + 1 * src_stride); 851 __builtin_prefetch(src + 2 * src_stride); 852 __builtin_prefetch(src + 3 * src_stride); 853 __builtin_prefetch(src + 4 * src_stride); 854 __builtin_prefetch(src + 5 * src_stride); 855 __builtin_prefetch(src + 6 * src_stride); 856 s = src; 857 s0 = vreinterpretq_s16_u16(vld1q_u16(s)); 858 s += src_stride; 859 s1 = vreinterpretq_s16_u16(vld1q_u16(s)); 860 s += src_stride; 861 s2 = vreinterpretq_s16_u16(vld1q_u16(s)); 862 s += src_stride; 863 s3 = vreinterpretq_s16_u16(vld1q_u16(s)); 864 s += src_stride; 865 s4 = vreinterpretq_s16_u16(vld1q_u16(s)); 866 s += src_stride; 867 s5 = vreinterpretq_s16_u16(vld1q_u16(s)); 868 s += src_stride; 869 s6 = vreinterpretq_s16_u16(vld1q_u16(s)); 870 s += src_stride; 871 d = dst; 872 height = h; 873 874 do { 875 s7 = vreinterpretq_s16_u16(vld1q_u16(s)); 876 s += src_stride; 877 s8 = vreinterpretq_s16_u16(vld1q_u16(s)); 878 s += src_stride; 879 s9 = vreinterpretq_s16_u16(vld1q_u16(s)); 880 s += src_stride; 881 s10 = vreinterpretq_s16_u16(vld1q_u16(s)); 882 s += src_stride; 883 884 __builtin_prefetch(d + 0 * dst_stride); 885 __builtin_prefetch(d + 1 * dst_stride); 886 __builtin_prefetch(d + 2 * dst_stride); 887 __builtin_prefetch(d + 3 * dst_stride); 888 __builtin_prefetch(s + 0 * src_stride); 889 __builtin_prefetch(s + 1 * src_stride); 890 __builtin_prefetch(s + 2 * src_stride); 891 __builtin_prefetch(s + 3 * src_stride); 892 t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); 893 t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); 894 t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); 895 t3 = 896 highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); 897 898 d0 = vld1q_u16(d + 0 * dst_stride); 899 d1 = vld1q_u16(d + 1 * dst_stride); 900 d2 = vld1q_u16(d + 2 * dst_stride); 901 d3 = vld1q_u16(d + 3 * dst_stride); 902 d0 = vrhaddq_u16(d0, t0); 903 d1 = vrhaddq_u16(d1, t1); 904 d2 = vrhaddq_u16(d2, t2); 905 d3 = vrhaddq_u16(d3, t3); 906 907 vst1q_u16(d, d0); 908 d += dst_stride; 909 vst1q_u16(d, d1); 910 d += dst_stride; 911 vst1q_u16(d, d2); 912 d += dst_stride; 913 vst1q_u16(d, d3); 914 d += dst_stride; 915 916 s0 = s4; 917 s1 = s5; 918 s2 = s6; 919 s3 = s7; 920 s4 = s8; 921 s5 = s9; 922 s6 = s10; 923 height -= 4; 924 } while (height > 0); 925 src += 8; 926 dst += 8; 927 w -= 8; 928 } while (w > 0); 929 } 930 } 931} 932