1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12#include "vpx_ports/mem.h" 13#include "vpx/vpx_integer.h" 14 15static const uint16_t bilinear_taps_coeff[8][2] = { 16 {128, 0}, 17 {112, 16}, 18 { 96, 32}, 19 { 80, 48}, 20 { 64, 64}, 21 { 48, 80}, 22 { 32, 96}, 23 { 16, 112} 24}; 25 26unsigned int vp8_sub_pixel_variance16x16_neon_func( 27 const unsigned char *src_ptr, 28 int src_pixels_per_line, 29 int xoffset, 30 int yoffset, 31 const unsigned char *dst_ptr, 32 int dst_pixels_per_line, 33 unsigned int *sse) { 34 int i; 35 DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528); 36 unsigned char *tmpp; 37 unsigned char *tmpp2; 38 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 39 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; 40 uint8x8_t d19u8, d20u8, d21u8; 41 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; 42 uint32x2_t d0u32, d10u32; 43 int64x1_t d0s64, d1s64, d2s64, d3s64; 44 uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; 45 uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; 46 uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; 47 uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; 48 int32x4_t q8s32, q9s32, q10s32; 49 int64x2_t q0s64, q1s64, q5s64; 50 51 tmpp2 = tmp + 272; 52 tmpp = tmp; 53 if (xoffset == 0) { // secondpass_bfilter16x16_only 54 d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); 55 d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); 56 57 q11u8 = vld1q_u8(src_ptr); 58 src_ptr += src_pixels_per_line; 59 for (i = 4; i > 0; i--) { 60 q12u8 = vld1q_u8(src_ptr); 61 src_ptr += src_pixels_per_line; 62 q13u8 = vld1q_u8(src_ptr); 63 src_ptr += src_pixels_per_line; 64 q14u8 = vld1q_u8(src_ptr); 65 src_ptr += src_pixels_per_line; 66 q15u8 = vld1q_u8(src_ptr); 67 src_ptr += src_pixels_per_line; 68 69 __builtin_prefetch(src_ptr); 70 __builtin_prefetch(src_ptr + src_pixels_per_line); 71 __builtin_prefetch(src_ptr + src_pixels_per_line * 2); 72 73 q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); 74 q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); 75 q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); 76 q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); 77 q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); 78 q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); 79 q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); 80 q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); 81 82 q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); 83 q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); 84 q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); 85 q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); 86 q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); 87 q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); 88 q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); 89 q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); 90 91 d2u8 = vqrshrn_n_u16(q1u16, 7); 92 d3u8 = vqrshrn_n_u16(q2u16, 7); 93 d4u8 = vqrshrn_n_u16(q3u16, 7); 94 d5u8 = vqrshrn_n_u16(q4u16, 7); 95 d6u8 = vqrshrn_n_u16(q5u16, 7); 96 d7u8 = vqrshrn_n_u16(q6u16, 7); 97 d8u8 = vqrshrn_n_u16(q7u16, 7); 98 d9u8 = vqrshrn_n_u16(q8u16, 7); 99 100 q1u8 = vcombine_u8(d2u8, d3u8); 101 q2u8 = vcombine_u8(d4u8, d5u8); 102 q3u8 = vcombine_u8(d6u8, d7u8); 103 q4u8 = vcombine_u8(d8u8, d9u8); 104 105 q11u8 = q15u8; 106 107 vst1q_u8((uint8_t *)tmpp2, q1u8); 108 tmpp2 += 16; 109 vst1q_u8((uint8_t *)tmpp2, q2u8); 110 tmpp2 += 16; 111 vst1q_u8((uint8_t *)tmpp2, q3u8); 112 tmpp2 += 16; 113 vst1q_u8((uint8_t *)tmpp2, q4u8); 114 tmpp2 += 16; 115 } 116 } else if (yoffset == 0) { // firstpass_bfilter16x16_only 117 d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); 118 d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); 119 120 for (i = 4; i > 0 ; i--) { 121 d2u8 = vld1_u8(src_ptr); 122 d3u8 = vld1_u8(src_ptr + 8); 123 d4u8 = vld1_u8(src_ptr + 16); 124 src_ptr += src_pixels_per_line; 125 d5u8 = vld1_u8(src_ptr); 126 d6u8 = vld1_u8(src_ptr + 8); 127 d7u8 = vld1_u8(src_ptr + 16); 128 src_ptr += src_pixels_per_line; 129 d8u8 = vld1_u8(src_ptr); 130 d9u8 = vld1_u8(src_ptr + 8); 131 d10u8 = vld1_u8(src_ptr + 16); 132 src_ptr += src_pixels_per_line; 133 d11u8 = vld1_u8(src_ptr); 134 d12u8 = vld1_u8(src_ptr + 8); 135 d13u8 = vld1_u8(src_ptr + 16); 136 src_ptr += src_pixels_per_line; 137 138 __builtin_prefetch(src_ptr); 139 __builtin_prefetch(src_ptr + src_pixels_per_line); 140 __builtin_prefetch(src_ptr + src_pixels_per_line * 2); 141 142 q7u16 = vmull_u8(d2u8, d0u8); 143 q8u16 = vmull_u8(d3u8, d0u8); 144 q9u16 = vmull_u8(d5u8, d0u8); 145 q10u16 = vmull_u8(d6u8, d0u8); 146 q11u16 = vmull_u8(d8u8, d0u8); 147 q12u16 = vmull_u8(d9u8, d0u8); 148 q13u16 = vmull_u8(d11u8, d0u8); 149 q14u16 = vmull_u8(d12u8, d0u8); 150 151 d2u8 = vext_u8(d2u8, d3u8, 1); 152 d5u8 = vext_u8(d5u8, d6u8, 1); 153 d8u8 = vext_u8(d8u8, d9u8, 1); 154 d11u8 = vext_u8(d11u8, d12u8, 1); 155 156 q7u16 = vmlal_u8(q7u16, d2u8, d1u8); 157 q9u16 = vmlal_u8(q9u16, d5u8, d1u8); 158 q11u16 = vmlal_u8(q11u16, d8u8, d1u8); 159 q13u16 = vmlal_u8(q13u16, d11u8, d1u8); 160 161 d3u8 = vext_u8(d3u8, d4u8, 1); 162 d6u8 = vext_u8(d6u8, d7u8, 1); 163 d9u8 = vext_u8(d9u8, d10u8, 1); 164 d12u8 = vext_u8(d12u8, d13u8, 1); 165 166 q8u16 = vmlal_u8(q8u16, d3u8, d1u8); 167 q10u16 = vmlal_u8(q10u16, d6u8, d1u8); 168 q12u16 = vmlal_u8(q12u16, d9u8, d1u8); 169 q14u16 = vmlal_u8(q14u16, d12u8, d1u8); 170 171 d14u8 = vqrshrn_n_u16(q7u16, 7); 172 d15u8 = vqrshrn_n_u16(q8u16, 7); 173 d16u8 = vqrshrn_n_u16(q9u16, 7); 174 d17u8 = vqrshrn_n_u16(q10u16, 7); 175 d18u8 = vqrshrn_n_u16(q11u16, 7); 176 d19u8 = vqrshrn_n_u16(q12u16, 7); 177 d20u8 = vqrshrn_n_u16(q13u16, 7); 178 d21u8 = vqrshrn_n_u16(q14u16, 7); 179 180 q7u8 = vcombine_u8(d14u8, d15u8); 181 q8u8 = vcombine_u8(d16u8, d17u8); 182 q9u8 = vcombine_u8(d18u8, d19u8); 183 q10u8 = vcombine_u8(d20u8, d21u8); 184 185 vst1q_u8((uint8_t *)tmpp2, q7u8); 186 tmpp2 += 16; 187 vst1q_u8((uint8_t *)tmpp2, q8u8); 188 tmpp2 += 16; 189 vst1q_u8((uint8_t *)tmpp2, q9u8); 190 tmpp2 += 16; 191 vst1q_u8((uint8_t *)tmpp2, q10u8); 192 tmpp2 += 16; 193 } 194 } else { 195 d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); 196 d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); 197 198 d2u8 = vld1_u8(src_ptr); 199 d3u8 = vld1_u8(src_ptr + 8); 200 d4u8 = vld1_u8(src_ptr + 16); 201 src_ptr += src_pixels_per_line; 202 d5u8 = vld1_u8(src_ptr); 203 d6u8 = vld1_u8(src_ptr + 8); 204 d7u8 = vld1_u8(src_ptr + 16); 205 src_ptr += src_pixels_per_line; 206 d8u8 = vld1_u8(src_ptr); 207 d9u8 = vld1_u8(src_ptr + 8); 208 d10u8 = vld1_u8(src_ptr + 16); 209 src_ptr += src_pixels_per_line; 210 d11u8 = vld1_u8(src_ptr); 211 d12u8 = vld1_u8(src_ptr + 8); 212 d13u8 = vld1_u8(src_ptr + 16); 213 src_ptr += src_pixels_per_line; 214 215 // First Pass: output_height lines x output_width columns (17x16) 216 for (i = 3; i > 0; i--) { 217 q7u16 = vmull_u8(d2u8, d0u8); 218 q8u16 = vmull_u8(d3u8, d0u8); 219 q9u16 = vmull_u8(d5u8, d0u8); 220 q10u16 = vmull_u8(d6u8, d0u8); 221 q11u16 = vmull_u8(d8u8, d0u8); 222 q12u16 = vmull_u8(d9u8, d0u8); 223 q13u16 = vmull_u8(d11u8, d0u8); 224 q14u16 = vmull_u8(d12u8, d0u8); 225 226 d2u8 = vext_u8(d2u8, d3u8, 1); 227 d5u8 = vext_u8(d5u8, d6u8, 1); 228 d8u8 = vext_u8(d8u8, d9u8, 1); 229 d11u8 = vext_u8(d11u8, d12u8, 1); 230 231 q7u16 = vmlal_u8(q7u16, d2u8, d1u8); 232 q9u16 = vmlal_u8(q9u16, d5u8, d1u8); 233 q11u16 = vmlal_u8(q11u16, d8u8, d1u8); 234 q13u16 = vmlal_u8(q13u16, d11u8, d1u8); 235 236 d3u8 = vext_u8(d3u8, d4u8, 1); 237 d6u8 = vext_u8(d6u8, d7u8, 1); 238 d9u8 = vext_u8(d9u8, d10u8, 1); 239 d12u8 = vext_u8(d12u8, d13u8, 1); 240 241 q8u16 = vmlal_u8(q8u16, d3u8, d1u8); 242 q10u16 = vmlal_u8(q10u16, d6u8, d1u8); 243 q12u16 = vmlal_u8(q12u16, d9u8, d1u8); 244 q14u16 = vmlal_u8(q14u16, d12u8, d1u8); 245 246 d14u8 = vqrshrn_n_u16(q7u16, 7); 247 d15u8 = vqrshrn_n_u16(q8u16, 7); 248 d16u8 = vqrshrn_n_u16(q9u16, 7); 249 d17u8 = vqrshrn_n_u16(q10u16, 7); 250 d18u8 = vqrshrn_n_u16(q11u16, 7); 251 d19u8 = vqrshrn_n_u16(q12u16, 7); 252 d20u8 = vqrshrn_n_u16(q13u16, 7); 253 d21u8 = vqrshrn_n_u16(q14u16, 7); 254 255 d2u8 = vld1_u8(src_ptr); 256 d3u8 = vld1_u8(src_ptr + 8); 257 d4u8 = vld1_u8(src_ptr + 16); 258 src_ptr += src_pixels_per_line; 259 d5u8 = vld1_u8(src_ptr); 260 d6u8 = vld1_u8(src_ptr + 8); 261 d7u8 = vld1_u8(src_ptr + 16); 262 src_ptr += src_pixels_per_line; 263 d8u8 = vld1_u8(src_ptr); 264 d9u8 = vld1_u8(src_ptr + 8); 265 d10u8 = vld1_u8(src_ptr + 16); 266 src_ptr += src_pixels_per_line; 267 d11u8 = vld1_u8(src_ptr); 268 d12u8 = vld1_u8(src_ptr + 8); 269 d13u8 = vld1_u8(src_ptr + 16); 270 src_ptr += src_pixels_per_line; 271 272 q7u8 = vcombine_u8(d14u8, d15u8); 273 q8u8 = vcombine_u8(d16u8, d17u8); 274 q9u8 = vcombine_u8(d18u8, d19u8); 275 q10u8 = vcombine_u8(d20u8, d21u8); 276 277 vst1q_u8((uint8_t *)tmpp, q7u8); 278 tmpp += 16; 279 vst1q_u8((uint8_t *)tmpp, q8u8); 280 tmpp += 16; 281 vst1q_u8((uint8_t *)tmpp, q9u8); 282 tmpp += 16; 283 vst1q_u8((uint8_t *)tmpp, q10u8); 284 tmpp += 16; 285 } 286 287 // First-pass filtering for rest 5 lines 288 d14u8 = vld1_u8(src_ptr); 289 d15u8 = vld1_u8(src_ptr + 8); 290 d16u8 = vld1_u8(src_ptr + 16); 291 src_ptr += src_pixels_per_line; 292 293 q9u16 = vmull_u8(d2u8, d0u8); 294 q10u16 = vmull_u8(d3u8, d0u8); 295 q11u16 = vmull_u8(d5u8, d0u8); 296 q12u16 = vmull_u8(d6u8, d0u8); 297 q13u16 = vmull_u8(d8u8, d0u8); 298 q14u16 = vmull_u8(d9u8, d0u8); 299 300 d2u8 = vext_u8(d2u8, d3u8, 1); 301 d5u8 = vext_u8(d5u8, d6u8, 1); 302 d8u8 = vext_u8(d8u8, d9u8, 1); 303 304 q9u16 = vmlal_u8(q9u16, d2u8, d1u8); 305 q11u16 = vmlal_u8(q11u16, d5u8, d1u8); 306 q13u16 = vmlal_u8(q13u16, d8u8, d1u8); 307 308 d3u8 = vext_u8(d3u8, d4u8, 1); 309 d6u8 = vext_u8(d6u8, d7u8, 1); 310 d9u8 = vext_u8(d9u8, d10u8, 1); 311 312 q10u16 = vmlal_u8(q10u16, d3u8, d1u8); 313 q12u16 = vmlal_u8(q12u16, d6u8, d1u8); 314 q14u16 = vmlal_u8(q14u16, d9u8, d1u8); 315 316 q1u16 = vmull_u8(d11u8, d0u8); 317 q2u16 = vmull_u8(d12u8, d0u8); 318 q3u16 = vmull_u8(d14u8, d0u8); 319 q4u16 = vmull_u8(d15u8, d0u8); 320 321 d11u8 = vext_u8(d11u8, d12u8, 1); 322 d14u8 = vext_u8(d14u8, d15u8, 1); 323 324 q1u16 = vmlal_u8(q1u16, d11u8, d1u8); 325 q3u16 = vmlal_u8(q3u16, d14u8, d1u8); 326 327 d12u8 = vext_u8(d12u8, d13u8, 1); 328 d15u8 = vext_u8(d15u8, d16u8, 1); 329 330 q2u16 = vmlal_u8(q2u16, d12u8, d1u8); 331 q4u16 = vmlal_u8(q4u16, d15u8, d1u8); 332 333 d10u8 = vqrshrn_n_u16(q9u16, 7); 334 d11u8 = vqrshrn_n_u16(q10u16, 7); 335 d12u8 = vqrshrn_n_u16(q11u16, 7); 336 d13u8 = vqrshrn_n_u16(q12u16, 7); 337 d14u8 = vqrshrn_n_u16(q13u16, 7); 338 d15u8 = vqrshrn_n_u16(q14u16, 7); 339 d16u8 = vqrshrn_n_u16(q1u16, 7); 340 d17u8 = vqrshrn_n_u16(q2u16, 7); 341 d18u8 = vqrshrn_n_u16(q3u16, 7); 342 d19u8 = vqrshrn_n_u16(q4u16, 7); 343 344 q5u8 = vcombine_u8(d10u8, d11u8); 345 q6u8 = vcombine_u8(d12u8, d13u8); 346 q7u8 = vcombine_u8(d14u8, d15u8); 347 q8u8 = vcombine_u8(d16u8, d17u8); 348 q9u8 = vcombine_u8(d18u8, d19u8); 349 350 vst1q_u8((uint8_t *)tmpp, q5u8); 351 tmpp += 16; 352 vst1q_u8((uint8_t *)tmpp, q6u8); 353 tmpp += 16; 354 vst1q_u8((uint8_t *)tmpp, q7u8); 355 tmpp += 16; 356 vst1q_u8((uint8_t *)tmpp, q8u8); 357 tmpp += 16; 358 vst1q_u8((uint8_t *)tmpp, q9u8); 359 360 // secondpass_filter 361 d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); 362 d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); 363 364 tmpp = tmp; 365 tmpp2 = tmpp + 272; 366 q11u8 = vld1q_u8(tmpp); 367 tmpp += 16; 368 for (i = 4; i > 0; i--) { 369 q12u8 = vld1q_u8(tmpp); 370 tmpp += 16; 371 q13u8 = vld1q_u8(tmpp); 372 tmpp += 16; 373 q14u8 = vld1q_u8(tmpp); 374 tmpp += 16; 375 q15u8 = vld1q_u8(tmpp); 376 tmpp += 16; 377 378 q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); 379 q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); 380 q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); 381 q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); 382 q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); 383 q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); 384 q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); 385 q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); 386 387 q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); 388 q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); 389 q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); 390 q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); 391 q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); 392 q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); 393 q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); 394 q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); 395 396 d2u8 = vqrshrn_n_u16(q1u16, 7); 397 d3u8 = vqrshrn_n_u16(q2u16, 7); 398 d4u8 = vqrshrn_n_u16(q3u16, 7); 399 d5u8 = vqrshrn_n_u16(q4u16, 7); 400 d6u8 = vqrshrn_n_u16(q5u16, 7); 401 d7u8 = vqrshrn_n_u16(q6u16, 7); 402 d8u8 = vqrshrn_n_u16(q7u16, 7); 403 d9u8 = vqrshrn_n_u16(q8u16, 7); 404 405 q1u8 = vcombine_u8(d2u8, d3u8); 406 q2u8 = vcombine_u8(d4u8, d5u8); 407 q3u8 = vcombine_u8(d6u8, d7u8); 408 q4u8 = vcombine_u8(d8u8, d9u8); 409 410 q11u8 = q15u8; 411 412 vst1q_u8((uint8_t *)tmpp2, q1u8); 413 tmpp2 += 16; 414 vst1q_u8((uint8_t *)tmpp2, q2u8); 415 tmpp2 += 16; 416 vst1q_u8((uint8_t *)tmpp2, q3u8); 417 tmpp2 += 16; 418 vst1q_u8((uint8_t *)tmpp2, q4u8); 419 tmpp2 += 16; 420 } 421 } 422 423 // sub_pixel_variance16x16_neon 424 q8s32 = vdupq_n_s32(0); 425 q9s32 = vdupq_n_s32(0); 426 q10s32 = vdupq_n_s32(0); 427 428 tmpp = tmp + 272; 429 for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop 430 q0u8 = vld1q_u8(tmpp); 431 tmpp += 16; 432 q1u8 = vld1q_u8(tmpp); 433 tmpp += 16; 434 q2u8 = vld1q_u8(dst_ptr); 435 dst_ptr += dst_pixels_per_line; 436 q3u8 = vld1q_u8(dst_ptr); 437 dst_ptr += dst_pixels_per_line; 438 439 d0u8 = vget_low_u8(q0u8); 440 d1u8 = vget_high_u8(q0u8); 441 d2u8 = vget_low_u8(q1u8); 442 d3u8 = vget_high_u8(q1u8); 443 444 q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8)); 445 q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8)); 446 q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8)); 447 q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8)); 448 449 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 450 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 451 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 452 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 453 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 454 455 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 456 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 457 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 458 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 459 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 460 461 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); 462 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); 463 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); 464 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); 465 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); 466 467 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); 468 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); 469 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); 470 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); 471 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); 472 } 473 474 q10s32 = vaddq_s32(q10s32, q9s32); 475 q0s64 = vpaddlq_s32(q8s32); 476 q1s64 = vpaddlq_s32(q10s32); 477 478 d0s64 = vget_low_s64(q0s64); 479 d1s64 = vget_high_s64(q0s64); 480 d2s64 = vget_low_s64(q1s64); 481 d3s64 = vget_high_s64(q1s64); 482 d0s64 = vadd_s64(d0s64, d1s64); 483 d1s64 = vadd_s64(d2s64, d3s64); 484 485 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 486 vreinterpret_s32_s64(d0s64)); 487 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 488 489 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); 490 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 491 492 return vget_lane_u32(d0u32, 0); 493} 494 495unsigned int vp8_variance_halfpixvar16x16_h_neon( 496 const unsigned char *src_ptr, 497 int source_stride, 498 const unsigned char *ref_ptr, 499 int recon_stride, 500 unsigned int *sse) { 501 int i; 502 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; 503 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; 504 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; 505 uint32x2_t d0u32, d10u32; 506 int64x1_t d0s64, d1s64, d2s64, d3s64; 507 uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8; 508 uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8; 509 uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16; 510 int32x4_t q8s32, q9s32, q10s32; 511 int64x2_t q0s64, q1s64, q5s64; 512 513 q8s32 = vdupq_n_s32(0); 514 q9s32 = vdupq_n_s32(0); 515 q10s32 = vdupq_n_s32(0); 516 517 for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon 518 q0u8 = vld1q_u8(src_ptr); 519 q1u8 = vld1q_u8(src_ptr + 16); 520 src_ptr += source_stride; 521 q2u8 = vld1q_u8(src_ptr); 522 q3u8 = vld1q_u8(src_ptr + 16); 523 src_ptr += source_stride; 524 q4u8 = vld1q_u8(src_ptr); 525 q5u8 = vld1q_u8(src_ptr + 16); 526 src_ptr += source_stride; 527 q6u8 = vld1q_u8(src_ptr); 528 q7u8 = vld1q_u8(src_ptr + 16); 529 src_ptr += source_stride; 530 531 q11u8 = vld1q_u8(ref_ptr); 532 ref_ptr += recon_stride; 533 q12u8 = vld1q_u8(ref_ptr); 534 ref_ptr += recon_stride; 535 q13u8 = vld1q_u8(ref_ptr); 536 ref_ptr += recon_stride; 537 q14u8 = vld1q_u8(ref_ptr); 538 ref_ptr += recon_stride; 539 540 q1u8 = vextq_u8(q0u8, q1u8, 1); 541 q3u8 = vextq_u8(q2u8, q3u8, 1); 542 q5u8 = vextq_u8(q4u8, q5u8, 1); 543 q7u8 = vextq_u8(q6u8, q7u8, 1); 544 545 q0u8 = vrhaddq_u8(q0u8, q1u8); 546 q1u8 = vrhaddq_u8(q2u8, q3u8); 547 q2u8 = vrhaddq_u8(q4u8, q5u8); 548 q3u8 = vrhaddq_u8(q6u8, q7u8); 549 550 d0u8 = vget_low_u8(q0u8); 551 d1u8 = vget_high_u8(q0u8); 552 d2u8 = vget_low_u8(q1u8); 553 d3u8 = vget_high_u8(q1u8); 554 d4u8 = vget_low_u8(q2u8); 555 d5u8 = vget_high_u8(q2u8); 556 d6u8 = vget_low_u8(q3u8); 557 d7u8 = vget_high_u8(q3u8); 558 559 q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8)); 560 q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8)); 561 q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8)); 562 q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8)); 563 q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8)); 564 q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8)); 565 q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8)); 566 q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8)); 567 568 d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16)); 569 d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16)); 570 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16)); 571 q9s32 = vmlal_s16(q9s32, d8s16, d8s16); 572 q10s32 = vmlal_s16(q10s32, d9s16, d9s16); 573 d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); 574 d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); 575 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16)); 576 q9s32 = vmlal_s16(q9s32, d10s16, d10s16); 577 q10s32 = vmlal_s16(q10s32, d11s16, d11s16); 578 d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); 579 d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); 580 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16)); 581 q9s32 = vmlal_s16(q9s32, d12s16, d12s16); 582 q10s32 = vmlal_s16(q10s32, d13s16, d13s16); 583 d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16)); 584 d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16)); 585 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16)); 586 q9s32 = vmlal_s16(q9s32, d14s16, d14s16); 587 q10s32 = vmlal_s16(q10s32, d15s16, d15s16); 588 d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); 589 d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); 590 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); 591 q9s32 = vmlal_s16(q9s32, d0s16, d0s16); 592 q10s32 = vmlal_s16(q10s32, d1s16, d1s16); 593 d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); 594 d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); 595 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); 596 q9s32 = vmlal_s16(q9s32, d2s16, d2s16); 597 q10s32 = vmlal_s16(q10s32, d3s16, d3s16); 598 d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); 599 d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); 600 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); 601 q9s32 = vmlal_s16(q9s32, d4s16, d4s16); 602 q10s32 = vmlal_s16(q10s32, d5s16, d5s16); 603 d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); 604 d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); 605 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); 606 q9s32 = vmlal_s16(q9s32, d6s16, d6s16); 607 q10s32 = vmlal_s16(q10s32, d7s16, d7s16); 608 } 609 610 q10s32 = vaddq_s32(q10s32, q9s32); 611 q0s64 = vpaddlq_s32(q8s32); 612 q1s64 = vpaddlq_s32(q10s32); 613 614 d0s64 = vget_low_s64(q0s64); 615 d1s64 = vget_high_s64(q0s64); 616 d2s64 = vget_low_s64(q1s64); 617 d3s64 = vget_high_s64(q1s64); 618 d0s64 = vadd_s64(d0s64, d1s64); 619 d1s64 = vadd_s64(d2s64, d3s64); 620 621 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 622 vreinterpret_s32_s64(d0s64)); 623 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 624 625 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); 626 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 627 628 return vget_lane_u32(d0u32, 0); 629} 630 631unsigned int vp8_variance_halfpixvar16x16_v_neon( 632 const unsigned char *src_ptr, 633 int source_stride, 634 const unsigned char *ref_ptr, 635 int recon_stride, 636 unsigned int *sse) { 637 int i; 638 uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8; 639 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; 640 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; 641 uint32x2_t d0u32, d10u32; 642 int64x1_t d0s64, d1s64, d2s64, d3s64; 643 uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8; 644 uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16; 645 int32x4_t q8s32, q9s32, q10s32; 646 int64x2_t q0s64, q1s64, q5s64; 647 648 q8s32 = vdupq_n_s32(0); 649 q9s32 = vdupq_n_s32(0); 650 q10s32 = vdupq_n_s32(0); 651 652 q0u8 = vld1q_u8(src_ptr); 653 src_ptr += source_stride; 654 for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon 655 q2u8 = vld1q_u8(src_ptr); 656 src_ptr += source_stride; 657 q4u8 = vld1q_u8(src_ptr); 658 src_ptr += source_stride; 659 q6u8 = vld1q_u8(src_ptr); 660 src_ptr += source_stride; 661 q15u8 = vld1q_u8(src_ptr); 662 src_ptr += source_stride; 663 664 q1u8 = vld1q_u8(ref_ptr); 665 ref_ptr += recon_stride; 666 q3u8 = vld1q_u8(ref_ptr); 667 ref_ptr += recon_stride; 668 q5u8 = vld1q_u8(ref_ptr); 669 ref_ptr += recon_stride; 670 q7u8 = vld1q_u8(ref_ptr); 671 ref_ptr += recon_stride; 672 673 q0u8 = vrhaddq_u8(q0u8, q2u8); 674 q2u8 = vrhaddq_u8(q2u8, q4u8); 675 q4u8 = vrhaddq_u8(q4u8, q6u8); 676 q6u8 = vrhaddq_u8(q6u8, q15u8); 677 678 d0u8 = vget_low_u8(q0u8); 679 d1u8 = vget_high_u8(q0u8); 680 d4u8 = vget_low_u8(q2u8); 681 d5u8 = vget_high_u8(q2u8); 682 d8u8 = vget_low_u8(q4u8); 683 d9u8 = vget_high_u8(q4u8); 684 d12u8 = vget_low_u8(q6u8); 685 d13u8 = vget_high_u8(q6u8); 686 687 q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8)); 688 q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8)); 689 q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8)); 690 q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8)); 691 q0u16 = vsubl_u8(d8u8, vget_low_u8(q5u8)); 692 q1u16 = vsubl_u8(d9u8, vget_high_u8(q5u8)); 693 q2u16 = vsubl_u8(d12u8, vget_low_u8(q7u8)); 694 q3u16 = vsubl_u8(d13u8, vget_high_u8(q7u8)); 695 696 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 697 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 698 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 699 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 700 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 701 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 702 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 703 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 704 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 705 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 706 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); 707 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); 708 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); 709 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); 710 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); 711 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); 712 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); 713 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); 714 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); 715 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); 716 d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); 717 d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); 718 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); 719 q9s32 = vmlal_s16(q9s32, d0s16, d0s16); 720 q10s32 = vmlal_s16(q10s32, d1s16, d1s16); 721 d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); 722 d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); 723 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); 724 q9s32 = vmlal_s16(q9s32, d2s16, d2s16); 725 q10s32 = vmlal_s16(q10s32, d3s16, d3s16); 726 d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); 727 d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); 728 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); 729 q9s32 = vmlal_s16(q9s32, d4s16, d4s16); 730 q10s32 = vmlal_s16(q10s32, d5s16, d5s16); 731 d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); 732 d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); 733 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); 734 q9s32 = vmlal_s16(q9s32, d6s16, d6s16); 735 q10s32 = vmlal_s16(q10s32, d7s16, d7s16); 736 737 q0u8 = q15u8; 738 } 739 740 q10s32 = vaddq_s32(q10s32, q9s32); 741 q0s64 = vpaddlq_s32(q8s32); 742 q1s64 = vpaddlq_s32(q10s32); 743 744 d0s64 = vget_low_s64(q0s64); 745 d1s64 = vget_high_s64(q0s64); 746 d2s64 = vget_low_s64(q1s64); 747 d3s64 = vget_high_s64(q1s64); 748 d0s64 = vadd_s64(d0s64, d1s64); 749 d1s64 = vadd_s64(d2s64, d3s64); 750 751 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 752 vreinterpret_s32_s64(d0s64)); 753 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 754 755 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); 756 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 757 758 return vget_lane_u32(d0u32, 0); 759} 760 761unsigned int vp8_variance_halfpixvar16x16_hv_neon( 762 const unsigned char *src_ptr, 763 int source_stride, 764 const unsigned char *ref_ptr, 765 int recon_stride, 766 unsigned int *sse) { 767 int i; 768 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; 769 int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16; 770 int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16; 771 uint32x2_t d0u32, d10u32; 772 int64x1_t d0s64, d1s64, d2s64, d3s64; 773 uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; 774 uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16; 775 int32x4_t q13s32, q14s32, q15s32; 776 int64x2_t q0s64, q1s64, q5s64; 777 778 q13s32 = vdupq_n_s32(0); 779 q14s32 = vdupq_n_s32(0); 780 q15s32 = vdupq_n_s32(0); 781 782 q0u8 = vld1q_u8(src_ptr); 783 q1u8 = vld1q_u8(src_ptr + 16); 784 src_ptr += source_stride; 785 q1u8 = vextq_u8(q0u8, q1u8, 1); 786 q0u8 = vrhaddq_u8(q0u8, q1u8); 787 for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon 788 q2u8 = vld1q_u8(src_ptr); 789 q3u8 = vld1q_u8(src_ptr + 16); 790 src_ptr += source_stride; 791 q4u8 = vld1q_u8(src_ptr); 792 q5u8 = vld1q_u8(src_ptr + 16); 793 src_ptr += source_stride; 794 q6u8 = vld1q_u8(src_ptr); 795 q7u8 = vld1q_u8(src_ptr + 16); 796 src_ptr += source_stride; 797 q8u8 = vld1q_u8(src_ptr); 798 q9u8 = vld1q_u8(src_ptr + 16); 799 src_ptr += source_stride; 800 801 q3u8 = vextq_u8(q2u8, q3u8, 1); 802 q5u8 = vextq_u8(q4u8, q5u8, 1); 803 q7u8 = vextq_u8(q6u8, q7u8, 1); 804 q9u8 = vextq_u8(q8u8, q9u8, 1); 805 806 q1u8 = vrhaddq_u8(q2u8, q3u8); 807 q2u8 = vrhaddq_u8(q4u8, q5u8); 808 q3u8 = vrhaddq_u8(q6u8, q7u8); 809 q4u8 = vrhaddq_u8(q8u8, q9u8); 810 q0u8 = vrhaddq_u8(q0u8, q1u8); 811 q1u8 = vrhaddq_u8(q1u8, q2u8); 812 q2u8 = vrhaddq_u8(q2u8, q3u8); 813 q3u8 = vrhaddq_u8(q3u8, q4u8); 814 815 q5u8 = vld1q_u8(ref_ptr); 816 ref_ptr += recon_stride; 817 q6u8 = vld1q_u8(ref_ptr); 818 ref_ptr += recon_stride; 819 q7u8 = vld1q_u8(ref_ptr); 820 ref_ptr += recon_stride; 821 q8u8 = vld1q_u8(ref_ptr); 822 ref_ptr += recon_stride; 823 824 d0u8 = vget_low_u8(q0u8); 825 d1u8 = vget_high_u8(q0u8); 826 d2u8 = vget_low_u8(q1u8); 827 d3u8 = vget_high_u8(q1u8); 828 d4u8 = vget_low_u8(q2u8); 829 d5u8 = vget_high_u8(q2u8); 830 d6u8 = vget_low_u8(q3u8); 831 d7u8 = vget_high_u8(q3u8); 832 833 q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8)); 834 q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8)); 835 q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8)); 836 q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8)); 837 q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8)); 838 q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8)); 839 q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8)); 840 q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8)); 841 842 d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); 843 d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); 844 q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16)); 845 q14s32 = vmlal_s16(q14s32, d18s16, d18s16); 846 q15s32 = vmlal_s16(q15s32, d19s16, d19s16); 847 848 d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); 849 d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); 850 q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16)); 851 q14s32 = vmlal_s16(q14s32, d20s16, d20s16); 852 q15s32 = vmlal_s16(q15s32, d21s16, d21s16); 853 854 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 855 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 856 q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16)); 857 q14s32 = vmlal_s16(q14s32, d22s16, d22s16); 858 q15s32 = vmlal_s16(q15s32, d23s16, d23s16); 859 860 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 861 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 862 q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16)); 863 q14s32 = vmlal_s16(q14s32, d24s16, d24s16); 864 q15s32 = vmlal_s16(q15s32, d25s16, d25s16); 865 866 d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); 867 d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); 868 q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16)); 869 q14s32 = vmlal_s16(q14s32, d0s16, d0s16); 870 q15s32 = vmlal_s16(q15s32, d1s16, d1s16); 871 872 d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); 873 d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); 874 q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16)); 875 q14s32 = vmlal_s16(q14s32, d2s16, d2s16); 876 q15s32 = vmlal_s16(q15s32, d3s16, d3s16); 877 878 d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); 879 d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); 880 q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16)); 881 q14s32 = vmlal_s16(q14s32, d10s16, d10s16); 882 q15s32 = vmlal_s16(q15s32, d11s16, d11s16); 883 884 d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); 885 d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); 886 q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16)); 887 q14s32 = vmlal_s16(q14s32, d12s16, d12s16); 888 q15s32 = vmlal_s16(q15s32, d13s16, d13s16); 889 890 q0u8 = q4u8; 891 } 892 893 q15s32 = vaddq_s32(q14s32, q15s32); 894 q0s64 = vpaddlq_s32(q13s32); 895 q1s64 = vpaddlq_s32(q15s32); 896 897 d0s64 = vget_low_s64(q0s64); 898 d1s64 = vget_high_s64(q0s64); 899 d2s64 = vget_low_s64(q1s64); 900 d3s64 = vget_high_s64(q1s64); 901 d0s64 = vadd_s64(d0s64, d1s64); 902 d1s64 = vadd_s64(d2s64, d3s64); 903 904 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 905 vreinterpret_s32_s64(d0s64)); 906 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 907 908 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); 909 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 910 911 return vget_lane_u32(d0u32, 0); 912} 913 914enum { kWidth8 = 8 }; 915enum { kHeight8 = 8 }; 916enum { kHeight8PlusOne = 9 }; 917enum { kPixelStepOne = 1 }; 918enum { kAlign16 = 16 }; 919 920#define FILTER_BITS 7 921 922static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { 923 const int32x4_t a = vpaddlq_s16(v_16x8); 924 const int64x2_t b = vpaddlq_s32(a); 925 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), 926 vreinterpret_s32_s64(vget_high_s64(b))); 927 return vget_lane_s32(c, 0); 928} 929 930static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { 931 const int64x2_t b = vpaddlq_s32(v_32x4); 932 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), 933 vreinterpret_s32_s64(vget_high_s64(b))); 934 return vget_lane_s32(c, 0); 935} 936 937static void variance_neon_w8(const uint8_t *a, int a_stride, 938 const uint8_t *b, int b_stride, 939 int w, int h, unsigned int *sse, int *sum) { 940 int i, j; 941 int16x8_t v_sum = vdupq_n_s16(0); 942 int32x4_t v_sse_lo = vdupq_n_s32(0); 943 int32x4_t v_sse_hi = vdupq_n_s32(0); 944 945 for (i = 0; i < h; ++i) { 946 for (j = 0; j < w; j += 8) { 947 const uint8x8_t v_a = vld1_u8(&a[j]); 948 const uint8x8_t v_b = vld1_u8(&b[j]); 949 const uint16x8_t v_diff = vsubl_u8(v_a, v_b); 950 const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); 951 v_sum = vaddq_s16(v_sum, sv_diff); 952 v_sse_lo = vmlal_s16(v_sse_lo, 953 vget_low_s16(sv_diff), 954 vget_low_s16(sv_diff)); 955 v_sse_hi = vmlal_s16(v_sse_hi, 956 vget_high_s16(sv_diff), 957 vget_high_s16(sv_diff)); 958 } 959 a += a_stride; 960 b += b_stride; 961 } 962 963 *sum = horizontal_add_s16x8(v_sum); 964 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); 965} 966 967static unsigned int variance8x8_neon(const uint8_t *a, int a_stride, 968 const uint8_t *b, int b_stride, 969 unsigned int *sse) { 970 int sum; 971 variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum); 972 return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8)); 973} 974 975static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, 976 uint8_t *output_ptr, 977 unsigned int src_pixels_per_line, 978 int pixel_step, 979 unsigned int output_height, 980 unsigned int output_width, 981 const uint16_t *vpx_filter) { 982 const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]); 983 const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]); 984 unsigned int i; 985 for (i = 0; i < output_height; ++i) { 986 const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); 987 const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); 988 const uint16x8_t a = vmull_u8(src_0, f0); 989 const uint16x8_t b = vmlal_u8(a, src_1, f1); 990 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); 991 vst1_u8(&output_ptr[0], out); 992 // Next row... 993 src_ptr += src_pixels_per_line; 994 output_ptr += output_width; 995 } 996} 997 998unsigned int vp8_sub_pixel_variance8x8_neon( 999 const unsigned char *src, 1000 int src_stride, 1001 int xoffset, 1002 int yoffset, 1003 const unsigned char *dst, 1004 int dst_stride, 1005 unsigned int *sse) { 1006 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8); 1007 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8); 1008 if (xoffset == 0) { 1009 var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8, 1010 kWidth8, bilinear_taps_coeff[yoffset]); 1011 } else if (yoffset == 0) { 1012 var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne, 1013 kHeight8PlusOne, kWidth8, 1014 bilinear_taps_coeff[xoffset]); 1015 } else { 1016 var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne, 1017 kHeight8PlusOne, kWidth8, 1018 bilinear_taps_coeff[xoffset]); 1019 var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8, 1020 kWidth8, bilinear_taps_coeff[yoffset]); 1021 } 1022 return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse); 1023} 1024 1025