1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12#include "./vpx_config.h" 13 14static INLINE void vp8_mbloop_filter_neon(uint8x16_t qblimit, // mblimit 15 uint8x16_t qlimit, // limit 16 uint8x16_t qthresh, // thresh 17 uint8x16_t q3, // p2 18 uint8x16_t q4, // p2 19 uint8x16_t q5, // p1 20 uint8x16_t q6, // p0 21 uint8x16_t q7, // q0 22 uint8x16_t q8, // q1 23 uint8x16_t q9, // q2 24 uint8x16_t q10, // q3 25 uint8x16_t *q4r, // p1 26 uint8x16_t *q5r, // p1 27 uint8x16_t *q6r, // p0 28 uint8x16_t *q7r, // q0 29 uint8x16_t *q8r, // q1 30 uint8x16_t *q9r) { // q1 31 uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8; 32 int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16; 33 int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8; 34 uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16; 35 int8x16_t q0s8, q12s8, q14s8, q15s8; 36 int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29; 37 38 q11u8 = vabdq_u8(q3, q4); 39 q12u8 = vabdq_u8(q4, q5); 40 q13u8 = vabdq_u8(q5, q6); 41 q14u8 = vabdq_u8(q8, q7); 42 q1u8 = vabdq_u8(q9, q8); 43 q0u8 = vabdq_u8(q10, q9); 44 45 q11u8 = vmaxq_u8(q11u8, q12u8); 46 q12u8 = vmaxq_u8(q13u8, q14u8); 47 q1u8 = vmaxq_u8(q1u8, q0u8); 48 q15u8 = vmaxq_u8(q11u8, q12u8); 49 50 q12u8 = vabdq_u8(q6, q7); 51 52 // vp8_hevmask 53 q13u8 = vcgtq_u8(q13u8, qthresh); 54 q14u8 = vcgtq_u8(q14u8, qthresh); 55 q15u8 = vmaxq_u8(q15u8, q1u8); 56 57 q15u8 = vcgeq_u8(qlimit, q15u8); 58 59 q1u8 = vabdq_u8(q5, q8); 60 q12u8 = vqaddq_u8(q12u8, q12u8); 61 62 // vp8_filter() function 63 // convert to signed 64 q0u8 = vdupq_n_u8(0x80); 65 q9 = veorq_u8(q9, q0u8); 66 q8 = veorq_u8(q8, q0u8); 67 q7 = veorq_u8(q7, q0u8); 68 q6 = veorq_u8(q6, q0u8); 69 q5 = veorq_u8(q5, q0u8); 70 q4 = veorq_u8(q4, q0u8); 71 72 q1u8 = vshrq_n_u8(q1u8, 1); 73 q12u8 = vqaddq_u8(q12u8, q1u8); 74 75 q14u8 = vorrq_u8(q13u8, q14u8); 76 q12u8 = vcgeq_u8(qblimit, q12u8); 77 78 q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), 79 vget_low_s8(vreinterpretq_s8_u8(q6))); 80 q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), 81 vget_high_s8(vreinterpretq_s8_u8(q6))); 82 83 q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); 84 85 q11s16 = vdupq_n_s16(3); 86 q2s16 = vmulq_s16(q2s16, q11s16); 87 q13s16 = vmulq_s16(q13s16, q11s16); 88 89 q15u8 = vandq_u8(q15u8, q12u8); 90 91 q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); 92 q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8)); 93 94 q12u8 = vdupq_n_u8(3); 95 q11u8 = vdupq_n_u8(4); 96 // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) 97 d2 = vqmovn_s16(q2s16); 98 d3 = vqmovn_s16(q13s16); 99 q1s8 = vcombine_s8(d2, d3); 100 q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8)); 101 q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); 102 103 q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8)); 104 q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8)); 105 q2s8 = vshrq_n_s8(q2s8, 3); 106 q13s8 = vshrq_n_s8(q13s8, 3); 107 108 q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8); 109 q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8); 110 111 q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); 112 113 q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63); 114 d5 = vdup_n_s8(9); 115 d4 = vdup_n_s8(18); 116 117 q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5); 118 q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5); 119 d5 = vdup_n_s8(27); 120 q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4); 121 q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4); 122 q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5); 123 q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5); 124 125 d0 = vqshrn_n_s16(q0s16, 7); 126 d1 = vqshrn_n_s16(q11s16, 7); 127 d24 = vqshrn_n_s16(q12s16, 7); 128 d25 = vqshrn_n_s16(q13s16, 7); 129 d28 = vqshrn_n_s16(q14s16, 7); 130 d29 = vqshrn_n_s16(q15s16, 7); 131 132 q0s8 = vcombine_s8(d0, d1); 133 q12s8 = vcombine_s8(d24, d25); 134 q14s8 = vcombine_s8(d28, d29); 135 136 q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8); 137 q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8); 138 q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8); 139 q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8); 140 q15s8 = vqsubq_s8((q7s8), q14s8); 141 q14s8 = vqaddq_s8((q6s8), q14s8); 142 143 q1u8 = vdupq_n_u8(0x80); 144 *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8); 145 *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8); 146 *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8); 147 *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8); 148 *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8); 149 *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8); 150 return; 151} 152 153void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, 154 unsigned char blimit, 155 unsigned char limit, 156 unsigned char thresh) { 157 uint8x16_t qblimit, qlimit, qthresh, q3, q4; 158 uint8x16_t q5, q6, q7, q8, q9, q10; 159 160 qblimit = vdupq_n_u8(blimit); 161 qlimit = vdupq_n_u8(limit); 162 qthresh = vdupq_n_u8(thresh); 163 164 src -= (pitch << 2); 165 166 q3 = vld1q_u8(src); 167 src += pitch; 168 q4 = vld1q_u8(src); 169 src += pitch; 170 q5 = vld1q_u8(src); 171 src += pitch; 172 q6 = vld1q_u8(src); 173 src += pitch; 174 q7 = vld1q_u8(src); 175 src += pitch; 176 q8 = vld1q_u8(src); 177 src += pitch; 178 q9 = vld1q_u8(src); 179 src += pitch; 180 q10 = vld1q_u8(src); 181 182 vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, 183 q10, &q4, &q5, &q6, &q7, &q8, &q9); 184 185 src -= (pitch * 6); 186 vst1q_u8(src, q4); 187 src += pitch; 188 vst1q_u8(src, q5); 189 src += pitch; 190 vst1q_u8(src, q6); 191 src += pitch; 192 vst1q_u8(src, q7); 193 src += pitch; 194 vst1q_u8(src, q8); 195 src += pitch; 196 vst1q_u8(src, q9); 197 return; 198} 199 200void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, 201 unsigned char blimit, 202 unsigned char limit, 203 unsigned char thresh, 204 unsigned char *v) { 205 uint8x16_t qblimit, qlimit, qthresh, q3, q4; 206 uint8x16_t q5, q6, q7, q8, q9, q10; 207 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; 208 uint8x8_t d15, d16, d17, d18, d19, d20, d21; 209 210 qblimit = vdupq_n_u8(blimit); 211 qlimit = vdupq_n_u8(limit); 212 qthresh = vdupq_n_u8(thresh); 213 214 u -= (pitch << 2); 215 v -= (pitch << 2); 216 217 d6 = vld1_u8(u); 218 u += pitch; 219 d7 = vld1_u8(v); 220 v += pitch; 221 d8 = vld1_u8(u); 222 u += pitch; 223 d9 = vld1_u8(v); 224 v += pitch; 225 d10 = vld1_u8(u); 226 u += pitch; 227 d11 = vld1_u8(v); 228 v += pitch; 229 d12 = vld1_u8(u); 230 u += pitch; 231 d13 = vld1_u8(v); 232 v += pitch; 233 d14 = vld1_u8(u); 234 u += pitch; 235 d15 = vld1_u8(v); 236 v += pitch; 237 d16 = vld1_u8(u); 238 u += pitch; 239 d17 = vld1_u8(v); 240 v += pitch; 241 d18 = vld1_u8(u); 242 u += pitch; 243 d19 = vld1_u8(v); 244 v += pitch; 245 d20 = vld1_u8(u); 246 d21 = vld1_u8(v); 247 248 q3 = vcombine_u8(d6, d7); 249 q4 = vcombine_u8(d8, d9); 250 q5 = vcombine_u8(d10, d11); 251 q6 = vcombine_u8(d12, d13); 252 q7 = vcombine_u8(d14, d15); 253 q8 = vcombine_u8(d16, d17); 254 q9 = vcombine_u8(d18, d19); 255 q10 = vcombine_u8(d20, d21); 256 257 vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, 258 q10, &q4, &q5, &q6, &q7, &q8, &q9); 259 260 u -= (pitch * 6); 261 v -= (pitch * 6); 262 vst1_u8(u, vget_low_u8(q4)); 263 u += pitch; 264 vst1_u8(v, vget_high_u8(q4)); 265 v += pitch; 266 vst1_u8(u, vget_low_u8(q5)); 267 u += pitch; 268 vst1_u8(v, vget_high_u8(q5)); 269 v += pitch; 270 vst1_u8(u, vget_low_u8(q6)); 271 u += pitch; 272 vst1_u8(v, vget_high_u8(q6)); 273 v += pitch; 274 vst1_u8(u, vget_low_u8(q7)); 275 u += pitch; 276 vst1_u8(v, vget_high_u8(q7)); 277 v += pitch; 278 vst1_u8(u, vget_low_u8(q8)); 279 u += pitch; 280 vst1_u8(v, vget_high_u8(q8)); 281 v += pitch; 282 vst1_u8(u, vget_low_u8(q9)); 283 vst1_u8(v, vget_high_u8(q9)); 284 return; 285} 286 287void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, 288 unsigned char blimit, 289 unsigned char limit, 290 unsigned char thresh) { 291 unsigned char *s1, *s2; 292 uint8x16_t qblimit, qlimit, qthresh, q3, q4; 293 uint8x16_t q5, q6, q7, q8, q9, q10; 294 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; 295 uint8x8_t d15, d16, d17, d18, d19, d20, d21; 296 uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; 297 uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; 298 uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; 299 300 qblimit = vdupq_n_u8(blimit); 301 qlimit = vdupq_n_u8(limit); 302 qthresh = vdupq_n_u8(thresh); 303 304 s1 = src - 4; 305 s2 = s1 + 8 * pitch; 306 d6 = vld1_u8(s1); 307 s1 += pitch; 308 d7 = vld1_u8(s2); 309 s2 += pitch; 310 d8 = vld1_u8(s1); 311 s1 += pitch; 312 d9 = vld1_u8(s2); 313 s2 += pitch; 314 d10 = vld1_u8(s1); 315 s1 += pitch; 316 d11 = vld1_u8(s2); 317 s2 += pitch; 318 d12 = vld1_u8(s1); 319 s1 += pitch; 320 d13 = vld1_u8(s2); 321 s2 += pitch; 322 d14 = vld1_u8(s1); 323 s1 += pitch; 324 d15 = vld1_u8(s2); 325 s2 += pitch; 326 d16 = vld1_u8(s1); 327 s1 += pitch; 328 d17 = vld1_u8(s2); 329 s2 += pitch; 330 d18 = vld1_u8(s1); 331 s1 += pitch; 332 d19 = vld1_u8(s2); 333 s2 += pitch; 334 d20 = vld1_u8(s1); 335 d21 = vld1_u8(s2); 336 337 q3 = vcombine_u8(d6, d7); 338 q4 = vcombine_u8(d8, d9); 339 q5 = vcombine_u8(d10, d11); 340 q6 = vcombine_u8(d12, d13); 341 q7 = vcombine_u8(d14, d15); 342 q8 = vcombine_u8(d16, d17); 343 q9 = vcombine_u8(d18, d19); 344 q10 = vcombine_u8(d20, d21); 345 346 q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); 347 q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); 348 q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); 349 q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); 350 351 q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), 352 vreinterpretq_u16_u32(q2tmp2.val[0])); 353 q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), 354 vreinterpretq_u16_u32(q2tmp3.val[0])); 355 q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), 356 vreinterpretq_u16_u32(q2tmp2.val[1])); 357 q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), 358 vreinterpretq_u16_u32(q2tmp3.val[1])); 359 360 q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), 361 vreinterpretq_u8_u16(q2tmp5.val[0])); 362 q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), 363 vreinterpretq_u8_u16(q2tmp5.val[1])); 364 q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), 365 vreinterpretq_u8_u16(q2tmp7.val[0])); 366 q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), 367 vreinterpretq_u8_u16(q2tmp7.val[1])); 368 369 q3 = q2tmp8.val[0]; 370 q4 = q2tmp8.val[1]; 371 q5 = q2tmp9.val[0]; 372 q6 = q2tmp9.val[1]; 373 q7 = q2tmp10.val[0]; 374 q8 = q2tmp10.val[1]; 375 q9 = q2tmp11.val[0]; 376 q10 = q2tmp11.val[1]; 377 378 vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, 379 q10, &q4, &q5, &q6, &q7, &q8, &q9); 380 381 q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); 382 q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); 383 q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); 384 q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); 385 386 q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), 387 vreinterpretq_u16_u32(q2tmp2.val[0])); 388 q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), 389 vreinterpretq_u16_u32(q2tmp3.val[0])); 390 q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), 391 vreinterpretq_u16_u32(q2tmp2.val[1])); 392 q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), 393 vreinterpretq_u16_u32(q2tmp3.val[1])); 394 395 q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), 396 vreinterpretq_u8_u16(q2tmp5.val[0])); 397 q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), 398 vreinterpretq_u8_u16(q2tmp5.val[1])); 399 q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), 400 vreinterpretq_u8_u16(q2tmp7.val[0])); 401 q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), 402 vreinterpretq_u8_u16(q2tmp7.val[1])); 403 404 q3 = q2tmp8.val[0]; 405 q4 = q2tmp8.val[1]; 406 q5 = q2tmp9.val[0]; 407 q6 = q2tmp9.val[1]; 408 q7 = q2tmp10.val[0]; 409 q8 = q2tmp10.val[1]; 410 q9 = q2tmp11.val[0]; 411 q10 = q2tmp11.val[1]; 412 413 s1 -= 7 * pitch; 414 s2 -= 7 * pitch; 415 416 vst1_u8(s1, vget_low_u8(q3)); 417 s1 += pitch; 418 vst1_u8(s2, vget_high_u8(q3)); 419 s2 += pitch; 420 vst1_u8(s1, vget_low_u8(q4)); 421 s1 += pitch; 422 vst1_u8(s2, vget_high_u8(q4)); 423 s2 += pitch; 424 vst1_u8(s1, vget_low_u8(q5)); 425 s1 += pitch; 426 vst1_u8(s2, vget_high_u8(q5)); 427 s2 += pitch; 428 vst1_u8(s1, vget_low_u8(q6)); 429 s1 += pitch; 430 vst1_u8(s2, vget_high_u8(q6)); 431 s2 += pitch; 432 vst1_u8(s1, vget_low_u8(q7)); 433 s1 += pitch; 434 vst1_u8(s2, vget_high_u8(q7)); 435 s2 += pitch; 436 vst1_u8(s1, vget_low_u8(q8)); 437 s1 += pitch; 438 vst1_u8(s2, vget_high_u8(q8)); 439 s2 += pitch; 440 vst1_u8(s1, vget_low_u8(q9)); 441 s1 += pitch; 442 vst1_u8(s2, vget_high_u8(q9)); 443 s2 += pitch; 444 vst1_u8(s1, vget_low_u8(q10)); 445 vst1_u8(s2, vget_high_u8(q10)); 446 return; 447} 448 449void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, 450 unsigned char blimit, 451 unsigned char limit, 452 unsigned char thresh, 453 unsigned char *v) { 454 unsigned char *us, *ud; 455 unsigned char *vs, *vd; 456 uint8x16_t qblimit, qlimit, qthresh, q3, q4; 457 uint8x16_t q5, q6, q7, q8, q9, q10; 458 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; 459 uint8x8_t d15, d16, d17, d18, d19, d20, d21; 460 uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; 461 uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; 462 uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; 463 464 qblimit = vdupq_n_u8(blimit); 465 qlimit = vdupq_n_u8(limit); 466 qthresh = vdupq_n_u8(thresh); 467 468 us = u - 4; 469 vs = v - 4; 470 d6 = vld1_u8(us); 471 us += pitch; 472 d7 = vld1_u8(vs); 473 vs += pitch; 474 d8 = vld1_u8(us); 475 us += pitch; 476 d9 = vld1_u8(vs); 477 vs += pitch; 478 d10 = vld1_u8(us); 479 us += pitch; 480 d11 = vld1_u8(vs); 481 vs += pitch; 482 d12 = vld1_u8(us); 483 us += pitch; 484 d13 = vld1_u8(vs); 485 vs += pitch; 486 d14 = vld1_u8(us); 487 us += pitch; 488 d15 = vld1_u8(vs); 489 vs += pitch; 490 d16 = vld1_u8(us); 491 us += pitch; 492 d17 = vld1_u8(vs); 493 vs += pitch; 494 d18 = vld1_u8(us); 495 us += pitch; 496 d19 = vld1_u8(vs); 497 vs += pitch; 498 d20 = vld1_u8(us); 499 d21 = vld1_u8(vs); 500 501 q3 = vcombine_u8(d6, d7); 502 q4 = vcombine_u8(d8, d9); 503 q5 = vcombine_u8(d10, d11); 504 q6 = vcombine_u8(d12, d13); 505 q7 = vcombine_u8(d14, d15); 506 q8 = vcombine_u8(d16, d17); 507 q9 = vcombine_u8(d18, d19); 508 q10 = vcombine_u8(d20, d21); 509 510 q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); 511 q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); 512 q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); 513 q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); 514 515 q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), 516 vreinterpretq_u16_u32(q2tmp2.val[0])); 517 q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), 518 vreinterpretq_u16_u32(q2tmp3.val[0])); 519 q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), 520 vreinterpretq_u16_u32(q2tmp2.val[1])); 521 q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), 522 vreinterpretq_u16_u32(q2tmp3.val[1])); 523 524 q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), 525 vreinterpretq_u8_u16(q2tmp5.val[0])); 526 q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), 527 vreinterpretq_u8_u16(q2tmp5.val[1])); 528 q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), 529 vreinterpretq_u8_u16(q2tmp7.val[0])); 530 q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), 531 vreinterpretq_u8_u16(q2tmp7.val[1])); 532 533 q3 = q2tmp8.val[0]; 534 q4 = q2tmp8.val[1]; 535 q5 = q2tmp9.val[0]; 536 q6 = q2tmp9.val[1]; 537 q7 = q2tmp10.val[0]; 538 q8 = q2tmp10.val[1]; 539 q9 = q2tmp11.val[0]; 540 q10 = q2tmp11.val[1]; 541 542 vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, 543 q10, &q4, &q5, &q6, &q7, &q8, &q9); 544 545 q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); 546 q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); 547 q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); 548 q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); 549 550 q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), 551 vreinterpretq_u16_u32(q2tmp2.val[0])); 552 q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), 553 vreinterpretq_u16_u32(q2tmp3.val[0])); 554 q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), 555 vreinterpretq_u16_u32(q2tmp2.val[1])); 556 q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), 557 vreinterpretq_u16_u32(q2tmp3.val[1])); 558 559 q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), 560 vreinterpretq_u8_u16(q2tmp5.val[0])); 561 q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), 562 vreinterpretq_u8_u16(q2tmp5.val[1])); 563 q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), 564 vreinterpretq_u8_u16(q2tmp7.val[0])); 565 q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), 566 vreinterpretq_u8_u16(q2tmp7.val[1])); 567 568 q3 = q2tmp8.val[0]; 569 q4 = q2tmp8.val[1]; 570 q5 = q2tmp9.val[0]; 571 q6 = q2tmp9.val[1]; 572 q7 = q2tmp10.val[0]; 573 q8 = q2tmp10.val[1]; 574 q9 = q2tmp11.val[0]; 575 q10 = q2tmp11.val[1]; 576 577 ud = u - 4; 578 vst1_u8(ud, vget_low_u8(q3)); 579 ud += pitch; 580 vst1_u8(ud, vget_low_u8(q4)); 581 ud += pitch; 582 vst1_u8(ud, vget_low_u8(q5)); 583 ud += pitch; 584 vst1_u8(ud, vget_low_u8(q6)); 585 ud += pitch; 586 vst1_u8(ud, vget_low_u8(q7)); 587 ud += pitch; 588 vst1_u8(ud, vget_low_u8(q8)); 589 ud += pitch; 590 vst1_u8(ud, vget_low_u8(q9)); 591 ud += pitch; 592 vst1_u8(ud, vget_low_u8(q10)); 593 594 vd = v - 4; 595 vst1_u8(vd, vget_high_u8(q3)); 596 vd += pitch; 597 vst1_u8(vd, vget_high_u8(q4)); 598 vd += pitch; 599 vst1_u8(vd, vget_high_u8(q5)); 600 vd += pitch; 601 vst1_u8(vd, vget_high_u8(q6)); 602 vd += pitch; 603 vst1_u8(vd, vget_high_u8(q7)); 604 vd += pitch; 605 vst1_u8(vd, vget_high_u8(q8)); 606 vd += pitch; 607 vst1_u8(vd, vget_high_u8(q9)); 608 vd += pitch; 609 vst1_u8(vd, vget_high_u8(q10)); 610 return; 611} 612