1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13#ifdef _MSC_VER 14#define __builtin_prefetch(x) 15#endif 16 17static const int8_t vp8_sub_pel_filters[8][8] = { 18 {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */ 19 {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */ 20 {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ 21 {0, -9, 93, 50, -6, 0, 0, 0}, 22 {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ 23 {0, -6, 50, 93, -9, 0, 0, 0}, 24 {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ 25 {0, -1, 12, 123, -6, 0, 0, 0}, 26}; 27 28void vp8_sixtap_predict4x4_neon( 29 unsigned char *src_ptr, 30 int src_pixels_per_line, 31 int xoffset, 32 int yoffset, 33 unsigned char *dst_ptr, 34 int dst_pitch) { 35 unsigned char *src; 36 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8; 37 uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; 38 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 39 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; 40 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; 41 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; 42 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; 43 uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8; 44 uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64; 45 uint32x2x2_t d0u32x2, d1u32x2; 46 47 if (xoffset == 0) { // secondpass_filter4x4_only 48 uint32x2_t d27u32 = vdup_n_u32(0); 49 uint32x2_t d28u32 = vdup_n_u32(0); 50 uint32x2_t d29u32 = vdup_n_u32(0); 51 uint32x2_t d30u32 = vdup_n_u32(0); 52 uint32x2_t d31u32 = vdup_n_u32(0); 53 54 // load second_pass filter 55 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 56 d0s8 = vdup_lane_s8(dtmps8, 0); 57 d1s8 = vdup_lane_s8(dtmps8, 1); 58 d2s8 = vdup_lane_s8(dtmps8, 2); 59 d3s8 = vdup_lane_s8(dtmps8, 3); 60 d4s8 = vdup_lane_s8(dtmps8, 4); 61 d5s8 = vdup_lane_s8(dtmps8, 5); 62 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 63 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 64 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 65 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 66 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 67 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 68 69 // load src data 70 src = src_ptr - src_pixels_per_line * 2; 71 d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0); 72 src += src_pixels_per_line; 73 d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1); 74 src += src_pixels_per_line; 75 d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0); 76 src += src_pixels_per_line; 77 d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1); 78 src += src_pixels_per_line; 79 d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0); 80 src += src_pixels_per_line; 81 d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1); 82 src += src_pixels_per_line; 83 d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0); 84 src += src_pixels_per_line; 85 d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1); 86 src += src_pixels_per_line; 87 d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0); 88 89 d27u8 = vreinterpret_u8_u32(d27u32); 90 d28u8 = vreinterpret_u8_u32(d28u32); 91 d29u8 = vreinterpret_u8_u32(d29u32); 92 d30u8 = vreinterpret_u8_u32(d30u32); 93 d31u8 = vreinterpret_u8_u32(d31u32); 94 95 d23u8 = vext_u8(d27u8, d28u8, 4); 96 d24u8 = vext_u8(d28u8, d29u8, 4); 97 d25u8 = vext_u8(d29u8, d30u8, 4); 98 d26u8 = vext_u8(d30u8, d31u8, 4); 99 100 q3u16 = vmull_u8(d27u8, d0u8); 101 q4u16 = vmull_u8(d28u8, d0u8); 102 q5u16 = vmull_u8(d25u8, d5u8); 103 q6u16 = vmull_u8(d26u8, d5u8); 104 105 q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); 106 q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); 107 q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); 108 q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); 109 110 q3u16 = vmlal_u8(q3u16, d28u8, d2u8); 111 q4u16 = vmlal_u8(q4u16, d29u8, d2u8); 112 q5u16 = vmlal_u8(q5u16, d24u8, d3u8); 113 q6u16 = vmlal_u8(q6u16, d25u8, d3u8); 114 115 q3s16 = vreinterpretq_s16_u16(q3u16); 116 q4s16 = vreinterpretq_s16_u16(q4u16); 117 q5s16 = vreinterpretq_s16_u16(q5u16); 118 q6s16 = vreinterpretq_s16_u16(q6u16); 119 120 q5s16 = vqaddq_s16(q5s16, q3s16); 121 q6s16 = vqaddq_s16(q6s16, q4s16); 122 123 d3u8 = vqrshrun_n_s16(q5s16, 7); 124 d4u8 = vqrshrun_n_s16(q6s16, 7); 125 126 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); 127 dst_ptr += dst_pitch; 128 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); 129 dst_ptr += dst_pitch; 130 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); 131 dst_ptr += dst_pitch; 132 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); 133 return; 134 } 135 136 // load first_pass filter 137 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 138 d0s8 = vdup_lane_s8(dtmps8, 0); 139 d1s8 = vdup_lane_s8(dtmps8, 1); 140 d2s8 = vdup_lane_s8(dtmps8, 2); 141 d3s8 = vdup_lane_s8(dtmps8, 3); 142 d4s8 = vdup_lane_s8(dtmps8, 4); 143 d5s8 = vdup_lane_s8(dtmps8, 5); 144 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 145 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 146 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 147 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 148 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 149 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 150 151 // First pass: output_height lines x output_width columns (9x4) 152 153 if (yoffset == 0) // firstpass_filter4x4_only 154 src = src_ptr - 2; 155 else 156 src = src_ptr - 2 - (src_pixels_per_line * 2); 157 158 q3u8 = vld1q_u8(src); 159 src += src_pixels_per_line; 160 q4u8 = vld1q_u8(src); 161 src += src_pixels_per_line; 162 q5u8 = vld1q_u8(src); 163 src += src_pixels_per_line; 164 q6u8 = vld1q_u8(src); 165 src += src_pixels_per_line; 166 167 d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 168 d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 169 d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 170 d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 171 172 // vswp here 173 q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); 174 q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); 175 176 d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 177 vreinterpret_u32_u8(d19u8)); 178 d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 179 vreinterpret_u32_u8(d21u8)); 180 q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); 181 q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); 182 183 // keep original src data in q4 q6 184 q4u64 = vreinterpretq_u64_u8(q3u8); 185 q6u64 = vreinterpretq_u64_u8(q5u8); 186 187 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 188 vreinterpret_u32_u8(vget_high_u8(q3u8))); 189 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 190 vreinterpret_u32_u8(vget_high_u8(q5u8))); 191 q9u64 = vshrq_n_u64(q4u64, 8); 192 q10u64 = vshrq_n_u64(q6u64, 8); 193 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); 194 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); 195 196 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 197 vreinterpret_u32_u64(vget_high_u64(q9u64))); 198 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 199 vreinterpret_u32_u64(vget_high_u64(q10u64))); 200 q3u64 = vshrq_n_u64(q4u64, 32); 201 q5u64 = vshrq_n_u64(q6u64, 32); 202 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); 203 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); 204 205 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 206 vreinterpret_u32_u64(vget_high_u64(q3u64))); 207 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 208 vreinterpret_u32_u64(vget_high_u64(q5u64))); 209 q9u64 = vshrq_n_u64(q4u64, 16); 210 q10u64 = vshrq_n_u64(q6u64, 16); 211 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); 212 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); 213 214 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 215 vreinterpret_u32_u64(vget_high_u64(q9u64))); 216 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 217 vreinterpret_u32_u64(vget_high_u64(q10u64))); 218 q3u64 = vshrq_n_u64(q4u64, 24); 219 q5u64 = vshrq_n_u64(q6u64, 24); 220 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); 221 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); 222 223 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 224 vreinterpret_u32_u64(vget_high_u64(q3u64))); 225 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 226 vreinterpret_u32_u64(vget_high_u64(q5u64))); 227 q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); 228 q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); 229 230 q7s16 = vreinterpretq_s16_u16(q7u16); 231 q8s16 = vreinterpretq_s16_u16(q8u16); 232 q9s16 = vreinterpretq_s16_u16(q9u16); 233 q10s16 = vreinterpretq_s16_u16(q10u16); 234 q7s16 = vqaddq_s16(q7s16, q9s16); 235 q8s16 = vqaddq_s16(q8s16, q10s16); 236 237 d27u8 = vqrshrun_n_s16(q7s16, 7); 238 d28u8 = vqrshrun_n_s16(q8s16, 7); 239 240 if (yoffset == 0) { // firstpass_filter4x4_only 241 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0); 242 dst_ptr += dst_pitch; 243 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1); 244 dst_ptr += dst_pitch; 245 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0); 246 dst_ptr += dst_pitch; 247 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1); 248 return; 249 } 250 251 // First Pass on rest 5-line data 252 q3u8 = vld1q_u8(src); 253 src += src_pixels_per_line; 254 q4u8 = vld1q_u8(src); 255 src += src_pixels_per_line; 256 q5u8 = vld1q_u8(src); 257 src += src_pixels_per_line; 258 q6u8 = vld1q_u8(src); 259 src += src_pixels_per_line; 260 q11u8 = vld1q_u8(src); 261 262 d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 263 d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 264 d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 265 d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 266 267 // vswp here 268 q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); 269 q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); 270 271 d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 272 vreinterpret_u32_u8(d19u8)); 273 d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 274 vreinterpret_u32_u8(d21u8)); 275 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5); 276 q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); 277 q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); 278 q12u16 = vmull_u8(d31u8, d5u8); 279 280 q4u64 = vreinterpretq_u64_u8(q3u8); 281 q6u64 = vreinterpretq_u64_u8(q5u8); 282 283 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 284 vreinterpret_u32_u8(vget_high_u8(q3u8))); 285 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 286 vreinterpret_u32_u8(vget_high_u8(q5u8))); 287 q9u64 = vshrq_n_u64(q4u64, 8); 288 q10u64 = vshrq_n_u64(q6u64, 8); 289 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); 290 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); 291 q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8); 292 293 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 294 vreinterpret_u32_u64(vget_high_u64(q9u64))); 295 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 296 vreinterpret_u32_u64(vget_high_u64(q10u64))); 297 q3u64 = vshrq_n_u64(q4u64, 32); 298 q5u64 = vshrq_n_u64(q6u64, 32); 299 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1); 300 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); 301 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); 302 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); 303 304 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 305 vreinterpret_u32_u64(vget_high_u64(q3u64))); 306 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 307 vreinterpret_u32_u64(vget_high_u64(q5u64))); 308 q9u64 = vshrq_n_u64(q4u64, 16); 309 q10u64 = vshrq_n_u64(q6u64, 16); 310 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4); 311 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); 312 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); 313 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); 314 315 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 316 vreinterpret_u32_u64(vget_high_u64(q9u64))); 317 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 318 vreinterpret_u32_u64(vget_high_u64(q10u64))); 319 q3u64 = vshrq_n_u64(q4u64, 24); 320 q5u64 = vshrq_n_u64(q6u64, 24); 321 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2); 322 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); 323 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); 324 q12u16 = vmlal_u8(q12u16, d31u8, d2u8); 325 326 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 327 vreinterpret_u32_u64(vget_high_u64(q3u64))); 328 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 329 vreinterpret_u32_u64(vget_high_u64(q5u64))); 330 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3); 331 q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); 332 q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); 333 q11u16 = vmull_u8(d31u8, d3u8); 334 335 q7s16 = vreinterpretq_s16_u16(q7u16); 336 q8s16 = vreinterpretq_s16_u16(q8u16); 337 q9s16 = vreinterpretq_s16_u16(q9u16); 338 q10s16 = vreinterpretq_s16_u16(q10u16); 339 q11s16 = vreinterpretq_s16_u16(q11u16); 340 q12s16 = vreinterpretq_s16_u16(q12u16); 341 q7s16 = vqaddq_s16(q7s16, q9s16); 342 q8s16 = vqaddq_s16(q8s16, q10s16); 343 q12s16 = vqaddq_s16(q12s16, q11s16); 344 345 d29u8 = vqrshrun_n_s16(q7s16, 7); 346 d30u8 = vqrshrun_n_s16(q8s16, 7); 347 d31u8 = vqrshrun_n_s16(q12s16, 7); 348 349 // Second pass: 4x4 350 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 351 d0s8 = vdup_lane_s8(dtmps8, 0); 352 d1s8 = vdup_lane_s8(dtmps8, 1); 353 d2s8 = vdup_lane_s8(dtmps8, 2); 354 d3s8 = vdup_lane_s8(dtmps8, 3); 355 d4s8 = vdup_lane_s8(dtmps8, 4); 356 d5s8 = vdup_lane_s8(dtmps8, 5); 357 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 358 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 359 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 360 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 361 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 362 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 363 364 d23u8 = vext_u8(d27u8, d28u8, 4); 365 d24u8 = vext_u8(d28u8, d29u8, 4); 366 d25u8 = vext_u8(d29u8, d30u8, 4); 367 d26u8 = vext_u8(d30u8, d31u8, 4); 368 369 q3u16 = vmull_u8(d27u8, d0u8); 370 q4u16 = vmull_u8(d28u8, d0u8); 371 q5u16 = vmull_u8(d25u8, d5u8); 372 q6u16 = vmull_u8(d26u8, d5u8); 373 374 q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); 375 q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); 376 q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); 377 q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); 378 379 q3u16 = vmlal_u8(q3u16, d28u8, d2u8); 380 q4u16 = vmlal_u8(q4u16, d29u8, d2u8); 381 q5u16 = vmlal_u8(q5u16, d24u8, d3u8); 382 q6u16 = vmlal_u8(q6u16, d25u8, d3u8); 383 384 q3s16 = vreinterpretq_s16_u16(q3u16); 385 q4s16 = vreinterpretq_s16_u16(q4u16); 386 q5s16 = vreinterpretq_s16_u16(q5u16); 387 q6s16 = vreinterpretq_s16_u16(q6u16); 388 389 q5s16 = vqaddq_s16(q5s16, q3s16); 390 q6s16 = vqaddq_s16(q6s16, q4s16); 391 392 d3u8 = vqrshrun_n_s16(q5s16, 7); 393 d4u8 = vqrshrun_n_s16(q6s16, 7); 394 395 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); 396 dst_ptr += dst_pitch; 397 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); 398 dst_ptr += dst_pitch; 399 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); 400 dst_ptr += dst_pitch; 401 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); 402 return; 403} 404 405void vp8_sixtap_predict8x4_neon( 406 unsigned char *src_ptr, 407 int src_pixels_per_line, 408 int xoffset, 409 int yoffset, 410 unsigned char *dst_ptr, 411 int dst_pitch) { 412 unsigned char *src; 413 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 414 uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8; 415 uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8; 416 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 417 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; 418 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; 419 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; 420 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; 421 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8; 422 423 if (xoffset == 0) { // secondpass_filter8x4_only 424 // load second_pass filter 425 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 426 d0s8 = vdup_lane_s8(dtmps8, 0); 427 d1s8 = vdup_lane_s8(dtmps8, 1); 428 d2s8 = vdup_lane_s8(dtmps8, 2); 429 d3s8 = vdup_lane_s8(dtmps8, 3); 430 d4s8 = vdup_lane_s8(dtmps8, 4); 431 d5s8 = vdup_lane_s8(dtmps8, 5); 432 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 433 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 434 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 435 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 436 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 437 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 438 439 // load src data 440 src = src_ptr - src_pixels_per_line * 2; 441 d22u8 = vld1_u8(src); 442 src += src_pixels_per_line; 443 d23u8 = vld1_u8(src); 444 src += src_pixels_per_line; 445 d24u8 = vld1_u8(src); 446 src += src_pixels_per_line; 447 d25u8 = vld1_u8(src); 448 src += src_pixels_per_line; 449 d26u8 = vld1_u8(src); 450 src += src_pixels_per_line; 451 d27u8 = vld1_u8(src); 452 src += src_pixels_per_line; 453 d28u8 = vld1_u8(src); 454 src += src_pixels_per_line; 455 d29u8 = vld1_u8(src); 456 src += src_pixels_per_line; 457 d30u8 = vld1_u8(src); 458 459 q3u16 = vmull_u8(d22u8, d0u8); 460 q4u16 = vmull_u8(d23u8, d0u8); 461 q5u16 = vmull_u8(d24u8, d0u8); 462 q6u16 = vmull_u8(d25u8, d0u8); 463 464 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); 465 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); 466 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); 467 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); 468 469 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); 470 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); 471 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); 472 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); 473 474 q3u16 = vmlal_u8(q3u16, d24u8, d2u8); 475 q4u16 = vmlal_u8(q4u16, d25u8, d2u8); 476 q5u16 = vmlal_u8(q5u16, d26u8, d2u8); 477 q6u16 = vmlal_u8(q6u16, d27u8, d2u8); 478 479 q3u16 = vmlal_u8(q3u16, d27u8, d5u8); 480 q4u16 = vmlal_u8(q4u16, d28u8, d5u8); 481 q5u16 = vmlal_u8(q5u16, d29u8, d5u8); 482 q6u16 = vmlal_u8(q6u16, d30u8, d5u8); 483 484 q7u16 = vmull_u8(d25u8, d3u8); 485 q8u16 = vmull_u8(d26u8, d3u8); 486 q9u16 = vmull_u8(d27u8, d3u8); 487 q10u16 = vmull_u8(d28u8, d3u8); 488 489 q3s16 = vreinterpretq_s16_u16(q3u16); 490 q4s16 = vreinterpretq_s16_u16(q4u16); 491 q5s16 = vreinterpretq_s16_u16(q5u16); 492 q6s16 = vreinterpretq_s16_u16(q6u16); 493 q7s16 = vreinterpretq_s16_u16(q7u16); 494 q8s16 = vreinterpretq_s16_u16(q8u16); 495 q9s16 = vreinterpretq_s16_u16(q9u16); 496 q10s16 = vreinterpretq_s16_u16(q10u16); 497 498 q7s16 = vqaddq_s16(q7s16, q3s16); 499 q8s16 = vqaddq_s16(q8s16, q4s16); 500 q9s16 = vqaddq_s16(q9s16, q5s16); 501 q10s16 = vqaddq_s16(q10s16, q6s16); 502 503 d6u8 = vqrshrun_n_s16(q7s16, 7); 504 d7u8 = vqrshrun_n_s16(q8s16, 7); 505 d8u8 = vqrshrun_n_s16(q9s16, 7); 506 d9u8 = vqrshrun_n_s16(q10s16, 7); 507 508 vst1_u8(dst_ptr, d6u8); 509 dst_ptr += dst_pitch; 510 vst1_u8(dst_ptr, d7u8); 511 dst_ptr += dst_pitch; 512 vst1_u8(dst_ptr, d8u8); 513 dst_ptr += dst_pitch; 514 vst1_u8(dst_ptr, d9u8); 515 return; 516 } 517 518 // load first_pass filter 519 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 520 d0s8 = vdup_lane_s8(dtmps8, 0); 521 d1s8 = vdup_lane_s8(dtmps8, 1); 522 d2s8 = vdup_lane_s8(dtmps8, 2); 523 d3s8 = vdup_lane_s8(dtmps8, 3); 524 d4s8 = vdup_lane_s8(dtmps8, 4); 525 d5s8 = vdup_lane_s8(dtmps8, 5); 526 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 527 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 528 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 529 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 530 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 531 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 532 533 // First pass: output_height lines x output_width columns (9x4) 534 if (yoffset == 0) // firstpass_filter4x4_only 535 src = src_ptr - 2; 536 else 537 src = src_ptr - 2 - (src_pixels_per_line * 2); 538 q3u8 = vld1q_u8(src); 539 src += src_pixels_per_line; 540 q4u8 = vld1q_u8(src); 541 src += src_pixels_per_line; 542 q5u8 = vld1q_u8(src); 543 src += src_pixels_per_line; 544 q6u8 = vld1q_u8(src); 545 546 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 547 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 548 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 549 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 550 551 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 552 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 553 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 554 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 555 556 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); 557 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); 558 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); 559 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); 560 561 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 562 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 563 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 564 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 565 566 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); 567 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); 568 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); 569 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); 570 571 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 572 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 573 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 574 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 575 576 q7u16 = vmlal_u8(q7u16, d28u8, d2u8); 577 q8u16 = vmlal_u8(q8u16, d29u8, d2u8); 578 q9u16 = vmlal_u8(q9u16, d30u8, d2u8); 579 q10u16 = vmlal_u8(q10u16, d31u8, d2u8); 580 581 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 582 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 583 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 584 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 585 586 q7u16 = vmlal_u8(q7u16, d28u8, d5u8); 587 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 588 q9u16 = vmlal_u8(q9u16, d30u8, d5u8); 589 q10u16 = vmlal_u8(q10u16, d31u8, d5u8); 590 591 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 592 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 593 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 594 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 595 596 q3u16 = vmull_u8(d28u8, d3u8); 597 q4u16 = vmull_u8(d29u8, d3u8); 598 q5u16 = vmull_u8(d30u8, d3u8); 599 q6u16 = vmull_u8(d31u8, d3u8); 600 601 q3s16 = vreinterpretq_s16_u16(q3u16); 602 q4s16 = vreinterpretq_s16_u16(q4u16); 603 q5s16 = vreinterpretq_s16_u16(q5u16); 604 q6s16 = vreinterpretq_s16_u16(q6u16); 605 q7s16 = vreinterpretq_s16_u16(q7u16); 606 q8s16 = vreinterpretq_s16_u16(q8u16); 607 q9s16 = vreinterpretq_s16_u16(q9u16); 608 q10s16 = vreinterpretq_s16_u16(q10u16); 609 610 q7s16 = vqaddq_s16(q7s16, q3s16); 611 q8s16 = vqaddq_s16(q8s16, q4s16); 612 q9s16 = vqaddq_s16(q9s16, q5s16); 613 q10s16 = vqaddq_s16(q10s16, q6s16); 614 615 d22u8 = vqrshrun_n_s16(q7s16, 7); 616 d23u8 = vqrshrun_n_s16(q8s16, 7); 617 d24u8 = vqrshrun_n_s16(q9s16, 7); 618 d25u8 = vqrshrun_n_s16(q10s16, 7); 619 620 if (yoffset == 0) { // firstpass_filter8x4_only 621 vst1_u8(dst_ptr, d22u8); 622 dst_ptr += dst_pitch; 623 vst1_u8(dst_ptr, d23u8); 624 dst_ptr += dst_pitch; 625 vst1_u8(dst_ptr, d24u8); 626 dst_ptr += dst_pitch; 627 vst1_u8(dst_ptr, d25u8); 628 return; 629 } 630 631 // First Pass on rest 5-line data 632 src += src_pixels_per_line; 633 q3u8 = vld1q_u8(src); 634 src += src_pixels_per_line; 635 q4u8 = vld1q_u8(src); 636 src += src_pixels_per_line; 637 q5u8 = vld1q_u8(src); 638 src += src_pixels_per_line; 639 q6u8 = vld1q_u8(src); 640 src += src_pixels_per_line; 641 q7u8 = vld1q_u8(src); 642 643 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 644 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 645 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 646 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 647 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); 648 649 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 650 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 651 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 652 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 653 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); 654 655 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); 656 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 657 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 658 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); 659 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); 660 661 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 662 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 663 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 664 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 665 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); 666 667 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); 668 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 669 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 670 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); 671 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); 672 673 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 674 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 675 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 676 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 677 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); 678 679 q8u16 = vmlal_u8(q8u16, d27u8, d2u8); 680 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 681 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 682 q11u16 = vmlal_u8(q11u16, d30u8, d2u8); 683 q12u16 = vmlal_u8(q12u16, d31u8, d2u8); 684 685 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 686 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 687 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 688 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 689 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); 690 691 q8u16 = vmlal_u8(q8u16, d27u8, d5u8); 692 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 693 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 694 q11u16 = vmlal_u8(q11u16, d30u8, d5u8); 695 q12u16 = vmlal_u8(q12u16, d31u8, d5u8); 696 697 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 698 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 699 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 700 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 701 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); 702 703 q3u16 = vmull_u8(d27u8, d3u8); 704 q4u16 = vmull_u8(d28u8, d3u8); 705 q5u16 = vmull_u8(d29u8, d3u8); 706 q6u16 = vmull_u8(d30u8, d3u8); 707 q7u16 = vmull_u8(d31u8, d3u8); 708 709 q3s16 = vreinterpretq_s16_u16(q3u16); 710 q4s16 = vreinterpretq_s16_u16(q4u16); 711 q5s16 = vreinterpretq_s16_u16(q5u16); 712 q6s16 = vreinterpretq_s16_u16(q6u16); 713 q7s16 = vreinterpretq_s16_u16(q7u16); 714 q8s16 = vreinterpretq_s16_u16(q8u16); 715 q9s16 = vreinterpretq_s16_u16(q9u16); 716 q10s16 = vreinterpretq_s16_u16(q10u16); 717 q11s16 = vreinterpretq_s16_u16(q11u16); 718 q12s16 = vreinterpretq_s16_u16(q12u16); 719 720 q8s16 = vqaddq_s16(q8s16, q3s16); 721 q9s16 = vqaddq_s16(q9s16, q4s16); 722 q10s16 = vqaddq_s16(q10s16, q5s16); 723 q11s16 = vqaddq_s16(q11s16, q6s16); 724 q12s16 = vqaddq_s16(q12s16, q7s16); 725 726 d26u8 = vqrshrun_n_s16(q8s16, 7); 727 d27u8 = vqrshrun_n_s16(q9s16, 7); 728 d28u8 = vqrshrun_n_s16(q10s16, 7); 729 d29u8 = vqrshrun_n_s16(q11s16, 7); 730 d30u8 = vqrshrun_n_s16(q12s16, 7); 731 732 // Second pass: 8x4 733 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 734 d0s8 = vdup_lane_s8(dtmps8, 0); 735 d1s8 = vdup_lane_s8(dtmps8, 1); 736 d2s8 = vdup_lane_s8(dtmps8, 2); 737 d3s8 = vdup_lane_s8(dtmps8, 3); 738 d4s8 = vdup_lane_s8(dtmps8, 4); 739 d5s8 = vdup_lane_s8(dtmps8, 5); 740 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 741 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 742 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 743 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 744 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 745 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 746 747 q3u16 = vmull_u8(d22u8, d0u8); 748 q4u16 = vmull_u8(d23u8, d0u8); 749 q5u16 = vmull_u8(d24u8, d0u8); 750 q6u16 = vmull_u8(d25u8, d0u8); 751 752 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); 753 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); 754 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); 755 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); 756 757 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); 758 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); 759 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); 760 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); 761 762 q3u16 = vmlal_u8(q3u16, d24u8, d2u8); 763 q4u16 = vmlal_u8(q4u16, d25u8, d2u8); 764 q5u16 = vmlal_u8(q5u16, d26u8, d2u8); 765 q6u16 = vmlal_u8(q6u16, d27u8, d2u8); 766 767 q3u16 = vmlal_u8(q3u16, d27u8, d5u8); 768 q4u16 = vmlal_u8(q4u16, d28u8, d5u8); 769 q5u16 = vmlal_u8(q5u16, d29u8, d5u8); 770 q6u16 = vmlal_u8(q6u16, d30u8, d5u8); 771 772 q7u16 = vmull_u8(d25u8, d3u8); 773 q8u16 = vmull_u8(d26u8, d3u8); 774 q9u16 = vmull_u8(d27u8, d3u8); 775 q10u16 = vmull_u8(d28u8, d3u8); 776 777 q3s16 = vreinterpretq_s16_u16(q3u16); 778 q4s16 = vreinterpretq_s16_u16(q4u16); 779 q5s16 = vreinterpretq_s16_u16(q5u16); 780 q6s16 = vreinterpretq_s16_u16(q6u16); 781 q7s16 = vreinterpretq_s16_u16(q7u16); 782 q8s16 = vreinterpretq_s16_u16(q8u16); 783 q9s16 = vreinterpretq_s16_u16(q9u16); 784 q10s16 = vreinterpretq_s16_u16(q10u16); 785 786 q7s16 = vqaddq_s16(q7s16, q3s16); 787 q8s16 = vqaddq_s16(q8s16, q4s16); 788 q9s16 = vqaddq_s16(q9s16, q5s16); 789 q10s16 = vqaddq_s16(q10s16, q6s16); 790 791 d6u8 = vqrshrun_n_s16(q7s16, 7); 792 d7u8 = vqrshrun_n_s16(q8s16, 7); 793 d8u8 = vqrshrun_n_s16(q9s16, 7); 794 d9u8 = vqrshrun_n_s16(q10s16, 7); 795 796 vst1_u8(dst_ptr, d6u8); 797 dst_ptr += dst_pitch; 798 vst1_u8(dst_ptr, d7u8); 799 dst_ptr += dst_pitch; 800 vst1_u8(dst_ptr, d8u8); 801 dst_ptr += dst_pitch; 802 vst1_u8(dst_ptr, d9u8); 803 return; 804} 805 806void vp8_sixtap_predict8x8_neon( 807 unsigned char *src_ptr, 808 int src_pixels_per_line, 809 int xoffset, 810 int yoffset, 811 unsigned char *dst_ptr, 812 int dst_pitch) { 813 unsigned char *src, *tmpp; 814 unsigned char tmp[64]; 815 int i; 816 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 817 uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; 818 uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; 819 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 820 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; 821 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; 822 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; 823 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; 824 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; 825 826 if (xoffset == 0) { // secondpass_filter8x8_only 827 // load second_pass filter 828 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 829 d0s8 = vdup_lane_s8(dtmps8, 0); 830 d1s8 = vdup_lane_s8(dtmps8, 1); 831 d2s8 = vdup_lane_s8(dtmps8, 2); 832 d3s8 = vdup_lane_s8(dtmps8, 3); 833 d4s8 = vdup_lane_s8(dtmps8, 4); 834 d5s8 = vdup_lane_s8(dtmps8, 5); 835 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 836 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 837 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 838 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 839 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 840 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 841 842 // load src data 843 src = src_ptr - src_pixels_per_line * 2; 844 d18u8 = vld1_u8(src); 845 src += src_pixels_per_line; 846 d19u8 = vld1_u8(src); 847 src += src_pixels_per_line; 848 d20u8 = vld1_u8(src); 849 src += src_pixels_per_line; 850 d21u8 = vld1_u8(src); 851 src += src_pixels_per_line; 852 d22u8 = vld1_u8(src); 853 src += src_pixels_per_line; 854 d23u8 = vld1_u8(src); 855 src += src_pixels_per_line; 856 d24u8 = vld1_u8(src); 857 src += src_pixels_per_line; 858 d25u8 = vld1_u8(src); 859 src += src_pixels_per_line; 860 d26u8 = vld1_u8(src); 861 src += src_pixels_per_line; 862 d27u8 = vld1_u8(src); 863 src += src_pixels_per_line; 864 d28u8 = vld1_u8(src); 865 src += src_pixels_per_line; 866 d29u8 = vld1_u8(src); 867 src += src_pixels_per_line; 868 d30u8 = vld1_u8(src); 869 870 for (i = 2; i > 0; i--) { 871 q3u16 = vmull_u8(d18u8, d0u8); 872 q4u16 = vmull_u8(d19u8, d0u8); 873 q5u16 = vmull_u8(d20u8, d0u8); 874 q6u16 = vmull_u8(d21u8, d0u8); 875 876 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 877 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 878 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 879 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 880 881 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 882 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 883 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 884 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 885 886 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 887 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 888 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 889 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 890 891 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 892 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 893 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 894 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 895 896 q7u16 = vmull_u8(d21u8, d3u8); 897 q8u16 = vmull_u8(d22u8, d3u8); 898 q9u16 = vmull_u8(d23u8, d3u8); 899 q10u16 = vmull_u8(d24u8, d3u8); 900 901 q3s16 = vreinterpretq_s16_u16(q3u16); 902 q4s16 = vreinterpretq_s16_u16(q4u16); 903 q5s16 = vreinterpretq_s16_u16(q5u16); 904 q6s16 = vreinterpretq_s16_u16(q6u16); 905 q7s16 = vreinterpretq_s16_u16(q7u16); 906 q8s16 = vreinterpretq_s16_u16(q8u16); 907 q9s16 = vreinterpretq_s16_u16(q9u16); 908 q10s16 = vreinterpretq_s16_u16(q10u16); 909 910 q7s16 = vqaddq_s16(q7s16, q3s16); 911 q8s16 = vqaddq_s16(q8s16, q4s16); 912 q9s16 = vqaddq_s16(q9s16, q5s16); 913 q10s16 = vqaddq_s16(q10s16, q6s16); 914 915 d6u8 = vqrshrun_n_s16(q7s16, 7); 916 d7u8 = vqrshrun_n_s16(q8s16, 7); 917 d8u8 = vqrshrun_n_s16(q9s16, 7); 918 d9u8 = vqrshrun_n_s16(q10s16, 7); 919 920 d18u8 = d22u8; 921 d19u8 = d23u8; 922 d20u8 = d24u8; 923 d21u8 = d25u8; 924 d22u8 = d26u8; 925 d23u8 = d27u8; 926 d24u8 = d28u8; 927 d25u8 = d29u8; 928 d26u8 = d30u8; 929 930 vst1_u8(dst_ptr, d6u8); 931 dst_ptr += dst_pitch; 932 vst1_u8(dst_ptr, d7u8); 933 dst_ptr += dst_pitch; 934 vst1_u8(dst_ptr, d8u8); 935 dst_ptr += dst_pitch; 936 vst1_u8(dst_ptr, d9u8); 937 dst_ptr += dst_pitch; 938 } 939 return; 940 } 941 942 // load first_pass filter 943 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 944 d0s8 = vdup_lane_s8(dtmps8, 0); 945 d1s8 = vdup_lane_s8(dtmps8, 1); 946 d2s8 = vdup_lane_s8(dtmps8, 2); 947 d3s8 = vdup_lane_s8(dtmps8, 3); 948 d4s8 = vdup_lane_s8(dtmps8, 4); 949 d5s8 = vdup_lane_s8(dtmps8, 5); 950 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 951 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 952 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 953 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 954 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 955 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 956 957 // First pass: output_height lines x output_width columns (9x4) 958 if (yoffset == 0) // firstpass_filter4x4_only 959 src = src_ptr - 2; 960 else 961 src = src_ptr - 2 - (src_pixels_per_line * 2); 962 963 tmpp = tmp; 964 for (i = 2; i > 0; i--) { 965 q3u8 = vld1q_u8(src); 966 src += src_pixels_per_line; 967 q4u8 = vld1q_u8(src); 968 src += src_pixels_per_line; 969 q5u8 = vld1q_u8(src); 970 src += src_pixels_per_line; 971 q6u8 = vld1q_u8(src); 972 src += src_pixels_per_line; 973 974 __builtin_prefetch(src); 975 __builtin_prefetch(src + src_pixels_per_line); 976 __builtin_prefetch(src + src_pixels_per_line * 2); 977 978 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 979 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 980 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 981 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 982 983 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 984 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 985 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 986 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 987 988 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); 989 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); 990 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); 991 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); 992 993 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 994 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 995 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 996 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 997 998 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); 999 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); 1000 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); 1001 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); 1002 1003 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 1004 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 1005 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 1006 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 1007 1008 q7u16 = vmlal_u8(q7u16, d28u8, d2u8); 1009 q8u16 = vmlal_u8(q8u16, d29u8, d2u8); 1010 q9u16 = vmlal_u8(q9u16, d30u8, d2u8); 1011 q10u16 = vmlal_u8(q10u16, d31u8, d2u8); 1012 1013 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 1014 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 1015 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 1016 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 1017 1018 q7u16 = vmlal_u8(q7u16, d28u8, d5u8); 1019 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 1020 q9u16 = vmlal_u8(q9u16, d30u8, d5u8); 1021 q10u16 = vmlal_u8(q10u16, d31u8, d5u8); 1022 1023 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 1024 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 1025 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 1026 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 1027 1028 q3u16 = vmull_u8(d28u8, d3u8); 1029 q4u16 = vmull_u8(d29u8, d3u8); 1030 q5u16 = vmull_u8(d30u8, d3u8); 1031 q6u16 = vmull_u8(d31u8, d3u8); 1032 1033 q3s16 = vreinterpretq_s16_u16(q3u16); 1034 q4s16 = vreinterpretq_s16_u16(q4u16); 1035 q5s16 = vreinterpretq_s16_u16(q5u16); 1036 q6s16 = vreinterpretq_s16_u16(q6u16); 1037 q7s16 = vreinterpretq_s16_u16(q7u16); 1038 q8s16 = vreinterpretq_s16_u16(q8u16); 1039 q9s16 = vreinterpretq_s16_u16(q9u16); 1040 q10s16 = vreinterpretq_s16_u16(q10u16); 1041 1042 q7s16 = vqaddq_s16(q7s16, q3s16); 1043 q8s16 = vqaddq_s16(q8s16, q4s16); 1044 q9s16 = vqaddq_s16(q9s16, q5s16); 1045 q10s16 = vqaddq_s16(q10s16, q6s16); 1046 1047 d22u8 = vqrshrun_n_s16(q7s16, 7); 1048 d23u8 = vqrshrun_n_s16(q8s16, 7); 1049 d24u8 = vqrshrun_n_s16(q9s16, 7); 1050 d25u8 = vqrshrun_n_s16(q10s16, 7); 1051 1052 if (yoffset == 0) { // firstpass_filter8x4_only 1053 vst1_u8(dst_ptr, d22u8); 1054 dst_ptr += dst_pitch; 1055 vst1_u8(dst_ptr, d23u8); 1056 dst_ptr += dst_pitch; 1057 vst1_u8(dst_ptr, d24u8); 1058 dst_ptr += dst_pitch; 1059 vst1_u8(dst_ptr, d25u8); 1060 dst_ptr += dst_pitch; 1061 } else { 1062 vst1_u8(tmpp, d22u8); 1063 tmpp += 8; 1064 vst1_u8(tmpp, d23u8); 1065 tmpp += 8; 1066 vst1_u8(tmpp, d24u8); 1067 tmpp += 8; 1068 vst1_u8(tmpp, d25u8); 1069 tmpp += 8; 1070 } 1071 } 1072 if (yoffset == 0) 1073 return; 1074 1075 // First Pass on rest 5-line data 1076 q3u8 = vld1q_u8(src); 1077 src += src_pixels_per_line; 1078 q4u8 = vld1q_u8(src); 1079 src += src_pixels_per_line; 1080 q5u8 = vld1q_u8(src); 1081 src += src_pixels_per_line; 1082 q6u8 = vld1q_u8(src); 1083 src += src_pixels_per_line; 1084 q7u8 = vld1q_u8(src); 1085 1086 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 1087 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 1088 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 1089 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 1090 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); 1091 1092 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 1093 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 1094 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 1095 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 1096 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); 1097 1098 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); 1099 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 1100 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 1101 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); 1102 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); 1103 1104 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 1105 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 1106 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 1107 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 1108 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); 1109 1110 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); 1111 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 1112 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 1113 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); 1114 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); 1115 1116 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 1117 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 1118 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 1119 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 1120 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); 1121 1122 q8u16 = vmlal_u8(q8u16, d27u8, d2u8); 1123 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 1124 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 1125 q11u16 = vmlal_u8(q11u16, d30u8, d2u8); 1126 q12u16 = vmlal_u8(q12u16, d31u8, d2u8); 1127 1128 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 1129 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 1130 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 1131 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 1132 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); 1133 1134 q8u16 = vmlal_u8(q8u16, d27u8, d5u8); 1135 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 1136 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 1137 q11u16 = vmlal_u8(q11u16, d30u8, d5u8); 1138 q12u16 = vmlal_u8(q12u16, d31u8, d5u8); 1139 1140 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 1141 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 1142 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 1143 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 1144 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); 1145 1146 q3u16 = vmull_u8(d27u8, d3u8); 1147 q4u16 = vmull_u8(d28u8, d3u8); 1148 q5u16 = vmull_u8(d29u8, d3u8); 1149 q6u16 = vmull_u8(d30u8, d3u8); 1150 q7u16 = vmull_u8(d31u8, d3u8); 1151 1152 q3s16 = vreinterpretq_s16_u16(q3u16); 1153 q4s16 = vreinterpretq_s16_u16(q4u16); 1154 q5s16 = vreinterpretq_s16_u16(q5u16); 1155 q6s16 = vreinterpretq_s16_u16(q6u16); 1156 q7s16 = vreinterpretq_s16_u16(q7u16); 1157 q8s16 = vreinterpretq_s16_u16(q8u16); 1158 q9s16 = vreinterpretq_s16_u16(q9u16); 1159 q10s16 = vreinterpretq_s16_u16(q10u16); 1160 q11s16 = vreinterpretq_s16_u16(q11u16); 1161 q12s16 = vreinterpretq_s16_u16(q12u16); 1162 1163 q8s16 = vqaddq_s16(q8s16, q3s16); 1164 q9s16 = vqaddq_s16(q9s16, q4s16); 1165 q10s16 = vqaddq_s16(q10s16, q5s16); 1166 q11s16 = vqaddq_s16(q11s16, q6s16); 1167 q12s16 = vqaddq_s16(q12s16, q7s16); 1168 1169 d26u8 = vqrshrun_n_s16(q8s16, 7); 1170 d27u8 = vqrshrun_n_s16(q9s16, 7); 1171 d28u8 = vqrshrun_n_s16(q10s16, 7); 1172 d29u8 = vqrshrun_n_s16(q11s16, 7); 1173 d30u8 = vqrshrun_n_s16(q12s16, 7); 1174 1175 // Second pass: 8x8 1176 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1177 d0s8 = vdup_lane_s8(dtmps8, 0); 1178 d1s8 = vdup_lane_s8(dtmps8, 1); 1179 d2s8 = vdup_lane_s8(dtmps8, 2); 1180 d3s8 = vdup_lane_s8(dtmps8, 3); 1181 d4s8 = vdup_lane_s8(dtmps8, 4); 1182 d5s8 = vdup_lane_s8(dtmps8, 5); 1183 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1184 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1185 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1186 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1187 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1188 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1189 1190 tmpp = tmp; 1191 q9u8 = vld1q_u8(tmpp); 1192 tmpp += 16; 1193 q10u8 = vld1q_u8(tmpp); 1194 tmpp += 16; 1195 q11u8 = vld1q_u8(tmpp); 1196 tmpp += 16; 1197 q12u8 = vld1q_u8(tmpp); 1198 1199 d18u8 = vget_low_u8(q9u8); 1200 d19u8 = vget_high_u8(q9u8); 1201 d20u8 = vget_low_u8(q10u8); 1202 d21u8 = vget_high_u8(q10u8); 1203 d22u8 = vget_low_u8(q11u8); 1204 d23u8 = vget_high_u8(q11u8); 1205 d24u8 = vget_low_u8(q12u8); 1206 d25u8 = vget_high_u8(q12u8); 1207 1208 for (i = 2; i > 0; i--) { 1209 q3u16 = vmull_u8(d18u8, d0u8); 1210 q4u16 = vmull_u8(d19u8, d0u8); 1211 q5u16 = vmull_u8(d20u8, d0u8); 1212 q6u16 = vmull_u8(d21u8, d0u8); 1213 1214 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1215 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1216 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1217 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1218 1219 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1220 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1221 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1222 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1223 1224 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1225 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1226 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1227 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1228 1229 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1230 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1231 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1232 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1233 1234 q7u16 = vmull_u8(d21u8, d3u8); 1235 q8u16 = vmull_u8(d22u8, d3u8); 1236 q9u16 = vmull_u8(d23u8, d3u8); 1237 q10u16 = vmull_u8(d24u8, d3u8); 1238 1239 q3s16 = vreinterpretq_s16_u16(q3u16); 1240 q4s16 = vreinterpretq_s16_u16(q4u16); 1241 q5s16 = vreinterpretq_s16_u16(q5u16); 1242 q6s16 = vreinterpretq_s16_u16(q6u16); 1243 q7s16 = vreinterpretq_s16_u16(q7u16); 1244 q8s16 = vreinterpretq_s16_u16(q8u16); 1245 q9s16 = vreinterpretq_s16_u16(q9u16); 1246 q10s16 = vreinterpretq_s16_u16(q10u16); 1247 1248 q7s16 = vqaddq_s16(q7s16, q3s16); 1249 q8s16 = vqaddq_s16(q8s16, q4s16); 1250 q9s16 = vqaddq_s16(q9s16, q5s16); 1251 q10s16 = vqaddq_s16(q10s16, q6s16); 1252 1253 d6u8 = vqrshrun_n_s16(q7s16, 7); 1254 d7u8 = vqrshrun_n_s16(q8s16, 7); 1255 d8u8 = vqrshrun_n_s16(q9s16, 7); 1256 d9u8 = vqrshrun_n_s16(q10s16, 7); 1257 1258 d18u8 = d22u8; 1259 d19u8 = d23u8; 1260 d20u8 = d24u8; 1261 d21u8 = d25u8; 1262 d22u8 = d26u8; 1263 d23u8 = d27u8; 1264 d24u8 = d28u8; 1265 d25u8 = d29u8; 1266 d26u8 = d30u8; 1267 1268 vst1_u8(dst_ptr, d6u8); 1269 dst_ptr += dst_pitch; 1270 vst1_u8(dst_ptr, d7u8); 1271 dst_ptr += dst_pitch; 1272 vst1_u8(dst_ptr, d8u8); 1273 dst_ptr += dst_pitch; 1274 vst1_u8(dst_ptr, d9u8); 1275 dst_ptr += dst_pitch; 1276 } 1277 return; 1278} 1279 1280void vp8_sixtap_predict16x16_neon( 1281 unsigned char *src_ptr, 1282 int src_pixels_per_line, 1283 int xoffset, 1284 int yoffset, 1285 unsigned char *dst_ptr, 1286 int dst_pitch) { 1287 unsigned char *src, *src_tmp, *dst, *tmpp; 1288 unsigned char tmp[336]; 1289 int i, j; 1290 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 1291 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8; 1292 uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8; 1293 uint8x8_t d28u8, d29u8, d30u8, d31u8; 1294 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 1295 uint8x16_t q3u8, q4u8; 1296 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16; 1297 uint16x8_t q11u16, q12u16, q13u16, q15u16; 1298 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16; 1299 int16x8_t q11s16, q12s16, q13s16, q15s16; 1300 1301 if (xoffset == 0) { // secondpass_filter8x8_only 1302 // load second_pass filter 1303 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1304 d0s8 = vdup_lane_s8(dtmps8, 0); 1305 d1s8 = vdup_lane_s8(dtmps8, 1); 1306 d2s8 = vdup_lane_s8(dtmps8, 2); 1307 d3s8 = vdup_lane_s8(dtmps8, 3); 1308 d4s8 = vdup_lane_s8(dtmps8, 4); 1309 d5s8 = vdup_lane_s8(dtmps8, 5); 1310 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1311 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1312 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1313 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1314 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1315 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1316 1317 // load src data 1318 src_tmp = src_ptr - src_pixels_per_line * 2; 1319 for (i = 0; i < 2; i++) { 1320 src = src_tmp + i * 8; 1321 dst = dst_ptr + i * 8; 1322 d18u8 = vld1_u8(src); 1323 src += src_pixels_per_line; 1324 d19u8 = vld1_u8(src); 1325 src += src_pixels_per_line; 1326 d20u8 = vld1_u8(src); 1327 src += src_pixels_per_line; 1328 d21u8 = vld1_u8(src); 1329 src += src_pixels_per_line; 1330 d22u8 = vld1_u8(src); 1331 src += src_pixels_per_line; 1332 for (j = 0; j < 4; j++) { 1333 d23u8 = vld1_u8(src); 1334 src += src_pixels_per_line; 1335 d24u8 = vld1_u8(src); 1336 src += src_pixels_per_line; 1337 d25u8 = vld1_u8(src); 1338 src += src_pixels_per_line; 1339 d26u8 = vld1_u8(src); 1340 src += src_pixels_per_line; 1341 1342 q3u16 = vmull_u8(d18u8, d0u8); 1343 q4u16 = vmull_u8(d19u8, d0u8); 1344 q5u16 = vmull_u8(d20u8, d0u8); 1345 q6u16 = vmull_u8(d21u8, d0u8); 1346 1347 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1348 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1349 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1350 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1351 1352 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1353 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1354 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1355 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1356 1357 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1358 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1359 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1360 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1361 1362 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1363 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1364 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1365 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1366 1367 q7u16 = vmull_u8(d21u8, d3u8); 1368 q8u16 = vmull_u8(d22u8, d3u8); 1369 q9u16 = vmull_u8(d23u8, d3u8); 1370 q10u16 = vmull_u8(d24u8, d3u8); 1371 1372 q3s16 = vreinterpretq_s16_u16(q3u16); 1373 q4s16 = vreinterpretq_s16_u16(q4u16); 1374 q5s16 = vreinterpretq_s16_u16(q5u16); 1375 q6s16 = vreinterpretq_s16_u16(q6u16); 1376 q7s16 = vreinterpretq_s16_u16(q7u16); 1377 q8s16 = vreinterpretq_s16_u16(q8u16); 1378 q9s16 = vreinterpretq_s16_u16(q9u16); 1379 q10s16 = vreinterpretq_s16_u16(q10u16); 1380 1381 q7s16 = vqaddq_s16(q7s16, q3s16); 1382 q8s16 = vqaddq_s16(q8s16, q4s16); 1383 q9s16 = vqaddq_s16(q9s16, q5s16); 1384 q10s16 = vqaddq_s16(q10s16, q6s16); 1385 1386 d6u8 = vqrshrun_n_s16(q7s16, 7); 1387 d7u8 = vqrshrun_n_s16(q8s16, 7); 1388 d8u8 = vqrshrun_n_s16(q9s16, 7); 1389 d9u8 = vqrshrun_n_s16(q10s16, 7); 1390 1391 d18u8 = d22u8; 1392 d19u8 = d23u8; 1393 d20u8 = d24u8; 1394 d21u8 = d25u8; 1395 d22u8 = d26u8; 1396 1397 vst1_u8(dst, d6u8); 1398 dst += dst_pitch; 1399 vst1_u8(dst, d7u8); 1400 dst += dst_pitch; 1401 vst1_u8(dst, d8u8); 1402 dst += dst_pitch; 1403 vst1_u8(dst, d9u8); 1404 dst += dst_pitch; 1405 } 1406 } 1407 return; 1408 } 1409 1410 // load first_pass filter 1411 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 1412 d0s8 = vdup_lane_s8(dtmps8, 0); 1413 d1s8 = vdup_lane_s8(dtmps8, 1); 1414 d2s8 = vdup_lane_s8(dtmps8, 2); 1415 d3s8 = vdup_lane_s8(dtmps8, 3); 1416 d4s8 = vdup_lane_s8(dtmps8, 4); 1417 d5s8 = vdup_lane_s8(dtmps8, 5); 1418 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1419 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1420 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1421 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1422 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1423 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1424 1425 // First pass: output_height lines x output_width columns (9x4) 1426 if (yoffset == 0) { // firstpass_filter4x4_only 1427 src = src_ptr - 2; 1428 dst = dst_ptr; 1429 for (i = 0; i < 8; i++) { 1430 d6u8 = vld1_u8(src); 1431 d7u8 = vld1_u8(src + 8); 1432 d8u8 = vld1_u8(src + 16); 1433 src += src_pixels_per_line; 1434 d9u8 = vld1_u8(src); 1435 d10u8 = vld1_u8(src + 8); 1436 d11u8 = vld1_u8(src + 16); 1437 src += src_pixels_per_line; 1438 1439 __builtin_prefetch(src); 1440 __builtin_prefetch(src + src_pixels_per_line); 1441 1442 q6u16 = vmull_u8(d6u8, d0u8); 1443 q7u16 = vmull_u8(d7u8, d0u8); 1444 q8u16 = vmull_u8(d9u8, d0u8); 1445 q9u16 = vmull_u8(d10u8, d0u8); 1446 1447 d20u8 = vext_u8(d6u8, d7u8, 1); 1448 d21u8 = vext_u8(d9u8, d10u8, 1); 1449 d22u8 = vext_u8(d7u8, d8u8, 1); 1450 d23u8 = vext_u8(d10u8, d11u8, 1); 1451 d24u8 = vext_u8(d6u8, d7u8, 4); 1452 d25u8 = vext_u8(d9u8, d10u8, 4); 1453 d26u8 = vext_u8(d7u8, d8u8, 4); 1454 d27u8 = vext_u8(d10u8, d11u8, 4); 1455 d28u8 = vext_u8(d6u8, d7u8, 5); 1456 d29u8 = vext_u8(d9u8, d10u8, 5); 1457 1458 q6u16 = vmlsl_u8(q6u16, d20u8, d1u8); 1459 q8u16 = vmlsl_u8(q8u16, d21u8, d1u8); 1460 q7u16 = vmlsl_u8(q7u16, d22u8, d1u8); 1461 q9u16 = vmlsl_u8(q9u16, d23u8, d1u8); 1462 q6u16 = vmlsl_u8(q6u16, d24u8, d4u8); 1463 q8u16 = vmlsl_u8(q8u16, d25u8, d4u8); 1464 q7u16 = vmlsl_u8(q7u16, d26u8, d4u8); 1465 q9u16 = vmlsl_u8(q9u16, d27u8, d4u8); 1466 q6u16 = vmlal_u8(q6u16, d28u8, d5u8); 1467 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 1468 1469 d20u8 = vext_u8(d7u8, d8u8, 5); 1470 d21u8 = vext_u8(d10u8, d11u8, 5); 1471 d22u8 = vext_u8(d6u8, d7u8, 2); 1472 d23u8 = vext_u8(d9u8, d10u8, 2); 1473 d24u8 = vext_u8(d7u8, d8u8, 2); 1474 d25u8 = vext_u8(d10u8, d11u8, 2); 1475 d26u8 = vext_u8(d6u8, d7u8, 3); 1476 d27u8 = vext_u8(d9u8, d10u8, 3); 1477 d28u8 = vext_u8(d7u8, d8u8, 3); 1478 d29u8 = vext_u8(d10u8, d11u8, 3); 1479 1480 q7u16 = vmlal_u8(q7u16, d20u8, d5u8); 1481 q9u16 = vmlal_u8(q9u16, d21u8, d5u8); 1482 q6u16 = vmlal_u8(q6u16, d22u8, d2u8); 1483 q8u16 = vmlal_u8(q8u16, d23u8, d2u8); 1484 q7u16 = vmlal_u8(q7u16, d24u8, d2u8); 1485 q9u16 = vmlal_u8(q9u16, d25u8, d2u8); 1486 1487 q10u16 = vmull_u8(d26u8, d3u8); 1488 q11u16 = vmull_u8(d27u8, d3u8); 1489 q12u16 = vmull_u8(d28u8, d3u8); 1490 q15u16 = vmull_u8(d29u8, d3u8); 1491 1492 q6s16 = vreinterpretq_s16_u16(q6u16); 1493 q7s16 = vreinterpretq_s16_u16(q7u16); 1494 q8s16 = vreinterpretq_s16_u16(q8u16); 1495 q9s16 = vreinterpretq_s16_u16(q9u16); 1496 q10s16 = vreinterpretq_s16_u16(q10u16); 1497 q11s16 = vreinterpretq_s16_u16(q11u16); 1498 q12s16 = vreinterpretq_s16_u16(q12u16); 1499 q15s16 = vreinterpretq_s16_u16(q15u16); 1500 1501 q6s16 = vqaddq_s16(q6s16, q10s16); 1502 q8s16 = vqaddq_s16(q8s16, q11s16); 1503 q7s16 = vqaddq_s16(q7s16, q12s16); 1504 q9s16 = vqaddq_s16(q9s16, q15s16); 1505 1506 d6u8 = vqrshrun_n_s16(q6s16, 7); 1507 d7u8 = vqrshrun_n_s16(q7s16, 7); 1508 d8u8 = vqrshrun_n_s16(q8s16, 7); 1509 d9u8 = vqrshrun_n_s16(q9s16, 7); 1510 1511 q3u8 = vcombine_u8(d6u8, d7u8); 1512 q4u8 = vcombine_u8(d8u8, d9u8); 1513 vst1q_u8(dst, q3u8); 1514 dst += dst_pitch; 1515 vst1q_u8(dst, q4u8); 1516 dst += dst_pitch; 1517 } 1518 return; 1519 } 1520 1521 src = src_ptr - 2 - src_pixels_per_line * 2; 1522 tmpp = tmp; 1523 for (i = 0; i < 7; i++) { 1524 d6u8 = vld1_u8(src); 1525 d7u8 = vld1_u8(src + 8); 1526 d8u8 = vld1_u8(src + 16); 1527 src += src_pixels_per_line; 1528 d9u8 = vld1_u8(src); 1529 d10u8 = vld1_u8(src + 8); 1530 d11u8 = vld1_u8(src + 16); 1531 src += src_pixels_per_line; 1532 d12u8 = vld1_u8(src); 1533 d13u8 = vld1_u8(src + 8); 1534 d14u8 = vld1_u8(src + 16); 1535 src += src_pixels_per_line; 1536 1537 __builtin_prefetch(src); 1538 __builtin_prefetch(src + src_pixels_per_line); 1539 __builtin_prefetch(src + src_pixels_per_line * 2); 1540 1541 q8u16 = vmull_u8(d6u8, d0u8); 1542 q9u16 = vmull_u8(d7u8, d0u8); 1543 q10u16 = vmull_u8(d9u8, d0u8); 1544 q11u16 = vmull_u8(d10u8, d0u8); 1545 q12u16 = vmull_u8(d12u8, d0u8); 1546 q13u16 = vmull_u8(d13u8, d0u8); 1547 1548 d28u8 = vext_u8(d6u8, d7u8, 1); 1549 d29u8 = vext_u8(d9u8, d10u8, 1); 1550 d30u8 = vext_u8(d12u8, d13u8, 1); 1551 q8u16 = vmlsl_u8(q8u16, d28u8, d1u8); 1552 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 1553 q12u16 = vmlsl_u8(q12u16, d30u8, d1u8); 1554 d28u8 = vext_u8(d7u8, d8u8, 1); 1555 d29u8 = vext_u8(d10u8, d11u8, 1); 1556 d30u8 = vext_u8(d13u8, d14u8, 1); 1557 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 1558 q11u16 = vmlsl_u8(q11u16, d29u8, d1u8); 1559 q13u16 = vmlsl_u8(q13u16, d30u8, d1u8); 1560 1561 d28u8 = vext_u8(d6u8, d7u8, 4); 1562 d29u8 = vext_u8(d9u8, d10u8, 4); 1563 d30u8 = vext_u8(d12u8, d13u8, 4); 1564 q8u16 = vmlsl_u8(q8u16, d28u8, d4u8); 1565 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 1566 q12u16 = vmlsl_u8(q12u16, d30u8, d4u8); 1567 d28u8 = vext_u8(d7u8, d8u8, 4); 1568 d29u8 = vext_u8(d10u8, d11u8, 4); 1569 d30u8 = vext_u8(d13u8, d14u8, 4); 1570 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 1571 q11u16 = vmlsl_u8(q11u16, d29u8, d4u8); 1572 q13u16 = vmlsl_u8(q13u16, d30u8, d4u8); 1573 1574 d28u8 = vext_u8(d6u8, d7u8, 5); 1575 d29u8 = vext_u8(d9u8, d10u8, 5); 1576 d30u8 = vext_u8(d12u8, d13u8, 5); 1577 q8u16 = vmlal_u8(q8u16, d28u8, d5u8); 1578 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 1579 q12u16 = vmlal_u8(q12u16, d30u8, d5u8); 1580 d28u8 = vext_u8(d7u8, d8u8, 5); 1581 d29u8 = vext_u8(d10u8, d11u8, 5); 1582 d30u8 = vext_u8(d13u8, d14u8, 5); 1583 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 1584 q11u16 = vmlal_u8(q11u16, d29u8, d5u8); 1585 q13u16 = vmlal_u8(q13u16, d30u8, d5u8); 1586 1587 d28u8 = vext_u8(d6u8, d7u8, 2); 1588 d29u8 = vext_u8(d9u8, d10u8, 2); 1589 d30u8 = vext_u8(d12u8, d13u8, 2); 1590 q8u16 = vmlal_u8(q8u16, d28u8, d2u8); 1591 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 1592 q12u16 = vmlal_u8(q12u16, d30u8, d2u8); 1593 d28u8 = vext_u8(d7u8, d8u8, 2); 1594 d29u8 = vext_u8(d10u8, d11u8, 2); 1595 d30u8 = vext_u8(d13u8, d14u8, 2); 1596 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 1597 q11u16 = vmlal_u8(q11u16, d29u8, d2u8); 1598 q13u16 = vmlal_u8(q13u16, d30u8, d2u8); 1599 1600 d28u8 = vext_u8(d6u8, d7u8, 3); 1601 d29u8 = vext_u8(d9u8, d10u8, 3); 1602 d30u8 = vext_u8(d12u8, d13u8, 3); 1603 d15u8 = vext_u8(d7u8, d8u8, 3); 1604 d31u8 = vext_u8(d10u8, d11u8, 3); 1605 d6u8 = vext_u8(d13u8, d14u8, 3); 1606 q4u16 = vmull_u8(d28u8, d3u8); 1607 q5u16 = vmull_u8(d29u8, d3u8); 1608 q6u16 = vmull_u8(d30u8, d3u8); 1609 q4s16 = vreinterpretq_s16_u16(q4u16); 1610 q5s16 = vreinterpretq_s16_u16(q5u16); 1611 q6s16 = vreinterpretq_s16_u16(q6u16); 1612 q8s16 = vreinterpretq_s16_u16(q8u16); 1613 q10s16 = vreinterpretq_s16_u16(q10u16); 1614 q12s16 = vreinterpretq_s16_u16(q12u16); 1615 q8s16 = vqaddq_s16(q8s16, q4s16); 1616 q10s16 = vqaddq_s16(q10s16, q5s16); 1617 q12s16 = vqaddq_s16(q12s16, q6s16); 1618 1619 q6u16 = vmull_u8(d15u8, d3u8); 1620 q7u16 = vmull_u8(d31u8, d3u8); 1621 q3u16 = vmull_u8(d6u8, d3u8); 1622 q3s16 = vreinterpretq_s16_u16(q3u16); 1623 q6s16 = vreinterpretq_s16_u16(q6u16); 1624 q7s16 = vreinterpretq_s16_u16(q7u16); 1625 q9s16 = vreinterpretq_s16_u16(q9u16); 1626 q11s16 = vreinterpretq_s16_u16(q11u16); 1627 q13s16 = vreinterpretq_s16_u16(q13u16); 1628 q9s16 = vqaddq_s16(q9s16, q6s16); 1629 q11s16 = vqaddq_s16(q11s16, q7s16); 1630 q13s16 = vqaddq_s16(q13s16, q3s16); 1631 1632 d6u8 = vqrshrun_n_s16(q8s16, 7); 1633 d7u8 = vqrshrun_n_s16(q9s16, 7); 1634 d8u8 = vqrshrun_n_s16(q10s16, 7); 1635 d9u8 = vqrshrun_n_s16(q11s16, 7); 1636 d10u8 = vqrshrun_n_s16(q12s16, 7); 1637 d11u8 = vqrshrun_n_s16(q13s16, 7); 1638 1639 vst1_u8(tmpp, d6u8); 1640 tmpp += 8; 1641 vst1_u8(tmpp, d7u8); 1642 tmpp += 8; 1643 vst1_u8(tmpp, d8u8); 1644 tmpp += 8; 1645 vst1_u8(tmpp, d9u8); 1646 tmpp += 8; 1647 vst1_u8(tmpp, d10u8); 1648 tmpp += 8; 1649 vst1_u8(tmpp, d11u8); 1650 tmpp += 8; 1651 } 1652 1653 // Second pass: 16x16 1654 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1655 d0s8 = vdup_lane_s8(dtmps8, 0); 1656 d1s8 = vdup_lane_s8(dtmps8, 1); 1657 d2s8 = vdup_lane_s8(dtmps8, 2); 1658 d3s8 = vdup_lane_s8(dtmps8, 3); 1659 d4s8 = vdup_lane_s8(dtmps8, 4); 1660 d5s8 = vdup_lane_s8(dtmps8, 5); 1661 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1662 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1663 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1664 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1665 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1666 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1667 1668 for (i = 0; i < 2; i++) { 1669 dst = dst_ptr + 8 * i; 1670 tmpp = tmp + 8 * i; 1671 d18u8 = vld1_u8(tmpp); 1672 tmpp += 16; 1673 d19u8 = vld1_u8(tmpp); 1674 tmpp += 16; 1675 d20u8 = vld1_u8(tmpp); 1676 tmpp += 16; 1677 d21u8 = vld1_u8(tmpp); 1678 tmpp += 16; 1679 d22u8 = vld1_u8(tmpp); 1680 tmpp += 16; 1681 for (j = 0; j < 4; j++) { 1682 d23u8 = vld1_u8(tmpp); 1683 tmpp += 16; 1684 d24u8 = vld1_u8(tmpp); 1685 tmpp += 16; 1686 d25u8 = vld1_u8(tmpp); 1687 tmpp += 16; 1688 d26u8 = vld1_u8(tmpp); 1689 tmpp += 16; 1690 1691 q3u16 = vmull_u8(d18u8, d0u8); 1692 q4u16 = vmull_u8(d19u8, d0u8); 1693 q5u16 = vmull_u8(d20u8, d0u8); 1694 q6u16 = vmull_u8(d21u8, d0u8); 1695 1696 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1697 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1698 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1699 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1700 1701 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1702 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1703 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1704 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1705 1706 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1707 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1708 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1709 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1710 1711 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1712 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1713 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1714 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1715 1716 q7u16 = vmull_u8(d21u8, d3u8); 1717 q8u16 = vmull_u8(d22u8, d3u8); 1718 q9u16 = vmull_u8(d23u8, d3u8); 1719 q10u16 = vmull_u8(d24u8, d3u8); 1720 1721 q3s16 = vreinterpretq_s16_u16(q3u16); 1722 q4s16 = vreinterpretq_s16_u16(q4u16); 1723 q5s16 = vreinterpretq_s16_u16(q5u16); 1724 q6s16 = vreinterpretq_s16_u16(q6u16); 1725 q7s16 = vreinterpretq_s16_u16(q7u16); 1726 q8s16 = vreinterpretq_s16_u16(q8u16); 1727 q9s16 = vreinterpretq_s16_u16(q9u16); 1728 q10s16 = vreinterpretq_s16_u16(q10u16); 1729 1730 q7s16 = vqaddq_s16(q7s16, q3s16); 1731 q8s16 = vqaddq_s16(q8s16, q4s16); 1732 q9s16 = vqaddq_s16(q9s16, q5s16); 1733 q10s16 = vqaddq_s16(q10s16, q6s16); 1734 1735 d6u8 = vqrshrun_n_s16(q7s16, 7); 1736 d7u8 = vqrshrun_n_s16(q8s16, 7); 1737 d8u8 = vqrshrun_n_s16(q9s16, 7); 1738 d9u8 = vqrshrun_n_s16(q10s16, 7); 1739 1740 d18u8 = d22u8; 1741 d19u8 = d23u8; 1742 d20u8 = d24u8; 1743 d21u8 = d25u8; 1744 d22u8 = d26u8; 1745 1746 vst1_u8(dst, d6u8); 1747 dst += dst_pitch; 1748 vst1_u8(dst, d7u8); 1749 dst += dst_pitch; 1750 vst1_u8(dst, d8u8); 1751 dst += dst_pitch; 1752 vst1_u8(dst, d9u8); 1753 dst += dst_pitch; 1754 } 1755 } 1756 return; 1757} 1758