1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_dsp_rtcd.h" 12#include "vpx_dsp/mips/loopfilter_msa.h" 13#include "vpx_ports/mem.h" 14 15static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, 16 uint8_t *filter48, 17 const uint8_t *b_limit_ptr, 18 const uint8_t *limit_ptr, 19 const uint8_t *thresh_ptr) { 20 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 21 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 22 v16u8 flat, mask, hev, thresh, b_limit, limit; 23 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 24 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 25 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 26 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; 27 v16u8 zero = { 0 }; 28 29 /* load vector elements */ 30 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 31 32 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 33 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 34 limit = (v16u8)__msa_fill_b(*limit_ptr); 35 36 /* mask and hev */ 37 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 38 mask, flat); 39 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 40 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 41 42 if (__msa_test_bz_v(flat)) { 43 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 44 45 return 1; 46 } else { 47 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 48 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 49 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 50 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 51 52 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); 53 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); 54 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 55 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 56 57 /* convert 16 bit output data into 8 bit */ 58 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 59 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 60 p0_filt8_r, q0_filt8_r); 61 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 62 q2_filt8_r); 63 64 /* store pixel values */ 65 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 66 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 67 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 68 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 69 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 70 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 71 72 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 73 filter48 += (4 * 16); 74 ST_UB2(q1_out, q2_out, filter48, 16); 75 filter48 += (2 * 16); 76 ST_UB(flat, filter48); 77 78 return 0; 79 } 80} 81 82static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { 83 v16u8 flat, flat2, filter8; 84 v16i8 zero = { 0 }; 85 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 86 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; 87 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; 88 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; 89 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; 90 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; 91 v8i16 l_out, r_out; 92 93 flat = LD_UB(filter48 + 96); 94 95 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); 96 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); 97 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 98 99 if (__msa_test_bz_v(flat2)) { 100 LD_UB4(filter48, 16, p2, p1, p0, q0); 101 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 102 103 src -= 3 * pitch; 104 ST_UB4(p2, p1, p0, q0, src, pitch); 105 src += (4 * pitch); 106 ST_UB2(q1, q2, src, pitch); 107 } else { 108 src -= 7 * pitch; 109 110 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, 111 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, 112 p2_r_in, p1_r_in, p0_r_in); 113 114 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); 115 116 tmp0_r = p7_r_in << 3; 117 tmp0_r -= p7_r_in; 118 tmp0_r += p6_r_in; 119 tmp0_r += q0_r_in; 120 tmp1_r = p6_r_in + p5_r_in; 121 tmp1_r += p4_r_in; 122 tmp1_r += p3_r_in; 123 tmp1_r += p2_r_in; 124 tmp1_r += p1_r_in; 125 tmp1_r += p0_r_in; 126 tmp1_r += tmp0_r; 127 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 128 129 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, 130 p5_l_in, p4_l_in); 131 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, 132 p1_l_in, p0_l_in); 133 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); 134 135 tmp0_l = p7_l_in << 3; 136 tmp0_l -= p7_l_in; 137 tmp0_l += p6_l_in; 138 tmp0_l += q0_l_in; 139 tmp1_l = p6_l_in + p5_l_in; 140 tmp1_l += p4_l_in; 141 tmp1_l += p3_l_in; 142 tmp1_l += p2_l_in; 143 tmp1_l += p1_l_in; 144 tmp1_l += p0_l_in; 145 tmp1_l += tmp0_l; 146 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 147 148 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 149 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); 150 ST_UB(p6, src); 151 src += pitch; 152 153 /* p5 */ 154 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); 155 tmp0_r = p5_r_in - p6_r_in; 156 tmp0_r += q1_r_in; 157 tmp0_r -= p7_r_in; 158 tmp1_r += tmp0_r; 159 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 160 161 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); 162 tmp0_l = p5_l_in - p6_l_in; 163 tmp0_l += q1_l_in; 164 tmp0_l -= p7_l_in; 165 tmp1_l += tmp0_l; 166 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 167 168 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 169 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); 170 ST_UB(p5, src); 171 src += pitch; 172 173 /* p4 */ 174 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); 175 tmp0_r = p4_r_in - p5_r_in; 176 tmp0_r += q2_r_in; 177 tmp0_r -= p7_r_in; 178 tmp1_r += tmp0_r; 179 r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4); 180 181 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); 182 tmp0_l = p4_l_in - p5_l_in; 183 tmp0_l += q2_l_in; 184 tmp0_l -= p7_l_in; 185 tmp1_l += tmp0_l; 186 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 187 188 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 189 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); 190 ST_UB(p4, src); 191 src += pitch; 192 193 /* p3 */ 194 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); 195 tmp0_r = p3_r_in - p4_r_in; 196 tmp0_r += q3_r_in; 197 tmp0_r -= p7_r_in; 198 tmp1_r += tmp0_r; 199 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 200 201 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); 202 tmp0_l = p3_l_in - p4_l_in; 203 tmp0_l += q3_l_in; 204 tmp0_l -= p7_l_in; 205 tmp1_l += tmp0_l; 206 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 207 208 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 209 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); 210 ST_UB(p3, src); 211 src += pitch; 212 213 /* p2 */ 214 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); 215 filter8 = LD_UB(filter48); 216 tmp0_r = p2_r_in - p3_r_in; 217 tmp0_r += q4_r_in; 218 tmp0_r -= p7_r_in; 219 tmp1_r += tmp0_r; 220 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 221 222 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); 223 tmp0_l = p2_l_in - p3_l_in; 224 tmp0_l += q4_l_in; 225 tmp0_l -= p7_l_in; 226 tmp1_l += tmp0_l; 227 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 228 229 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 230 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 231 ST_UB(filter8, src); 232 src += pitch; 233 234 /* p1 */ 235 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); 236 filter8 = LD_UB(filter48 + 16); 237 tmp0_r = p1_r_in - p2_r_in; 238 tmp0_r += q5_r_in; 239 tmp0_r -= p7_r_in; 240 tmp1_r += tmp0_r; 241 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 242 243 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); 244 tmp0_l = p1_l_in - p2_l_in; 245 tmp0_l += q5_l_in; 246 tmp0_l -= p7_l_in; 247 tmp1_l += tmp0_l; 248 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 249 250 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 251 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 252 ST_UB(filter8, src); 253 src += pitch; 254 255 /* p0 */ 256 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); 257 filter8 = LD_UB(filter48 + 32); 258 tmp0_r = p0_r_in - p1_r_in; 259 tmp0_r += q6_r_in; 260 tmp0_r -= p7_r_in; 261 tmp1_r += tmp0_r; 262 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 263 264 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); 265 tmp0_l = p0_l_in - p1_l_in; 266 tmp0_l += q6_l_in; 267 tmp0_l -= p7_l_in; 268 tmp1_l += tmp0_l; 269 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 270 271 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 272 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 273 ST_UB(filter8, src); 274 src += pitch; 275 276 /* q0 */ 277 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); 278 filter8 = LD_UB(filter48 + 48); 279 tmp0_r = q7_r_in - p0_r_in; 280 tmp0_r += q0_r_in; 281 tmp0_r -= p7_r_in; 282 tmp1_r += tmp0_r; 283 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 284 285 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); 286 tmp0_l = q7_l_in - p0_l_in; 287 tmp0_l += q0_l_in; 288 tmp0_l -= p7_l_in; 289 tmp1_l += tmp0_l; 290 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 291 292 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 293 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 294 ST_UB(filter8, src); 295 src += pitch; 296 297 /* q1 */ 298 filter8 = LD_UB(filter48 + 64); 299 tmp0_r = q7_r_in - q0_r_in; 300 tmp0_r += q1_r_in; 301 tmp0_r -= p6_r_in; 302 tmp1_r += tmp0_r; 303 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 304 305 tmp0_l = q7_l_in - q0_l_in; 306 tmp0_l += q1_l_in; 307 tmp0_l -= p6_l_in; 308 tmp1_l += tmp0_l; 309 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 310 311 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 312 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 313 ST_UB(filter8, src); 314 src += pitch; 315 316 /* q2 */ 317 filter8 = LD_UB(filter48 + 80); 318 tmp0_r = q7_r_in - q1_r_in; 319 tmp0_r += q2_r_in; 320 tmp0_r -= p5_r_in; 321 tmp1_r += tmp0_r; 322 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 323 324 tmp0_l = q7_l_in - q1_l_in; 325 tmp0_l += q2_l_in; 326 tmp0_l -= p5_l_in; 327 tmp1_l += tmp0_l; 328 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 329 330 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 331 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 332 ST_UB(filter8, src); 333 src += pitch; 334 335 /* q3 */ 336 tmp0_r = q7_r_in - q2_r_in; 337 tmp0_r += q3_r_in; 338 tmp0_r -= p4_r_in; 339 tmp1_r += tmp0_r; 340 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 341 342 tmp0_l = q7_l_in - q2_l_in; 343 tmp0_l += q3_l_in; 344 tmp0_l -= p4_l_in; 345 tmp1_l += tmp0_l; 346 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 347 348 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 349 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); 350 ST_UB(q3, src); 351 src += pitch; 352 353 /* q4 */ 354 tmp0_r = q7_r_in - q3_r_in; 355 tmp0_r += q4_r_in; 356 tmp0_r -= p3_r_in; 357 tmp1_r += tmp0_r; 358 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 359 360 tmp0_l = q7_l_in - q3_l_in; 361 tmp0_l += q4_l_in; 362 tmp0_l -= p3_l_in; 363 tmp1_l += tmp0_l; 364 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 365 366 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 367 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); 368 ST_UB(q4, src); 369 src += pitch; 370 371 /* q5 */ 372 tmp0_r = q7_r_in - q4_r_in; 373 tmp0_r += q5_r_in; 374 tmp0_r -= p2_r_in; 375 tmp1_r += tmp0_r; 376 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 377 378 tmp0_l = q7_l_in - q4_l_in; 379 tmp0_l += q5_l_in; 380 tmp0_l -= p2_l_in; 381 tmp1_l += tmp0_l; 382 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 383 384 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 385 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); 386 ST_UB(q5, src); 387 src += pitch; 388 389 /* q6 */ 390 tmp0_r = q7_r_in - q5_r_in; 391 tmp0_r += q6_r_in; 392 tmp0_r -= p1_r_in; 393 tmp1_r += tmp0_r; 394 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 395 396 tmp0_l = q7_l_in - q5_l_in; 397 tmp0_l += q6_l_in; 398 tmp0_l -= p1_l_in; 399 tmp1_l += tmp0_l; 400 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 401 402 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 403 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); 404 ST_UB(q6, src); 405 } 406} 407 408static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch, 409 const uint8_t *b_limit_ptr, 410 const uint8_t *limit_ptr, 411 const uint8_t *thresh_ptr, 412 int32_t count) { 413 DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); 414 uint8_t early_exit = 0; 415 416 (void)count; 417 418 early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, 419 limit_ptr, thresh_ptr); 420 421 if (0 == early_exit) { 422 hz_lpf_t16_16w(src, pitch, filter48); 423 } 424} 425 426static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, 427 const uint8_t *b_limit_ptr, 428 const uint8_t *limit_ptr, 429 const uint8_t *thresh_ptr, int32_t count) { 430 if (1 == count) { 431 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 432 uint64_t dword0, dword1; 433 v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; 434 v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; 435 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 436 v16u8 p0_filter16, p1_filter16; 437 v8i16 p2_filter8, p1_filter8, p0_filter8; 438 v8i16 q0_filter8, q1_filter8, q2_filter8; 439 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; 440 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; 441 v16i8 zero = { 0 }; 442 v8u16 tmp0, tmp1, tmp2; 443 444 /* load vector elements */ 445 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 446 447 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 448 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 449 limit = (v16u8)__msa_fill_b(*limit_ptr); 450 451 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 452 mask, flat); 453 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 454 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 455 q1_out); 456 457 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); 458 459 if (__msa_test_bz_v(flat)) { 460 p1_d = __msa_copy_u_d((v2i64)p1_out, 0); 461 p0_d = __msa_copy_u_d((v2i64)p0_out, 0); 462 q0_d = __msa_copy_u_d((v2i64)q0_out, 0); 463 q1_d = __msa_copy_u_d((v2i64)q1_out, 0); 464 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); 465 } else { 466 /* convert 8 bit input data into 16 bit */ 467 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 468 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 469 q3_r); 470 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, 471 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); 472 473 /* convert 16 bit output data into 8 bit */ 474 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, 475 q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); 476 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); 477 478 /* store pixel values */ 479 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); 480 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); 481 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); 482 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); 483 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); 484 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); 485 486 /* load 16 vector elements */ 487 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); 488 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); 489 490 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 491 492 if (__msa_test_bz_v(flat2)) { 493 p2_d = __msa_copy_u_d((v2i64)p2_out, 0); 494 p1_d = __msa_copy_u_d((v2i64)p1_out, 0); 495 p0_d = __msa_copy_u_d((v2i64)p0_out, 0); 496 q0_d = __msa_copy_u_d((v2i64)q0_out, 0); 497 q1_d = __msa_copy_u_d((v2i64)q1_out, 0); 498 q2_d = __msa_copy_u_d((v2i64)q2_out, 0); 499 500 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); 501 SD(q1_d, src + pitch); 502 SD(q2_d, src + 2 * pitch); 503 } else { 504 /* LSB(right) 8 pixel operation */ 505 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5, 506 zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r, 507 q7_r); 508 509 tmp0 = p7_r << 3; 510 tmp0 -= p7_r; 511 tmp0 += p6_r; 512 tmp0 += q0_r; 513 514 src -= 7 * pitch; 515 516 /* calculation of p6 and p5 */ 517 tmp1 = p6_r + p5_r + p4_r + p3_r; 518 tmp1 += (p2_r + p1_r + p0_r); 519 tmp1 += tmp0; 520 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 521 tmp0 = p5_r - p6_r + q1_r - p7_r; 522 tmp1 += tmp0; 523 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 524 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 525 p1_filter16); 526 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); 527 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); 528 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 529 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 530 SD(dword0, src); 531 src += pitch; 532 SD(dword1, src); 533 src += pitch; 534 535 /* calculation of p4 and p3 */ 536 tmp0 = p4_r - p5_r + q2_r - p7_r; 537 tmp2 = p3_r - p4_r + q3_r - p7_r; 538 tmp1 += tmp0; 539 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 540 tmp1 += tmp2; 541 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 542 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 543 p1_filter16); 544 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); 545 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); 546 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 547 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 548 SD(dword0, src); 549 src += pitch; 550 SD(dword1, src); 551 src += pitch; 552 553 /* calculation of p2 and p1 */ 554 tmp0 = p2_r - p3_r + q4_r - p7_r; 555 tmp2 = p1_r - p2_r + q5_r - p7_r; 556 tmp1 += tmp0; 557 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 558 tmp1 += tmp2; 559 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 560 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 561 p1_filter16); 562 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); 563 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); 564 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 565 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 566 SD(dword0, src); 567 src += pitch; 568 SD(dword1, src); 569 src += pitch; 570 571 /* calculation of p0 and q0 */ 572 tmp0 = (p0_r - p1_r) + (q6_r - p7_r); 573 tmp2 = (q7_r - p0_r) + (q0_r - p7_r); 574 tmp1 += tmp0; 575 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 576 tmp1 += tmp2; 577 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 578 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 579 p1_filter16); 580 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); 581 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); 582 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 583 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 584 SD(dword0, src); 585 src += pitch; 586 SD(dword1, src); 587 src += pitch; 588 589 /* calculation of q1 and q2 */ 590 tmp0 = q7_r - q0_r + q1_r - p6_r; 591 tmp2 = q7_r - q1_r + q2_r - p5_r; 592 tmp1 += tmp0; 593 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 594 tmp1 += tmp2; 595 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 596 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 597 p1_filter16); 598 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); 599 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); 600 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 601 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 602 SD(dword0, src); 603 src += pitch; 604 SD(dword1, src); 605 src += pitch; 606 607 /* calculation of q3 and q4 */ 608 tmp0 = (q7_r - q2_r) + (q3_r - p4_r); 609 tmp2 = (q7_r - q3_r) + (q4_r - p3_r); 610 tmp1 += tmp0; 611 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 612 tmp1 += tmp2; 613 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 614 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 615 p1_filter16); 616 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); 617 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); 618 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 619 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 620 SD(dword0, src); 621 src += pitch; 622 SD(dword1, src); 623 src += pitch; 624 625 /* calculation of q5 and q6 */ 626 tmp0 = (q7_r - q4_r) + (q5_r - p2_r); 627 tmp2 = (q7_r - q5_r) + (q6_r - p1_r); 628 tmp1 += tmp0; 629 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 630 tmp1 += tmp2; 631 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 632 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 633 p1_filter16); 634 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); 635 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); 636 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 637 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 638 SD(dword0, src); 639 src += pitch; 640 SD(dword1, src); 641 } 642 } 643 } else { 644 mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 645 count); 646 } 647} 648 649void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, 650 const uint8_t *b_limit_ptr, 651 const uint8_t *limit_ptr, 652 const uint8_t *thresh_ptr) { 653 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); 654} 655 656void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, 657 const uint8_t *b_limit_ptr, 658 const uint8_t *limit_ptr, 659 const uint8_t *thresh_ptr) { 660 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2); 661} 662 663static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, 664 uint8_t *output, int32_t out_pitch) { 665 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; 666 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 667 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 668 669 LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, 670 p1_org, p0_org); 671 /* 8x8 transpose */ 672 TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, 673 p0_org, p7, p6, p5, p4, p3, p2, p1, p0); 674 /* 8x8 transpose */ 675 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, 676 tmp0, tmp1, tmp2, tmp3); 677 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); 678 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); 679 ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); 680 ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); 681 SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); 682 683 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); 684 output += (8 * out_pitch); 685 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); 686} 687 688static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, 689 uint8_t *output, int32_t out_pitch) { 690 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; 691 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 692 693 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); 694 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); 695 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, 696 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); 697 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); 698} 699 700static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, 701 int32_t out_pitch) { 702 v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 703 v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 704 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 705 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; 706 v4i32 tmp2, tmp3; 707 708 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); 709 input += (8 * in_pitch); 710 LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15); 711 712 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, 713 row9, row10, row11, row12, row13, row14, row15, p7, p6, 714 p5, p4, p3, p2, p1, p0); 715 716 /* transpose 16x8 matrix into 8x16 */ 717 /* total 8 intermediate register and 32 instructions */ 718 q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0); 719 q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1); 720 q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2); 721 q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3); 722 q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4); 723 q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5); 724 q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6); 725 q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7); 726 727 ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); 728 tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7); 729 tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5); 730 731 ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); 732 tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3); 733 tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1); 734 735 ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); 736 q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 737 q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 738 739 tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0); 740 tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5); 741 q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 742 q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 743 744 ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); 745 q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 746 q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 747 748 tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4); 749 tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6); 750 q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 751 q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 752 753 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); 754 output += (8 * out_pitch); 755 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); 756} 757 758static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, 759 uint8_t *src_org, int32_t pitch_org, 760 const uint8_t *b_limit_ptr, 761 const uint8_t *limit_ptr, 762 const uint8_t *thresh_ptr) { 763 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 764 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 765 v16u8 flat, mask, hev, thresh, b_limit, limit; 766 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 767 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 768 v16i8 zero = { 0 }; 769 v8i16 vec0, vec1, vec2, vec3; 770 771 /* load vector elements */ 772 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); 773 774 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 775 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 776 limit = (v16u8)__msa_fill_b(*limit_ptr); 777 778 /* mask and hev */ 779 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 780 mask, flat); 781 /* flat4 */ 782 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 783 /* filter4 */ 784 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 785 786 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); 787 788 if (__msa_test_bz_v(flat)) { 789 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 790 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 791 ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); 792 return 1; 793 } else { 794 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 795 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 796 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 797 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 798 799 /* convert 16 bit output data into 8 bit */ 800 p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r); 801 p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r); 802 p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r); 803 q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r); 804 q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r); 805 q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r); 806 807 /* store pixel values */ 808 p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat); 809 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat); 810 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat); 811 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat); 812 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat); 813 q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat); 814 815 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 816 filter48 += (4 * 16); 817 ST_UB2(q1_out, q2_out, filter48, 16); 818 filter48 += (2 * 16); 819 ST_UB(flat, filter48); 820 821 return 0; 822 } 823} 824 825static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, 826 uint8_t *filter48) { 827 v16i8 zero = { 0 }; 828 v16u8 filter8, flat, flat2; 829 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 830 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; 831 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; 832 v8u16 tmp0_r, tmp1_r; 833 v8i16 r_out; 834 835 flat = LD_UB(filter48 + 6 * 16); 836 837 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); 838 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); 839 840 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 841 842 if (__msa_test_bz_v(flat2)) { 843 v8i16 vec0, vec1, vec2, vec3, vec4; 844 845 LD_UB4(filter48, 16, p2, p1, p0, q0); 846 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 847 848 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 849 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 850 vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); 851 852 src_org -= 3; 853 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); 854 ST2x4_UB(vec2, 0, (src_org + 4), pitch); 855 src_org += (4 * pitch); 856 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); 857 ST2x4_UB(vec2, 4, (src_org + 4), pitch); 858 859 return 1; 860 } else { 861 src -= 7 * 16; 862 863 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, 864 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, 865 p2_r_in, p1_r_in, p0_r_in); 866 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); 867 868 tmp0_r = p7_r_in << 3; 869 tmp0_r -= p7_r_in; 870 tmp0_r += p6_r_in; 871 tmp0_r += q0_r_in; 872 tmp1_r = p6_r_in + p5_r_in; 873 tmp1_r += p4_r_in; 874 tmp1_r += p3_r_in; 875 tmp1_r += p2_r_in; 876 tmp1_r += p1_r_in; 877 tmp1_r += p0_r_in; 878 tmp1_r += tmp0_r; 879 880 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 881 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 882 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); 883 ST8x1_UB(p6, src); 884 src += 16; 885 886 /* p5 */ 887 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); 888 tmp0_r = p5_r_in - p6_r_in; 889 tmp0_r += q1_r_in; 890 tmp0_r -= p7_r_in; 891 tmp1_r += tmp0_r; 892 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 893 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 894 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); 895 ST8x1_UB(p5, src); 896 src += 16; 897 898 /* p4 */ 899 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); 900 tmp0_r = p4_r_in - p5_r_in; 901 tmp0_r += q2_r_in; 902 tmp0_r -= p7_r_in; 903 tmp1_r += tmp0_r; 904 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 905 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 906 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); 907 ST8x1_UB(p4, src); 908 src += 16; 909 910 /* p3 */ 911 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); 912 tmp0_r = p3_r_in - p4_r_in; 913 tmp0_r += q3_r_in; 914 tmp0_r -= p7_r_in; 915 tmp1_r += tmp0_r; 916 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 917 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 918 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); 919 ST8x1_UB(p3, src); 920 src += 16; 921 922 /* p2 */ 923 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); 924 filter8 = LD_UB(filter48); 925 tmp0_r = p2_r_in - p3_r_in; 926 tmp0_r += q4_r_in; 927 tmp0_r -= p7_r_in; 928 tmp1_r += tmp0_r; 929 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 930 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 931 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 932 ST8x1_UB(filter8, src); 933 src += 16; 934 935 /* p1 */ 936 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); 937 filter8 = LD_UB(filter48 + 16); 938 tmp0_r = p1_r_in - p2_r_in; 939 tmp0_r += q5_r_in; 940 tmp0_r -= p7_r_in; 941 tmp1_r += tmp0_r; 942 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 943 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 944 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 945 ST8x1_UB(filter8, src); 946 src += 16; 947 948 /* p0 */ 949 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); 950 filter8 = LD_UB(filter48 + 32); 951 tmp0_r = p0_r_in - p1_r_in; 952 tmp0_r += q6_r_in; 953 tmp0_r -= p7_r_in; 954 tmp1_r += tmp0_r; 955 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 956 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 957 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 958 ST8x1_UB(filter8, src); 959 src += 16; 960 961 /* q0 */ 962 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); 963 filter8 = LD_UB(filter48 + 48); 964 tmp0_r = q7_r_in - p0_r_in; 965 tmp0_r += q0_r_in; 966 tmp0_r -= p7_r_in; 967 tmp1_r += tmp0_r; 968 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 969 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 970 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 971 ST8x1_UB(filter8, src); 972 src += 16; 973 974 /* q1 */ 975 filter8 = LD_UB(filter48 + 64); 976 tmp0_r = q7_r_in - q0_r_in; 977 tmp0_r += q1_r_in; 978 tmp0_r -= p6_r_in; 979 tmp1_r += tmp0_r; 980 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 981 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 982 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 983 ST8x1_UB(filter8, src); 984 src += 16; 985 986 /* q2 */ 987 filter8 = LD_UB(filter48 + 80); 988 tmp0_r = q7_r_in - q1_r_in; 989 tmp0_r += q2_r_in; 990 tmp0_r -= p5_r_in; 991 tmp1_r += tmp0_r; 992 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 993 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 994 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 995 ST8x1_UB(filter8, src); 996 src += 16; 997 998 /* q3 */ 999 tmp0_r = q7_r_in - q2_r_in; 1000 tmp0_r += q3_r_in; 1001 tmp0_r -= p4_r_in; 1002 tmp1_r += tmp0_r; 1003 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1004 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 1005 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); 1006 ST8x1_UB(q3, src); 1007 src += 16; 1008 1009 /* q4 */ 1010 tmp0_r = q7_r_in - q3_r_in; 1011 tmp0_r += q4_r_in; 1012 tmp0_r -= p3_r_in; 1013 tmp1_r += tmp0_r; 1014 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1015 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 1016 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); 1017 ST8x1_UB(q4, src); 1018 src += 16; 1019 1020 /* q5 */ 1021 tmp0_r = q7_r_in - q4_r_in; 1022 tmp0_r += q5_r_in; 1023 tmp0_r -= p2_r_in; 1024 tmp1_r += tmp0_r; 1025 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1026 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 1027 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); 1028 ST8x1_UB(q5, src); 1029 src += 16; 1030 1031 /* q6 */ 1032 tmp0_r = q7_r_in - q5_r_in; 1033 tmp0_r += q6_r_in; 1034 tmp0_r -= p1_r_in; 1035 tmp1_r += tmp0_r; 1036 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1037 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 1038 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); 1039 ST8x1_UB(q6, src); 1040 1041 return 0; 1042 } 1043} 1044 1045void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, 1046 const uint8_t *b_limit_ptr, 1047 const uint8_t *limit_ptr, 1048 const uint8_t *thresh_ptr) { 1049 uint8_t early_exit = 0; 1050 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); 1051 uint8_t *filter48 = &transposed_input[16 * 16]; 1052 1053 transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); 1054 1055 early_exit = 1056 vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch, 1057 b_limit_ptr, limit_ptr, thresh_ptr); 1058 1059 if (0 == early_exit) { 1060 early_exit = 1061 vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]); 1062 1063 if (0 == early_exit) { 1064 transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); 1065 } 1066 } 1067} 1068 1069static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, 1070 uint8_t *src_org, int32_t pitch, 1071 const uint8_t *b_limit_ptr, 1072 const uint8_t *limit_ptr, 1073 const uint8_t *thresh_ptr) { 1074 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1075 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 1076 v16u8 flat, mask, hev, thresh, b_limit, limit; 1077 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 1078 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 1079 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 1080 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; 1081 v16i8 zero = { 0 }; 1082 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; 1083 1084 /* load vector elements */ 1085 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); 1086 1087 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 1088 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 1089 limit = (v16u8)__msa_fill_b(*limit_ptr); 1090 1091 /* mask and hev */ 1092 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 1093 mask, flat); 1094 /* flat4 */ 1095 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1096 /* filter4 */ 1097 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 1098 1099 if (__msa_test_bz_v(flat)) { 1100 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1101 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1102 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1103 ILVRL_H2_SH(vec1, vec0, vec4, vec5); 1104 1105 src_org -= 2; 1106 ST4x8_UB(vec2, vec3, src_org, pitch); 1107 src_org += 8 * pitch; 1108 ST4x8_UB(vec4, vec5, src_org, pitch); 1109 1110 return 1; 1111 } else { 1112 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 1113 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 1114 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 1115 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 1116 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); 1117 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); 1118 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 1119 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 1120 1121 /* convert 16 bit output data into 8 bit */ 1122 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 1123 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 1124 p0_filt8_r, q0_filt8_r); 1125 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 1126 q2_filt8_r); 1127 1128 /* store pixel values */ 1129 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 1130 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 1131 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 1132 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 1133 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 1134 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 1135 1136 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 1137 filter48 += (4 * 16); 1138 ST_UB2(q1_out, q2_out, filter48, 16); 1139 filter48 += (2 * 16); 1140 ST_UB(flat, filter48); 1141 1142 return 0; 1143 } 1144} 1145 1146static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, 1147 uint8_t *filter48) { 1148 v16u8 flat, flat2, filter8; 1149 v16i8 zero = { 0 }; 1150 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 1151 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; 1152 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; 1153 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; 1154 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; 1155 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; 1156 v8i16 l_out, r_out; 1157 1158 flat = LD_UB(filter48 + 6 * 16); 1159 1160 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); 1161 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); 1162 1163 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 1164 1165 if (__msa_test_bz_v(flat2)) { 1166 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1167 1168 LD_UB4(filter48, 16, p2, p1, p0, q0); 1169 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 1170 1171 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1172 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 1173 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 1174 ILVRL_H2_SH(vec1, vec0, vec6, vec7); 1175 ILVRL_B2_SH(q2, q1, vec2, vec5); 1176 1177 src_org -= 3; 1178 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); 1179 ST2x4_UB(vec2, 0, (src_org + 4), pitch); 1180 src_org += (4 * pitch); 1181 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); 1182 ST2x4_UB(vec2, 4, (src_org + 4), pitch); 1183 src_org += (4 * pitch); 1184 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); 1185 ST2x4_UB(vec5, 0, (src_org + 4), pitch); 1186 src_org += (4 * pitch); 1187 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); 1188 ST2x4_UB(vec5, 4, (src_org + 4), pitch); 1189 1190 return 1; 1191 } else { 1192 src -= 7 * 16; 1193 1194 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, 1195 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, 1196 p2_r_in, p1_r_in, p0_r_in); 1197 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); 1198 1199 tmp0_r = p7_r_in << 3; 1200 tmp0_r -= p7_r_in; 1201 tmp0_r += p6_r_in; 1202 tmp0_r += q0_r_in; 1203 tmp1_r = p6_r_in + p5_r_in; 1204 tmp1_r += p4_r_in; 1205 tmp1_r += p3_r_in; 1206 tmp1_r += p2_r_in; 1207 tmp1_r += p1_r_in; 1208 tmp1_r += p0_r_in; 1209 tmp1_r += tmp0_r; 1210 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1211 1212 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, 1213 p5_l_in, p4_l_in); 1214 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, 1215 p1_l_in, p0_l_in); 1216 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); 1217 1218 tmp0_l = p7_l_in << 3; 1219 tmp0_l -= p7_l_in; 1220 tmp0_l += p6_l_in; 1221 tmp0_l += q0_l_in; 1222 tmp1_l = p6_l_in + p5_l_in; 1223 tmp1_l += p4_l_in; 1224 tmp1_l += p3_l_in; 1225 tmp1_l += p2_l_in; 1226 tmp1_l += p1_l_in; 1227 tmp1_l += p0_l_in; 1228 tmp1_l += tmp0_l; 1229 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1230 1231 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1232 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); 1233 ST_UB(p6, src); 1234 src += 16; 1235 1236 /* p5 */ 1237 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); 1238 tmp0_r = p5_r_in - p6_r_in; 1239 tmp0_r += q1_r_in; 1240 tmp0_r -= p7_r_in; 1241 tmp1_r += tmp0_r; 1242 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1243 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); 1244 tmp0_l = p5_l_in - p6_l_in; 1245 tmp0_l += q1_l_in; 1246 tmp0_l -= p7_l_in; 1247 tmp1_l += tmp0_l; 1248 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1249 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1250 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); 1251 ST_UB(p5, src); 1252 src += 16; 1253 1254 /* p4 */ 1255 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); 1256 tmp0_r = p4_r_in - p5_r_in; 1257 tmp0_r += q2_r_in; 1258 tmp0_r -= p7_r_in; 1259 tmp1_r += tmp0_r; 1260 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1261 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); 1262 tmp0_l = p4_l_in - p5_l_in; 1263 tmp0_l += q2_l_in; 1264 tmp0_l -= p7_l_in; 1265 tmp1_l += tmp0_l; 1266 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1267 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1268 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); 1269 ST_UB(p4, src); 1270 src += 16; 1271 1272 /* p3 */ 1273 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); 1274 tmp0_r = p3_r_in - p4_r_in; 1275 tmp0_r += q3_r_in; 1276 tmp0_r -= p7_r_in; 1277 tmp1_r += tmp0_r; 1278 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1279 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); 1280 tmp0_l = p3_l_in - p4_l_in; 1281 tmp0_l += q3_l_in; 1282 tmp0_l -= p7_l_in; 1283 tmp1_l += tmp0_l; 1284 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1285 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1286 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); 1287 ST_UB(p3, src); 1288 src += 16; 1289 1290 /* p2 */ 1291 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); 1292 filter8 = LD_UB(filter48); 1293 tmp0_r = p2_r_in - p3_r_in; 1294 tmp0_r += q4_r_in; 1295 tmp0_r -= p7_r_in; 1296 tmp1_r += tmp0_r; 1297 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1298 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); 1299 tmp0_l = p2_l_in - p3_l_in; 1300 tmp0_l += q4_l_in; 1301 tmp0_l -= p7_l_in; 1302 tmp1_l += tmp0_l; 1303 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1304 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1305 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1306 ST_UB(filter8, src); 1307 src += 16; 1308 1309 /* p1 */ 1310 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); 1311 filter8 = LD_UB(filter48 + 16); 1312 tmp0_r = p1_r_in - p2_r_in; 1313 tmp0_r += q5_r_in; 1314 tmp0_r -= p7_r_in; 1315 tmp1_r += tmp0_r; 1316 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1317 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); 1318 tmp0_l = p1_l_in - p2_l_in; 1319 tmp0_l += q5_l_in; 1320 tmp0_l -= p7_l_in; 1321 tmp1_l += tmp0_l; 1322 l_out = __msa_srari_h((v8i16)(tmp1_l), 4); 1323 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1324 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1325 ST_UB(filter8, src); 1326 src += 16; 1327 1328 /* p0 */ 1329 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); 1330 filter8 = LD_UB(filter48 + 32); 1331 tmp0_r = p0_r_in - p1_r_in; 1332 tmp0_r += q6_r_in; 1333 tmp0_r -= p7_r_in; 1334 tmp1_r += tmp0_r; 1335 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1336 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); 1337 tmp0_l = p0_l_in - p1_l_in; 1338 tmp0_l += q6_l_in; 1339 tmp0_l -= p7_l_in; 1340 tmp1_l += tmp0_l; 1341 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1342 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1343 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1344 ST_UB(filter8, src); 1345 src += 16; 1346 1347 /* q0 */ 1348 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); 1349 filter8 = LD_UB(filter48 + 48); 1350 tmp0_r = q7_r_in - p0_r_in; 1351 tmp0_r += q0_r_in; 1352 tmp0_r -= p7_r_in; 1353 tmp1_r += tmp0_r; 1354 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1355 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); 1356 tmp0_l = q7_l_in - p0_l_in; 1357 tmp0_l += q0_l_in; 1358 tmp0_l -= p7_l_in; 1359 tmp1_l += tmp0_l; 1360 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1361 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1362 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1363 ST_UB(filter8, src); 1364 src += 16; 1365 1366 /* q1 */ 1367 filter8 = LD_UB(filter48 + 64); 1368 tmp0_r = q7_r_in - q0_r_in; 1369 tmp0_r += q1_r_in; 1370 tmp0_r -= p6_r_in; 1371 tmp1_r += tmp0_r; 1372 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1373 tmp0_l = q7_l_in - q0_l_in; 1374 tmp0_l += q1_l_in; 1375 tmp0_l -= p6_l_in; 1376 tmp1_l += tmp0_l; 1377 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1378 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1379 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1380 ST_UB(filter8, src); 1381 src += 16; 1382 1383 /* q2 */ 1384 filter8 = LD_UB(filter48 + 80); 1385 tmp0_r = q7_r_in - q1_r_in; 1386 tmp0_r += q2_r_in; 1387 tmp0_r -= p5_r_in; 1388 tmp1_r += tmp0_r; 1389 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1390 tmp0_l = q7_l_in - q1_l_in; 1391 tmp0_l += q2_l_in; 1392 tmp0_l -= p5_l_in; 1393 tmp1_l += tmp0_l; 1394 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1395 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1396 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1397 ST_UB(filter8, src); 1398 src += 16; 1399 1400 /* q3 */ 1401 tmp0_r = q7_r_in - q2_r_in; 1402 tmp0_r += q3_r_in; 1403 tmp0_r -= p4_r_in; 1404 tmp1_r += tmp0_r; 1405 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1406 tmp0_l = q7_l_in - q2_l_in; 1407 tmp0_l += q3_l_in; 1408 tmp0_l -= p4_l_in; 1409 tmp1_l += tmp0_l; 1410 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1411 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1412 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); 1413 ST_UB(q3, src); 1414 src += 16; 1415 1416 /* q4 */ 1417 tmp0_r = q7_r_in - q3_r_in; 1418 tmp0_r += q4_r_in; 1419 tmp0_r -= p3_r_in; 1420 tmp1_r += tmp0_r; 1421 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1422 tmp0_l = q7_l_in - q3_l_in; 1423 tmp0_l += q4_l_in; 1424 tmp0_l -= p3_l_in; 1425 tmp1_l += tmp0_l; 1426 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1427 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1428 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); 1429 ST_UB(q4, src); 1430 src += 16; 1431 1432 /* q5 */ 1433 tmp0_r = q7_r_in - q4_r_in; 1434 tmp0_r += q5_r_in; 1435 tmp0_r -= p2_r_in; 1436 tmp1_r += tmp0_r; 1437 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1438 tmp0_l = q7_l_in - q4_l_in; 1439 tmp0_l += q5_l_in; 1440 tmp0_l -= p2_l_in; 1441 tmp1_l += tmp0_l; 1442 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1443 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1444 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); 1445 ST_UB(q5, src); 1446 src += 16; 1447 1448 /* q6 */ 1449 tmp0_r = q7_r_in - q5_r_in; 1450 tmp0_r += q6_r_in; 1451 tmp0_r -= p1_r_in; 1452 tmp1_r += tmp0_r; 1453 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1454 tmp0_l = q7_l_in - q5_l_in; 1455 tmp0_l += q6_l_in; 1456 tmp0_l -= p1_l_in; 1457 tmp1_l += tmp0_l; 1458 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1459 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1460 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); 1461 ST_UB(q6, src); 1462 1463 return 0; 1464 } 1465} 1466 1467void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, 1468 const uint8_t *b_limit_ptr, 1469 const uint8_t *limit_ptr, 1470 const uint8_t *thresh_ptr) { 1471 uint8_t early_exit = 0; 1472 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); 1473 uint8_t *filter48 = &transposed_input[16 * 16]; 1474 1475 transpose_16x16((src - 8), pitch, &transposed_input[0], 16); 1476 1477 early_exit = 1478 vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, 1479 pitch, b_limit_ptr, limit_ptr, thresh_ptr); 1480 1481 if (0 == early_exit) { 1482 early_exit = 1483 vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]); 1484 1485 if (0 == early_exit) { 1486 transpose_16x16(transposed_input, 16, (src - 8), pitch); 1487 } 1488 } 1489} 1490