1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_dsp_rtcd.h" 12#include "vpx_dsp/mips/loopfilter_msa.h" 13 14void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, 15 const uint8_t *b_limit_ptr, 16 const uint8_t *limit_ptr, 17 const uint8_t *thresh_ptr) { 18 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 19 v16u8 mask, hev, flat, thresh, b_limit, limit; 20 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 21 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 22 v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; 23 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; 24 v16i8 zero = { 0 }; 25 26 /* load vector elements */ 27 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 28 29 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 30 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 31 limit = (v16u8)__msa_fill_b(*limit_ptr); 32 33 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 34 mask, flat); 35 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 36 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 37 38 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); 39 40 if (__msa_test_bz_v(flat)) { 41 p1_d = __msa_copy_u_d((v2i64)p1_out, 0); 42 p0_d = __msa_copy_u_d((v2i64)p0_out, 0); 43 q0_d = __msa_copy_u_d((v2i64)q0_out, 0); 44 q1_d = __msa_copy_u_d((v2i64)q1_out, 0); 45 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); 46 } else { 47 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 48 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 49 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, 50 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); 51 52 /* convert 16 bit output data into 8 bit */ 53 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, 54 q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); 55 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); 56 57 /* store pixel values */ 58 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); 59 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); 60 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); 61 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); 62 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); 63 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); 64 65 p2_d = __msa_copy_u_d((v2i64)p2_out, 0); 66 p1_d = __msa_copy_u_d((v2i64)p1_out, 0); 67 p0_d = __msa_copy_u_d((v2i64)p0_out, 0); 68 q0_d = __msa_copy_u_d((v2i64)q0_out, 0); 69 q1_d = __msa_copy_u_d((v2i64)q1_out, 0); 70 q2_d = __msa_copy_u_d((v2i64)q2_out, 0); 71 72 src -= 3 * pitch; 73 74 SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); 75 src += (4 * pitch); 76 SD(q1_d, src); 77 src += pitch; 78 SD(q2_d, src); 79 } 80} 81 82void vpx_lpf_horizontal_8_dual_msa( 83 uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, 84 const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, 85 const uint8_t *thresh1) { 86 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 87 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 88 v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; 89 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 90 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 91 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 92 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; 93 v16u8 zero = { 0 }; 94 95 /* load vector elements */ 96 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 97 98 thresh = (v16u8)__msa_fill_b(*thresh0); 99 tmp = (v16u8)__msa_fill_b(*thresh1); 100 thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); 101 102 b_limit = (v16u8)__msa_fill_b(*b_limit0); 103 tmp = (v16u8)__msa_fill_b(*b_limit1); 104 b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); 105 106 limit = (v16u8)__msa_fill_b(*limit0); 107 tmp = (v16u8)__msa_fill_b(*limit1); 108 limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); 109 110 /* mask and hev */ 111 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 112 mask, flat); 113 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 114 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 115 116 if (__msa_test_bz_v(flat)) { 117 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 118 } else { 119 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 120 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 121 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 122 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 123 124 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); 125 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); 126 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 127 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 128 129 /* convert 16 bit output data into 8 bit */ 130 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 131 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 132 p0_filt8_r, q0_filt8_r); 133 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 134 q2_filt8_r); 135 136 /* store pixel values */ 137 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 138 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 139 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 140 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 141 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 142 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 143 144 src -= 3 * pitch; 145 146 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); 147 src += (4 * pitch); 148 ST_UB2(q1_out, q2_out, src, pitch); 149 src += (2 * pitch); 150 } 151} 152 153void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, 154 const uint8_t *b_limit_ptr, 155 const uint8_t *limit_ptr, 156 const uint8_t *thresh_ptr) { 157 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 158 v16u8 p1_out, p0_out, q0_out, q1_out; 159 v16u8 flat, mask, hev, thresh, b_limit, limit; 160 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 161 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 162 v16u8 zero = { 0 }; 163 v8i16 vec0, vec1, vec2, vec3, vec4; 164 165 /* load vector elements */ 166 LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); 167 168 TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, 169 q3); 170 171 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 172 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 173 limit = (v16u8)__msa_fill_b(*limit_ptr); 174 175 /* mask and hev */ 176 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 177 mask, flat); 178 /* flat4 */ 179 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 180 /* filter4 */ 181 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 182 183 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); 184 185 if (__msa_test_bz_v(flat)) { 186 /* Store 4 pixels p1-_q1 */ 187 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 188 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 189 190 src -= 2; 191 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); 192 src += 4 * pitch; 193 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); 194 } else { 195 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 196 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 197 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 198 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 199 /* convert 16 bit output data into 8 bit */ 200 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, 201 p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, 202 p0_filt8_r, q0_filt8_r); 203 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, 204 q2_filt8_r); 205 206 /* store pixel values */ 207 p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 208 p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 209 p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 210 q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 211 q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 212 q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 213 214 /* Store 6 pixels p2-_q2 */ 215 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 216 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 217 vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); 218 219 src -= 3; 220 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); 221 ST2x4_UB(vec4, 0, src + 4, pitch); 222 src += (4 * pitch); 223 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); 224 ST2x4_UB(vec4, 4, src + 4, pitch); 225 } 226} 227 228void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, 229 const uint8_t *b_limit0, const uint8_t *limit0, 230 const uint8_t *thresh0, 231 const uint8_t *b_limit1, const uint8_t *limit1, 232 const uint8_t *thresh1) { 233 uint8_t *temp_src; 234 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 235 v16u8 p1_out, p0_out, q0_out, q1_out; 236 v16u8 flat, mask, hev, thresh, b_limit, limit; 237 v16u8 row4, row5, row6, row7, row12, row13, row14, row15; 238 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 239 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 240 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 241 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; 242 v16u8 zero = { 0 }; 243 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 244 245 temp_src = src - 4; 246 247 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); 248 temp_src += (8 * pitch); 249 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); 250 251 /* transpose 16x8 matrix into 8x16 */ 252 TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, 253 row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, 254 q3); 255 256 thresh = (v16u8)__msa_fill_b(*thresh0); 257 vec0 = (v8i16)__msa_fill_b(*thresh1); 258 thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); 259 260 b_limit = (v16u8)__msa_fill_b(*b_limit0); 261 vec0 = (v8i16)__msa_fill_b(*b_limit1); 262 b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); 263 264 limit = (v16u8)__msa_fill_b(*limit0); 265 vec0 = (v8i16)__msa_fill_b(*limit1); 266 limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); 267 268 /* mask and hev */ 269 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 270 mask, flat); 271 /* flat4 */ 272 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 273 /* filter4 */ 274 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 275 276 if (__msa_test_bz_v(flat)) { 277 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 278 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 279 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 280 ILVRL_H2_SH(vec1, vec0, vec4, vec5); 281 282 src -= 2; 283 ST4x8_UB(vec2, vec3, src, pitch); 284 src += 8 * pitch; 285 ST4x8_UB(vec4, vec5, src, pitch); 286 } else { 287 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 288 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 289 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 290 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 291 292 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); 293 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); 294 295 /* filter8 */ 296 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 297 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 298 299 /* convert 16 bit output data into 8 bit */ 300 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 301 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 302 p0_filt8_r, q0_filt8_r); 303 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 304 q2_filt8_r); 305 306 /* store pixel values */ 307 p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 308 p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 309 p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 310 q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 311 q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 312 q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 313 314 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 315 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 316 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 317 ILVRL_H2_SH(vec1, vec0, vec6, vec7); 318 ILVRL_B2_SH(q2, q1, vec2, vec5); 319 320 src -= 3; 321 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); 322 ST2x4_UB(vec2, 0, src + 4, pitch); 323 src += (4 * pitch); 324 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); 325 ST2x4_UB(vec2, 4, src + 4, pitch); 326 src += (4 * pitch); 327 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); 328 ST2x4_UB(vec5, 0, src + 4, pitch); 329 src += (4 * pitch); 330 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); 331 ST2x4_UB(vec5, 4, src + 4, pitch); 332 } 333} 334