1/* 2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <stdlib.h> 12 13#include "./macros_msa.h" 14 15extern const int16_t vpx_rv[]; 16 17#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 18 out1, out2, out3, out4, out5, out6, out7, \ 19 out8, out9, out10, out11, out12, out13, out14, \ 20 out15) \ 21 { \ 22 v8i16 temp0, temp1, temp2, temp3, temp4; \ 23 v8i16 temp5, temp6, temp7, temp8, temp9; \ 24 \ 25 ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ 26 temp3); \ 27 ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ 28 ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ 29 ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ 30 ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ 31 ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ 32 temp3); \ 33 ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ 34 ILVRL_W2_UB(temp5, temp4, out8, out10); \ 35 ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ 36 ILVRL_W2_UB(temp5, temp4, out12, out14); \ 37 out0 = (v16u8)temp6; \ 38 out2 = (v16u8)temp7; \ 39 out4 = (v16u8)temp8; \ 40 out6 = (v16u8)temp9; \ 41 out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ 42 out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ 43 out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ 44 out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ 45 out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 46 out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ 47 out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ 48 out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ 49 } 50 51#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \ 52 ref, out) \ 53 { \ 54 v16u8 temp0, temp1; \ 55 \ 56 temp1 = __msa_aver_u_b(above2_in, above1_in); \ 57 temp0 = __msa_aver_u_b(below2_in, below1_in); \ 58 temp1 = __msa_aver_u_b(temp1, temp0); \ 59 out = __msa_aver_u_b(src_in, temp1); \ 60 temp0 = __msa_asub_u_b(src_in, above2_in); \ 61 temp1 = __msa_asub_u_b(src_in, above1_in); \ 62 temp0 = (temp0 < ref); \ 63 temp1 = (temp1 < ref); \ 64 temp0 = temp0 & temp1; \ 65 temp1 = __msa_asub_u_b(src_in, below1_in); \ 66 temp1 = (temp1 < ref); \ 67 temp0 = temp0 & temp1; \ 68 temp1 = __msa_asub_u_b(src_in, below2_in); \ 69 temp1 = (temp1 < ref); \ 70 temp0 = temp0 & temp1; \ 71 out = __msa_bmz_v(out, src_in, temp0); \ 72 } 73 74#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ 75 in10, in11, in12, in13, in14, in15) \ 76 { \ 77 v8i16 temp0, temp1, temp2, temp3, temp4; \ 78 v8i16 temp5, temp6, temp7, temp8, temp9; \ 79 \ 80 ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ 81 ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ 82 ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ 83 ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ 84 ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ 85 ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ 86 ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ 87 ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ 88 ILVRL_H2_SH(temp5, temp4, temp6, temp7); \ 89 ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \ 90 ILVRL_H2_SH(temp5, temp4, temp8, temp9); \ 91 ILVRL_W2_SH(temp8, temp6, temp4, temp5); \ 92 ILVRL_W2_SH(temp9, temp7, temp6, temp7); \ 93 ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \ 94 ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \ 95 in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \ 96 in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \ 97 ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \ 98 ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \ 99 in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \ 100 in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \ 101 ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, temp2, temp3, \ 102 temp4, temp5); \ 103 ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, temp6, \ 104 temp7, temp8, temp9); \ 105 ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \ 106 in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \ 107 in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \ 108 ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \ 109 in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \ 110 in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \ 111 } 112 113#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, \ 114 in9, in10, in11) \ 115 { \ 116 v8i16 temp0, temp1, temp2, temp3; \ 117 v8i16 temp4, temp5, temp6, temp7; \ 118 \ 119 ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ 120 ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ 121 ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ 122 ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ 123 ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ 124 ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ 125 ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \ 126 temp4 = __msa_ilvr_h(temp5, temp4); \ 127 ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \ 128 temp5 = __msa_ilvr_h(temp7, temp6); \ 129 ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ 130 in0 = (v16u8)temp0; \ 131 in2 = (v16u8)temp1; \ 132 in4 = (v16u8)temp2; \ 133 in6 = (v16u8)temp3; \ 134 in8 = (v16u8)temp6; \ 135 in10 = (v16u8)temp7; \ 136 in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \ 137 in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \ 138 in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \ 139 in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \ 140 in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \ 141 in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \ 142 } 143 144static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, 145 int32_t src_stride, 146 int32_t dst_stride, int32_t cols, 147 uint8_t *f) { 148 uint8_t *p_src = src_ptr; 149 uint8_t *p_dst = dst_ptr; 150 uint8_t *f_orig = f; 151 uint8_t *p_dst_st = dst_ptr; 152 uint16_t col; 153 uint64_t out0, out1, out2, out3; 154 v16u8 above2, above1, below2, below1, src, ref, ref_temp; 155 v16u8 inter0, inter1, inter2, inter3, inter4, inter5; 156 v16u8 inter6, inter7, inter8, inter9, inter10, inter11; 157 158 for (col = (cols / 16); col--;) { 159 ref = LD_UB(f); 160 LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); 161 src = LD_UB(p_src); 162 LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); 163 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); 164 above2 = LD_UB(p_src + 3 * src_stride); 165 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); 166 above1 = LD_UB(p_src + 4 * src_stride); 167 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); 168 src = LD_UB(p_src + 5 * src_stride); 169 VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); 170 below1 = LD_UB(p_src + 6 * src_stride); 171 VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); 172 below2 = LD_UB(p_src + 7 * src_stride); 173 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); 174 above2 = LD_UB(p_src + 8 * src_stride); 175 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); 176 above1 = LD_UB(p_src + 9 * src_stride); 177 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); 178 ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7, 179 p_dst, dst_stride); 180 181 p_dst += 16; 182 p_src += 16; 183 f += 16; 184 } 185 186 if (0 != (cols / 16)) { 187 ref = LD_UB(f); 188 LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); 189 src = LD_UB(p_src); 190 LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); 191 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); 192 above2 = LD_UB(p_src + 3 * src_stride); 193 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); 194 above1 = LD_UB(p_src + 4 * src_stride); 195 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); 196 src = LD_UB(p_src + 5 * src_stride); 197 VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); 198 below1 = LD_UB(p_src + 6 * src_stride); 199 VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); 200 below2 = LD_UB(p_src + 7 * src_stride); 201 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); 202 above2 = LD_UB(p_src + 8 * src_stride); 203 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); 204 above1 = LD_UB(p_src + 9 * src_stride); 205 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); 206 out0 = __msa_copy_u_d((v2i64)inter0, 0); 207 out1 = __msa_copy_u_d((v2i64)inter1, 0); 208 out2 = __msa_copy_u_d((v2i64)inter2, 0); 209 out3 = __msa_copy_u_d((v2i64)inter3, 0); 210 SD4(out0, out1, out2, out3, p_dst, dst_stride); 211 212 out0 = __msa_copy_u_d((v2i64)inter4, 0); 213 out1 = __msa_copy_u_d((v2i64)inter5, 0); 214 out2 = __msa_copy_u_d((v2i64)inter6, 0); 215 out3 = __msa_copy_u_d((v2i64)inter7, 0); 216 SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); 217 } 218 219 f = f_orig; 220 p_dst = dst_ptr - 2; 221 LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5, 222 inter6, inter7); 223 224 for (col = 0; col < (cols / 8); ++col) { 225 ref = LD_UB(f); 226 f += 8; 227 VPX_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5, 228 inter6, inter7, inter8, inter9, inter10, inter11); 229 if (0 == col) { 230 above2 = inter2; 231 above1 = inter2; 232 } else { 233 above2 = inter0; 234 above1 = inter1; 235 } 236 src = inter2; 237 below1 = inter3; 238 below2 = inter4; 239 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0); 240 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2); 241 above2 = inter5; 242 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1); 243 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3); 244 above1 = inter6; 245 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2); 246 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4); 247 src = inter7; 248 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3); 249 VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5); 250 below1 = inter8; 251 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4); 252 VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6); 253 below2 = inter9; 254 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5); 255 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7); 256 if (col == (cols / 8 - 1)) { 257 above2 = inter9; 258 } else { 259 above2 = inter10; 260 } 261 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6); 262 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8); 263 if (col == (cols / 8 - 1)) { 264 above1 = inter9; 265 } else { 266 above1 = inter11; 267 } 268 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7); 269 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9); 270 TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8, 271 inter9, inter2, inter3, inter4, inter5, inter6, inter7, 272 inter8, inter9); 273 p_dst += 8; 274 LD_UB2(p_dst, dst_stride, inter0, inter1); 275 ST8x1_UB(inter2, p_dst_st); 276 ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride)); 277 LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3); 278 ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride)); 279 ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride)); 280 LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5); 281 ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride)); 282 ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride)); 283 LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7); 284 ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride)); 285 ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride)); 286 p_dst_st += 8; 287 } 288} 289 290static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, 291 int32_t src_stride, 292 int32_t dst_stride, int32_t cols, 293 uint8_t *f) { 294 uint8_t *p_src = src_ptr; 295 uint8_t *p_dst = dst_ptr; 296 uint8_t *p_dst_st = dst_ptr; 297 uint8_t *f_orig = f; 298 uint16_t col; 299 uint64_t out0, out1, out2, out3; 300 v16u8 above2, above1, below2, below1; 301 v16u8 src, ref, ref_temp; 302 v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6; 303 v16u8 inter7, inter8, inter9, inter10, inter11; 304 v16u8 inter12, inter13, inter14, inter15; 305 306 for (col = (cols / 16); col--;) { 307 ref = LD_UB(f); 308 LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); 309 src = LD_UB(p_src); 310 LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); 311 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); 312 above2 = LD_UB(p_src + 3 * src_stride); 313 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); 314 above1 = LD_UB(p_src + 4 * src_stride); 315 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); 316 src = LD_UB(p_src + 5 * src_stride); 317 VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); 318 below1 = LD_UB(p_src + 6 * src_stride); 319 VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); 320 below2 = LD_UB(p_src + 7 * src_stride); 321 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); 322 above2 = LD_UB(p_src + 8 * src_stride); 323 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); 324 above1 = LD_UB(p_src + 9 * src_stride); 325 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); 326 src = LD_UB(p_src + 10 * src_stride); 327 VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8); 328 below1 = LD_UB(p_src + 11 * src_stride); 329 VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9); 330 below2 = LD_UB(p_src + 12 * src_stride); 331 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10); 332 above2 = LD_UB(p_src + 13 * src_stride); 333 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11); 334 above1 = LD_UB(p_src + 14 * src_stride); 335 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12); 336 src = LD_UB(p_src + 15 * src_stride); 337 VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13); 338 below1 = LD_UB(p_src + 16 * src_stride); 339 VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14); 340 below2 = LD_UB(p_src + 17 * src_stride); 341 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15); 342 ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7, 343 p_dst, dst_stride); 344 ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15, 345 p_dst + 8 * dst_stride, dst_stride); 346 p_src += 16; 347 p_dst += 16; 348 f += 16; 349 } 350 351 if (0 != (cols / 16)) { 352 ref = LD_UB(f); 353 LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); 354 src = LD_UB(p_src); 355 LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); 356 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); 357 above2 = LD_UB(p_src + 3 * src_stride); 358 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); 359 above1 = LD_UB(p_src + 4 * src_stride); 360 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); 361 src = LD_UB(p_src + 5 * src_stride); 362 VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); 363 below1 = LD_UB(p_src + 6 * src_stride); 364 VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); 365 below2 = LD_UB(p_src + 7 * src_stride); 366 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); 367 above2 = LD_UB(p_src + 8 * src_stride); 368 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); 369 above1 = LD_UB(p_src + 9 * src_stride); 370 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); 371 src = LD_UB(p_src + 10 * src_stride); 372 VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8); 373 below1 = LD_UB(p_src + 11 * src_stride); 374 VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9); 375 below2 = LD_UB(p_src + 12 * src_stride); 376 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10); 377 above2 = LD_UB(p_src + 13 * src_stride); 378 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11); 379 above1 = LD_UB(p_src + 14 * src_stride); 380 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12); 381 src = LD_UB(p_src + 15 * src_stride); 382 VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13); 383 below1 = LD_UB(p_src + 16 * src_stride); 384 VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14); 385 below2 = LD_UB(p_src + 17 * src_stride); 386 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15); 387 out0 = __msa_copy_u_d((v2i64)inter0, 0); 388 out1 = __msa_copy_u_d((v2i64)inter1, 0); 389 out2 = __msa_copy_u_d((v2i64)inter2, 0); 390 out3 = __msa_copy_u_d((v2i64)inter3, 0); 391 SD4(out0, out1, out2, out3, p_dst, dst_stride); 392 393 out0 = __msa_copy_u_d((v2i64)inter4, 0); 394 out1 = __msa_copy_u_d((v2i64)inter5, 0); 395 out2 = __msa_copy_u_d((v2i64)inter6, 0); 396 out3 = __msa_copy_u_d((v2i64)inter7, 0); 397 SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); 398 399 out0 = __msa_copy_u_d((v2i64)inter8, 0); 400 out1 = __msa_copy_u_d((v2i64)inter9, 0); 401 out2 = __msa_copy_u_d((v2i64)inter10, 0); 402 out3 = __msa_copy_u_d((v2i64)inter11, 0); 403 SD4(out0, out1, out2, out3, p_dst + 8 * dst_stride, dst_stride); 404 405 out0 = __msa_copy_u_d((v2i64)inter12, 0); 406 out1 = __msa_copy_u_d((v2i64)inter13, 0); 407 out2 = __msa_copy_u_d((v2i64)inter14, 0); 408 out3 = __msa_copy_u_d((v2i64)inter15, 0); 409 SD4(out0, out1, out2, out3, p_dst + 12 * dst_stride, dst_stride); 410 } 411 412 f = f_orig; 413 p_dst = dst_ptr - 2; 414 LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5, 415 inter6, inter7); 416 LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11, 417 inter12, inter13, inter14, inter15); 418 419 for (col = 0; col < cols / 8; ++col) { 420 ref = LD_UB(f); 421 f += 8; 422 TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6, 423 inter7, inter8, inter9, inter10, inter11, inter12, inter13, 424 inter14, inter15); 425 if (0 == col) { 426 above2 = inter2; 427 above1 = inter2; 428 } else { 429 above2 = inter0; 430 above1 = inter1; 431 } 432 433 src = inter2; 434 below1 = inter3; 435 below2 = inter4; 436 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0); 437 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2); 438 above2 = inter5; 439 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1); 440 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3); 441 above1 = inter6; 442 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2); 443 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4); 444 src = inter7; 445 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3); 446 VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5); 447 below1 = inter8; 448 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4); 449 VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6); 450 below2 = inter9; 451 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5); 452 VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7); 453 if (col == (cols / 8 - 1)) { 454 above2 = inter9; 455 } else { 456 above2 = inter10; 457 } 458 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6); 459 VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8); 460 if (col == (cols / 8 - 1)) { 461 above1 = inter9; 462 } else { 463 above1 = inter11; 464 } 465 ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7); 466 VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9); 467 VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, 468 inter8, inter9, inter2, inter3, inter4, inter5, 469 inter6, inter7, inter8, inter9, inter10, inter11, 470 inter12, inter13, inter14, inter15, above2, above1); 471 472 p_dst += 8; 473 LD_UB2(p_dst, dst_stride, inter0, inter1); 474 ST8x1_UB(inter2, p_dst_st); 475 ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride)); 476 LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3); 477 ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride)); 478 ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride)); 479 LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5); 480 ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride)); 481 ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride)); 482 LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7); 483 ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride)); 484 ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride)); 485 LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9); 486 ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride)); 487 ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride)); 488 LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11); 489 ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride)); 490 ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride)); 491 LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13); 492 ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride)); 493 ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride)); 494 LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15); 495 ST8x1_UB(above2, (p_dst_st + 14 * dst_stride)); 496 ST8x1_UB(above1, (p_dst_st + 15 * dst_stride)); 497 p_dst_st += 8; 498 } 499} 500 501void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst, 502 int32_t src_stride, 503 int32_t dst_stride, int32_t cols, 504 uint8_t *f, int32_t size) { 505 if (8 == size) { 506 postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f); 507 } else if (16 == size) { 508 postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f); 509 } 510} 511 512void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, 513 int32_t rows, int32_t cols, int32_t flimit) { 514 int32_t row, col, cnt; 515 uint8_t *src_dup = src_ptr; 516 v16u8 src0, src, tmp_orig; 517 v16u8 tmp = { 0 }; 518 v16i8 zero = { 0 }; 519 v8u16 sum_h, src_r_h, src_l_h; 520 v4u32 src_r_w; 521 v4i32 flimit_vec; 522 523 flimit_vec = __msa_fill_w(flimit); 524 for (row = rows; row--;) { 525 int32_t sum_sq; 526 int32_t sum = 0; 527 src0 = (v16u8)__msa_fill_b(src_dup[0]); 528 ST8x1_UB(src0, (src_dup - 8)); 529 530 src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]); 531 ST_UB(src0, src_dup + cols); 532 src_dup[cols + 16] = src_dup[cols - 1]; 533 tmp_orig = (v16u8)__msa_ldi_b(0); 534 tmp_orig[15] = tmp[15]; 535 src = LD_UB(src_dup - 8); 536 src[15] = 0; 537 ILVRL_B2_UH(zero, src, src_r_h, src_l_h); 538 src_r_w = __msa_dotp_u_w(src_r_h, src_r_h); 539 src_r_w += __msa_dotp_u_w(src_l_h, src_l_h); 540 sum_sq = HADD_SW_S32(src_r_w) + 16; 541 sum_h = __msa_hadd_u_h(src, src); 542 sum = HADD_UH_U32(sum_h); 543 { 544 v16u8 src7, src8, src_r, src_l; 545 v16i8 mask; 546 v8u16 add_r, add_l; 547 v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1; 548 v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3; 549 v4i32 sub0, sub1, sub2, sub3; 550 v4i32 sum0_w, sum1_w, sum2_w, sum3_w; 551 v4i32 mul0, mul1, mul2, mul3; 552 v4i32 total0, total1, total2, total3; 553 v8i16 const8 = __msa_fill_h(8); 554 555 src7 = LD_UB(src_dup + 7); 556 src8 = LD_UB(src_dup - 8); 557 for (col = 0; col < (cols >> 4); ++col) { 558 ILVRL_B2_UB(src7, src8, src_r, src_l); 559 HSUB_UB2_SH(src_r, src_l, sub_r, sub_l); 560 561 sum_r[0] = sum + sub_r[0]; 562 for (cnt = 0; cnt < 7; ++cnt) { 563 sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1]; 564 } 565 sum_l[0] = sum_r[7] + sub_l[0]; 566 for (cnt = 0; cnt < 7; ++cnt) { 567 sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1]; 568 } 569 sum = sum_l[7]; 570 src = LD_UB(src_dup + 16 * col); 571 ILVRL_B2_UH(zero, src, src_r_h, src_l_h); 572 src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4); 573 src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4); 574 tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7); 575 576 HADD_UB2_UH(src_r, src_l, add_r, add_l); 577 UNPCK_SH_SW(sub_r, sub0, sub1); 578 UNPCK_SH_SW(sub_l, sub2, sub3); 579 ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w); 580 ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w); 581 MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1, 582 mul2, mul3); 583 sum_sq0[0] = sum_sq + mul0[0]; 584 for (cnt = 0; cnt < 3; ++cnt) { 585 sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1]; 586 } 587 sum_sq1[0] = sum_sq0[3] + mul1[0]; 588 for (cnt = 0; cnt < 3; ++cnt) { 589 sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1]; 590 } 591 sum_sq2[0] = sum_sq1[3] + mul2[0]; 592 for (cnt = 0; cnt < 3; ++cnt) { 593 sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1]; 594 } 595 sum_sq3[0] = sum_sq2[3] + mul3[0]; 596 for (cnt = 0; cnt < 3; ++cnt) { 597 sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1]; 598 } 599 sum_sq = sum_sq3[3]; 600 601 UNPCK_SH_SW(sum_r, sum0_w, sum1_w); 602 UNPCK_SH_SW(sum_l, sum2_w, sum3_w); 603 total0 = sum_sq0 * __msa_ldi_w(15); 604 total0 -= sum0_w * sum0_w; 605 total1 = sum_sq1 * __msa_ldi_w(15); 606 total1 -= sum1_w * sum1_w; 607 total2 = sum_sq2 * __msa_ldi_w(15); 608 total2 -= sum2_w * sum2_w; 609 total3 = sum_sq3 * __msa_ldi_w(15); 610 total3 -= sum3_w * sum3_w; 611 total0 = (total0 < flimit_vec); 612 total1 = (total1 < flimit_vec); 613 total2 = (total2 < flimit_vec); 614 total3 = (total3 < flimit_vec); 615 PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); 616 mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0); 617 tmp = __msa_bmz_v(tmp, src, (v16u8)mask); 618 619 if (col == 0) { 620 uint64_t src_d; 621 622 src_d = __msa_copy_u_d((v2i64)tmp_orig, 1); 623 SD(src_d, (src_dup - 8)); 624 } 625 626 src7 = LD_UB(src_dup + 16 * (col + 1) + 7); 627 src8 = LD_UB(src_dup + 16 * (col + 1) - 8); 628 ST_UB(tmp, (src_dup + (16 * col))); 629 } 630 631 src_dup += pitch; 632 } 633 } 634} 635 636void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, 637 int32_t cols, int32_t flimit) { 638 int32_t row, col, cnt, i; 639 v4i32 flimit_vec; 640 v16u8 dst7, dst8, dst_r_b, dst_l_b; 641 v16i8 mask; 642 v8u16 add_r, add_l; 643 v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1; 644 v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3; 645 646 flimit_vec = __msa_fill_w(flimit); 647 648 for (col = 0; col < (cols >> 4); ++col) { 649 uint8_t *dst_tmp = &dst_ptr[col << 4]; 650 v16u8 dst; 651 v16i8 zero = { 0 }; 652 v16u8 tmp[16]; 653 v8i16 mult0, mult1, rv2_0, rv2_1; 654 v8i16 sum0_h = { 0 }; 655 v8i16 sum1_h = { 0 }; 656 v4i32 mul0 = { 0 }; 657 v4i32 mul1 = { 0 }; 658 v4i32 mul2 = { 0 }; 659 v4i32 mul3 = { 0 }; 660 v4i32 sum0_w, sum1_w, sum2_w, sum3_w; 661 v4i32 add0, add1, add2, add3; 662 const int16_t *rv2[16]; 663 664 dst = LD_UB(dst_tmp); 665 for (cnt = (col << 4), i = 0; i < 16; ++cnt) { 666 rv2[i] = vpx_rv + (i & 7); 667 ++i; 668 } 669 for (cnt = -8; cnt < 0; ++cnt) { 670 ST_UB(dst, dst_tmp + cnt * pitch); 671 } 672 673 dst = LD_UB((dst_tmp + (rows - 1) * pitch)); 674 for (cnt = rows; cnt < rows + 17; ++cnt) { 675 ST_UB(dst, dst_tmp + cnt * pitch); 676 } 677 for (cnt = -8; cnt <= 6; ++cnt) { 678 dst = LD_UB(dst_tmp + (cnt * pitch)); 679 UNPCK_UB_SH(dst, dst_r_h, dst_l_h); 680 MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1); 681 mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0); 682 mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0); 683 mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1); 684 mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1); 685 ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h); 686 } 687 688 for (row = 0; row < (rows + 8); ++row) { 689 for (i = 0; i < 8; ++i) { 690 rv2_0[i] = *(rv2[i] + (row & 127)); 691 rv2_1[i] = *(rv2[i + 8] + (row & 127)); 692 } 693 dst7 = LD_UB(dst_tmp + (7 * pitch)); 694 dst8 = LD_UB(dst_tmp - (8 * pitch)); 695 ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b); 696 697 HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l); 698 UNPCK_SH_SW(sub_r, sub0, sub1); 699 UNPCK_SH_SW(sub_l, sub2, sub3); 700 sum0_h += sub_r; 701 sum1_h += sub_l; 702 703 HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l); 704 705 ILVRL_H2_SW(zero, add_r, add0, add1); 706 ILVRL_H2_SW(zero, add_l, add2, add3); 707 mul0 += add0 * sub0; 708 mul1 += add1 * sub1; 709 mul2 += add2 * sub2; 710 mul3 += add3 * sub3; 711 dst = LD_UB(dst_tmp); 712 ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h); 713 dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4); 714 dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4); 715 tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7); 716 717 UNPCK_SH_SW(sum0_h, sum0_w, sum1_w); 718 UNPCK_SH_SW(sum1_h, sum2_w, sum3_w); 719 total0 = mul0 * __msa_ldi_w(15); 720 total0 -= sum0_w * sum0_w; 721 total1 = mul1 * __msa_ldi_w(15); 722 total1 -= sum1_w * sum1_w; 723 total2 = mul2 * __msa_ldi_w(15); 724 total2 -= sum2_w * sum2_w; 725 total3 = mul3 * __msa_ldi_w(15); 726 total3 -= sum3_w * sum3_w; 727 total0 = (total0 < flimit_vec); 728 total1 = (total1 < flimit_vec); 729 total2 = (total2 < flimit_vec); 730 total3 = (total3 < flimit_vec); 731 PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); 732 mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0); 733 tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask); 734 735 if (row >= 8) { 736 ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch)); 737 } 738 739 dst_tmp += pitch; 740 } 741 } 742} 743