1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vp8_rtcd.h" 12#include "vp8/common/postproc.h" 13#include "vp8/common/mips/msa/vp8_macros_msa.h" 14 15static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride, 16 uint8_t *dst_ptr, int32_t dst_stride, 17 int32_t src_weight) { 18 int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; 19 int32_t row; 20 uint64_t src0_d, src1_d, dst0_d, dst1_d; 21 v16i8 src0 = { 0 }; 22 v16i8 src1 = { 0 }; 23 v16i8 dst0 = { 0 }; 24 v16i8 dst1 = { 0 }; 25 v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; 26 27 src_wt = __msa_fill_h(src_weight); 28 dst_wt = __msa_fill_h(dst_weight); 29 30 for (row = 2; row--;) { 31 LD2(src_ptr, src_stride, src0_d, src1_d); 32 src_ptr += (2 * src_stride); 33 LD2(dst_ptr, dst_stride, dst0_d, dst1_d); 34 INSERT_D2_SB(src0_d, src1_d, src0); 35 INSERT_D2_SB(dst0_d, dst1_d, dst0); 36 37 LD2(src_ptr, src_stride, src0_d, src1_d); 38 src_ptr += (2 * src_stride); 39 LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d); 40 INSERT_D2_SB(src0_d, src1_d, src1); 41 INSERT_D2_SB(dst0_d, dst1_d, dst1); 42 43 UNPCK_UB_SH(src0, src_r, src_l); 44 UNPCK_UB_SH(dst0, dst_r, dst_l); 45 res_h_r = (src_r * src_wt); 46 res_h_r += (dst_r * dst_wt); 47 res_h_l = (src_l * src_wt); 48 res_h_l += (dst_l * dst_wt); 49 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 50 dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); 51 ST8x2_UB(dst0, dst_ptr, dst_stride); 52 dst_ptr += (2 * dst_stride); 53 54 UNPCK_UB_SH(src1, src_r, src_l); 55 UNPCK_UB_SH(dst1, dst_r, dst_l); 56 res_h_r = (src_r * src_wt); 57 res_h_r += (dst_r * dst_wt); 58 res_h_l = (src_l * src_wt); 59 res_h_l += (dst_l * dst_wt); 60 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 61 dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); 62 ST8x2_UB(dst1, dst_ptr, dst_stride); 63 dst_ptr += (2 * dst_stride); 64 } 65} 66 67static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride, 68 uint8_t *dst_ptr, int32_t dst_stride, 69 int32_t src_weight) { 70 int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; 71 int32_t row; 72 v16i8 src0, src1, src2, src3; 73 v16i8 dst0, dst1, dst2, dst3; 74 v8i16 src_wt, dst_wt; 75 v8i16 res_h_r, res_h_l; 76 v8i16 src_r, src_l, dst_r, dst_l; 77 78 src_wt = __msa_fill_h(src_weight); 79 dst_wt = __msa_fill_h(dst_weight); 80 81 for (row = 4; row--;) { 82 LD_SB4(src_ptr, src_stride, src0, src1, src2, src3); 83 src_ptr += (4 * src_stride); 84 LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3); 85 86 UNPCK_UB_SH(src0, src_r, src_l); 87 UNPCK_UB_SH(dst0, dst_r, dst_l); 88 res_h_r = (src_r * src_wt); 89 res_h_r += (dst_r * dst_wt); 90 res_h_l = (src_l * src_wt); 91 res_h_l += (dst_l * dst_wt); 92 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 93 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); 94 dst_ptr += dst_stride; 95 96 UNPCK_UB_SH(src1, src_r, src_l); 97 UNPCK_UB_SH(dst1, dst_r, dst_l); 98 res_h_r = (src_r * src_wt); 99 res_h_r += (dst_r * dst_wt); 100 res_h_l = (src_l * src_wt); 101 res_h_l += (dst_l * dst_wt); 102 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 103 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); 104 dst_ptr += dst_stride; 105 106 UNPCK_UB_SH(src2, src_r, src_l); 107 UNPCK_UB_SH(dst2, dst_r, dst_l); 108 res_h_r = (src_r * src_wt); 109 res_h_r += (dst_r * dst_wt); 110 res_h_l = (src_l * src_wt); 111 res_h_l += (dst_l * dst_wt); 112 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 113 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); 114 dst_ptr += dst_stride; 115 116 UNPCK_UB_SH(src3, src_r, src_l); 117 UNPCK_UB_SH(dst3, dst_r, dst_l); 118 res_h_r = (src_r * src_wt); 119 res_h_r += (dst_r * dst_wt); 120 res_h_l = (src_l * src_wt); 121 res_h_l += (dst_l * dst_wt); 122 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 123 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); 124 dst_ptr += dst_stride; 125 } 126} 127 128void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride, 129 uint8_t *dst_ptr, int32_t dst_stride, 130 int32_t src_weight) { 131 filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride, 132 src_weight); 133} 134 135void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride, 136 uint8_t *dst_ptr, int32_t dst_stride, 137 int32_t src_weight) { 138 filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride, src_weight); 139} 140