mfqe_msa.c revision da49e34c1fb5e99681f4ad99c21d9cfd83eddb96
1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vp8_rtcd.h" 12#include "vp8/common/postproc.h" 13#include "vp8/common/mips/msa/vp8_macros_msa.h" 14 15static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride, 16 uint8_t *dst_ptr, int32_t dst_stride, 17 int32_t src_weight) 18{ 19 int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; 20 int32_t row; 21 uint64_t src0_d, src1_d, dst0_d, dst1_d; 22 v16i8 src0 = { 0 }; 23 v16i8 src1 = { 0 }; 24 v16i8 dst0 = { 0 }; 25 v16i8 dst1 = { 0 }; 26 v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; 27 28 src_wt = __msa_fill_h(src_weight); 29 dst_wt = __msa_fill_h(dst_weight); 30 31 for (row = 2; row--;) 32 { 33 LD2(src_ptr, src_stride, src0_d, src1_d); 34 src_ptr += (2 * src_stride); 35 LD2(dst_ptr, dst_stride, dst0_d, dst1_d); 36 INSERT_D2_SB(src0_d, src1_d, src0); 37 INSERT_D2_SB(dst0_d, dst1_d, dst0); 38 39 LD2(src_ptr, src_stride, src0_d, src1_d); 40 src_ptr += (2 * src_stride); 41 LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d); 42 INSERT_D2_SB(src0_d, src1_d, src1); 43 INSERT_D2_SB(dst0_d, dst1_d, dst1); 44 45 UNPCK_UB_SH(src0, src_r, src_l); 46 UNPCK_UB_SH(dst0, dst_r, dst_l); 47 res_h_r = (src_r * src_wt); 48 res_h_r += (dst_r * dst_wt); 49 res_h_l = (src_l * src_wt); 50 res_h_l += (dst_l * dst_wt); 51 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 52 dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); 53 ST8x2_UB(dst0, dst_ptr, dst_stride); 54 dst_ptr += (2 * dst_stride); 55 56 UNPCK_UB_SH(src1, src_r, src_l); 57 UNPCK_UB_SH(dst1, dst_r, dst_l); 58 res_h_r = (src_r * src_wt); 59 res_h_r += (dst_r * dst_wt); 60 res_h_l = (src_l * src_wt); 61 res_h_l += (dst_l * dst_wt); 62 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 63 dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); 64 ST8x2_UB(dst1, dst_ptr, dst_stride); 65 dst_ptr += (2 * dst_stride); 66 } 67} 68 69static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride, 70 uint8_t *dst_ptr, int32_t dst_stride, 71 int32_t src_weight) 72{ 73 int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; 74 int32_t row; 75 v16i8 src0, src1, src2, src3; 76 v16i8 dst0, dst1, dst2, dst3; 77 v8i16 src_wt, dst_wt; 78 v8i16 res_h_r, res_h_l; 79 v8i16 src_r, src_l, dst_r, dst_l; 80 81 src_wt = __msa_fill_h(src_weight); 82 dst_wt = __msa_fill_h(dst_weight); 83 84 for (row = 4; row--;) 85 { 86 LD_SB4(src_ptr, src_stride, src0, src1, src2, src3); 87 src_ptr += (4 * src_stride); 88 LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3); 89 90 UNPCK_UB_SH(src0, src_r, src_l); 91 UNPCK_UB_SH(dst0, dst_r, dst_l); 92 res_h_r = (src_r * src_wt); 93 res_h_r += (dst_r * dst_wt); 94 res_h_l = (src_l * src_wt); 95 res_h_l += (dst_l * dst_wt); 96 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 97 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); 98 dst_ptr += dst_stride; 99 100 UNPCK_UB_SH(src1, src_r, src_l); 101 UNPCK_UB_SH(dst1, dst_r, dst_l); 102 res_h_r = (src_r * src_wt); 103 res_h_r += (dst_r * dst_wt); 104 res_h_l = (src_l * src_wt); 105 res_h_l += (dst_l * dst_wt); 106 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 107 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); 108 dst_ptr += dst_stride; 109 110 UNPCK_UB_SH(src2, src_r, src_l); 111 UNPCK_UB_SH(dst2, dst_r, dst_l); 112 res_h_r = (src_r * src_wt); 113 res_h_r += (dst_r * dst_wt); 114 res_h_l = (src_l * src_wt); 115 res_h_l += (dst_l * dst_wt); 116 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 117 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); 118 dst_ptr += dst_stride; 119 120 UNPCK_UB_SH(src3, src_r, src_l); 121 UNPCK_UB_SH(dst3, dst_r, dst_l); 122 res_h_r = (src_r * src_wt); 123 res_h_r += (dst_r * dst_wt); 124 res_h_l = (src_l * src_wt); 125 res_h_l += (dst_l * dst_wt); 126 SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); 127 PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); 128 dst_ptr += dst_stride; 129 } 130} 131 132void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride, 133 uint8_t *dst_ptr, int32_t dst_stride, 134 int32_t src_weight) 135{ 136 filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride, 137 src_weight); 138} 139 140void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride, 141 uint8_t *dst_ptr, int32_t dst_stride, 142 int32_t src_weight) 143{ 144 filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride, 145 src_weight); 146} 147