1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vp9_rtcd.h"
12#include "vp9/common/vp9_onyxc_int.h"
13#include "vpx_dsp/mips/macros_msa.h"
14
15static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
16                                    uint8_t *dst_ptr, int32_t dst_stride,
17                                    int32_t src_weight) {
18  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
19  int32_t row;
20  uint64_t src0_d, src1_d, dst0_d, dst1_d;
21  v16i8 src0 = { 0 };
22  v16i8 src1 = { 0 };
23  v16i8 dst0 = { 0 };
24  v16i8 dst1 = { 0 };
25  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
26
27  src_wt = __msa_fill_h(src_weight);
28  dst_wt = __msa_fill_h(dst_weight);
29
30  for (row = 2; row--;) {
31    LD2(src_ptr, src_stride, src0_d, src1_d);
32    src_ptr += (2 * src_stride);
33    LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
34    INSERT_D2_SB(src0_d, src1_d, src0);
35    INSERT_D2_SB(dst0_d, dst1_d, dst0);
36
37    LD2(src_ptr, src_stride, src0_d, src1_d);
38    src_ptr += (2 * src_stride);
39    LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
40    INSERT_D2_SB(src0_d, src1_d, src1);
41    INSERT_D2_SB(dst0_d, dst1_d, dst1);
42
43    UNPCK_UB_SH(src0, src_r, src_l);
44    UNPCK_UB_SH(dst0, dst_r, dst_l);
45    res_h_r = (src_r * src_wt);
46    res_h_r += (dst_r * dst_wt);
47    res_h_l = (src_l * src_wt);
48    res_h_l += (dst_l * dst_wt);
49    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
50    dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
51    ST8x2_UB(dst0, dst_ptr, dst_stride);
52    dst_ptr += (2 * dst_stride);
53
54    UNPCK_UB_SH(src1, src_r, src_l);
55    UNPCK_UB_SH(dst1, dst_r, dst_l);
56    res_h_r = (src_r * src_wt);
57    res_h_r += (dst_r * dst_wt);
58    res_h_l = (src_l * src_wt);
59    res_h_l += (dst_l * dst_wt);
60    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
61    dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
62    ST8x2_UB(dst1, dst_ptr, dst_stride);
63    dst_ptr += (2 * dst_stride);
64  }
65}
66
67static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
68                                      int32_t src_stride,
69                                      uint8_t *dst_ptr,
70                                      int32_t dst_stride,
71                                      int32_t src_weight) {
72  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
73  int32_t row;
74  v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
75  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
76
77  src_wt = __msa_fill_h(src_weight);
78  dst_wt = __msa_fill_h(dst_weight);
79
80  for (row = 4; row--;) {
81    LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
82    src_ptr += (4 * src_stride);
83    LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
84
85    UNPCK_UB_SH(src0, src_r, src_l);
86    UNPCK_UB_SH(dst0, dst_r, dst_l);
87    res_h_r = (src_r * src_wt);
88    res_h_r += (dst_r * dst_wt);
89    res_h_l = (src_l * src_wt);
90    res_h_l += (dst_l * dst_wt);
91    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
92    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
93    dst_ptr += dst_stride;
94
95    UNPCK_UB_SH(src1, src_r, src_l);
96    UNPCK_UB_SH(dst1, dst_r, dst_l);
97    res_h_r = (src_r * src_wt);
98    res_h_r += (dst_r * dst_wt);
99    res_h_l = (src_l * src_wt);
100    res_h_l += (dst_l * dst_wt);
101    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
102    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
103    dst_ptr += dst_stride;
104
105    UNPCK_UB_SH(src2, src_r, src_l);
106    UNPCK_UB_SH(dst2, dst_r, dst_l);
107    res_h_r = (src_r * src_wt);
108    res_h_r += (dst_r * dst_wt);
109    res_h_l = (src_l * src_wt);
110    res_h_l += (dst_l * dst_wt);
111    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
112    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
113    dst_ptr += dst_stride;
114
115    UNPCK_UB_SH(src3, src_r, src_l);
116    UNPCK_UB_SH(dst3, dst_r, dst_l);
117    res_h_r = (src_r * src_wt);
118    res_h_r += (dst_r * dst_wt);
119    res_h_l = (src_l * src_wt);
120    res_h_l += (dst_l * dst_wt);
121    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
122    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
123    dst_ptr += dst_stride;
124  }
125}
126
127void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
128                                 uint8_t *dst, int dst_stride,
129                                 int src_weight) {
130  filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
131}
132
133void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
134                                   uint8_t *dst, int dst_stride,
135                                   int src_weight) {
136  filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
137}
138