mfqe_msa.c revision da49e34c1fb5e99681f4ad99c21d9cfd83eddb96
1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vp8_rtcd.h"
12#include "vp8/common/postproc.h"
13#include "vp8/common/mips/msa/vp8_macros_msa.h"
14
15static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
16                                    uint8_t *dst_ptr, int32_t dst_stride,
17                                    int32_t src_weight)
18{
19    int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
20    int32_t row;
21    uint64_t src0_d, src1_d, dst0_d, dst1_d;
22    v16i8 src0 = { 0 };
23    v16i8 src1 = { 0 };
24    v16i8 dst0 = { 0 };
25    v16i8 dst1 = { 0 };
26    v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
27
28    src_wt = __msa_fill_h(src_weight);
29    dst_wt = __msa_fill_h(dst_weight);
30
31    for (row = 2; row--;)
32    {
33        LD2(src_ptr, src_stride, src0_d, src1_d);
34        src_ptr += (2 * src_stride);
35        LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
36        INSERT_D2_SB(src0_d, src1_d, src0);
37        INSERT_D2_SB(dst0_d, dst1_d, dst0);
38
39        LD2(src_ptr, src_stride, src0_d, src1_d);
40        src_ptr += (2 * src_stride);
41        LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
42        INSERT_D2_SB(src0_d, src1_d, src1);
43        INSERT_D2_SB(dst0_d, dst1_d, dst1);
44
45        UNPCK_UB_SH(src0, src_r, src_l);
46        UNPCK_UB_SH(dst0, dst_r, dst_l);
47        res_h_r = (src_r * src_wt);
48        res_h_r += (dst_r * dst_wt);
49        res_h_l = (src_l * src_wt);
50        res_h_l += (dst_l * dst_wt);
51        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
52        dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
53        ST8x2_UB(dst0, dst_ptr, dst_stride);
54        dst_ptr += (2 * dst_stride);
55
56        UNPCK_UB_SH(src1, src_r, src_l);
57        UNPCK_UB_SH(dst1, dst_r, dst_l);
58        res_h_r = (src_r * src_wt);
59        res_h_r += (dst_r * dst_wt);
60        res_h_l = (src_l * src_wt);
61        res_h_l += (dst_l * dst_wt);
62        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
63        dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
64        ST8x2_UB(dst1, dst_ptr, dst_stride);
65        dst_ptr += (2 * dst_stride);
66    }
67}
68
69static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
70                                      uint8_t *dst_ptr, int32_t dst_stride,
71                                      int32_t src_weight)
72{
73    int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
74    int32_t row;
75    v16i8 src0, src1, src2, src3;
76    v16i8 dst0, dst1, dst2, dst3;
77    v8i16 src_wt, dst_wt;
78    v8i16 res_h_r, res_h_l;
79    v8i16 src_r, src_l, dst_r, dst_l;
80
81    src_wt = __msa_fill_h(src_weight);
82    dst_wt = __msa_fill_h(dst_weight);
83
84    for (row = 4; row--;)
85    {
86        LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
87        src_ptr += (4 * src_stride);
88        LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
89
90        UNPCK_UB_SH(src0, src_r, src_l);
91        UNPCK_UB_SH(dst0, dst_r, dst_l);
92        res_h_r = (src_r * src_wt);
93        res_h_r += (dst_r * dst_wt);
94        res_h_l = (src_l * src_wt);
95        res_h_l += (dst_l * dst_wt);
96        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
97        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
98        dst_ptr += dst_stride;
99
100        UNPCK_UB_SH(src1, src_r, src_l);
101        UNPCK_UB_SH(dst1, dst_r, dst_l);
102        res_h_r = (src_r * src_wt);
103        res_h_r += (dst_r * dst_wt);
104        res_h_l = (src_l * src_wt);
105        res_h_l += (dst_l * dst_wt);
106        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
107        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
108        dst_ptr += dst_stride;
109
110        UNPCK_UB_SH(src2, src_r, src_l);
111        UNPCK_UB_SH(dst2, dst_r, dst_l);
112        res_h_r = (src_r * src_wt);
113        res_h_r += (dst_r * dst_wt);
114        res_h_l = (src_l * src_wt);
115        res_h_l += (dst_l * dst_wt);
116        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
117        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
118        dst_ptr += dst_stride;
119
120        UNPCK_UB_SH(src3, src_r, src_l);
121        UNPCK_UB_SH(dst3, dst_r, dst_l);
122        res_h_r = (src_r * src_wt);
123        res_h_r += (dst_r * dst_wt);
124        res_h_l = (src_l * src_wt);
125        res_h_l += (dst_l * dst_wt);
126        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
127        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
128        dst_ptr += dst_stride;
129    }
130}
131
132void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
133                                   uint8_t *dst_ptr, int32_t dst_stride,
134                                   int32_t src_weight)
135{
136    filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride,
137                              src_weight);
138}
139
140void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
141                                 uint8_t *dst_ptr, int32_t dst_stride,
142                                 int32_t src_weight)
143{
144    filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride,
145                            src_weight);
146}
147