1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vp8_rtcd.h"
12#include "vp8/common/mips/msa/vp8_macros_msa.h"
13
14static void temporal_filter_apply_16size_msa(
15    uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr,
16    int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) {
17  uint32_t row;
18  v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
19  v16u8 frame_l, frame_h;
20  v16i8 zero = { 0 };
21  v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
22  v8i16 diff0, diff1, cnt0, cnt1;
23  v4i32 const3, const16, filter_wt, strength;
24  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
25  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
26  v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
27  v4i32 acc0, acc1, acc2, acc3;
28
29  filter_wt = __msa_fill_w(filter_wt_in);
30  strength = __msa_fill_w(strength_in);
31  const3 = __msa_ldi_w(3);
32  const16 = __msa_ldi_w(16);
33
34  for (row = 8; row--;) {
35    frame1_0_b = LD_SB(frame1_ptr);
36    frame2_0_b = LD_SB(frame2_ptr);
37    frame1_ptr += stride;
38    frame2_ptr += 16;
39    frame1_1_b = LD_SB(frame1_ptr);
40    frame2_1_b = LD_SB(frame2_ptr);
41    LD_SW2(acc, 4, acc0, acc1);
42    LD_SW2(acc + 8, 4, acc2, acc3);
43    LD_SH2(cnt, 8, cnt0, cnt1);
44    ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
45    HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
46    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
47    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
48    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
49         mod0_w, mod1_w, mod2_w, mod3_w);
50    MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
51         mod1_w, mod2_w, mod3_w);
52    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
53    diff0_r = (mod0_w < const16);
54    diff0_l = (mod1_w < const16);
55    diff1_r = (mod2_w < const16);
56    diff1_l = (mod3_w < const16);
57    SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
58         mod0_w, mod1_w, mod2_w, mod3_w);
59    mod0_w = diff0_r & mod0_w;
60    mod1_w = diff0_l & mod1_w;
61    mod2_w = diff1_r & mod2_w;
62    mod3_w = diff1_l & mod3_w;
63    MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
64         filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
65    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
66    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
67    ST_SH2(mod0_h, mod1_h, cnt, 8);
68    cnt += 16;
69    ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
70    UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
71    UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
72    MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
73         mod0_w, mod1_w, mod2_w, mod3_w);
74    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
75         mod2_w, mod3_w);
76    ST_SW2(mod0_w, mod1_w, acc, 4);
77    ST_SW2(mod2_w, mod3_w, acc + 8, 4);
78    acc += 16;
79    LD_SW2(acc, 4, acc0, acc1);
80    LD_SW2(acc + 8, 4, acc2, acc3);
81    LD_SH2(cnt, 8, cnt0, cnt1);
82    ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
83    HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
84    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
85    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
86    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
87         mod0_w, mod1_w, mod2_w, mod3_w);
88    MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
89         mod1_w, mod2_w, mod3_w);
90    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
91    diff0_r = (mod0_w < const16);
92    diff0_l = (mod1_w < const16);
93    diff1_r = (mod2_w < const16);
94    diff1_l = (mod3_w < const16);
95    SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
96         mod0_w, mod1_w, mod2_w, mod3_w);
97    mod0_w = diff0_r & mod0_w;
98    mod1_w = diff0_l & mod1_w;
99    mod2_w = diff1_r & mod2_w;
100    mod3_w = diff1_l & mod3_w;
101    MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
102         filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
103    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
104    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
105    ST_SH2(mod0_h, mod1_h, cnt, 8);
106    cnt += 16;
107
108    UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
109    UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
110    UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
111    MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
112         mod0_w, mod1_w, mod2_w, mod3_w);
113    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
114         mod2_w, mod3_w);
115    ST_SW2(mod0_w, mod1_w, acc, 4);
116    ST_SW2(mod2_w, mod3_w, acc + 8, 4);
117    acc += 16;
118    frame1_ptr += stride;
119    frame2_ptr += 16;
120  }
121}
122
123static void temporal_filter_apply_8size_msa(
124    uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr,
125    int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) {
126  uint32_t row;
127  uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
128  v16i8 frame1 = { 0 };
129  v16i8 frame2 = { 0 };
130  v16i8 frame3 = { 0 };
131  v16i8 frame4 = { 0 };
132  v16u8 frame_l, frame_h;
133  v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
134  v8i16 diff0, diff1, cnt0, cnt1;
135  v4i32 const3, const16;
136  v4i32 filter_wt, strength;
137  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
138  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
139  v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
140  v4i32 acc0, acc1, acc2, acc3;
141
142  filter_wt = __msa_fill_w(filter_wt_in);
143  strength = __msa_fill_w(strength_in);
144  const3 = __msa_ldi_w(3);
145  const16 = __msa_ldi_w(16);
146
147  for (row = 2; row--;) {
148    LD2(frame1_ptr, stride, f0, f1);
149    frame1_ptr += (2 * stride);
150    LD2(frame2_ptr, 8, f2, f3);
151    frame2_ptr += 16;
152    LD2(frame1_ptr, stride, f4, f5);
153    frame1_ptr += (2 * stride);
154    LD2(frame2_ptr, 8, f6, f7);
155    frame2_ptr += 16;
156
157    LD_SW2(acc, 4, acc0, acc1);
158    LD_SW2(acc + 8, 4, acc2, acc3);
159    LD_SH2(cnt, 8, cnt0, cnt1);
160    INSERT_D2_SB(f0, f1, frame1);
161    INSERT_D2_SB(f2, f3, frame2);
162    INSERT_D2_SB(f4, f5, frame3);
163    INSERT_D2_SB(f6, f7, frame4);
164    ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
165    HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
166    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
167    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
168    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
169         mod0_w, mod1_w, mod2_w, mod3_w);
170    MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
171         mod1_w, mod2_w, mod3_w);
172    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
173    diff0_r = (mod0_w < const16);
174    diff0_l = (mod1_w < const16);
175    diff1_r = (mod2_w < const16);
176    diff1_l = (mod3_w < const16);
177    SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
178         mod0_w, mod1_w, mod2_w, mod3_w);
179    mod0_w = diff0_r & mod0_w;
180    mod1_w = diff0_l & mod1_w;
181    mod2_w = diff1_r & mod2_w;
182    mod3_w = diff1_l & mod3_w;
183    MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
184         filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
185    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
186    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
187    ST_SH2(mod0_h, mod1_h, cnt, 8);
188    cnt += 16;
189
190    UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
191    UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
192    UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
193    MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
194         mod0_w, mod1_w, mod2_w, mod3_w);
195    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
196         mod2_w, mod3_w);
197    ST_SW2(mod0_w, mod1_w, acc, 4);
198    ST_SW2(mod2_w, mod3_w, acc + 8, 4);
199    acc += 16;
200
201    LD_SW2(acc, 4, acc0, acc1);
202    LD_SW2(acc + 8, 4, acc2, acc3);
203    LD_SH2(cnt, 8, cnt0, cnt1);
204    ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
205    HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
206    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
207    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
208    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
209         mod0_w, mod1_w, mod2_w, mod3_w);
210    MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
211         mod1_w, mod2_w, mod3_w);
212    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
213    diff0_r = (mod0_w < const16);
214    diff0_l = (mod1_w < const16);
215    diff1_r = (mod2_w < const16);
216    diff1_l = (mod3_w < const16);
217    SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
218         mod0_w, mod1_w, mod2_w, mod3_w);
219    mod0_w = diff0_r & mod0_w;
220    mod1_w = diff0_l & mod1_w;
221    mod2_w = diff1_r & mod2_w;
222    mod3_w = diff1_l & mod3_w;
223    MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
224         filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
225    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
226    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
227    ST_SH2(mod0_h, mod1_h, cnt, 8);
228    cnt += 16;
229
230    UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
231    UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
232    UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
233    MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
234         mod0_w, mod1_w, mod2_w, mod3_w);
235    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
236         mod2_w, mod3_w);
237    ST_SW2(mod0_w, mod1_w, acc, 4);
238    ST_SW2(mod2_w, mod3_w, acc + 8, 4);
239    acc += 16;
240  }
241}
242
243void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
244                                   uint8_t *frame2, uint32_t block_size,
245                                   int32_t strength, int32_t filter_weight,
246                                   uint32_t *accumulator, uint16_t *count) {
247  if (8 == block_size) {
248    temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
249                                    filter_weight, accumulator, count);
250  } else if (16 == block_size) {
251    temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
252                                     filter_weight, accumulator, count);
253  } else {
254    uint32_t i, j, k;
255    int32_t modifier;
256    int32_t byte = 0;
257    const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;
258
259    for (i = 0, k = 0; i < block_size; ++i) {
260      for (j = 0; j < block_size; ++j, ++k) {
261        int src_byte = frame1[byte];
262        int pixel_value = *frame2++;
263
264        modifier = src_byte - pixel_value;
265        modifier *= modifier;
266        modifier *= 3;
267        modifier += rounding;
268        modifier >>= strength;
269
270        if (modifier > 16) modifier = 16;
271
272        modifier = 16 - modifier;
273        modifier *= filter_weight;
274
275        count[k] += modifier;
276        accumulator[k] += modifier * pixel_value;
277
278        byte++;
279      }
280
281      byte += stride - block_size;
282    }
283  }
284}
285