1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_dsp/mips/loopfilter_msa.h"
13#include "vpx_ports/mem.h"
14
15static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
16                                    uint8_t *filter48,
17                                    const uint8_t *b_limit_ptr,
18                                    const uint8_t *limit_ptr,
19                                    const uint8_t *thresh_ptr) {
20  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
21  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
22  v16u8 flat, mask, hev, thresh, b_limit, limit;
23  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
24  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
25  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
26  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
27  v16u8 zero = { 0 };
28
29  /* load vector elements */
30  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
31
32  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
33  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
34  limit = (v16u8)__msa_fill_b(*limit_ptr);
35
36  /* mask and hev */
37  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
38               mask, flat);
39  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
40  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
41
42  if (__msa_test_bz_v(flat)) {
43    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
44
45    return 1;
46  } else {
47    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
48               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
49    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
50                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
51
52    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
53    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
54    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
55                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
56
57    /* convert 16 bit output data into 8 bit */
58    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
59                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
60                p0_filt8_r, q0_filt8_r);
61    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
62                q2_filt8_r);
63
64    /* store pixel values */
65    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
66    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
67    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
68    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
69    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
70    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
71
72    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
73    filter48 += (4 * 16);
74    ST_UB2(q1_out, q2_out, filter48, 16);
75    filter48 += (2 * 16);
76    ST_UB(flat, filter48);
77
78    return 0;
79  }
80}
81
82static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
83  v16u8 flat, flat2, filter8;
84  v16i8 zero = { 0 };
85  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
86  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
87  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
88  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
89  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
90  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
91  v8i16 l_out, r_out;
92
93  flat = LD_UB(filter48 + 96);
94
95  LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
96  LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
97  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
98
99  if (__msa_test_bz_v(flat2)) {
100    LD_UB4(filter48, 16, p2, p1, p0, q0);
101    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
102
103    src -= 3 * pitch;
104    ST_UB4(p2, p1, p0, q0, src, pitch);
105    src += (4 * pitch);
106    ST_UB2(q1, q2, src, pitch);
107  } else {
108    src -= 7 * pitch;
109
110    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
111               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
112               p2_r_in, p1_r_in, p0_r_in);
113
114    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
115
116    tmp0_r = p7_r_in << 3;
117    tmp0_r -= p7_r_in;
118    tmp0_r += p6_r_in;
119    tmp0_r += q0_r_in;
120    tmp1_r = p6_r_in + p5_r_in;
121    tmp1_r += p4_r_in;
122    tmp1_r += p3_r_in;
123    tmp1_r += p2_r_in;
124    tmp1_r += p1_r_in;
125    tmp1_r += p0_r_in;
126    tmp1_r += tmp0_r;
127    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
128
129    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
130               p5_l_in, p4_l_in);
131    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
132               p1_l_in, p0_l_in);
133    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
134
135    tmp0_l = p7_l_in << 3;
136    tmp0_l -= p7_l_in;
137    tmp0_l += p6_l_in;
138    tmp0_l += q0_l_in;
139    tmp1_l = p6_l_in + p5_l_in;
140    tmp1_l += p4_l_in;
141    tmp1_l += p3_l_in;
142    tmp1_l += p2_l_in;
143    tmp1_l += p1_l_in;
144    tmp1_l += p0_l_in;
145    tmp1_l += tmp0_l;
146    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
147
148    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
149    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
150    ST_UB(p6, src);
151    src += pitch;
152
153    /* p5 */
154    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
155    tmp0_r = p5_r_in - p6_r_in;
156    tmp0_r += q1_r_in;
157    tmp0_r -= p7_r_in;
158    tmp1_r += tmp0_r;
159    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
160
161    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
162    tmp0_l = p5_l_in - p6_l_in;
163    tmp0_l += q1_l_in;
164    tmp0_l -= p7_l_in;
165    tmp1_l += tmp0_l;
166    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
167
168    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
169    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
170    ST_UB(p5, src);
171    src += pitch;
172
173    /* p4 */
174    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
175    tmp0_r = p4_r_in - p5_r_in;
176    tmp0_r += q2_r_in;
177    tmp0_r -= p7_r_in;
178    tmp1_r += tmp0_r;
179    r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
180
181    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
182    tmp0_l = p4_l_in - p5_l_in;
183    tmp0_l += q2_l_in;
184    tmp0_l -= p7_l_in;
185    tmp1_l += tmp0_l;
186    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
187
188    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
189    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
190    ST_UB(p4, src);
191    src += pitch;
192
193    /* p3 */
194    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
195    tmp0_r = p3_r_in - p4_r_in;
196    tmp0_r += q3_r_in;
197    tmp0_r -= p7_r_in;
198    tmp1_r += tmp0_r;
199    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
200
201    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
202    tmp0_l = p3_l_in - p4_l_in;
203    tmp0_l += q3_l_in;
204    tmp0_l -= p7_l_in;
205    tmp1_l += tmp0_l;
206    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
207
208    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
209    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
210    ST_UB(p3, src);
211    src += pitch;
212
213    /* p2 */
214    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
215    filter8 = LD_UB(filter48);
216    tmp0_r = p2_r_in - p3_r_in;
217    tmp0_r += q4_r_in;
218    tmp0_r -= p7_r_in;
219    tmp1_r += tmp0_r;
220    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
221
222    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
223    tmp0_l = p2_l_in - p3_l_in;
224    tmp0_l += q4_l_in;
225    tmp0_l -= p7_l_in;
226    tmp1_l += tmp0_l;
227    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
228
229    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
230    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
231    ST_UB(filter8, src);
232    src += pitch;
233
234    /* p1 */
235    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
236    filter8 = LD_UB(filter48 + 16);
237    tmp0_r = p1_r_in - p2_r_in;
238    tmp0_r += q5_r_in;
239    tmp0_r -= p7_r_in;
240    tmp1_r += tmp0_r;
241    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
242
243    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
244    tmp0_l = p1_l_in - p2_l_in;
245    tmp0_l += q5_l_in;
246    tmp0_l -= p7_l_in;
247    tmp1_l += tmp0_l;
248    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
249
250    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
251    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
252    ST_UB(filter8, src);
253    src += pitch;
254
255    /* p0 */
256    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
257    filter8 = LD_UB(filter48 + 32);
258    tmp0_r = p0_r_in - p1_r_in;
259    tmp0_r += q6_r_in;
260    tmp0_r -= p7_r_in;
261    tmp1_r += tmp0_r;
262    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
263
264    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
265    tmp0_l = p0_l_in - p1_l_in;
266    tmp0_l += q6_l_in;
267    tmp0_l -= p7_l_in;
268    tmp1_l += tmp0_l;
269    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
270
271    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
272    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
273    ST_UB(filter8, src);
274    src += pitch;
275
276    /* q0 */
277    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
278    filter8 = LD_UB(filter48 + 48);
279    tmp0_r = q7_r_in - p0_r_in;
280    tmp0_r += q0_r_in;
281    tmp0_r -= p7_r_in;
282    tmp1_r += tmp0_r;
283    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
284
285    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
286    tmp0_l = q7_l_in - p0_l_in;
287    tmp0_l += q0_l_in;
288    tmp0_l -= p7_l_in;
289    tmp1_l += tmp0_l;
290    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
291
292    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
293    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
294    ST_UB(filter8, src);
295    src += pitch;
296
297    /* q1 */
298    filter8 = LD_UB(filter48 + 64);
299    tmp0_r = q7_r_in - q0_r_in;
300    tmp0_r += q1_r_in;
301    tmp0_r -= p6_r_in;
302    tmp1_r += tmp0_r;
303    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
304
305    tmp0_l = q7_l_in - q0_l_in;
306    tmp0_l += q1_l_in;
307    tmp0_l -= p6_l_in;
308    tmp1_l += tmp0_l;
309    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
310
311    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
312    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
313    ST_UB(filter8, src);
314    src += pitch;
315
316    /* q2 */
317    filter8 = LD_UB(filter48 + 80);
318    tmp0_r = q7_r_in - q1_r_in;
319    tmp0_r += q2_r_in;
320    tmp0_r -= p5_r_in;
321    tmp1_r += tmp0_r;
322    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
323
324    tmp0_l = q7_l_in - q1_l_in;
325    tmp0_l += q2_l_in;
326    tmp0_l -= p5_l_in;
327    tmp1_l += tmp0_l;
328    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
329
330    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
331    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
332    ST_UB(filter8, src);
333    src += pitch;
334
335    /* q3 */
336    tmp0_r = q7_r_in - q2_r_in;
337    tmp0_r += q3_r_in;
338    tmp0_r -= p4_r_in;
339    tmp1_r += tmp0_r;
340    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
341
342    tmp0_l = q7_l_in - q2_l_in;
343    tmp0_l += q3_l_in;
344    tmp0_l -= p4_l_in;
345    tmp1_l += tmp0_l;
346    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
347
348    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
349    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
350    ST_UB(q3, src);
351    src += pitch;
352
353    /* q4 */
354    tmp0_r = q7_r_in - q3_r_in;
355    tmp0_r += q4_r_in;
356    tmp0_r -= p3_r_in;
357    tmp1_r += tmp0_r;
358    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
359
360    tmp0_l = q7_l_in - q3_l_in;
361    tmp0_l += q4_l_in;
362    tmp0_l -= p3_l_in;
363    tmp1_l += tmp0_l;
364    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
365
366    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
367    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
368    ST_UB(q4, src);
369    src += pitch;
370
371    /* q5 */
372    tmp0_r = q7_r_in - q4_r_in;
373    tmp0_r += q5_r_in;
374    tmp0_r -= p2_r_in;
375    tmp1_r += tmp0_r;
376    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
377
378    tmp0_l = q7_l_in - q4_l_in;
379    tmp0_l += q5_l_in;
380    tmp0_l -= p2_l_in;
381    tmp1_l += tmp0_l;
382    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
383
384    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
385    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
386    ST_UB(q5, src);
387    src += pitch;
388
389    /* q6 */
390    tmp0_r = q7_r_in - q5_r_in;
391    tmp0_r += q6_r_in;
392    tmp0_r -= p1_r_in;
393    tmp1_r += tmp0_r;
394    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
395
396    tmp0_l = q7_l_in - q5_l_in;
397    tmp0_l += q6_l_in;
398    tmp0_l -= p1_l_in;
399    tmp1_l += tmp0_l;
400    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
401
402    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
403    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
404    ST_UB(q6, src);
405  }
406}
407
408static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
409                                        const uint8_t *b_limit_ptr,
410                                        const uint8_t *limit_ptr,
411                                        const uint8_t *thresh_ptr,
412                                        int32_t count) {
413  DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
414  uint8_t early_exit = 0;
415
416  (void)count;
417
418  early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
419                                    limit_ptr, thresh_ptr);
420
421  if (0 == early_exit) {
422    hz_lpf_t16_16w(src, pitch, filter48);
423  }
424}
425
426static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
427                                   const uint8_t *b_limit_ptr,
428                                   const uint8_t *limit_ptr,
429                                   const uint8_t *thresh_ptr, int32_t count) {
430  if (1 == count) {
431    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
432    uint64_t dword0, dword1;
433    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
434    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
435    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
436    v16u8 p0_filter16, p1_filter16;
437    v8i16 p2_filter8, p1_filter8, p0_filter8;
438    v8i16 q0_filter8, q1_filter8, q2_filter8;
439    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
440    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
441    v16i8 zero = { 0 };
442    v8u16 tmp0, tmp1, tmp2;
443
444    /* load vector elements */
445    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
446
447    thresh = (v16u8)__msa_fill_b(*thresh_ptr);
448    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
449    limit = (v16u8)__msa_fill_b(*limit_ptr);
450
451    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
452                 mask, flat);
453    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
454    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
455                       q1_out);
456
457    flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
458
459    if (__msa_test_bz_v(flat)) {
460      p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
461      p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
462      q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
463      q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
464      SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
465    } else {
466      /* convert 8 bit input data into 16 bit */
467      ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
468                 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
469                 q3_r);
470      VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
471                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
472
473      /* convert 16 bit output data into 8 bit */
474      PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
475                  q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
476      PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
477
478      /* store pixel values */
479      p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
480      p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
481      p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
482      q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
483      q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
484      q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
485
486      /* load 16 vector elements */
487      LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
488      LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
489
490      VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
491
492      if (__msa_test_bz_v(flat2)) {
493        p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
494        p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
495        p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
496        q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
497        q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
498        q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
499
500        SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
501        SD(q1_d, src + pitch);
502        SD(q2_d, src + 2 * pitch);
503      } else {
504        /* LSB(right) 8 pixel operation */
505        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
506                   zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
507                   q7_r);
508
509        tmp0 = p7_r << 3;
510        tmp0 -= p7_r;
511        tmp0 += p6_r;
512        tmp0 += q0_r;
513
514        src -= 7 * pitch;
515
516        /* calculation of p6 and p5 */
517        tmp1 = p6_r + p5_r + p4_r + p3_r;
518        tmp1 += (p2_r + p1_r + p0_r);
519        tmp1 += tmp0;
520        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
521        tmp0 = p5_r - p6_r + q1_r - p7_r;
522        tmp1 += tmp0;
523        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
524        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
525                    p1_filter16);
526        p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
527        p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
528        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
529        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
530        SD(dword0, src);
531        src += pitch;
532        SD(dword1, src);
533        src += pitch;
534
535        /* calculation of p4 and p3 */
536        tmp0 = p4_r - p5_r + q2_r - p7_r;
537        tmp2 = p3_r - p4_r + q3_r - p7_r;
538        tmp1 += tmp0;
539        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
540        tmp1 += tmp2;
541        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
542        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
543                    p1_filter16);
544        p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
545        p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
546        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
547        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
548        SD(dword0, src);
549        src += pitch;
550        SD(dword1, src);
551        src += pitch;
552
553        /* calculation of p2 and p1 */
554        tmp0 = p2_r - p3_r + q4_r - p7_r;
555        tmp2 = p1_r - p2_r + q5_r - p7_r;
556        tmp1 += tmp0;
557        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
558        tmp1 += tmp2;
559        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
560        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
561                    p1_filter16);
562        p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
563        p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
564        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
565        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
566        SD(dword0, src);
567        src += pitch;
568        SD(dword1, src);
569        src += pitch;
570
571        /* calculation of p0 and q0 */
572        tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
573        tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
574        tmp1 += tmp0;
575        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
576        tmp1 += tmp2;
577        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
578        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
579                    p1_filter16);
580        p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
581        p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
582        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
583        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
584        SD(dword0, src);
585        src += pitch;
586        SD(dword1, src);
587        src += pitch;
588
589        /* calculation of q1 and q2 */
590        tmp0 = q7_r - q0_r + q1_r - p6_r;
591        tmp2 = q7_r - q1_r + q2_r - p5_r;
592        tmp1 += tmp0;
593        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
594        tmp1 += tmp2;
595        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
596        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
597                    p1_filter16);
598        p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
599        p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
600        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
601        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
602        SD(dword0, src);
603        src += pitch;
604        SD(dword1, src);
605        src += pitch;
606
607        /* calculation of q3 and q4 */
608        tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
609        tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
610        tmp1 += tmp0;
611        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
612        tmp1 += tmp2;
613        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
614        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
615                    p1_filter16);
616        p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
617        p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
618        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
619        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
620        SD(dword0, src);
621        src += pitch;
622        SD(dword1, src);
623        src += pitch;
624
625        /* calculation of q5 and q6 */
626        tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
627        tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
628        tmp1 += tmp0;
629        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
630        tmp1 += tmp2;
631        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
632        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
633                    p1_filter16);
634        p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
635        p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
636        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
637        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
638        SD(dword0, src);
639        src += pitch;
640        SD(dword1, src);
641      }
642    }
643  } else {
644    mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
645                                count);
646  }
647}
648
649void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
650                               const uint8_t *b_limit_ptr,
651                               const uint8_t *limit_ptr,
652                               const uint8_t *thresh_ptr) {
653  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
654}
655
656void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
657                                    const uint8_t *b_limit_ptr,
658                                    const uint8_t *limit_ptr,
659                                    const uint8_t *thresh_ptr) {
660  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
661}
662
663static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
664                                   uint8_t *output, int32_t out_pitch) {
665  v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
666  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
667  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
668
669  LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
670         p1_org, p0_org);
671  /* 8x8 transpose */
672  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
673                     p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
674  /* 8x8 transpose */
675  ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
676             tmp0, tmp1, tmp2, tmp3);
677  ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
678  ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
679  ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
680  ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
681  SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
682
683  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
684  output += (8 * out_pitch);
685  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
686}
687
688static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
689                                   uint8_t *output, int32_t out_pitch) {
690  v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
691  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
692
693  LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
694  LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
695  TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
696                      q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
697  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
698}
699
700static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
701                            int32_t out_pitch) {
702  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
703  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
704  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
705  v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
706  v4i32 tmp2, tmp3;
707
708  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
709  input += (8 * in_pitch);
710  LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
711
712  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
713                      row9, row10, row11, row12, row13, row14, row15, p7, p6,
714                      p5, p4, p3, p2, p1, p0);
715
716  /* transpose 16x8 matrix into 8x16 */
717  /* total 8 intermediate register and 32 instructions */
718  q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
719  q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
720  q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
721  q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
722  q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
723  q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
724  q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
725  q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
726
727  ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
728  tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
729  tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
730
731  ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
732  tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
733  tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
734
735  ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
736  q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
737  q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
738
739  tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
740  tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
741  q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
742  q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
743
744  ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
745  q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
746  q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
747
748  tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
749  tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
750  q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
751  q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
752
753  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
754  output += (8 * out_pitch);
755  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
756}
757
758static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
759                                   uint8_t *src_org, int32_t pitch_org,
760                                   const uint8_t *b_limit_ptr,
761                                   const uint8_t *limit_ptr,
762                                   const uint8_t *thresh_ptr) {
763  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
764  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
765  v16u8 flat, mask, hev, thresh, b_limit, limit;
766  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
767  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
768  v16i8 zero = { 0 };
769  v8i16 vec0, vec1, vec2, vec3;
770
771  /* load vector elements */
772  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
773
774  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
775  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
776  limit = (v16u8)__msa_fill_b(*limit_ptr);
777
778  /* mask and hev */
779  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
780               mask, flat);
781  /* flat4 */
782  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
783  /* filter4 */
784  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
785
786  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
787
788  if (__msa_test_bz_v(flat)) {
789    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
790    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
791    ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
792    return 1;
793  } else {
794    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
795               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
796    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
797                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
798
799    /* convert 16 bit output data into 8 bit */
800    p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
801    p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
802    p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
803    q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
804    q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
805    q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
806
807    /* store pixel values */
808    p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
809    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
810    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
811    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
812    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
813    q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
814
815    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
816    filter48 += (4 * 16);
817    ST_UB2(q1_out, q2_out, filter48, 16);
818    filter48 += (2 * 16);
819    ST_UB(flat, filter48);
820
821    return 0;
822  }
823}
824
825static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
826                             uint8_t *filter48) {
827  v16i8 zero = { 0 };
828  v16u8 filter8, flat, flat2;
829  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
830  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
831  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
832  v8u16 tmp0_r, tmp1_r;
833  v8i16 r_out;
834
835  flat = LD_UB(filter48 + 6 * 16);
836
837  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
838  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
839
840  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
841
842  if (__msa_test_bz_v(flat2)) {
843    v8i16 vec0, vec1, vec2, vec3, vec4;
844
845    LD_UB4(filter48, 16, p2, p1, p0, q0);
846    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
847
848    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
849    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
850    vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
851
852    src_org -= 3;
853    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
854    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
855    src_org += (4 * pitch);
856    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
857    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
858
859    return 1;
860  } else {
861    src -= 7 * 16;
862
863    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
864               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
865               p2_r_in, p1_r_in, p0_r_in);
866    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
867
868    tmp0_r = p7_r_in << 3;
869    tmp0_r -= p7_r_in;
870    tmp0_r += p6_r_in;
871    tmp0_r += q0_r_in;
872    tmp1_r = p6_r_in + p5_r_in;
873    tmp1_r += p4_r_in;
874    tmp1_r += p3_r_in;
875    tmp1_r += p2_r_in;
876    tmp1_r += p1_r_in;
877    tmp1_r += p0_r_in;
878    tmp1_r += tmp0_r;
879
880    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
881    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
882    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
883    ST8x1_UB(p6, src);
884    src += 16;
885
886    /* p5 */
887    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
888    tmp0_r = p5_r_in - p6_r_in;
889    tmp0_r += q1_r_in;
890    tmp0_r -= p7_r_in;
891    tmp1_r += tmp0_r;
892    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
893    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
894    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
895    ST8x1_UB(p5, src);
896    src += 16;
897
898    /* p4 */
899    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
900    tmp0_r = p4_r_in - p5_r_in;
901    tmp0_r += q2_r_in;
902    tmp0_r -= p7_r_in;
903    tmp1_r += tmp0_r;
904    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
905    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
906    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
907    ST8x1_UB(p4, src);
908    src += 16;
909
910    /* p3 */
911    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
912    tmp0_r = p3_r_in - p4_r_in;
913    tmp0_r += q3_r_in;
914    tmp0_r -= p7_r_in;
915    tmp1_r += tmp0_r;
916    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
917    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
918    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
919    ST8x1_UB(p3, src);
920    src += 16;
921
922    /* p2 */
923    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
924    filter8 = LD_UB(filter48);
925    tmp0_r = p2_r_in - p3_r_in;
926    tmp0_r += q4_r_in;
927    tmp0_r -= p7_r_in;
928    tmp1_r += tmp0_r;
929    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
930    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
931    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
932    ST8x1_UB(filter8, src);
933    src += 16;
934
935    /* p1 */
936    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
937    filter8 = LD_UB(filter48 + 16);
938    tmp0_r = p1_r_in - p2_r_in;
939    tmp0_r += q5_r_in;
940    tmp0_r -= p7_r_in;
941    tmp1_r += tmp0_r;
942    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
943    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
944    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
945    ST8x1_UB(filter8, src);
946    src += 16;
947
948    /* p0 */
949    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
950    filter8 = LD_UB(filter48 + 32);
951    tmp0_r = p0_r_in - p1_r_in;
952    tmp0_r += q6_r_in;
953    tmp0_r -= p7_r_in;
954    tmp1_r += tmp0_r;
955    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
956    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
957    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
958    ST8x1_UB(filter8, src);
959    src += 16;
960
961    /* q0 */
962    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
963    filter8 = LD_UB(filter48 + 48);
964    tmp0_r = q7_r_in - p0_r_in;
965    tmp0_r += q0_r_in;
966    tmp0_r -= p7_r_in;
967    tmp1_r += tmp0_r;
968    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
969    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
970    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
971    ST8x1_UB(filter8, src);
972    src += 16;
973
974    /* q1 */
975    filter8 = LD_UB(filter48 + 64);
976    tmp0_r = q7_r_in - q0_r_in;
977    tmp0_r += q1_r_in;
978    tmp0_r -= p6_r_in;
979    tmp1_r += tmp0_r;
980    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
981    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
982    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
983    ST8x1_UB(filter8, src);
984    src += 16;
985
986    /* q2 */
987    filter8 = LD_UB(filter48 + 80);
988    tmp0_r = q7_r_in - q1_r_in;
989    tmp0_r += q2_r_in;
990    tmp0_r -= p5_r_in;
991    tmp1_r += tmp0_r;
992    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
993    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
994    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
995    ST8x1_UB(filter8, src);
996    src += 16;
997
998    /* q3 */
999    tmp0_r = q7_r_in - q2_r_in;
1000    tmp0_r += q3_r_in;
1001    tmp0_r -= p4_r_in;
1002    tmp1_r += tmp0_r;
1003    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1004    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1005    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1006    ST8x1_UB(q3, src);
1007    src += 16;
1008
1009    /* q4 */
1010    tmp0_r = q7_r_in - q3_r_in;
1011    tmp0_r += q4_r_in;
1012    tmp0_r -= p3_r_in;
1013    tmp1_r += tmp0_r;
1014    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1015    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1016    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1017    ST8x1_UB(q4, src);
1018    src += 16;
1019
1020    /* q5 */
1021    tmp0_r = q7_r_in - q4_r_in;
1022    tmp0_r += q5_r_in;
1023    tmp0_r -= p2_r_in;
1024    tmp1_r += tmp0_r;
1025    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1026    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1027    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1028    ST8x1_UB(q5, src);
1029    src += 16;
1030
1031    /* q6 */
1032    tmp0_r = q7_r_in - q5_r_in;
1033    tmp0_r += q6_r_in;
1034    tmp0_r -= p1_r_in;
1035    tmp1_r += tmp0_r;
1036    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1037    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1038    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1039    ST8x1_UB(q6, src);
1040
1041    return 0;
1042  }
1043}
1044
1045void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
1046                             const uint8_t *b_limit_ptr,
1047                             const uint8_t *limit_ptr,
1048                             const uint8_t *thresh_ptr) {
1049  uint8_t early_exit = 0;
1050  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1051  uint8_t *filter48 = &transposed_input[16 * 16];
1052
1053  transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
1054
1055  early_exit =
1056      vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch,
1057                          b_limit_ptr, limit_ptr, thresh_ptr);
1058
1059  if (0 == early_exit) {
1060    early_exit =
1061        vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
1062
1063    if (0 == early_exit) {
1064      transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
1065    }
1066  }
1067}
1068
1069static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
1070                                    uint8_t *src_org, int32_t pitch,
1071                                    const uint8_t *b_limit_ptr,
1072                                    const uint8_t *limit_ptr,
1073                                    const uint8_t *thresh_ptr) {
1074  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1075  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1076  v16u8 flat, mask, hev, thresh, b_limit, limit;
1077  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1078  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1079  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
1080  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
1081  v16i8 zero = { 0 };
1082  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
1083
1084  /* load vector elements */
1085  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1086
1087  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
1088  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
1089  limit = (v16u8)__msa_fill_b(*limit_ptr);
1090
1091  /* mask and hev */
1092  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
1093               mask, flat);
1094  /* flat4 */
1095  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1096  /* filter4 */
1097  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
1098
1099  if (__msa_test_bz_v(flat)) {
1100    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1101    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1102    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1103    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1104
1105    src_org -= 2;
1106    ST4x8_UB(vec2, vec3, src_org, pitch);
1107    src_org += 8 * pitch;
1108    ST4x8_UB(vec4, vec5, src_org, pitch);
1109
1110    return 1;
1111  } else {
1112    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
1113               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
1114    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1115                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1116    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
1117    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
1118    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1119                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1120
1121    /* convert 16 bit output data into 8 bit */
1122    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1123                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1124                p0_filt8_r, q0_filt8_r);
1125    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1126                q2_filt8_r);
1127
1128    /* store pixel values */
1129    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
1130    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
1131    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
1132    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
1133    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
1134    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
1135
1136    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1137    filter48 += (4 * 16);
1138    ST_UB2(q1_out, q2_out, filter48, 16);
1139    filter48 += (2 * 16);
1140    ST_UB(flat, filter48);
1141
1142    return 0;
1143  }
1144}
1145
1146static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
1147                              uint8_t *filter48) {
1148  v16u8 flat, flat2, filter8;
1149  v16i8 zero = { 0 };
1150  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1151  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1152  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1153  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
1154  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
1155  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
1156  v8i16 l_out, r_out;
1157
1158  flat = LD_UB(filter48 + 6 * 16);
1159
1160  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1161  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1162
1163  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1164
1165  if (__msa_test_bz_v(flat2)) {
1166    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1167
1168    LD_UB4(filter48, 16, p2, p1, p0, q0);
1169    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1170
1171    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1172    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1173    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1174    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1175    ILVRL_B2_SH(q2, q1, vec2, vec5);
1176
1177    src_org -= 3;
1178    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1179    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1180    src_org += (4 * pitch);
1181    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1182    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1183    src_org += (4 * pitch);
1184    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
1185    ST2x4_UB(vec5, 0, (src_org + 4), pitch);
1186    src_org += (4 * pitch);
1187    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
1188    ST2x4_UB(vec5, 4, (src_org + 4), pitch);
1189
1190    return 1;
1191  } else {
1192    src -= 7 * 16;
1193
1194    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
1195               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
1196               p2_r_in, p1_r_in, p0_r_in);
1197    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
1198
1199    tmp0_r = p7_r_in << 3;
1200    tmp0_r -= p7_r_in;
1201    tmp0_r += p6_r_in;
1202    tmp0_r += q0_r_in;
1203    tmp1_r = p6_r_in + p5_r_in;
1204    tmp1_r += p4_r_in;
1205    tmp1_r += p3_r_in;
1206    tmp1_r += p2_r_in;
1207    tmp1_r += p1_r_in;
1208    tmp1_r += p0_r_in;
1209    tmp1_r += tmp0_r;
1210    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1211
1212    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
1213               p5_l_in, p4_l_in);
1214    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
1215               p1_l_in, p0_l_in);
1216    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
1217
1218    tmp0_l = p7_l_in << 3;
1219    tmp0_l -= p7_l_in;
1220    tmp0_l += p6_l_in;
1221    tmp0_l += q0_l_in;
1222    tmp1_l = p6_l_in + p5_l_in;
1223    tmp1_l += p4_l_in;
1224    tmp1_l += p3_l_in;
1225    tmp1_l += p2_l_in;
1226    tmp1_l += p1_l_in;
1227    tmp1_l += p0_l_in;
1228    tmp1_l += tmp0_l;
1229    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1230
1231    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1232    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
1233    ST_UB(p6, src);
1234    src += 16;
1235
1236    /* p5 */
1237    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
1238    tmp0_r = p5_r_in - p6_r_in;
1239    tmp0_r += q1_r_in;
1240    tmp0_r -= p7_r_in;
1241    tmp1_r += tmp0_r;
1242    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1243    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
1244    tmp0_l = p5_l_in - p6_l_in;
1245    tmp0_l += q1_l_in;
1246    tmp0_l -= p7_l_in;
1247    tmp1_l += tmp0_l;
1248    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1249    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1250    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
1251    ST_UB(p5, src);
1252    src += 16;
1253
1254    /* p4 */
1255    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
1256    tmp0_r = p4_r_in - p5_r_in;
1257    tmp0_r += q2_r_in;
1258    tmp0_r -= p7_r_in;
1259    tmp1_r += tmp0_r;
1260    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1261    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
1262    tmp0_l = p4_l_in - p5_l_in;
1263    tmp0_l += q2_l_in;
1264    tmp0_l -= p7_l_in;
1265    tmp1_l += tmp0_l;
1266    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1267    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1268    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
1269    ST_UB(p4, src);
1270    src += 16;
1271
1272    /* p3 */
1273    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
1274    tmp0_r = p3_r_in - p4_r_in;
1275    tmp0_r += q3_r_in;
1276    tmp0_r -= p7_r_in;
1277    tmp1_r += tmp0_r;
1278    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1279    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
1280    tmp0_l = p3_l_in - p4_l_in;
1281    tmp0_l += q3_l_in;
1282    tmp0_l -= p7_l_in;
1283    tmp1_l += tmp0_l;
1284    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1285    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1286    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
1287    ST_UB(p3, src);
1288    src += 16;
1289
1290    /* p2 */
1291    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
1292    filter8 = LD_UB(filter48);
1293    tmp0_r = p2_r_in - p3_r_in;
1294    tmp0_r += q4_r_in;
1295    tmp0_r -= p7_r_in;
1296    tmp1_r += tmp0_r;
1297    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1298    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
1299    tmp0_l = p2_l_in - p3_l_in;
1300    tmp0_l += q4_l_in;
1301    tmp0_l -= p7_l_in;
1302    tmp1_l += tmp0_l;
1303    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1304    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1305    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1306    ST_UB(filter8, src);
1307    src += 16;
1308
1309    /* p1 */
1310    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
1311    filter8 = LD_UB(filter48 + 16);
1312    tmp0_r = p1_r_in - p2_r_in;
1313    tmp0_r += q5_r_in;
1314    tmp0_r -= p7_r_in;
1315    tmp1_r += tmp0_r;
1316    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1317    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
1318    tmp0_l = p1_l_in - p2_l_in;
1319    tmp0_l += q5_l_in;
1320    tmp0_l -= p7_l_in;
1321    tmp1_l += tmp0_l;
1322    l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
1323    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1324    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1325    ST_UB(filter8, src);
1326    src += 16;
1327
1328    /* p0 */
1329    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
1330    filter8 = LD_UB(filter48 + 32);
1331    tmp0_r = p0_r_in - p1_r_in;
1332    tmp0_r += q6_r_in;
1333    tmp0_r -= p7_r_in;
1334    tmp1_r += tmp0_r;
1335    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1336    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
1337    tmp0_l = p0_l_in - p1_l_in;
1338    tmp0_l += q6_l_in;
1339    tmp0_l -= p7_l_in;
1340    tmp1_l += tmp0_l;
1341    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1342    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1343    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1344    ST_UB(filter8, src);
1345    src += 16;
1346
1347    /* q0 */
1348    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
1349    filter8 = LD_UB(filter48 + 48);
1350    tmp0_r = q7_r_in - p0_r_in;
1351    tmp0_r += q0_r_in;
1352    tmp0_r -= p7_r_in;
1353    tmp1_r += tmp0_r;
1354    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1355    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
1356    tmp0_l = q7_l_in - p0_l_in;
1357    tmp0_l += q0_l_in;
1358    tmp0_l -= p7_l_in;
1359    tmp1_l += tmp0_l;
1360    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1361    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1362    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1363    ST_UB(filter8, src);
1364    src += 16;
1365
1366    /* q1 */
1367    filter8 = LD_UB(filter48 + 64);
1368    tmp0_r = q7_r_in - q0_r_in;
1369    tmp0_r += q1_r_in;
1370    tmp0_r -= p6_r_in;
1371    tmp1_r += tmp0_r;
1372    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1373    tmp0_l = q7_l_in - q0_l_in;
1374    tmp0_l += q1_l_in;
1375    tmp0_l -= p6_l_in;
1376    tmp1_l += tmp0_l;
1377    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1378    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1379    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1380    ST_UB(filter8, src);
1381    src += 16;
1382
1383    /* q2 */
1384    filter8 = LD_UB(filter48 + 80);
1385    tmp0_r = q7_r_in - q1_r_in;
1386    tmp0_r += q2_r_in;
1387    tmp0_r -= p5_r_in;
1388    tmp1_r += tmp0_r;
1389    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1390    tmp0_l = q7_l_in - q1_l_in;
1391    tmp0_l += q2_l_in;
1392    tmp0_l -= p5_l_in;
1393    tmp1_l += tmp0_l;
1394    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1395    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1396    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1397    ST_UB(filter8, src);
1398    src += 16;
1399
1400    /* q3 */
1401    tmp0_r = q7_r_in - q2_r_in;
1402    tmp0_r += q3_r_in;
1403    tmp0_r -= p4_r_in;
1404    tmp1_r += tmp0_r;
1405    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1406    tmp0_l = q7_l_in - q2_l_in;
1407    tmp0_l += q3_l_in;
1408    tmp0_l -= p4_l_in;
1409    tmp1_l += tmp0_l;
1410    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1411    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1412    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1413    ST_UB(q3, src);
1414    src += 16;
1415
1416    /* q4 */
1417    tmp0_r = q7_r_in - q3_r_in;
1418    tmp0_r += q4_r_in;
1419    tmp0_r -= p3_r_in;
1420    tmp1_r += tmp0_r;
1421    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1422    tmp0_l = q7_l_in - q3_l_in;
1423    tmp0_l += q4_l_in;
1424    tmp0_l -= p3_l_in;
1425    tmp1_l += tmp0_l;
1426    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1427    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1428    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1429    ST_UB(q4, src);
1430    src += 16;
1431
1432    /* q5 */
1433    tmp0_r = q7_r_in - q4_r_in;
1434    tmp0_r += q5_r_in;
1435    tmp0_r -= p2_r_in;
1436    tmp1_r += tmp0_r;
1437    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1438    tmp0_l = q7_l_in - q4_l_in;
1439    tmp0_l += q5_l_in;
1440    tmp0_l -= p2_l_in;
1441    tmp1_l += tmp0_l;
1442    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1443    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1444    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1445    ST_UB(q5, src);
1446    src += 16;
1447
1448    /* q6 */
1449    tmp0_r = q7_r_in - q5_r_in;
1450    tmp0_r += q6_r_in;
1451    tmp0_r -= p1_r_in;
1452    tmp1_r += tmp0_r;
1453    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1454    tmp0_l = q7_l_in - q5_l_in;
1455    tmp0_l += q6_l_in;
1456    tmp0_l -= p1_l_in;
1457    tmp1_l += tmp0_l;
1458    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1459    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1460    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1461    ST_UB(q6, src);
1462
1463    return 0;
1464  }
1465}
1466
1467void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
1468                                  const uint8_t *b_limit_ptr,
1469                                  const uint8_t *limit_ptr,
1470                                  const uint8_t *thresh_ptr) {
1471  uint8_t early_exit = 0;
1472  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1473  uint8_t *filter48 = &transposed_input[16 * 16];
1474
1475  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
1476
1477  early_exit =
1478      vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
1479                           pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1480
1481  if (0 == early_exit) {
1482    early_exit =
1483        vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
1484
1485    if (0 == early_exit) {
1486      transpose_16x16(transposed_input, 16, (src - 8), pitch);
1487    }
1488  }
1489}
1490