1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include <stdlib.h>
13#include "vp8_rtcd.h"
14#include "vp8/common/onyxc_int.h"
15
16#if HAVE_DSPR2
17typedef unsigned char uc;
18
19/* prefetch data for load */
20inline void prefetch_load_lf(unsigned char *src)
21{
22    __asm__ __volatile__ (
23        "pref   0,  0(%[src])   \n\t"
24        :
25        : [src] "r" (src)
26    );
27}
28
29
30/* prefetch data for store */
31inline void prefetch_store_lf(unsigned char *dst)
32{
33    __asm__ __volatile__ (
34        "pref   1,  0(%[dst])   \n\t"
35        :
36        : [dst] "r" (dst)
37    );
38}
39
40/* processing 4 pixels at the same time
41 * compute hev and mask in the same function
42 */
43static __inline void vp8_filter_mask_vec_mips
44(
45    uint32_t limit,
46    uint32_t flimit,
47    uint32_t p1,
48    uint32_t p0,
49    uint32_t p3,
50    uint32_t p2,
51    uint32_t q0,
52    uint32_t q1,
53    uint32_t q2,
54    uint32_t q3,
55    uint32_t thresh,
56    uint32_t *hev,
57    uint32_t *mask
58)
59{
60    uint32_t c, r, r3, r_k;
61    uint32_t s1, s2, s3;
62    uint32_t ones = 0xFFFFFFFF;
63    uint32_t hev1;
64
65    __asm__ __volatile__ (
66        /* mask |= (abs(p3 - p2) > limit) */
67        "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
68        "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
69        "or             %[r_k], %[r_k],    %[c]         \n\t"
70        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
71        "or             %[r],   $0,        %[c]         \n\t"
72
73        /* mask |= (abs(p2 - p1) > limit) */
74        "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
75        "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
76        "or             %[r_k], %[r_k],    %[c]         \n\t"
77        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
78        "or             %[r],   %[r],      %[c]         \n\t"
79
80        /* mask |= (abs(p1 - p0) > limit)
81         * hev  |= (abs(p1 - p0) > thresh)
82         */
83        "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
84        "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
85        "or             %[r_k], %[r_k],    %[c]         \n\t"
86        "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
87        "or             %[r3],  $0,        %[c]         \n\t"
88        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
89        "or             %[r],   %[r],      %[c]         \n\t"
90
91        /* mask |= (abs(q1 - q0) > limit)
92         * hev  |= (abs(q1 - q0) > thresh)
93         */
94        "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
95        "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
96        "or             %[r_k], %[r_k],    %[c]         \n\t"
97        "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
98        "or             %[r3],  %[r3],     %[c]         \n\t"
99        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
100        "or             %[r],   %[r],      %[c]         \n\t"
101
102        /* mask |= (abs(q2 - q1) > limit) */
103        "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
104        "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
105        "or             %[r_k], %[r_k],    %[c]         \n\t"
106        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
107        "or             %[r],   %[r],      %[c]         \n\t"
108        "sll            %[r3],    %[r3],    24          \n\t"
109
110        /* mask |= (abs(q3 - q2) > limit) */
111        "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
112        "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
113        "or             %[r_k], %[r_k],    %[c]         \n\t"
114        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
115        "or             %[r],   %[r],      %[c]         \n\t"
116
117        : [c] "=&r" (c), [r_k] "=&r" (r_k),
118          [r] "=&r" (r), [r3] "=&r" (r3)
119        : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
120          [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
121          [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
122    );
123
124    __asm__ __volatile__ (
125        /* abs(p0 - q0) */
126        "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
127        "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
128        "wrdsp          %[r3]                           \n\t"
129        "or             %[s1],  %[r_k],    %[c]         \n\t"
130
131        /* abs(p1 - q1) */
132        "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
133        "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
134        "pick.qb        %[hev1], %[ones],  $0           \n\t"
135        "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
136        "or             %[s2],   %[r_k],   %[c]         \n\t"
137
138        /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
139        "shrl.qb        %[s2],   %[s2],     1           \n\t"
140        "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
141        "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
142        "or             %[r],    %[r],      %[c]        \n\t"
143        "sll            %[r],    %[r],      24          \n\t"
144
145        "wrdsp          %[r]                            \n\t"
146        "pick.qb        %[s2],  $0,         %[ones]     \n\t"
147
148        : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
149          [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
150        : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
151          [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
152    );
153
154    *hev = hev1;
155    *mask = s2;
156}
157
158
159/* inputs & outputs are quad-byte vectors */
160static __inline void vp8_filter_mips
161(
162    uint32_t mask,
163    uint32_t hev,
164    uint32_t *ps1,
165    uint32_t *ps0,
166    uint32_t *qs0,
167    uint32_t *qs1
168)
169{
170    int32_t vp8_filter_l, vp8_filter_r;
171    int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
172    int32_t subr_r, subr_l;
173    uint32_t t1, t2, HWM, t3;
174    uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
175
176    int32_t vps1, vps0, vqs0, vqs1;
177    int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
178    uint32_t N128;
179
180    N128 = 0x80808080;
181    t1  = 0x03000300;
182    t2  = 0x04000400;
183    t3  = 0x01000100;
184    HWM = 0xFF00FF00;
185
186    vps0 = (*ps0) ^ N128;
187    vps1 = (*ps1) ^ N128;
188    vqs0 = (*qs0) ^ N128;
189    vqs1 = (*qs1) ^ N128;
190
191    /* use halfword pairs instead quad-bytes because of accuracy */
192    vps0_l = vps0 & HWM;
193    vps0_r = vps0 << 8;
194    vps0_r = vps0_r & HWM;
195
196    vps1_l = vps1 & HWM;
197    vps1_r = vps1 << 8;
198    vps1_r = vps1_r & HWM;
199
200    vqs0_l = vqs0 & HWM;
201    vqs0_r = vqs0 << 8;
202    vqs0_r = vqs0_r & HWM;
203
204    vqs1_l = vqs1 & HWM;
205    vqs1_r = vqs1 << 8;
206    vqs1_r = vqs1_r & HWM;
207
208    mask_l = mask & HWM;
209    mask_r = mask << 8;
210    mask_r = mask_r & HWM;
211
212    hev_l = hev & HWM;
213    hev_r = hev << 8;
214    hev_r = hev_r & HWM;
215
216    __asm__ __volatile__ (
217        /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
218        "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
219        "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
220
221        /* qs0 - ps0 */
222        "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
223        "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
224
225        /* vp8_filter &= hev; */
226        "and          %[vp8_filter_l], %[vp8_filter_l], %[hev_l]        \n\t"
227        "and          %[vp8_filter_r], %[vp8_filter_r], %[hev_r]        \n\t"
228
229        /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
230        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
231        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
232        "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
233        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
234        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
235        "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
236        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
237        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
238
239        /* vp8_filter &= mask; */
240        "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
241        "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
242
243        : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=&r" (vp8_filter_r),
244          [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
245          [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
246
247        : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
248          [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
249          [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
250          [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
251          [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
252          [HWM] "r" (HWM)
253    );
254
255    /* save bottom 3 bits so that we round one side +4 and the other +3 */
256    __asm__ __volatile__ (
257        /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
258        "addq_s.ph    %[Filter1_l],    %[vp8_filter_l], %[t2]           \n\t"
259        "addq_s.ph    %[Filter1_r],    %[vp8_filter_r], %[t2]           \n\t"
260
261        /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
262        "addq_s.ph    %[Filter2_l],    %[vp8_filter_l], %[t1]           \n\t"
263        "addq_s.ph    %[Filter2_r],    %[vp8_filter_r], %[t1]           \n\t"
264        "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
265        "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
266
267        "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
268        "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
269
270        "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
271        "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
272
273        /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
274        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
275        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
276
277        /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
278        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
279        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
280
281        : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
282          [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
283          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
284          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
285
286        : [t1] "r" (t1), [t2] "r" (t2),
287          [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
288          [HWM] "r" (HWM)
289    );
290
291    __asm__ __volatile__ (
292        /* (vp8_filter += 1) >>= 1 */
293        "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
294        "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
295
296        /* vp8_filter &= ~hev; */
297        "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
298        "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
299
300        /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
301        "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
302        "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
303
304        /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
305        "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
306        "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
307
308        : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
309          [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
310          [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
311
312        : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
313    );
314
315    /* Create quad-bytes from halfword pairs */
316    vqs0_l = vqs0_l & HWM;
317    vqs1_l = vqs1_l & HWM;
318    vps0_l = vps0_l & HWM;
319    vps1_l = vps1_l & HWM;
320
321    __asm__ __volatile__ (
322        "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
323        "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
324        "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
325        "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
326
327        : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
328          [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
329        :
330    );
331
332    vqs0 = vqs0_l | vqs0_r;
333    vqs1 = vqs1_l | vqs1_r;
334    vps0 = vps0_l | vps0_r;
335    vps1 = vps1_l | vps1_r;
336
337    *ps0 = vps0 ^ N128;
338    *ps1 = vps1 ^ N128;
339    *qs0 = vqs0 ^ N128;
340    *qs1 = vqs1 ^ N128;
341}
342
343void vp8_loop_filter_horizontal_edge_mips
344(
345    unsigned char *s,
346    int p,
347    unsigned int flimit,
348    unsigned int limit,
349    unsigned int thresh,
350    int count
351)
352{
353    uint32_t mask;
354    uint32_t hev;
355    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
356    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
357
358    mask = 0;
359    hev = 0;
360    p1 = 0;
361    p2 = 0;
362    p3 = 0;
363    p4 = 0;
364
365    /* prefetch data for store */
366    prefetch_store_lf(s);
367
368    /* loop filter designed to work using chars so that we can make maximum use
369     * of 8 bit simd instructions.
370     */
371
372    sm1 = s - (p << 2);
373    s0 = s - p - p - p;
374    s1 = s - p - p ;
375    s2 = s - p;
376    s3 = s;
377    s4 = s + p;
378    s5 = s + p + p;
379    s6 = s + p + p + p;
380
381    /* load quad-byte vectors
382     * memory is 4 byte aligned
383     */
384    p1 = *((uint32_t *)(s1));
385    p2 = *((uint32_t *)(s2));
386    p3 = *((uint32_t *)(s3));
387    p4 = *((uint32_t *)(s4));
388
389    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
390     * mask will be zero and filtering is not needed
391     */
392    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
393    {
394
395        pm1 = *((uint32_t *)(sm1));
396        p0  = *((uint32_t *)(s0));
397        p5  = *((uint32_t *)(s5));
398        p6  = *((uint32_t *)(s6));
399
400        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
401                                 thresh, &hev, &mask);
402
403        /* if mask == 0 do filtering is not needed */
404        if (mask)
405        {
406            /* filtering */
407            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
408
409            /* unpack processed 4x4 neighborhood */
410            *((uint32_t *)s1) = p1;
411            *((uint32_t *)s2) = p2;
412            *((uint32_t *)s3) = p3;
413            *((uint32_t *)s4) = p4;
414        }
415    }
416
417    sm1 += 4;
418    s0  += 4;
419    s1  += 4;
420    s2  += 4;
421    s3  += 4;
422    s4  += 4;
423    s5  += 4;
424    s6  += 4;
425
426    /* load quad-byte vectors
427     * memory is 4 byte aligned
428     */
429    p1 = *((uint32_t *)(s1));
430    p2 = *((uint32_t *)(s2));
431    p3 = *((uint32_t *)(s3));
432    p4 = *((uint32_t *)(s4));
433
434    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
435     * mask will be zero and filtering is not needed
436     */
437    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
438    {
439
440        pm1 = *((uint32_t *)(sm1));
441        p0  = *((uint32_t *)(s0));
442        p5  = *((uint32_t *)(s5));
443        p6  = *((uint32_t *)(s6));
444
445        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
446                                 thresh, &hev, &mask);
447
448        /* if mask == 0 do filtering is not needed */
449        if (mask)
450        {
451            /* filtering */
452            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
453
454            /* unpack processed 4x4 neighborhood */
455            *((uint32_t *)s1) = p1;
456            *((uint32_t *)s2) = p2;
457            *((uint32_t *)s3) = p3;
458            *((uint32_t *)s4) = p4;
459        }
460    }
461
462    sm1 += 4;
463    s0  += 4;
464    s1  += 4;
465    s2  += 4;
466    s3  += 4;
467    s4  += 4;
468    s5  += 4;
469    s6  += 4;
470
471    /* load quad-byte vectors
472     * memory is 4 byte aligned
473     */
474    p1 = *((uint32_t *)(s1));
475    p2 = *((uint32_t *)(s2));
476    p3 = *((uint32_t *)(s3));
477    p4 = *((uint32_t *)(s4));
478
479    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
480     * mask will be zero and filtering is not needed
481     */
482    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
483    {
484
485        pm1 = *((uint32_t *)(sm1));
486        p0  = *((uint32_t *)(s0));
487        p5  = *((uint32_t *)(s5));
488        p6  = *((uint32_t *)(s6));
489
490        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
491                                 thresh, &hev, &mask);
492
493        /* if mask == 0 do filtering is not needed */
494        if (mask)
495        {
496            /* filtering */
497            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
498
499            /* unpack processed 4x4 neighborhood */
500            *((uint32_t *)s1) = p1;
501            *((uint32_t *)s2) = p2;
502            *((uint32_t *)s3) = p3;
503            *((uint32_t *)s4) = p4;
504        }
505    }
506
507    sm1 += 4;
508    s0  += 4;
509    s1  += 4;
510    s2  += 4;
511    s3  += 4;
512    s4  += 4;
513    s5  += 4;
514    s6  += 4;
515
516    /* load quad-byte vectors
517     * memory is 4 byte aligned
518     */
519    p1 = *((uint32_t *)(s1));
520    p2 = *((uint32_t *)(s2));
521    p3 = *((uint32_t *)(s3));
522    p4 = *((uint32_t *)(s4));
523
524    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
525     * mask will be zero and filtering is not needed
526     */
527    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
528    {
529
530        pm1 = *((uint32_t *)(sm1));
531        p0  = *((uint32_t *)(s0));
532        p5  = *((uint32_t *)(s5));
533        p6  = *((uint32_t *)(s6));
534
535        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
536                                 thresh, &hev, &mask);
537
538        /* if mask == 0 do filtering is not needed */
539        if (mask)
540        {
541            /* filtering */
542            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
543
544            /* unpack processed 4x4 neighborhood */
545            *((uint32_t *)s1) = p1;
546            *((uint32_t *)s2) = p2;
547            *((uint32_t *)s3) = p3;
548            *((uint32_t *)s4) = p4;
549        }
550    }
551}
552
553void vp8_loop_filter_uvhorizontal_edge_mips
554(
555    unsigned char *s,
556    int p,
557    unsigned int flimit,
558    unsigned int limit,
559    unsigned int thresh,
560    int count
561)
562{
563    uint32_t mask;
564    uint32_t hev;
565    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
566    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
567
568    mask = 0;
569    hev = 0;
570    p1 = 0;
571    p2 = 0;
572    p3 = 0;
573    p4 = 0;
574
575    /* loop filter designed to work using chars so that we can make maximum use
576     * of 8 bit simd instructions.
577     */
578
579    sm1 = s - (p << 2);
580    s0  = s - p - p - p;
581    s1  = s - p - p ;
582    s2  = s - p;
583    s3  = s;
584    s4  = s + p;
585    s5  = s + p + p;
586    s6  = s + p + p + p;
587
588    /* load quad-byte vectors
589     * memory is 4 byte aligned
590     */
591    p1 = *((uint32_t *)(s1));
592    p2 = *((uint32_t *)(s2));
593    p3 = *((uint32_t *)(s3));
594    p4 = *((uint32_t *)(s4));
595
596    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
597     * mask will be zero and filtering is not needed
598     */
599    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
600    {
601
602        pm1 = *((uint32_t *)(sm1));
603        p0  = *((uint32_t *)(s0));
604        p5  = *((uint32_t *)(s5));
605        p6  = *((uint32_t *)(s6));
606
607        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
608                                 thresh, &hev, &mask);
609
610        /* if mask == 0 do filtering is not needed */
611        if (mask)
612        {
613            /* filtering */
614            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
615
616            /* unpack processed 4x4 neighborhood */
617            *((uint32_t *)s1) = p1;
618            *((uint32_t *)s2) = p2;
619            *((uint32_t *)s3) = p3;
620            *((uint32_t *)s4) = p4;
621        }
622    }
623
624    sm1 += 4;
625    s0  += 4;
626    s1  += 4;
627    s2  += 4;
628    s3  += 4;
629    s4  += 4;
630    s5  += 4;
631    s6  += 4;
632
633    /* load quad-byte vectors
634     * memory is 4 byte aligned
635     */
636    p1 = *((uint32_t *)(s1));
637    p2 = *((uint32_t *)(s2));
638    p3 = *((uint32_t *)(s3));
639    p4 = *((uint32_t *)(s4));
640
641    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
642     * mask will be zero and filtering is not needed
643     */
644    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
645    {
646
647        pm1 = *((uint32_t *)(sm1));
648        p0  = *((uint32_t *)(s0));
649        p5  = *((uint32_t *)(s5));
650        p6  = *((uint32_t *)(s6));
651
652        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
653                                 thresh, &hev, &mask);
654
655        /* if mask == 0 do filtering is not needed */
656        if (mask)
657        {
658            /* filtering */
659            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
660
661            /* unpack processed 4x4 neighborhood */
662            *((uint32_t *)s1) = p1;
663            *((uint32_t *)s2) = p2;
664            *((uint32_t *)s3) = p3;
665            *((uint32_t *)s4) = p4;
666        }
667    }
668}
669
670void vp8_loop_filter_vertical_edge_mips
671(
672    unsigned char *s,
673    int p,
674    const unsigned int flimit,
675    const unsigned int limit,
676    const unsigned int thresh,
677    int count
678)
679{
680    int i;
681    uint32_t mask, hev;
682    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
683    unsigned char *s1, *s2, *s3, *s4;
684    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
685
686    hev = 0;
687    mask = 0;
688    i = 0;
689    pm1 = 0;
690    p0 = 0;
691    p1 = 0;
692    p2 = 0;
693    p3 = 0;
694    p4 = 0;
695    p5 = 0;
696    p6 = 0;
697
698    /* loop filter designed to work using chars so that we can make maximum use
699     * of 8 bit simd instructions.
700     */
701
702    /* apply filter on 4 pixesl at the same time */
703    do
704    {
705
706        /* prefetch data for store */
707        prefetch_store_lf(s + p);
708
709        s1 = s;
710        s2 = s + p;
711        s3 = s2 + p;
712        s4 = s3 + p;
713        s  = s4 + p;
714
715        /* load quad-byte vectors
716         * memory is 4 byte aligned
717         */
718        p2  = *((uint32_t *)(s1 - 4));
719        p6  = *((uint32_t *)(s1));
720        p1  = *((uint32_t *)(s2 - 4));
721        p5  = *((uint32_t *)(s2));
722        p0  = *((uint32_t *)(s3 - 4));
723        p4  = *((uint32_t *)(s3));
724        pm1 = *((uint32_t *)(s4 - 4));
725        p3  = *((uint32_t *)(s4));
726
727        /* transpose pm1, p0, p1, p2 */
728        __asm__ __volatile__ (
729            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
730            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
731            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
732            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
733
734            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
735            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
736            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
737            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
738
739            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
740            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
741            "append         %[p1],      %[sec3],    16          \n\t"
742            "append         %[pm1],     %[sec4],    16          \n\t"
743
744            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
745              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
746              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
747              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
748            :
749        );
750
751        /* transpose p3, p4, p5, p6 */
752        __asm__ __volatile__ (
753            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
754            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
755            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
756            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
757
758            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
759            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
760            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
761            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
762
763            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
764            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
765            "append         %[p5],      %[sec3],    16          \n\t"
766            "append         %[p3],      %[sec4],    16          \n\t"
767
768            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
769              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
770              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
771              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
772            :
773        );
774
775        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
776         * mask will be zero and filtering is not needed
777         */
778        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
779        {
780
781            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
782                                     thresh, &hev, &mask);
783
784            /* if mask == 0 do filtering is not needed */
785            if (mask)
786            {
787                /* filtering */
788                vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
789
790                /* unpack processed 4x4 neighborhood
791                 * don't use transpose on output data
792                 * because memory isn't aligned
793                 */
794                __asm__ __volatile__ (
795                    "sb         %[p4],  1(%[s4])    \n\t"
796                    "sb         %[p3],  0(%[s4])    \n\t"
797                    "sb         %[p2], -1(%[s4])    \n\t"
798                    "sb         %[p1], -2(%[s4])    \n\t"
799                    :
800                    : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
801                      [p2] "r" (p2), [p1] "r" (p1)
802                );
803
804                __asm__ __volatile__ (
805                    "srl        %[p4], %[p4], 8     \n\t"
806                    "srl        %[p3], %[p3], 8     \n\t"
807                    "srl        %[p2], %[p2], 8     \n\t"
808                    "srl        %[p1], %[p1], 8     \n\t"
809                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
810                    :
811                );
812
813                __asm__ __volatile__ (
814                    "sb         %[p4],  1(%[s3])    \n\t"
815                    "sb         %[p3],  0(%[s3])    \n\t"
816                    "sb         %[p2], -1(%[s3])    \n\t"
817                    "sb         %[p1], -2(%[s3])    \n\t"
818                    : [p1] "+r" (p1)
819                    : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
820                );
821
822                __asm__ __volatile__ (
823                    "srl        %[p4], %[p4], 8     \n\t"
824                    "srl        %[p3], %[p3], 8     \n\t"
825                    "srl        %[p2], %[p2], 8     \n\t"
826                    "srl        %[p1], %[p1], 8     \n\t"
827                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
828                    :
829                );
830
831                __asm__ __volatile__ (
832                    "sb         %[p4],  1(%[s2])    \n\t"
833                    "sb         %[p3],  0(%[s2])    \n\t"
834                    "sb         %[p2], -1(%[s2])    \n\t"
835                    "sb         %[p1], -2(%[s2])    \n\t"
836                    :
837                    : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
838                      [p2] "r" (p2), [p1] "r" (p1)
839                );
840
841                __asm__ __volatile__ (
842                    "srl        %[p4], %[p4], 8     \n\t"
843                    "srl        %[p3], %[p3], 8     \n\t"
844                    "srl        %[p2], %[p2], 8     \n\t"
845                    "srl        %[p1], %[p1], 8     \n\t"
846                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
847                    :
848                );
849
850                __asm__ __volatile__ (
851                    "sb         %[p4],  1(%[s1])    \n\t"
852                    "sb         %[p3],  0(%[s1])    \n\t"
853                    "sb         %[p2], -1(%[s1])    \n\t"
854                    "sb         %[p1], -2(%[s1])    \n\t"
855                    :
856                    : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
857                      [p2] "r" (p2), [p1] "r" (p1)
858                );
859            }
860        }
861
862        s1 = s;
863        s2 = s + p;
864        s3 = s2 + p;
865        s4 = s3 + p;
866        s  = s4 + p;
867
868        /* load quad-byte vectors
869         * memory is 4 byte aligned
870         */
871        p2  = *((uint32_t *)(s1 - 4));
872        p6  = *((uint32_t *)(s1));
873        p1  = *((uint32_t *)(s2 - 4));
874        p5  = *((uint32_t *)(s2));
875        p0  = *((uint32_t *)(s3 - 4));
876        p4  = *((uint32_t *)(s3));
877        pm1 = *((uint32_t *)(s4 - 4));
878        p3  = *((uint32_t *)(s4));
879
880        /* transpose pm1, p0, p1, p2 */
881        __asm__ __volatile__ (
882            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
883            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
884            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
885            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
886
887            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
888            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
889            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
890            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
891
892            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
893            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
894            "append         %[p1],      %[sec3],    16          \n\t"
895            "append         %[pm1],     %[sec4],    16          \n\t"
896
897            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
898              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
899              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
900              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
901            :
902        );
903
904        /* transpose p3, p4, p5, p6 */
905        __asm__ __volatile__ (
906            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
907            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
908            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
909            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
910
911            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
912            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
913            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
914            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
915
916            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
917            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
918            "append         %[p5],      %[sec3],    16          \n\t"
919            "append         %[p3],      %[sec4],    16          \n\t"
920
921            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
922              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
923              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
924              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
925            :
926        );
927
928        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
929         * mask will be zero and filtering is not needed
930         */
931        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
932        {
933
934            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
935                                     thresh, &hev, &mask);
936
937            /* if mask == 0 do filtering is not needed */
938            if (mask)
939            {
940                /* filtering */
941                vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
942
943                /* unpack processed 4x4 neighborhood
944                 * don't use transpose on output data
945                 * because memory isn't aligned
946                 */
947                __asm__ __volatile__ (
948                    "sb         %[p4],  1(%[s4])    \n\t"
949                    "sb         %[p3],  0(%[s4])    \n\t"
950                    "sb         %[p2], -1(%[s4])    \n\t"
951                    "sb         %[p1], -2(%[s4])    \n\t"
952                    :
953                    : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
954                      [p2] "r" (p2), [p1] "r" (p1)
955                );
956
957                __asm__ __volatile__ (
958                    "srl        %[p4], %[p4], 8     \n\t"
959                    "srl        %[p3], %[p3], 8     \n\t"
960                    "srl        %[p2], %[p2], 8     \n\t"
961                    "srl        %[p1], %[p1], 8     \n\t"
962                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
963                    :
964                );
965
966                __asm__ __volatile__ (
967                    "sb         %[p4],  1(%[s3])    \n\t"
968                    "sb         %[p3],  0(%[s3])    \n\t"
969                    "sb         %[p2], -1(%[s3])    \n\t"
970                    "sb         %[p1], -2(%[s3])    \n\t"
971                    : [p1] "+r" (p1)
972                    : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
973                );
974
975                __asm__ __volatile__ (
976                    "srl        %[p4], %[p4], 8     \n\t"
977                    "srl        %[p3], %[p3], 8     \n\t"
978                    "srl        %[p2], %[p2], 8     \n\t"
979                    "srl        %[p1], %[p1], 8     \n\t"
980                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
981                    :
982                );
983
984                __asm__ __volatile__ (
985                    "sb         %[p4],  1(%[s2])    \n\t"
986                    "sb         %[p3],  0(%[s2])    \n\t"
987                    "sb         %[p2], -1(%[s2])    \n\t"
988                    "sb         %[p1], -2(%[s2])    \n\t"
989                    :
990                    : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
991                      [p2] "r" (p2), [p1] "r" (p1)
992                );
993
994                __asm__ __volatile__ (
995                    "srl        %[p4], %[p4], 8     \n\t"
996                    "srl        %[p3], %[p3], 8     \n\t"
997                    "srl        %[p2], %[p2], 8     \n\t"
998                    "srl        %[p1], %[p1], 8     \n\t"
999                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1000                    :
1001                );
1002
1003                __asm__ __volatile__ (
1004                    "sb         %[p4],  1(%[s1])    \n\t"
1005                    "sb         %[p3],  0(%[s1])    \n\t"
1006                    "sb         %[p2], -1(%[s1])    \n\t"
1007                    "sb         %[p1], -2(%[s1])    \n\t"
1008                    :
1009                    : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
1010                      [p2] "r" (p2), [p1] "r" (p1)
1011                );
1012            }
1013        }
1014
1015        i += 8;
1016    }
1017
1018    while (i < count);
1019}
1020
1021void vp8_loop_filter_uvvertical_edge_mips
1022(
1023    unsigned char *s,
1024    int p,
1025    unsigned int flimit,
1026    unsigned int limit,
1027    unsigned int thresh,
1028    int count
1029)
1030{
1031    uint32_t mask, hev;
1032    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1033    unsigned char *s1, *s2, *s3, *s4;
1034    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
1035
1036    /* loop filter designed to work using chars so that we can make maximum use
1037     * of 8 bit simd instructions.
1038     */
1039
1040    /* apply filter on 4 pixesl at the same time */
1041
1042    s1 = s;
1043    s2 = s + p;
1044    s3 = s2 + p;
1045    s4 = s3 + p;
1046
1047    /* load quad-byte vectors
1048    * memory is 4 byte aligned
1049    */
1050    p2  = *((uint32_t *)(s1 - 4));
1051    p6  = *((uint32_t *)(s1));
1052    p1  = *((uint32_t *)(s2 - 4));
1053    p5  = *((uint32_t *)(s2));
1054    p0  = *((uint32_t *)(s3 - 4));
1055    p4  = *((uint32_t *)(s3));
1056    pm1 = *((uint32_t *)(s4 - 4));
1057    p3  = *((uint32_t *)(s4));
1058
1059    /* transpose pm1, p0, p1, p2 */
1060    __asm__ __volatile__ (
1061        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
1062        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
1063        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
1064        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
1065
1066        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
1067        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
1068        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1069        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1070
1071        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
1072        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
1073        "append         %[p1],      %[sec3],    16          \n\t"
1074        "append         %[pm1],     %[sec4],    16          \n\t"
1075
1076        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1077          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1078          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
1079          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
1080        :
1081    );
1082
1083    /* transpose p3, p4, p5, p6 */
1084    __asm__ __volatile__ (
1085        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
1086        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
1087        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
1088        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
1089
1090        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
1091        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
1092        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1093        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1094
1095        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
1096        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
1097        "append         %[p5],      %[sec3],    16          \n\t"
1098        "append         %[p3],      %[sec4],    16          \n\t"
1099
1100        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1101          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1102          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
1103          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
1104        :
1105    );
1106
1107    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1108    * mask will be zero and filtering is not needed
1109    */
1110    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1111    {
1112
1113        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1114                                 thresh, &hev, &mask);
1115
1116        /* if mask == 0 do filtering is not needed */
1117        if (mask)
1118        {
1119            /* filtering */
1120            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
1121
1122            /* unpack processed 4x4 neighborhood
1123             * don't use transpose on output data
1124             * because memory isn't aligned
1125             */
1126            __asm__ __volatile__ (
1127                "sb         %[p4],  1(%[s4])    \n\t"
1128                "sb         %[p3],  0(%[s4])    \n\t"
1129                "sb         %[p2], -1(%[s4])    \n\t"
1130                "sb         %[p1], -2(%[s4])    \n\t"
1131                :
1132                : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
1133                  [p2] "r" (p2), [p1] "r" (p1)
1134            );
1135
1136            __asm__ __volatile__ (
1137                "srl        %[p4], %[p4], 8     \n\t"
1138                "srl        %[p3], %[p3], 8     \n\t"
1139                "srl        %[p2], %[p2], 8     \n\t"
1140                "srl        %[p1], %[p1], 8     \n\t"
1141                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1142                :
1143            );
1144
1145            __asm__ __volatile__ (
1146                "sb         %[p4],  1(%[s3])    \n\t"
1147                "sb         %[p3],  0(%[s3])    \n\t"
1148                "sb         %[p2], -1(%[s3])    \n\t"
1149                "sb         %[p1], -2(%[s3])    \n\t"
1150                : [p1] "+r" (p1)
1151                : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
1152            );
1153
1154            __asm__ __volatile__ (
1155                "srl        %[p4], %[p4], 8     \n\t"
1156                "srl        %[p3], %[p3], 8     \n\t"
1157                "srl        %[p2], %[p2], 8     \n\t"
1158                "srl        %[p1], %[p1], 8     \n\t"
1159                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1160                :
1161            );
1162
1163            __asm__ __volatile__ (
1164                "sb         %[p4],  1(%[s2])    \n\t"
1165                "sb         %[p3],  0(%[s2])    \n\t"
1166                "sb         %[p2], -1(%[s2])    \n\t"
1167                "sb         %[p1], -2(%[s2])    \n\t"
1168                :
1169                : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
1170                  [p2] "r" (p2), [p1] "r" (p1)
1171            );
1172
1173            __asm__ __volatile__ (
1174                "srl        %[p4], %[p4], 8     \n\t"
1175                "srl        %[p3], %[p3], 8     \n\t"
1176                "srl        %[p2], %[p2], 8     \n\t"
1177                "srl        %[p1], %[p1], 8     \n\t"
1178                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1179                :
1180            );
1181
1182            __asm__ __volatile__ (
1183                "sb         %[p4],  1(%[s1])    \n\t"
1184                "sb         %[p3],  0(%[s1])    \n\t"
1185                "sb         %[p2], -1(%[s1])    \n\t"
1186                "sb         %[p1], -2(%[s1])    \n\t"
1187                :
1188                : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), [p2] "r" (p2), [p1] "r" (p1)
1189            );
1190        }
1191    }
1192
1193    s1 = s4 + p;
1194    s2 = s1 + p;
1195    s3 = s2 + p;
1196    s4 = s3 + p;
1197
1198    /* load quad-byte vectors
1199     * memory is 4 byte aligned
1200     */
1201    p2  = *((uint32_t *)(s1 - 4));
1202    p6  = *((uint32_t *)(s1));
1203    p1  = *((uint32_t *)(s2 - 4));
1204    p5  = *((uint32_t *)(s2));
1205    p0  = *((uint32_t *)(s3 - 4));
1206    p4  = *((uint32_t *)(s3));
1207    pm1 = *((uint32_t *)(s4 - 4));
1208    p3  = *((uint32_t *)(s4));
1209
1210    /* transpose pm1, p0, p1, p2 */
1211    __asm__ __volatile__ (
1212        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
1213        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
1214        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
1215        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
1216
1217        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
1218        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
1219        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1220        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1221
1222        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
1223        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
1224        "append         %[p1],      %[sec3],    16          \n\t"
1225        "append         %[pm1],     %[sec4],    16          \n\t"
1226
1227        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1228          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1229          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
1230          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
1231        :
1232    );
1233
1234    /* transpose p3, p4, p5, p6 */
1235    __asm__ __volatile__ (
1236        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
1237        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
1238        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
1239        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
1240
1241        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
1242        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
1243        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1244        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1245
1246        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
1247        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
1248        "append         %[p5],      %[sec3],    16          \n\t"
1249        "append         %[p3],      %[sec4],    16          \n\t"
1250
1251        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1252          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1253          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
1254          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
1255        :
1256    );
1257
1258    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1259     * mask will be zero and filtering is not needed
1260     */
1261    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1262    {
1263
1264        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1265                                 thresh, &hev, &mask);
1266
1267        /* if mask == 0 do filtering is not needed */
1268        if (mask)
1269        {
1270            /* filtering */
1271            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
1272
1273            /* unpack processed 4x4 neighborhood
1274             * don't use transpose on output data
1275             * because memory isn't aligned
1276             */
1277            __asm__ __volatile__ (
1278                "sb         %[p4],  1(%[s4])    \n\t"
1279                "sb         %[p3],  0(%[s4])    \n\t"
1280                "sb         %[p2], -1(%[s4])    \n\t"
1281                "sb         %[p1], -2(%[s4])    \n\t"
1282                :
1283                : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
1284                  [p2] "r" (p2), [p1] "r" (p1)
1285            );
1286
1287            __asm__ __volatile__ (
1288                "srl        %[p4], %[p4], 8     \n\t"
1289                "srl        %[p3], %[p3], 8     \n\t"
1290                "srl        %[p2], %[p2], 8     \n\t"
1291                "srl        %[p1], %[p1], 8     \n\t"
1292                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1293                :
1294            );
1295
1296            __asm__ __volatile__ (
1297                "sb         %[p4],  1(%[s3])    \n\t"
1298                "sb         %[p3],  0(%[s3])    \n\t"
1299                "sb         %[p2], -1(%[s3])    \n\t"
1300                "sb         %[p1], -2(%[s3])    \n\t"
1301                : [p1] "+r" (p1)
1302                : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
1303            );
1304
1305            __asm__ __volatile__ (
1306                "srl        %[p4], %[p4], 8     \n\t"
1307                "srl        %[p3], %[p3], 8     \n\t"
1308                "srl        %[p2], %[p2], 8     \n\t"
1309                "srl        %[p1], %[p1], 8     \n\t"
1310                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1311                :
1312            );
1313
1314            __asm__ __volatile__ (
1315                "sb         %[p4],  1(%[s2])    \n\t"
1316                "sb         %[p3],  0(%[s2])    \n\t"
1317                "sb         %[p2], -1(%[s2])    \n\t"
1318                "sb         %[p1], -2(%[s2])    \n\t"
1319                :
1320                : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
1321                  [p2] "r" (p2), [p1] "r" (p1)
1322            );
1323
1324            __asm__ __volatile__ (
1325                "srl        %[p4], %[p4], 8     \n\t"
1326                "srl        %[p3], %[p3], 8     \n\t"
1327                "srl        %[p2], %[p2], 8     \n\t"
1328                "srl        %[p1], %[p1], 8     \n\t"
1329                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1330                :
1331            );
1332
1333            __asm__ __volatile__ (
1334                "sb         %[p4],  1(%[s1])    \n\t"
1335                "sb         %[p3],  0(%[s1])    \n\t"
1336                "sb         %[p2], -1(%[s1])    \n\t"
1337                "sb         %[p1], -2(%[s1])    \n\t"
1338                :
1339                : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
1340                  [p2] "r" (p2), [p1] "r" (p1)
1341            );
1342        }
1343    }
1344}
1345
1346/* inputs & outputs are quad-byte vectors */
1347static __inline void vp8_mbfilter_mips
1348(
1349    uint32_t mask,
1350    uint32_t hev,
1351    uint32_t *ps2,
1352    uint32_t *ps1,
1353    uint32_t *ps0,
1354    uint32_t *qs0,
1355    uint32_t *qs1,
1356    uint32_t *qs2
1357)
1358{
1359    int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
1360    int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
1361    int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
1362    uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r, subr_r, subr_l;
1363    uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l, invhev_r;
1364    uint32_t N128, R63;
1365    uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
1366
1367    R63  = 0x003F003F;
1368    HWM  = 0xFF00FF00;
1369    N128 = 0x80808080;
1370    t1   = 0x03000300;
1371    t2   = 0x04000400;
1372
1373    vps0 = (*ps0) ^ N128;
1374    vps1 = (*ps1) ^ N128;
1375    vps2 = (*ps2) ^ N128;
1376    vqs0 = (*qs0) ^ N128;
1377    vqs1 = (*qs1) ^ N128;
1378    vqs2 = (*qs2) ^ N128;
1379
1380    /* use halfword pairs instead quad-bytes because of accuracy */
1381    vps0_l = vps0 & HWM;
1382    vps0_r = vps0 << 8;
1383    vps0_r = vps0_r & HWM;
1384
1385    vqs0_l = vqs0 & HWM;
1386    vqs0_r = vqs0 << 8;
1387    vqs0_r = vqs0_r & HWM;
1388
1389    vps1_l = vps1 & HWM;
1390    vps1_r = vps1 << 8;
1391    vps1_r = vps1_r & HWM;
1392
1393    vqs1_l = vqs1 & HWM;
1394    vqs1_r = vqs1 << 8;
1395    vqs1_r = vqs1_r & HWM;
1396
1397    vqs2_l = vqs2 & HWM;
1398    vqs2_r = vqs2 << 8;
1399    vqs2_r = vqs2_r & HWM;
1400
1401    __asm__ __volatile__ (
1402        /* qs0 - ps0 */
1403        "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
1404        "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
1405
1406        /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
1407        "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
1408        "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
1409
1410        : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=r" (vp8_filter_r),
1411          [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r)
1412        : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
1413          [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
1414          [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r)
1415    );
1416
1417    vps2_l = vps2 & HWM;
1418    vps2_r = vps2 << 8;
1419    vps2_r = vps2_r & HWM;
1420
1421    /* add outer taps if we have high edge variance */
1422    __asm__ __volatile__ (
1423        /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
1424        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
1425        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
1426        "and          %[mask_l],       %[HWM],          %[mask]         \n\t"
1427        "sll          %[mask_r],       %[mask],         8               \n\t"
1428        "and          %[mask_r],       %[HWM],          %[mask_r]       \n\t"
1429        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
1430        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
1431        "and          %[hev_l],        %[HWM],          %[hev]          \n\t"
1432        "sll          %[hev_r],        %[hev],          8               \n\t"
1433        "and          %[hev_r],        %[HWM],          %[hev_r]        \n\t"
1434        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
1435        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
1436
1437        /* vp8_filter &= mask; */
1438        "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
1439        "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
1440
1441        /* Filter2 = vp8_filter & hev; */
1442        "and          %[Filter2_l],    %[vp8_filter_l], %[hev_l]        \n\t"
1443        "and          %[Filter2_r],    %[vp8_filter_r], %[hev_r]        \n\t"
1444
1445        : [vp8_filter_l] "+r" (vp8_filter_l), [vp8_filter_r] "+r" (vp8_filter_r),
1446          [hev_l] "=&r" (hev_l), [hev_r] "=&r" (hev_r),
1447          [mask_l] "=&r" (mask_l), [mask_r] "=&r" (mask_r),
1448          [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
1449        : [subr_l] "r" (subr_l), [subr_r] "r" (subr_r),
1450          [HWM] "r" (HWM), [hev]  "r" (hev), [mask] "r" (mask)
1451    );
1452
1453    /* save bottom 3 bits so that we round one side +4 and the other +3 */
1454    __asm__ __volatile__ (
1455        /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
1456        "addq_s.ph    %[Filter1_l],    %[Filter2_l],    %[t2]           \n\t"
1457        "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
1458        "addq_s.ph    %[Filter1_r],    %[Filter2_r],    %[t2]           \n\t"
1459
1460        /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
1461        "addq_s.ph    %[Filter2_l],    %[Filter2_l],    %[t1]           \n\t"
1462        "addq_s.ph    %[Filter2_r],    %[Filter2_r],    %[t1]           \n\t"
1463
1464        "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
1465        "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
1466
1467        "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
1468        "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
1469        "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
1470        "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
1471        "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
1472
1473        /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
1474        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
1475        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
1476
1477        /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
1478        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
1479        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
1480
1481        : [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r),
1482          [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
1483          [Filter2_l] "+r" (Filter2_l), [Filter2_r] "+r" (Filter2_r),
1484          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
1485          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
1486        : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
1487          [hev_l] "r" (hev_l), [hev_r] "r" (hev_r)
1488    );
1489
1490    /* only apply wider filter if not high edge variance */
1491    __asm__ __volatile__ (
1492        /* vp8_filter &= ~hev; */
1493        "and          %[Filter2_l],    %[vp8_filter_l], %[invhev_l]     \n\t"
1494        "and          %[Filter2_r],    %[vp8_filter_r], %[invhev_r]     \n\t"
1495
1496        "shra.ph      %[Filter2_l],    %[Filter2_l],    8               \n\t"
1497        "shra.ph      %[Filter2_r],    %[Filter2_r],    8               \n\t"
1498
1499        : [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
1500        : [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
1501          [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
1502    );
1503
1504    /* roughly 3/7th difference across boundary */
1505    __asm__ __volatile__ (
1506        "shll.ph      %[u3_l],         %[Filter2_l],    3               \n\t"
1507        "shll.ph      %[u3_r],         %[Filter2_r],    3               \n\t"
1508
1509        "addq.ph      %[u3_l],         %[u3_l],         %[Filter2_l]    \n\t"
1510        "addq.ph      %[u3_r],         %[u3_r],         %[Filter2_r]    \n\t"
1511
1512        "shll.ph      %[u2_l],         %[u3_l],         1               \n\t"
1513        "shll.ph      %[u2_r],         %[u3_r],         1               \n\t"
1514
1515        "addq.ph      %[u1_l],         %[u3_l],         %[u2_l]         \n\t"
1516        "addq.ph      %[u1_r],         %[u3_r],         %[u2_r]         \n\t"
1517
1518        "addq.ph      %[u2_l],         %[u2_l],         %[R63]          \n\t"
1519        "addq.ph      %[u2_r],         %[u2_r],         %[R63]          \n\t"
1520
1521        "addq.ph      %[u3_l],         %[u3_l],         %[R63]          \n\t"
1522        "addq.ph      %[u3_r],         %[u3_r],         %[R63]          \n\t"
1523
1524        /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
1525         * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
1526         */
1527        "addq.ph      %[u1_l],         %[u1_l],         %[R63]          \n\t"
1528        "addq.ph      %[u1_r],         %[u1_r],         %[R63]          \n\t"
1529        "shra.ph      %[u1_l],         %[u1_l],         7               \n\t"
1530        "shra.ph      %[u1_r],         %[u1_r],         7               \n\t"
1531        "shra.ph      %[u2_l],         %[u2_l],         7               \n\t"
1532        "shra.ph      %[u2_r],         %[u2_r],         7               \n\t"
1533        "shll.ph      %[u1_l],         %[u1_l],         8               \n\t"
1534        "shll.ph      %[u1_r],         %[u1_r],         8               \n\t"
1535        "shll.ph      %[u2_l],         %[u2_l],         8               \n\t"
1536        "shll.ph      %[u2_r],         %[u2_r],         8               \n\t"
1537
1538        /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
1539        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[u1_l]         \n\t"
1540        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[u1_r]         \n\t"
1541
1542        /* vps0 = vp8_signed_char_clamp(ps0 + u); */
1543        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[u1_l]         \n\t"
1544        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[u1_r]         \n\t"
1545
1546        : [u1_l] "=&r" (u1_l), [u1_r] "=&r" (u1_r), [u2_l] "=&r" (u2_l),
1547          [u2_r] "=&r" (u2_r), [u3_l] "=&r" (u3_l), [u3_r] "=&r" (u3_r),
1548          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
1549          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
1550        : [R63]  "r" (R63),
1551          [Filter2_l] "r" (Filter2_l), [Filter2_r] "r" (Filter2_r)
1552    );
1553
1554    __asm__ __volatile__ (
1555        /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
1556        "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[u2_l]         \n\t"
1557        "addq_s.ph    %[vps1_l],       %[vps1_l],       %[u2_l]         \n\t"
1558
1559        /* vps1 = vp8_signed_char_clamp(ps1 + u); */
1560        "addq_s.ph    %[vps1_r],       %[vps1_r],       %[u2_r]         \n\t"
1561        "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[u2_r]         \n\t"
1562
1563        : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
1564          [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
1565        : [u2_l] "r" (u2_l), [u2_r] "r" (u2_r)
1566    );
1567
1568    /* roughly 1/7th difference across boundary */
1569    __asm__ __volatile__ (
1570        /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
1571        "shra.ph      %[u3_l],         %[u3_l],         7               \n\t"
1572        "shra.ph      %[u3_r],         %[u3_r],         7               \n\t"
1573        "shll.ph      %[u3_l],         %[u3_l],         8               \n\t"
1574        "shll.ph      %[u3_r],         %[u3_r],         8               \n\t"
1575
1576        /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
1577        "subq_s.ph    %[vqs2_l],       %[vqs2_l],       %[u3_l]         \n\t"
1578        "subq_s.ph    %[vqs2_r],       %[vqs2_r],       %[u3_r]         \n\t"
1579
1580        /* vps2 = vp8_signed_char_clamp(ps2 + u); */
1581        "addq_s.ph    %[vps2_l],       %[vps2_l],       %[u3_l]         \n\t"
1582        "addq_s.ph    %[vps2_r],       %[vps2_r],       %[u3_r]         \n\t"
1583
1584        : [u3_l] "+r" (u3_l), [u3_r] "+r" (u3_r), [vps2_l] "+r" (vps2_l),
1585          [vps2_r] "+r" (vps2_r), [vqs2_l] "+r" (vqs2_l), [vqs2_r] "+r" (vqs2_r)
1586        :
1587    );
1588
1589    /* Create quad-bytes from halfword pairs */
1590    __asm__ __volatile__ (
1591        "and          %[vqs0_l],       %[vqs0_l],       %[HWM]          \n\t"
1592        "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
1593
1594        "and          %[vps0_l],       %[vps0_l],       %[HWM]          \n\t"
1595        "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
1596
1597        "and          %[vqs1_l],       %[vqs1_l],       %[HWM]          \n\t"
1598        "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
1599
1600        "and          %[vps1_l],       %[vps1_l],       %[HWM]          \n\t"
1601        "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
1602
1603        "and          %[vqs2_l],       %[vqs2_l],       %[HWM]          \n\t"
1604        "shrl.ph      %[vqs2_r],       %[vqs2_r],       8               \n\t"
1605
1606        "and          %[vps2_l],       %[vps2_l],       %[HWM]          \n\t"
1607        "shrl.ph      %[vps2_r],       %[vps2_r],       8               \n\t"
1608
1609        "or           %[vqs0_r],       %[vqs0_l],       %[vqs0_r]       \n\t"
1610        "or           %[vps0_r],       %[vps0_l],       %[vps0_r]       \n\t"
1611        "or           %[vqs1_r],       %[vqs1_l],       %[vqs1_r]       \n\t"
1612        "or           %[vps1_r],       %[vps1_l],       %[vps1_r]       \n\t"
1613        "or           %[vqs2_r],       %[vqs2_l],       %[vqs2_r]       \n\t"
1614        "or           %[vps2_r],       %[vps2_l],       %[vps2_r]       \n\t"
1615
1616        : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), [vqs1_l] "+r" (vqs1_l),
1617          [vqs1_r] "+r" (vqs1_r), [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
1618          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r), [vqs2_l] "+r" (vqs2_l),
1619          [vqs2_r] "+r" (vqs2_r), [vps2_r] "+r" (vps2_r), [vps2_l] "+r" (vps2_l)
1620        : [HWM] "r" (HWM)
1621    );
1622
1623    *ps0 = vps0_r ^ N128;
1624    *ps1 = vps1_r ^ N128;
1625    *ps2 = vps2_r ^ N128;
1626    *qs0 = vqs0_r ^ N128;
1627    *qs1 = vqs1_r ^ N128;
1628    *qs2 = vqs2_r ^ N128;
1629}
1630
1631void vp8_mbloop_filter_horizontal_edge_mips
1632(
1633    unsigned char *s,
1634    int p,
1635    unsigned int flimit,
1636    unsigned int limit,
1637    unsigned int thresh,
1638    int count
1639)
1640{
1641    int i;
1642    uint32_t mask, hev;
1643    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1644    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
1645
1646    mask = 0;
1647    hev = 0;
1648    i = 0;
1649    p1 = 0;
1650    p2 = 0;
1651    p3 = 0;
1652    p4 = 0;
1653
1654    /* loop filter designed to work using chars so that we can make maximum use
1655     * of 8 bit simd instructions.
1656     */
1657
1658    sm1 = s - (p << 2);
1659    s0  = s - p - p - p;
1660    s1  = s - p - p;
1661    s2  = s - p;
1662    s3  = s;
1663    s4  = s + p;
1664    s5  = s + p + p;
1665    s6  = s + p + p + p;
1666
1667    /* prefetch data for load */
1668    prefetch_load_lf(s + p);
1669
1670    /* apply filter on 4 pixesl at the same time */
1671    do
1672    {
1673        /* load quad-byte vectors
1674         * memory is 4 byte aligned
1675         */
1676        p1 = *((uint32_t *)(s1));
1677        p2 = *((uint32_t *)(s2));
1678        p3 = *((uint32_t *)(s3));
1679        p4 = *((uint32_t *)(s4));
1680
1681        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1682         * mask will be zero and filtering is not needed
1683         */
1684        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1685        {
1686
1687            pm1 = *((uint32_t *)(sm1));
1688            p0  = *((uint32_t *)(s0));
1689            p5  = *((uint32_t *)(s5));
1690            p6  = *((uint32_t *)(s6));
1691
1692            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1693                                     thresh, &hev, &mask);
1694
1695            /* if mask == 0 do filtering is not needed */
1696            if (mask)
1697            {
1698                /* filtering */
1699                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1700
1701                /* unpack processed 4x4 neighborhood
1702                 * memory is 4 byte aligned
1703                 */
1704                *((uint32_t *)s0) = p0;
1705                *((uint32_t *)s1) = p1;
1706                *((uint32_t *)s2) = p2;
1707                *((uint32_t *)s3) = p3;
1708                *((uint32_t *)s4) = p4;
1709                *((uint32_t *)s5) = p5;
1710            }
1711        }
1712
1713        sm1 += 4;
1714        s0  += 4;
1715        s1  += 4;
1716        s2  += 4;
1717        s3  += 4;
1718        s4  += 4;
1719        s5  += 4;
1720        s6  += 4;
1721
1722        /* load quad-byte vectors
1723         * memory is 4 byte aligned
1724         */
1725        p1 = *((uint32_t *)(s1));
1726        p2 = *((uint32_t *)(s2));
1727        p3 = *((uint32_t *)(s3));
1728        p4 = *((uint32_t *)(s4));
1729
1730        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1731         * mask will be zero and filtering is not needed
1732         */
1733        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1734        {
1735
1736            pm1 = *((uint32_t *)(sm1));
1737            p0  = *((uint32_t *)(s0));
1738            p5  = *((uint32_t *)(s5));
1739            p6  = *((uint32_t *)(s6));
1740
1741            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1742                                     thresh, &hev, &mask);
1743
1744            /* if mask == 0 do filtering is not needed */
1745            if (mask)
1746            {
1747                /* filtering */
1748                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1749
1750                /* unpack processed 4x4 neighborhood
1751                 * memory is 4 byte aligned
1752                 */
1753                *((uint32_t *)s0) = p0;
1754                *((uint32_t *)s1) = p1;
1755                *((uint32_t *)s2) = p2;
1756                *((uint32_t *)s3) = p3;
1757                *((uint32_t *)s4) = p4;
1758                *((uint32_t *)s5) = p5;
1759            }
1760        }
1761
1762        sm1 += 4;
1763        s0  += 4;
1764        s1  += 4;
1765        s2  += 4;
1766        s3  += 4;
1767        s4  += 4;
1768        s5  += 4;
1769        s6  += 4;
1770
1771        i += 8;
1772    }
1773
1774    while (i < count);
1775}
1776
1777void vp8_mbloop_filter_uvhorizontal_edge_mips
1778(
1779    unsigned char *s,
1780    int p,
1781    unsigned int flimit,
1782    unsigned int limit,
1783    unsigned int thresh,
1784    int count
1785)
1786{
1787    uint32_t mask, hev;
1788    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1789    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
1790
1791    mask = 0;
1792    hev = 0;
1793    p1 = 0;
1794    p2 = 0;
1795    p3 = 0;
1796    p4 = 0;
1797
1798    /* loop filter designed to work using chars so that we can make maximum use
1799     * of 8 bit simd instructions.
1800     */
1801
1802    sm1 = s - (p << 2);
1803    s0  = s - p - p - p;
1804    s1  = s - p - p;
1805    s2  = s - p;
1806    s3  = s;
1807    s4  = s + p;
1808    s5  = s + p + p;
1809    s6  = s + p + p + p;
1810
1811    /* load quad-byte vectors
1812     * memory is 4 byte aligned
1813     */
1814    p1 = *((uint32_t *)(s1));
1815    p2 = *((uint32_t *)(s2));
1816    p3 = *((uint32_t *)(s3));
1817    p4 = *((uint32_t *)(s4));
1818
1819    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1820     * mask will be zero and filtering is not needed
1821     */
1822    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1823    {
1824
1825        pm1 = *((uint32_t *)(sm1));
1826        p0  = *((uint32_t *)(s0));
1827        p5  = *((uint32_t *)(s5));
1828        p6  = *((uint32_t *)(s6));
1829
1830        /* if mask == 0 do filtering is not needed */
1831        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1832                                 thresh, &hev, &mask);
1833
1834        if (mask)
1835        {
1836            /* filtering */
1837            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1838
1839            /* unpack processed 4x4 neighborhood
1840             * memory is 4 byte aligned
1841             */
1842            *((uint32_t *)s0) = p0;
1843            *((uint32_t *)s1) = p1;
1844            *((uint32_t *)s2) = p2;
1845            *((uint32_t *)s3) = p3;
1846            *((uint32_t *)s4) = p4;
1847            *((uint32_t *)s5) = p5;
1848        }
1849    }
1850
1851    sm1 += 4;
1852    s0  += 4;
1853    s1  += 4;
1854    s2  += 4;
1855    s3  += 4;
1856    s4  += 4;
1857    s5  += 4;
1858    s6  += 4;
1859
1860    /* load quad-byte vectors
1861     * memory is 4 byte aligned
1862     */
1863    p1 = *((uint32_t *)(s1));
1864    p2 = *((uint32_t *)(s2));
1865    p3 = *((uint32_t *)(s3));
1866    p4 = *((uint32_t *)(s4));
1867
1868    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1869     * mask will be zero and filtering is not needed
1870     */
1871    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1872    {
1873
1874        pm1 = *((uint32_t *)(sm1));
1875        p0  = *((uint32_t *)(s0));
1876        p5  = *((uint32_t *)(s5));
1877        p6  = *((uint32_t *)(s6));
1878
1879        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1880                                 thresh, &hev, &mask);
1881
1882        /* if mask == 0 do filtering is not needed */
1883        if (mask)
1884        {
1885            /* filtering */
1886            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1887
1888            /* unpack processed 4x4 neighborhood
1889             * memory is 4 byte aligned
1890             */
1891            *((uint32_t *)s0) = p0;
1892            *((uint32_t *)s1) = p1;
1893            *((uint32_t *)s2) = p2;
1894            *((uint32_t *)s3) = p3;
1895            *((uint32_t *)s4) = p4;
1896            *((uint32_t *)s5) = p5;
1897        }
1898    }
1899}
1900
1901
1902void vp8_mbloop_filter_vertical_edge_mips
1903(
1904    unsigned char *s,
1905    int p,
1906    unsigned int flimit,
1907    unsigned int limit,
1908    unsigned int thresh,
1909    int count
1910)
1911{
1912
1913    int i;
1914    uint32_t mask, hev;
1915    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1916    unsigned char *s1, *s2, *s3, *s4;
1917    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
1918
1919    mask = 0;
1920    hev = 0;
1921    i = 0;
1922    pm1 = 0;
1923    p0 = 0;
1924    p1 = 0;
1925    p2 = 0;
1926    p3 = 0;
1927    p4 = 0;
1928    p5 = 0;
1929    p6 = 0;
1930
1931    /* loop filter designed to work using chars so that we can make maximum use
1932     * of 8 bit simd instructions.
1933     */
1934
1935    /* apply filter on 4 pixesl at the same time */
1936    do
1937    {
1938        s1 = s;
1939        s2 = s + p;
1940        s3 = s2 + p;
1941        s4 = s3 + p;
1942        s  = s4 + p;
1943
1944        /* load quad-byte vectors
1945         * memory is 4 byte aligned
1946         */
1947        p2  = *((uint32_t *)(s1 - 4));
1948        p6  = *((uint32_t *)(s1));
1949        p1  = *((uint32_t *)(s2 - 4));
1950        p5  = *((uint32_t *)(s2));
1951        p0  = *((uint32_t *)(s3 - 4));
1952        p4  = *((uint32_t *)(s3));
1953        pm1 = *((uint32_t *)(s4 - 4));
1954        p3  = *((uint32_t *)(s4));
1955
1956        /* transpose pm1, p0, p1, p2 */
1957        __asm__ __volatile__ (
1958            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
1959            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
1960            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
1961            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
1962
1963            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
1964            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
1965            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1966            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1967
1968            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
1969            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
1970            "append         %[p1],      %[sec3],    16          \n\t"
1971            "append         %[pm1],     %[sec4],    16          \n\t"
1972
1973            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1974              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1975              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
1976              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
1977            :
1978        );
1979
1980        /* transpose p3, p4, p5, p6 */
1981        __asm__ __volatile__ (
1982            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
1983            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
1984            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
1985            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
1986
1987            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
1988            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
1989            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1990            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1991
1992            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
1993            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
1994            "append         %[p5],      %[sec3],    16          \n\t"
1995            "append         %[p3],      %[sec4],    16          \n\t"
1996
1997            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1998              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1999              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2000              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
2001            :
2002        );
2003
2004        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2005         * mask will be zero and filtering is not needed
2006         */
2007        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
2008        {
2009
2010            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
2011                                     thresh, &hev, &mask);
2012
2013            /* if mask == 0 do filtering is not needed */
2014            if (mask)
2015            {
2016                /* filtering */
2017                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2018
2019                /* don't use transpose on output data
2020                 * because memory isn't aligned
2021                 */
2022                __asm__ __volatile__ (
2023                    "sb         %[p5],  2(%[s4])        \n\t"
2024                    "sb         %[p4],  1(%[s4])        \n\t"
2025                    "sb         %[p3],  0(%[s4])        \n\t"
2026                    "sb         %[p2], -1(%[s4])        \n\t"
2027                    "sb         %[p1], -2(%[s4])        \n\t"
2028                    "sb         %[p0], -3(%[s4])        \n\t"
2029                    :
2030                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
2031                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2032                );
2033
2034                __asm__ __volatile__ (
2035                    "srl        %[p5], %[p5], 8         \n\t"
2036                    "srl        %[p4], %[p4], 8         \n\t"
2037                    "srl        %[p3], %[p3], 8         \n\t"
2038                    "srl        %[p2], %[p2], 8         \n\t"
2039                    "srl        %[p1], %[p1], 8         \n\t"
2040                    "srl        %[p0], %[p0], 8         \n\t"
2041                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2042                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2043                    :
2044                );
2045
2046                __asm__ __volatile__ (
2047                    "sb         %[p5],  2(%[s3])        \n\t"
2048                    "sb         %[p4],  1(%[s3])        \n\t"
2049                    "sb         %[p3],  0(%[s3])        \n\t"
2050                    "sb         %[p2], -1(%[s3])        \n\t"
2051                    "sb         %[p1], -2(%[s3])        \n\t"
2052                    "sb         %[p0], -3(%[s3])        \n\t"
2053                    :
2054                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
2055                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2056                );
2057
2058                __asm__ __volatile__ (
2059                    "srl        %[p5], %[p5], 8         \n\t"
2060                    "srl        %[p4], %[p4], 8         \n\t"
2061                    "srl        %[p3], %[p3], 8         \n\t"
2062                    "srl        %[p2], %[p2], 8         \n\t"
2063                    "srl        %[p1], %[p1], 8         \n\t"
2064                    "srl        %[p0], %[p0], 8         \n\t"
2065                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2066                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2067                    :
2068                );
2069
2070                __asm__ __volatile__ (
2071                    "sb         %[p5],  2(%[s2])        \n\t"
2072                    "sb         %[p4],  1(%[s2])        \n\t"
2073                    "sb         %[p3],  0(%[s2])        \n\t"
2074                    "sb         %[p2], -1(%[s2])        \n\t"
2075                    "sb         %[p1], -2(%[s2])        \n\t"
2076                    "sb         %[p0], -3(%[s2])        \n\t"
2077                    :
2078                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
2079                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2080                );
2081
2082                __asm__ __volatile__ (
2083                    "srl        %[p5], %[p5], 8         \n\t"
2084                    "srl        %[p4], %[p4], 8         \n\t"
2085                    "srl        %[p3], %[p3], 8         \n\t"
2086                    "srl        %[p2], %[p2], 8         \n\t"
2087                    "srl        %[p1], %[p1], 8         \n\t"
2088                    "srl        %[p0], %[p0], 8         \n\t"
2089                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2090                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2091                    :
2092                );
2093
2094                __asm__ __volatile__ (
2095                    "sb         %[p5],  2(%[s1])        \n\t"
2096                    "sb         %[p4],  1(%[s1])        \n\t"
2097                    "sb         %[p3],  0(%[s1])        \n\t"
2098                    "sb         %[p2], -1(%[s1])        \n\t"
2099                    "sb         %[p1], -2(%[s1])        \n\t"
2100                    "sb         %[p0], -3(%[s1])        \n\t"
2101                    :
2102                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
2103                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2104                );
2105            }
2106        }
2107
2108        i += 4;
2109    }
2110
2111    while (i < count);
2112}
2113
2114void vp8_mbloop_filter_uvvertical_edge_mips
2115(
2116    unsigned char *s,
2117    int p,
2118    unsigned int flimit,
2119    unsigned int limit,
2120    unsigned int thresh,
2121    int count
2122)
2123{
2124    uint32_t mask, hev;
2125    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
2126    unsigned char *s1, *s2, *s3, *s4;
2127    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
2128
2129    mask = 0;
2130    hev = 0;
2131    pm1 = 0;
2132    p0 = 0;
2133    p1 = 0;
2134    p2 = 0;
2135    p3 = 0;
2136    p4 = 0;
2137    p5 = 0;
2138    p6 = 0;
2139
2140    /* loop filter designed to work using chars so that we can make maximum use
2141     * of 8 bit simd instructions.
2142     */
2143
2144    /* apply filter on 4 pixesl at the same time */
2145
2146    s1 = s;
2147    s2 = s + p;
2148    s3 = s2 + p;
2149    s4 = s3 + p;
2150
2151    /* prefetch data for load */
2152    prefetch_load_lf(s + 2 * p);
2153
2154    /* load quad-byte vectors
2155     * memory is 4 byte aligned
2156     */
2157    p2  = *((uint32_t *)(s1 - 4));
2158    p6  = *((uint32_t *)(s1));
2159    p1  = *((uint32_t *)(s2 - 4));
2160    p5  = *((uint32_t *)(s2));
2161    p0  = *((uint32_t *)(s3 - 4));
2162    p4  = *((uint32_t *)(s3));
2163    pm1 = *((uint32_t *)(s4 - 4));
2164    p3  = *((uint32_t *)(s4));
2165
2166    /* transpose pm1, p0, p1, p2 */
2167    __asm__ __volatile__ (
2168        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
2169        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
2170        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
2171        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
2172
2173        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
2174        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
2175        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
2176        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
2177
2178        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
2179        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
2180        "append         %[p1],      %[sec3],    16          \n\t"
2181        "append         %[pm1],     %[sec4],    16          \n\t"
2182
2183        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
2184          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
2185          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
2186          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
2187        :
2188    );
2189
2190    /* transpose p3, p4, p5, p6 */
2191    __asm__ __volatile__ (
2192        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
2193        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
2194        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
2195        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
2196
2197        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
2198        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
2199        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
2200        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
2201
2202        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
2203        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
2204        "append         %[p5],      %[sec3],    16          \n\t"
2205        "append         %[p3],      %[sec4],    16          \n\t"
2206
2207        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
2208          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
2209          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2210          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
2211        :
2212    );
2213
2214    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2215     * mask will be zero and filtering is not needed
2216     */
2217    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
2218    {
2219
2220        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
2221                                 thresh, &hev, &mask);
2222
2223        /* if mask == 0 do filtering is not needed */
2224        if (mask)
2225        {
2226            /* filtering */
2227            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2228
2229            /* don't use transpose on output data
2230             * because memory isn't aligned
2231             */
2232            __asm__ __volatile__ (
2233                "sb         %[p5],  2(%[s4])        \n\t"
2234                "sb         %[p4],  1(%[s4])        \n\t"
2235                "sb         %[p3],  0(%[s4])        \n\t"
2236                "sb         %[p2], -1(%[s4])        \n\t"
2237                "sb         %[p1], -2(%[s4])        \n\t"
2238                "sb         %[p0], -3(%[s4])        \n\t"
2239                :
2240                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
2241                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2242            );
2243
2244            __asm__ __volatile__ (
2245                "srl        %[p5], %[p5], 8         \n\t"
2246                "srl        %[p4], %[p4], 8         \n\t"
2247                "srl        %[p3], %[p3], 8         \n\t"
2248                "srl        %[p2], %[p2], 8         \n\t"
2249                "srl        %[p1], %[p1], 8         \n\t"
2250                "srl        %[p0], %[p0], 8         \n\t"
2251                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2252                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2253                :
2254            );
2255
2256            __asm__ __volatile__ (
2257                "sb         %[p5],  2(%[s3])        \n\t"
2258                "sb         %[p4],  1(%[s3])        \n\t"
2259                "sb         %[p3],  0(%[s3])        \n\t"
2260                "sb         %[p2], -1(%[s3])        \n\t"
2261                "sb         %[p1], -2(%[s3])        \n\t"
2262                "sb         %[p0], -3(%[s3])        \n\t"
2263                :
2264                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
2265                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2266            );
2267
2268            __asm__ __volatile__ (
2269                "srl        %[p5], %[p5], 8         \n\t"
2270                "srl        %[p4], %[p4], 8         \n\t"
2271                "srl        %[p3], %[p3], 8         \n\t"
2272                "srl        %[p2], %[p2], 8         \n\t"
2273                "srl        %[p1], %[p1], 8         \n\t"
2274                "srl        %[p0], %[p0], 8         \n\t"
2275                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2276                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2277                :
2278            );
2279
2280            __asm__ __volatile__ (
2281                "sb         %[p5],  2(%[s2])        \n\t"
2282                "sb         %[p4],  1(%[s2])        \n\t"
2283                "sb         %[p3],  0(%[s2])        \n\t"
2284                "sb         %[p2], -1(%[s2])        \n\t"
2285                "sb         %[p1], -2(%[s2])        \n\t"
2286                "sb         %[p0], -3(%[s2])        \n\t"
2287                :
2288                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
2289                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2290            );
2291
2292            __asm__ __volatile__ (
2293                "srl        %[p5], %[p5], 8         \n\t"
2294                "srl        %[p4], %[p4], 8         \n\t"
2295                "srl        %[p3], %[p3], 8         \n\t"
2296                "srl        %[p2], %[p2], 8         \n\t"
2297                "srl        %[p1], %[p1], 8         \n\t"
2298                "srl        %[p0], %[p0], 8         \n\t"
2299                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2300                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2301                :
2302            );
2303
2304            __asm__ __volatile__ (
2305                "sb         %[p5],  2(%[s1])        \n\t"
2306                "sb         %[p4],  1(%[s1])        \n\t"
2307                "sb         %[p3],  0(%[s1])        \n\t"
2308                "sb         %[p2], -1(%[s1])        \n\t"
2309                "sb         %[p1], -2(%[s1])        \n\t"
2310                "sb         %[p0], -3(%[s1])        \n\t"
2311                :
2312                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
2313                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2314            );
2315        }
2316    }
2317
2318    s1 = s4 + p;
2319    s2 = s1 + p;
2320    s3 = s2 + p;
2321    s4 = s3 + p;
2322
2323    /* load quad-byte vectors
2324    * memory is 4 byte aligned
2325    */
2326    p2  = *((uint32_t *)(s1 - 4));
2327    p6  = *((uint32_t *)(s1));
2328    p1  = *((uint32_t *)(s2 - 4));
2329    p5  = *((uint32_t *)(s2));
2330    p0  = *((uint32_t *)(s3 - 4));
2331    p4  = *((uint32_t *)(s3));
2332    pm1 = *((uint32_t *)(s4 - 4));
2333    p3  = *((uint32_t *)(s4));
2334
2335    /* transpose pm1, p0, p1, p2 */
2336    __asm__ __volatile__ (
2337        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
2338        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
2339        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
2340        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
2341
2342        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
2343        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
2344        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
2345        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
2346
2347        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
2348        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
2349        "append         %[p1],      %[sec3],    16          \n\t"
2350        "append         %[pm1],     %[sec4],    16          \n\t"
2351
2352        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
2353          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
2354          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
2355          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
2356        :
2357    );
2358
2359    /* transpose p3, p4, p5, p6 */
2360    __asm__ __volatile__ (
2361        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
2362        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
2363        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
2364        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
2365
2366        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
2367        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
2368        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
2369        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
2370
2371        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
2372        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
2373        "append         %[p5],      %[sec3],    16          \n\t"
2374        "append         %[p3],      %[sec4],    16          \n\t"
2375
2376        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
2377          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
2378          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2379          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
2380        :
2381    );
2382
2383    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2384     * mask will be zero and filtering is not needed
2385     */
2386    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
2387    {
2388
2389        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, thresh, &hev, &mask);
2390
2391        /* if mask == 0 do filtering is not needed */
2392        if (mask)
2393        {
2394            /* filtering */
2395            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2396
2397            /* don't use transpose on output data
2398             * because memory isn't aligned
2399             */
2400            __asm__ __volatile__ (
2401                "sb         %[p5],  2(%[s4])        \n\t"
2402                "sb         %[p4],  1(%[s4])        \n\t"
2403                "sb         %[p3],  0(%[s4])        \n\t"
2404                "sb         %[p2], -1(%[s4])        \n\t"
2405                "sb         %[p1], -2(%[s4])        \n\t"
2406                "sb         %[p0], -3(%[s4])        \n\t"
2407                :
2408                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
2409                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2410            );
2411
2412            __asm__ __volatile__ (
2413                "srl        %[p5], %[p5], 8         \n\t"
2414                "srl        %[p4], %[p4], 8         \n\t"
2415                "srl        %[p3], %[p3], 8         \n\t"
2416                "srl        %[p2], %[p2], 8         \n\t"
2417                "srl        %[p1], %[p1], 8         \n\t"
2418                "srl        %[p0], %[p0], 8         \n\t"
2419                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2420                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2421                :
2422            );
2423
2424            __asm__ __volatile__ (
2425                "sb         %[p5],  2(%[s3])        \n\t"
2426                "sb         %[p4],  1(%[s3])        \n\t"
2427                "sb         %[p3],  0(%[s3])        \n\t"
2428                "sb         %[p2], -1(%[s3])        \n\t"
2429                "sb         %[p1], -2(%[s3])        \n\t"
2430                "sb         %[p0], -3(%[s3])        \n\t"
2431                :
2432                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
2433                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2434            );
2435
2436            __asm__ __volatile__ (
2437                "srl        %[p5], %[p5], 8         \n\t"
2438                "srl        %[p4], %[p4], 8         \n\t"
2439                "srl        %[p3], %[p3], 8         \n\t"
2440                "srl        %[p2], %[p2], 8         \n\t"
2441                "srl        %[p1], %[p1], 8         \n\t"
2442                "srl        %[p0], %[p0], 8         \n\t"
2443                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2444                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2445                :
2446            );
2447
2448            __asm__ __volatile__ (
2449                "sb         %[p5],  2(%[s2])        \n\t"
2450                "sb         %[p4],  1(%[s2])        \n\t"
2451                "sb         %[p3],  0(%[s2])        \n\t"
2452                "sb         %[p2], -1(%[s2])        \n\t"
2453                "sb         %[p1], -2(%[s2])        \n\t"
2454                "sb         %[p0], -3(%[s2])        \n\t"
2455                :
2456                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
2457                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2458            );
2459
2460            __asm__ __volatile__ (
2461                "srl        %[p5], %[p5], 8         \n\t"
2462                "srl        %[p4], %[p4], 8         \n\t"
2463                "srl        %[p3], %[p3], 8         \n\t"
2464                "srl        %[p2], %[p2], 8         \n\t"
2465                "srl        %[p1], %[p1], 8         \n\t"
2466                "srl        %[p0], %[p0], 8         \n\t"
2467                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2468                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2469                :
2470            );
2471
2472            __asm__ __volatile__ (
2473                "sb         %[p5],  2(%[s1])        \n\t"
2474                "sb         %[p4],  1(%[s1])        \n\t"
2475                "sb         %[p3],  0(%[s1])        \n\t"
2476                "sb         %[p2], -1(%[s1])        \n\t"
2477                "sb         %[p1], -2(%[s1])        \n\t"
2478                "sb         %[p0], -3(%[s1])        \n\t"
2479                :
2480                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
2481                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2482            );
2483        }
2484    }
2485}
2486
2487/* Horizontal MB filtering */
2488void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
2489                               int y_stride, int uv_stride, loop_filter_info *lfi)
2490{
2491    unsigned int thresh_vec, flimit_vec, limit_vec;
2492    unsigned char thresh, flimit, limit, flimit_temp;
2493
2494    /* use direct value instead pointers */
2495    limit = *(lfi->lim);
2496    flimit_temp = *(lfi->mblim);
2497    thresh = *(lfi->hev_thr);
2498    flimit = flimit_temp;
2499
2500    /* create quad-byte */
2501    __asm__ __volatile__ (
2502        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2503        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2504        "replv.qb       %[limit_vec],  %[limit]     \n\t"
2505        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
2506        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
2507    );
2508
2509    vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
2510
2511    if (u_ptr)
2512    {
2513        vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2514    }
2515
2516    if (v_ptr)
2517    {
2518        vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2519    }
2520}
2521
2522
2523/* Vertical MB Filtering */
2524void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
2525                               int y_stride, int uv_stride, loop_filter_info *lfi)
2526{
2527    unsigned int thresh_vec, flimit_vec, limit_vec;
2528    unsigned char thresh, flimit, limit, flimit_temp;
2529
2530    /* use direct value instead pointers */
2531    limit = *(lfi->lim);
2532    flimit_temp = *(lfi->mblim);
2533    thresh = *(lfi->hev_thr);
2534    flimit = flimit_temp;
2535
2536    /* create quad-byte */
2537    __asm__ __volatile__ (
2538        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2539        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2540        "replv.qb       %[limit_vec],  %[limit]     \n\t"
2541        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
2542        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
2543    );
2544
2545    vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
2546
2547    if (u_ptr)
2548        vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2549
2550    if (v_ptr)
2551        vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2552}
2553
2554
2555/* Horizontal B Filtering */
2556void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
2557                              int y_stride, int uv_stride, loop_filter_info *lfi)
2558{
2559    unsigned int thresh_vec, flimit_vec, limit_vec;
2560    unsigned char thresh, flimit, limit, flimit_temp;
2561
2562    /* use direct value instead pointers */
2563    limit = *(lfi->lim);
2564    flimit_temp = *(lfi->blim);
2565    thresh = *(lfi->hev_thr);
2566    flimit = flimit_temp;
2567
2568    /* create quad-byte */
2569    __asm__ __volatile__ (
2570        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2571        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2572        "replv.qb       %[limit_vec],  %[limit]     \n\t"
2573        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
2574        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
2575    );
2576
2577    vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
2578    vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
2579    vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
2580
2581    if (u_ptr)
2582        vp8_loop_filter_uvhorizontal_edge_mips(u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2583
2584    if (v_ptr)
2585        vp8_loop_filter_uvhorizontal_edge_mips(v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2586}
2587
2588
2589/* Vertical B Filtering */
2590void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
2591                              int y_stride, int uv_stride, loop_filter_info *lfi)
2592{
2593    unsigned int thresh_vec, flimit_vec, limit_vec;
2594    unsigned char thresh, flimit, limit, flimit_temp;
2595
2596    /* use direct value instead pointers */
2597    limit = *(lfi->lim);
2598    flimit_temp = *(lfi->blim);
2599    thresh = *(lfi->hev_thr);
2600    flimit = flimit_temp;
2601
2602    /* create quad-byte */
2603    __asm__ __volatile__ (
2604        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2605        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2606        "replv.qb       %[limit_vec],  %[limit]     \n\t"
2607        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
2608        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
2609    );
2610
2611    vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
2612    vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
2613    vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
2614
2615    if (u_ptr)
2616        vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2617
2618    if (v_ptr)
2619        vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2620}
2621
2622#endif
2623