1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <stdlib.h>
12#include "vp8_rtcd.h"
13#include "vp8/common/onyxc_int.h"
14
15#if HAVE_DSPR2
16typedef unsigned char uc;
17
18/* prefetch data for load */
19inline void prefetch_load_lf(unsigned char *src) {
20  __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
21}
22
23/* prefetch data for store */
24inline void prefetch_store_lf(unsigned char *dst) {
25  __asm__ __volatile__("pref   1,  0(%[dst])   \n\t" : : [dst] "r"(dst));
26}
27
28/* processing 4 pixels at the same time
29 * compute hev and mask in the same function
30 */
31static __inline void vp8_filter_mask_vec_mips(
32    uint32_t limit, uint32_t flimit, uint32_t p1, uint32_t p0, uint32_t p3,
33    uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, uint32_t q3,
34    uint32_t thresh, uint32_t *hev, uint32_t *mask) {
35  uint32_t c, r, r3, r_k;
36  uint32_t s1, s2, s3;
37  uint32_t ones = 0xFFFFFFFF;
38  uint32_t hev1;
39
40  __asm__ __volatile__(
41      /* mask |= (abs(p3 - p2) > limit) */
42      "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
43      "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
44      "or             %[r_k], %[r_k],    %[c]         \n\t"
45      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
46      "or             %[r],   $0,        %[c]         \n\t"
47
48      /* mask |= (abs(p2 - p1) > limit) */
49      "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
50      "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
51      "or             %[r_k], %[r_k],    %[c]         \n\t"
52      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
53      "or             %[r],   %[r],      %[c]         \n\t"
54
55      /* mask |= (abs(p1 - p0) > limit)
56       * hev  |= (abs(p1 - p0) > thresh)
57       */
58      "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
59      "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
60      "or             %[r_k], %[r_k],    %[c]         \n\t"
61      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
62      "or             %[r3],  $0,        %[c]         \n\t"
63      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
64      "or             %[r],   %[r],      %[c]         \n\t"
65
66      /* mask |= (abs(q1 - q0) > limit)
67       * hev  |= (abs(q1 - q0) > thresh)
68       */
69      "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
70      "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
71      "or             %[r_k], %[r_k],    %[c]         \n\t"
72      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
73      "or             %[r3],  %[r3],     %[c]         \n\t"
74      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
75      "or             %[r],   %[r],      %[c]         \n\t"
76
77      /* mask |= (abs(q2 - q1) > limit) */
78      "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
79      "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
80      "or             %[r_k], %[r_k],    %[c]         \n\t"
81      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
82      "or             %[r],   %[r],      %[c]         \n\t"
83      "sll            %[r3],    %[r3],    24          \n\t"
84
85      /* mask |= (abs(q3 - q2) > limit) */
86      "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
87      "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
88      "or             %[r_k], %[r_k],    %[c]         \n\t"
89      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
90      "or             %[r],   %[r],      %[c]         \n\t"
91
92      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
93      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
94        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
95        [thresh] "r"(thresh));
96
97  __asm__ __volatile__(
98      /* abs(p0 - q0) */
99      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
100      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
101      "wrdsp          %[r3]                           \n\t"
102      "or             %[s1],  %[r_k],    %[c]         \n\t"
103
104      /* abs(p1 - q1) */
105      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
106      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
107      "pick.qb        %[hev1], %[ones],  $0           \n\t"
108      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
109      "or             %[s2],   %[r_k],   %[c]         \n\t"
110
111      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
112      "shrl.qb        %[s2],   %[s2],     1           \n\t"
113      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
114      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
115      "or             %[r],    %[r],      %[c]        \n\t"
116      "sll            %[r],    %[r],      24          \n\t"
117
118      "wrdsp          %[r]                            \n\t"
119      "pick.qb        %[s2],  $0,         %[ones]     \n\t"
120
121      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
122        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
123      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
124        [ones] "r"(ones), [flimit] "r"(flimit));
125
126  *hev = hev1;
127  *mask = s2;
128}
129
130/* inputs & outputs are quad-byte vectors */
131static __inline void vp8_filter_mips(uint32_t mask, uint32_t hev, uint32_t *ps1,
132                                     uint32_t *ps0, uint32_t *qs0,
133                                     uint32_t *qs1) {
134  int32_t vp8_filter_l, vp8_filter_r;
135  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
136  int32_t subr_r, subr_l;
137  uint32_t t1, t2, HWM, t3;
138  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
139
140  int32_t vps1, vps0, vqs0, vqs1;
141  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
142  uint32_t N128;
143
144  N128 = 0x80808080;
145  t1 = 0x03000300;
146  t2 = 0x04000400;
147  t3 = 0x01000100;
148  HWM = 0xFF00FF00;
149
150  vps0 = (*ps0) ^ N128;
151  vps1 = (*ps1) ^ N128;
152  vqs0 = (*qs0) ^ N128;
153  vqs1 = (*qs1) ^ N128;
154
155  /* use halfword pairs instead quad-bytes because of accuracy */
156  vps0_l = vps0 & HWM;
157  vps0_r = vps0 << 8;
158  vps0_r = vps0_r & HWM;
159
160  vps1_l = vps1 & HWM;
161  vps1_r = vps1 << 8;
162  vps1_r = vps1_r & HWM;
163
164  vqs0_l = vqs0 & HWM;
165  vqs0_r = vqs0 << 8;
166  vqs0_r = vqs0_r & HWM;
167
168  vqs1_l = vqs1 & HWM;
169  vqs1_r = vqs1 << 8;
170  vqs1_r = vqs1_r & HWM;
171
172  mask_l = mask & HWM;
173  mask_r = mask << 8;
174  mask_r = mask_r & HWM;
175
176  hev_l = hev & HWM;
177  hev_r = hev << 8;
178  hev_r = hev_r & HWM;
179
180  __asm__ __volatile__(
181      /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
182      "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
183      "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
184
185      /* qs0 - ps0 */
186      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
187      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
188
189      /* vp8_filter &= hev; */
190      "and          %[vp8_filter_l], %[vp8_filter_l], %[hev_l]        \n\t"
191      "and          %[vp8_filter_r], %[vp8_filter_r], %[hev_r]        \n\t"
192
193      /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
194      "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
195      "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
196      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
197      "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
198      "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
199      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
200      "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
201      "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
202
203      /* vp8_filter &= mask; */
204      "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
205      "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
206
207      : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=&r"(vp8_filter_r),
208        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
209        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
210
211      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
212        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
213        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
214        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
215        [HWM] "r"(HWM));
216
217  /* save bottom 3 bits so that we round one side +4 and the other +3 */
218  __asm__ __volatile__(
219      /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
220      "addq_s.ph    %[Filter1_l],    %[vp8_filter_l], %[t2]           \n\t"
221      "addq_s.ph    %[Filter1_r],    %[vp8_filter_r], %[t2]           \n\t"
222
223      /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
224      "addq_s.ph    %[Filter2_l],    %[vp8_filter_l], %[t1]           \n\t"
225      "addq_s.ph    %[Filter2_r],    %[vp8_filter_r], %[t1]           \n\t"
226      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
227      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
228
229      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
230      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
231
232      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
233      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
234
235      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
236      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
237      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
238
239      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
240      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
241      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
242
243      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
244        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
245        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
246        [vqs0_r] "+r"(vqs0_r)
247
248      : [t1] "r"(t1), [t2] "r"(t2), [vp8_filter_l] "r"(vp8_filter_l),
249        [vp8_filter_r] "r"(vp8_filter_r), [HWM] "r"(HWM));
250
251  __asm__ __volatile__(
252      /* (vp8_filter += 1) >>= 1 */
253      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
254      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
255
256      /* vp8_filter &= ~hev; */
257      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
258      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
259
260      /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
261      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
262      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
263
264      /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
265      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
266      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
267
268      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
269        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
270        [vqs1_r] "+r"(vqs1_r)
271
272      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
273
274  /* Create quad-bytes from halfword pairs */
275  vqs0_l = vqs0_l & HWM;
276  vqs1_l = vqs1_l & HWM;
277  vps0_l = vps0_l & HWM;
278  vps1_l = vps1_l & HWM;
279
280  __asm__ __volatile__(
281      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
282      "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
283      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
284      "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
285
286      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
287        [vqs0_r] "+r"(vqs0_r)
288      :);
289
290  vqs0 = vqs0_l | vqs0_r;
291  vqs1 = vqs1_l | vqs1_r;
292  vps0 = vps0_l | vps0_r;
293  vps1 = vps1_l | vps1_r;
294
295  *ps0 = vps0 ^ N128;
296  *ps1 = vps1 ^ N128;
297  *qs0 = vqs0 ^ N128;
298  *qs1 = vqs1 ^ N128;
299}
300
301void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p,
302                                          unsigned int flimit,
303                                          unsigned int limit,
304                                          unsigned int thresh, int count) {
305  uint32_t mask;
306  uint32_t hev;
307  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
308  unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
309  (void)count;
310
311  mask = 0;
312  hev = 0;
313  p1 = 0;
314  p2 = 0;
315  p3 = 0;
316  p4 = 0;
317
318  /* prefetch data for store */
319  prefetch_store_lf(s);
320
321  /* loop filter designed to work using chars so that we can make maximum use
322   * of 8 bit simd instructions.
323   */
324
325  sm1 = s - (p << 2);
326  s0 = s - p - p - p;
327  s1 = s - p - p;
328  s2 = s - p;
329  s3 = s;
330  s4 = s + p;
331  s5 = s + p + p;
332  s6 = s + p + p + p;
333
334  /* load quad-byte vectors
335   * memory is 4 byte aligned
336   */
337  p1 = *((uint32_t *)(s1));
338  p2 = *((uint32_t *)(s2));
339  p3 = *((uint32_t *)(s3));
340  p4 = *((uint32_t *)(s4));
341
342  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
343   * mask will be zero and filtering is not needed
344   */
345  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
346    pm1 = *((uint32_t *)(sm1));
347    p0 = *((uint32_t *)(s0));
348    p5 = *((uint32_t *)(s5));
349    p6 = *((uint32_t *)(s6));
350
351    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
352                             thresh, &hev, &mask);
353
354    /* if mask == 0 do filtering is not needed */
355    if (mask) {
356      /* filtering */
357      vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
358
359      /* unpack processed 4x4 neighborhood */
360      *((uint32_t *)s1) = p1;
361      *((uint32_t *)s2) = p2;
362      *((uint32_t *)s3) = p3;
363      *((uint32_t *)s4) = p4;
364    }
365  }
366
367  sm1 += 4;
368  s0 += 4;
369  s1 += 4;
370  s2 += 4;
371  s3 += 4;
372  s4 += 4;
373  s5 += 4;
374  s6 += 4;
375
376  /* load quad-byte vectors
377   * memory is 4 byte aligned
378   */
379  p1 = *((uint32_t *)(s1));
380  p2 = *((uint32_t *)(s2));
381  p3 = *((uint32_t *)(s3));
382  p4 = *((uint32_t *)(s4));
383
384  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
385   * mask will be zero and filtering is not needed
386   */
387  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
388    pm1 = *((uint32_t *)(sm1));
389    p0 = *((uint32_t *)(s0));
390    p5 = *((uint32_t *)(s5));
391    p6 = *((uint32_t *)(s6));
392
393    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
394                             thresh, &hev, &mask);
395
396    /* if mask == 0 do filtering is not needed */
397    if (mask) {
398      /* filtering */
399      vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
400
401      /* unpack processed 4x4 neighborhood */
402      *((uint32_t *)s1) = p1;
403      *((uint32_t *)s2) = p2;
404      *((uint32_t *)s3) = p3;
405      *((uint32_t *)s4) = p4;
406    }
407  }
408
409  sm1 += 4;
410  s0 += 4;
411  s1 += 4;
412  s2 += 4;
413  s3 += 4;
414  s4 += 4;
415  s5 += 4;
416  s6 += 4;
417
418  /* load quad-byte vectors
419   * memory is 4 byte aligned
420   */
421  p1 = *((uint32_t *)(s1));
422  p2 = *((uint32_t *)(s2));
423  p3 = *((uint32_t *)(s3));
424  p4 = *((uint32_t *)(s4));
425
426  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
427   * mask will be zero and filtering is not needed
428   */
429  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
430    pm1 = *((uint32_t *)(sm1));
431    p0 = *((uint32_t *)(s0));
432    p5 = *((uint32_t *)(s5));
433    p6 = *((uint32_t *)(s6));
434
435    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
436                             thresh, &hev, &mask);
437
438    /* if mask == 0 do filtering is not needed */
439    if (mask) {
440      /* filtering */
441      vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
442
443      /* unpack processed 4x4 neighborhood */
444      *((uint32_t *)s1) = p1;
445      *((uint32_t *)s2) = p2;
446      *((uint32_t *)s3) = p3;
447      *((uint32_t *)s4) = p4;
448    }
449  }
450
451  sm1 += 4;
452  s0 += 4;
453  s1 += 4;
454  s2 += 4;
455  s3 += 4;
456  s4 += 4;
457  s5 += 4;
458  s6 += 4;
459
460  /* load quad-byte vectors
461   * memory is 4 byte aligned
462   */
463  p1 = *((uint32_t *)(s1));
464  p2 = *((uint32_t *)(s2));
465  p3 = *((uint32_t *)(s3));
466  p4 = *((uint32_t *)(s4));
467
468  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
469   * mask will be zero and filtering is not needed
470   */
471  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
472    pm1 = *((uint32_t *)(sm1));
473    p0 = *((uint32_t *)(s0));
474    p5 = *((uint32_t *)(s5));
475    p6 = *((uint32_t *)(s6));
476
477    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
478                             thresh, &hev, &mask);
479
480    /* if mask == 0 do filtering is not needed */
481    if (mask) {
482      /* filtering */
483      vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
484
485      /* unpack processed 4x4 neighborhood */
486      *((uint32_t *)s1) = p1;
487      *((uint32_t *)s2) = p2;
488      *((uint32_t *)s3) = p3;
489      *((uint32_t *)s4) = p4;
490    }
491  }
492}
493
494void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
495                                            unsigned int flimit,
496                                            unsigned int limit,
497                                            unsigned int thresh, int count) {
498  uint32_t mask;
499  uint32_t hev;
500  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
501  unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
502  (void)count;
503
504  mask = 0;
505  hev = 0;
506  p1 = 0;
507  p2 = 0;
508  p3 = 0;
509  p4 = 0;
510
511  /* loop filter designed to work using chars so that we can make maximum use
512   * of 8 bit simd instructions.
513   */
514
515  sm1 = s - (p << 2);
516  s0 = s - p - p - p;
517  s1 = s - p - p;
518  s2 = s - p;
519  s3 = s;
520  s4 = s + p;
521  s5 = s + p + p;
522  s6 = s + p + p + p;
523
524  /* load quad-byte vectors
525   * memory is 4 byte aligned
526   */
527  p1 = *((uint32_t *)(s1));
528  p2 = *((uint32_t *)(s2));
529  p3 = *((uint32_t *)(s3));
530  p4 = *((uint32_t *)(s4));
531
532  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
533   * mask will be zero and filtering is not needed
534   */
535  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
536    pm1 = *((uint32_t *)(sm1));
537    p0 = *((uint32_t *)(s0));
538    p5 = *((uint32_t *)(s5));
539    p6 = *((uint32_t *)(s6));
540
541    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
542                             thresh, &hev, &mask);
543
544    /* if mask == 0 do filtering is not needed */
545    if (mask) {
546      /* filtering */
547      vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
548
549      /* unpack processed 4x4 neighborhood */
550      *((uint32_t *)s1) = p1;
551      *((uint32_t *)s2) = p2;
552      *((uint32_t *)s3) = p3;
553      *((uint32_t *)s4) = p4;
554    }
555  }
556
557  sm1 += 4;
558  s0 += 4;
559  s1 += 4;
560  s2 += 4;
561  s3 += 4;
562  s4 += 4;
563  s5 += 4;
564  s6 += 4;
565
566  /* load quad-byte vectors
567   * memory is 4 byte aligned
568   */
569  p1 = *((uint32_t *)(s1));
570  p2 = *((uint32_t *)(s2));
571  p3 = *((uint32_t *)(s3));
572  p4 = *((uint32_t *)(s4));
573
574  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
575   * mask will be zero and filtering is not needed
576   */
577  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
578    pm1 = *((uint32_t *)(sm1));
579    p0 = *((uint32_t *)(s0));
580    p5 = *((uint32_t *)(s5));
581    p6 = *((uint32_t *)(s6));
582
583    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
584                             thresh, &hev, &mask);
585
586    /* if mask == 0 do filtering is not needed */
587    if (mask) {
588      /* filtering */
589      vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
590
591      /* unpack processed 4x4 neighborhood */
592      *((uint32_t *)s1) = p1;
593      *((uint32_t *)s2) = p2;
594      *((uint32_t *)s3) = p3;
595      *((uint32_t *)s4) = p4;
596    }
597  }
598}
599
600void vp8_loop_filter_vertical_edge_mips(unsigned char *s, int p,
601                                        const unsigned int flimit,
602                                        const unsigned int limit,
603                                        const unsigned int thresh, int count) {
604  int i;
605  uint32_t mask, hev;
606  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
607  unsigned char *s1, *s2, *s3, *s4;
608  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
609
610  hev = 0;
611  mask = 0;
612  i = 0;
613  pm1 = 0;
614  p0 = 0;
615  p1 = 0;
616  p2 = 0;
617  p3 = 0;
618  p4 = 0;
619  p5 = 0;
620  p6 = 0;
621
622  /* loop filter designed to work using chars so that we can make maximum use
623   * of 8 bit simd instructions.
624   */
625
626  /* apply filter on 4 pixesl at the same time */
627  do {
628    /* prefetch data for store */
629    prefetch_store_lf(s + p);
630
631    s1 = s;
632    s2 = s + p;
633    s3 = s2 + p;
634    s4 = s3 + p;
635    s = s4 + p;
636
637    /* load quad-byte vectors
638     * memory is 4 byte aligned
639     */
640    p2 = *((uint32_t *)(s1 - 4));
641    p6 = *((uint32_t *)(s1));
642    p1 = *((uint32_t *)(s2 - 4));
643    p5 = *((uint32_t *)(s2));
644    p0 = *((uint32_t *)(s3 - 4));
645    p4 = *((uint32_t *)(s3));
646    pm1 = *((uint32_t *)(s4 - 4));
647    p3 = *((uint32_t *)(s4));
648
649    /* transpose pm1, p0, p1, p2 */
650    __asm__ __volatile__(
651        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
652        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
653        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
654        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
655
656        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
657        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
658        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
659        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
660
661        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
662        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
663        "append         %[p1],      %[sec3],    16          \n\t"
664        "append         %[pm1],     %[sec4],    16          \n\t"
665
666        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
667          [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
668          [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
669        :);
670
671    /* transpose p3, p4, p5, p6 */
672    __asm__ __volatile__(
673        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
674        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
675        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
676        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
677
678        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
679        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
680        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
681        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
682
683        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
684        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
685        "append         %[p5],      %[sec3],    16          \n\t"
686        "append         %[p3],      %[sec4],    16          \n\t"
687
688        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
689          [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
690          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
691        :);
692
693    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
694     * mask will be zero and filtering is not needed
695     */
696    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
697      vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
698                               thresh, &hev, &mask);
699
700      /* if mask == 0 do filtering is not needed */
701      if (mask) {
702        /* filtering */
703        vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
704
705        /* unpack processed 4x4 neighborhood
706         * don't use transpose on output data
707         * because memory isn't aligned
708         */
709        __asm__ __volatile__(
710            "sb         %[p4],  1(%[s4])    \n\t"
711            "sb         %[p3],  0(%[s4])    \n\t"
712            "sb         %[p2], -1(%[s4])    \n\t"
713            "sb         %[p1], -2(%[s4])    \n\t"
714            :
715            : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2),
716              [p1] "r"(p1));
717
718        __asm__ __volatile__(
719            "srl        %[p4], %[p4], 8     \n\t"
720            "srl        %[p3], %[p3], 8     \n\t"
721            "srl        %[p2], %[p2], 8     \n\t"
722            "srl        %[p1], %[p1], 8     \n\t"
723            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
724            :);
725
726        __asm__ __volatile__(
727            "sb         %[p4],  1(%[s3])    \n\t"
728            "sb         %[p3],  0(%[s3])    \n\t"
729            "sb         %[p2], -1(%[s3])    \n\t"
730            "sb         %[p1], -2(%[s3])    \n\t"
731            : [p1] "+r"(p1)
732            : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
733
734        __asm__ __volatile__(
735            "srl        %[p4], %[p4], 8     \n\t"
736            "srl        %[p3], %[p3], 8     \n\t"
737            "srl        %[p2], %[p2], 8     \n\t"
738            "srl        %[p1], %[p1], 8     \n\t"
739            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
740            :);
741
742        __asm__ __volatile__(
743            "sb         %[p4],  1(%[s2])    \n\t"
744            "sb         %[p3],  0(%[s2])    \n\t"
745            "sb         %[p2], -1(%[s2])    \n\t"
746            "sb         %[p1], -2(%[s2])    \n\t"
747            :
748            : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2),
749              [p1] "r"(p1));
750
751        __asm__ __volatile__(
752            "srl        %[p4], %[p4], 8     \n\t"
753            "srl        %[p3], %[p3], 8     \n\t"
754            "srl        %[p2], %[p2], 8     \n\t"
755            "srl        %[p1], %[p1], 8     \n\t"
756            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
757            :);
758
759        __asm__ __volatile__(
760            "sb         %[p4],  1(%[s1])    \n\t"
761            "sb         %[p3],  0(%[s1])    \n\t"
762            "sb         %[p2], -1(%[s1])    \n\t"
763            "sb         %[p1], -2(%[s1])    \n\t"
764            :
765            : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2),
766              [p1] "r"(p1));
767      }
768    }
769
770    s1 = s;
771    s2 = s + p;
772    s3 = s2 + p;
773    s4 = s3 + p;
774    s = s4 + p;
775
776    /* load quad-byte vectors
777     * memory is 4 byte aligned
778     */
779    p2 = *((uint32_t *)(s1 - 4));
780    p6 = *((uint32_t *)(s1));
781    p1 = *((uint32_t *)(s2 - 4));
782    p5 = *((uint32_t *)(s2));
783    p0 = *((uint32_t *)(s3 - 4));
784    p4 = *((uint32_t *)(s3));
785    pm1 = *((uint32_t *)(s4 - 4));
786    p3 = *((uint32_t *)(s4));
787
788    /* transpose pm1, p0, p1, p2 */
789    __asm__ __volatile__(
790        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
791        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
792        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
793        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
794
795        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
796        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
797        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
798        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
799
800        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
801        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
802        "append         %[p1],      %[sec3],    16          \n\t"
803        "append         %[pm1],     %[sec4],    16          \n\t"
804
805        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
806          [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
807          [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
808        :);
809
810    /* transpose p3, p4, p5, p6 */
811    __asm__ __volatile__(
812        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
813        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
814        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
815        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
816
817        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
818        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
819        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
820        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
821
822        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
823        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
824        "append         %[p5],      %[sec3],    16          \n\t"
825        "append         %[p3],      %[sec4],    16          \n\t"
826
827        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
828          [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
829          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
830        :);
831
832    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
833     * mask will be zero and filtering is not needed
834     */
835    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
836      vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
837                               thresh, &hev, &mask);
838
839      /* if mask == 0 do filtering is not needed */
840      if (mask) {
841        /* filtering */
842        vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
843
844        /* unpack processed 4x4 neighborhood
845         * don't use transpose on output data
846         * because memory isn't aligned
847         */
848        __asm__ __volatile__(
849            "sb         %[p4],  1(%[s4])    \n\t"
850            "sb         %[p3],  0(%[s4])    \n\t"
851            "sb         %[p2], -1(%[s4])    \n\t"
852            "sb         %[p1], -2(%[s4])    \n\t"
853            :
854            : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2),
855              [p1] "r"(p1));
856
857        __asm__ __volatile__(
858            "srl        %[p4], %[p4], 8     \n\t"
859            "srl        %[p3], %[p3], 8     \n\t"
860            "srl        %[p2], %[p2], 8     \n\t"
861            "srl        %[p1], %[p1], 8     \n\t"
862            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
863            :);
864
865        __asm__ __volatile__(
866            "sb         %[p4],  1(%[s3])    \n\t"
867            "sb         %[p3],  0(%[s3])    \n\t"
868            "sb         %[p2], -1(%[s3])    \n\t"
869            "sb         %[p1], -2(%[s3])    \n\t"
870            : [p1] "+r"(p1)
871            : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
872
873        __asm__ __volatile__(
874            "srl        %[p4], %[p4], 8     \n\t"
875            "srl        %[p3], %[p3], 8     \n\t"
876            "srl        %[p2], %[p2], 8     \n\t"
877            "srl        %[p1], %[p1], 8     \n\t"
878            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
879            :);
880
881        __asm__ __volatile__(
882            "sb         %[p4],  1(%[s2])    \n\t"
883            "sb         %[p3],  0(%[s2])    \n\t"
884            "sb         %[p2], -1(%[s2])    \n\t"
885            "sb         %[p1], -2(%[s2])    \n\t"
886            :
887            : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2),
888              [p1] "r"(p1));
889
890        __asm__ __volatile__(
891            "srl        %[p4], %[p4], 8     \n\t"
892            "srl        %[p3], %[p3], 8     \n\t"
893            "srl        %[p2], %[p2], 8     \n\t"
894            "srl        %[p1], %[p1], 8     \n\t"
895            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
896            :);
897
898        __asm__ __volatile__(
899            "sb         %[p4],  1(%[s1])    \n\t"
900            "sb         %[p3],  0(%[s1])    \n\t"
901            "sb         %[p2], -1(%[s1])    \n\t"
902            "sb         %[p1], -2(%[s1])    \n\t"
903            :
904            : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2),
905              [p1] "r"(p1));
906      }
907    }
908
909    i += 8;
910  }
911
912  while (i < count);
913}
914
915void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
916                                          unsigned int flimit,
917                                          unsigned int limit,
918                                          unsigned int thresh, int count) {
919  uint32_t mask, hev;
920  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
921  unsigned char *s1, *s2, *s3, *s4;
922  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
923  (void)count;
924
925  /* loop filter designed to work using chars so that we can make maximum use
926   * of 8 bit simd instructions.
927   */
928
929  /* apply filter on 4 pixesl at the same time */
930
931  s1 = s;
932  s2 = s + p;
933  s3 = s2 + p;
934  s4 = s3 + p;
935
936  /* load quad-byte vectors
937  * memory is 4 byte aligned
938  */
939  p2 = *((uint32_t *)(s1 - 4));
940  p6 = *((uint32_t *)(s1));
941  p1 = *((uint32_t *)(s2 - 4));
942  p5 = *((uint32_t *)(s2));
943  p0 = *((uint32_t *)(s3 - 4));
944  p4 = *((uint32_t *)(s3));
945  pm1 = *((uint32_t *)(s4 - 4));
946  p3 = *((uint32_t *)(s4));
947
948  /* transpose pm1, p0, p1, p2 */
949  __asm__ __volatile__(
950      "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
951      "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
952      "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
953      "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
954
955      "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
956      "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
957      "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
958      "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
959
960      "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
961      "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
962      "append         %[p1],      %[sec3],    16          \n\t"
963      "append         %[pm1],     %[sec4],    16          \n\t"
964
965      : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
966        [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
967        [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
968      :);
969
970  /* transpose p3, p4, p5, p6 */
971  __asm__ __volatile__(
972      "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
973      "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
974      "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
975      "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
976
977      "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
978      "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
979      "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
980      "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
981
982      "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
983      "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
984      "append         %[p5],      %[sec3],    16          \n\t"
985      "append         %[p3],      %[sec4],    16          \n\t"
986
987      : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
988        [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
989        [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
990      :);
991
992  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
993  * mask will be zero and filtering is not needed
994  */
995  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
996    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
997                             thresh, &hev, &mask);
998
999    /* if mask == 0 do filtering is not needed */
1000    if (mask) {
1001      /* filtering */
1002      vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
1003
1004      /* unpack processed 4x4 neighborhood
1005       * don't use transpose on output data
1006       * because memory isn't aligned
1007       */
1008      __asm__ __volatile__(
1009          "sb         %[p4],  1(%[s4])    \n\t"
1010          "sb         %[p3],  0(%[s4])    \n\t"
1011          "sb         %[p2], -1(%[s4])    \n\t"
1012          "sb         %[p1], -2(%[s4])    \n\t"
1013          :
1014          :
1015          [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1));
1016
1017      __asm__ __volatile__(
1018          "srl        %[p4], %[p4], 8     \n\t"
1019          "srl        %[p3], %[p3], 8     \n\t"
1020          "srl        %[p2], %[p2], 8     \n\t"
1021          "srl        %[p1], %[p1], 8     \n\t"
1022          : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1023          :);
1024
1025      __asm__ __volatile__(
1026          "sb         %[p4],  1(%[s3])    \n\t"
1027          "sb         %[p3],  0(%[s3])    \n\t"
1028          "sb         %[p2], -1(%[s3])    \n\t"
1029          "sb         %[p1], -2(%[s3])    \n\t"
1030          : [p1] "+r"(p1)
1031          : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
1032
1033      __asm__ __volatile__(
1034          "srl        %[p4], %[p4], 8     \n\t"
1035          "srl        %[p3], %[p3], 8     \n\t"
1036          "srl        %[p2], %[p2], 8     \n\t"
1037          "srl        %[p1], %[p1], 8     \n\t"
1038          : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1039          :);
1040
1041      __asm__ __volatile__(
1042          "sb         %[p4],  1(%[s2])    \n\t"
1043          "sb         %[p3],  0(%[s2])    \n\t"
1044          "sb         %[p2], -1(%[s2])    \n\t"
1045          "sb         %[p1], -2(%[s2])    \n\t"
1046          :
1047          :
1048          [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1));
1049
1050      __asm__ __volatile__(
1051          "srl        %[p4], %[p4], 8     \n\t"
1052          "srl        %[p3], %[p3], 8     \n\t"
1053          "srl        %[p2], %[p2], 8     \n\t"
1054          "srl        %[p1], %[p1], 8     \n\t"
1055          : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1056          :);
1057
1058      __asm__ __volatile__(
1059          "sb         %[p4],  1(%[s1])    \n\t"
1060          "sb         %[p3],  0(%[s1])    \n\t"
1061          "sb         %[p2], -1(%[s1])    \n\t"
1062          "sb         %[p1], -2(%[s1])    \n\t"
1063          :
1064          :
1065          [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1));
1066    }
1067  }
1068
1069  s1 = s4 + p;
1070  s2 = s1 + p;
1071  s3 = s2 + p;
1072  s4 = s3 + p;
1073
1074  /* load quad-byte vectors
1075   * memory is 4 byte aligned
1076   */
1077  p2 = *((uint32_t *)(s1 - 4));
1078  p6 = *((uint32_t *)(s1));
1079  p1 = *((uint32_t *)(s2 - 4));
1080  p5 = *((uint32_t *)(s2));
1081  p0 = *((uint32_t *)(s3 - 4));
1082  p4 = *((uint32_t *)(s3));
1083  pm1 = *((uint32_t *)(s4 - 4));
1084  p3 = *((uint32_t *)(s4));
1085
1086  /* transpose pm1, p0, p1, p2 */
1087  __asm__ __volatile__(
1088      "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
1089      "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
1090      "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
1091      "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
1092
1093      "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
1094      "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
1095      "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1096      "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1097
1098      "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
1099      "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
1100      "append         %[p1],      %[sec3],    16          \n\t"
1101      "append         %[pm1],     %[sec4],    16          \n\t"
1102
1103      : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1104        [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
1105        [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1106      :);
1107
1108  /* transpose p3, p4, p5, p6 */
1109  __asm__ __volatile__(
1110      "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
1111      "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
1112      "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
1113      "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
1114
1115      "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
1116      "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
1117      "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1118      "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1119
1120      "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
1121      "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
1122      "append         %[p5],      %[sec3],    16          \n\t"
1123      "append         %[p3],      %[sec4],    16          \n\t"
1124
1125      : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1126        [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
1127        [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1128      :);
1129
1130  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1131   * mask will be zero and filtering is not needed
1132   */
1133  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1134    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1135                             thresh, &hev, &mask);
1136
1137    /* if mask == 0 do filtering is not needed */
1138    if (mask) {
1139      /* filtering */
1140      vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
1141
1142      /* unpack processed 4x4 neighborhood
1143       * don't use transpose on output data
1144       * because memory isn't aligned
1145       */
1146      __asm__ __volatile__(
1147          "sb         %[p4],  1(%[s4])    \n\t"
1148          "sb         %[p3],  0(%[s4])    \n\t"
1149          "sb         %[p2], -1(%[s4])    \n\t"
1150          "sb         %[p1], -2(%[s4])    \n\t"
1151          :
1152          :
1153          [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1));
1154
1155      __asm__ __volatile__(
1156          "srl        %[p4], %[p4], 8     \n\t"
1157          "srl        %[p3], %[p3], 8     \n\t"
1158          "srl        %[p2], %[p2], 8     \n\t"
1159          "srl        %[p1], %[p1], 8     \n\t"
1160          : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1161          :);
1162
1163      __asm__ __volatile__(
1164          "sb         %[p4],  1(%[s3])    \n\t"
1165          "sb         %[p3],  0(%[s3])    \n\t"
1166          "sb         %[p2], -1(%[s3])    \n\t"
1167          "sb         %[p1], -2(%[s3])    \n\t"
1168          : [p1] "+r"(p1)
1169          : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
1170
1171      __asm__ __volatile__(
1172          "srl        %[p4], %[p4], 8     \n\t"
1173          "srl        %[p3], %[p3], 8     \n\t"
1174          "srl        %[p2], %[p2], 8     \n\t"
1175          "srl        %[p1], %[p1], 8     \n\t"
1176          : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1177          :);
1178
1179      __asm__ __volatile__(
1180          "sb         %[p4],  1(%[s2])    \n\t"
1181          "sb         %[p3],  0(%[s2])    \n\t"
1182          "sb         %[p2], -1(%[s2])    \n\t"
1183          "sb         %[p1], -2(%[s2])    \n\t"
1184          :
1185          :
1186          [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1));
1187
1188      __asm__ __volatile__(
1189          "srl        %[p4], %[p4], 8     \n\t"
1190          "srl        %[p3], %[p3], 8     \n\t"
1191          "srl        %[p2], %[p2], 8     \n\t"
1192          "srl        %[p1], %[p1], 8     \n\t"
1193          : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1194          :);
1195
1196      __asm__ __volatile__(
1197          "sb         %[p4],  1(%[s1])    \n\t"
1198          "sb         %[p3],  0(%[s1])    \n\t"
1199          "sb         %[p2], -1(%[s1])    \n\t"
1200          "sb         %[p1], -2(%[s1])    \n\t"
1201          :
1202          :
1203          [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1));
1204    }
1205  }
1206}
1207
1208/* inputs & outputs are quad-byte vectors */
1209static __inline void vp8_mbfilter_mips(uint32_t mask, uint32_t hev,
1210                                       uint32_t *ps2, uint32_t *ps1,
1211                                       uint32_t *ps0, uint32_t *qs0,
1212                                       uint32_t *qs1, uint32_t *qs2) {
1213  int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
1214  int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
1215  int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
1216  uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r,
1217      subr_r, subr_l;
1218  uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l,
1219      invhev_r;
1220  uint32_t N128, R63;
1221  uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
1222
1223  R63 = 0x003F003F;
1224  HWM = 0xFF00FF00;
1225  N128 = 0x80808080;
1226  t1 = 0x03000300;
1227  t2 = 0x04000400;
1228
1229  vps0 = (*ps0) ^ N128;
1230  vps1 = (*ps1) ^ N128;
1231  vps2 = (*ps2) ^ N128;
1232  vqs0 = (*qs0) ^ N128;
1233  vqs1 = (*qs1) ^ N128;
1234  vqs2 = (*qs2) ^ N128;
1235
1236  /* use halfword pairs instead quad-bytes because of accuracy */
1237  vps0_l = vps0 & HWM;
1238  vps0_r = vps0 << 8;
1239  vps0_r = vps0_r & HWM;
1240
1241  vqs0_l = vqs0 & HWM;
1242  vqs0_r = vqs0 << 8;
1243  vqs0_r = vqs0_r & HWM;
1244
1245  vps1_l = vps1 & HWM;
1246  vps1_r = vps1 << 8;
1247  vps1_r = vps1_r & HWM;
1248
1249  vqs1_l = vqs1 & HWM;
1250  vqs1_r = vqs1 << 8;
1251  vqs1_r = vqs1_r & HWM;
1252
1253  vqs2_l = vqs2 & HWM;
1254  vqs2_r = vqs2 << 8;
1255  vqs2_r = vqs2_r & HWM;
1256
1257  __asm__ __volatile__(
1258      /* qs0 - ps0 */
1259      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
1260      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
1261
1262      /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
1263      "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
1264      "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
1265
1266      : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=r"(vp8_filter_r),
1267        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r)
1268      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
1269        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
1270        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r));
1271
1272  vps2_l = vps2 & HWM;
1273  vps2_r = vps2 << 8;
1274  vps2_r = vps2_r & HWM;
1275
1276  /* add outer taps if we have high edge variance */
1277  __asm__ __volatile__(
1278      /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
1279      "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
1280      "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
1281      "and          %[mask_l],       %[HWM],          %[mask]         \n\t"
1282      "sll          %[mask_r],       %[mask],         8               \n\t"
1283      "and          %[mask_r],       %[HWM],          %[mask_r]       \n\t"
1284      "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
1285      "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
1286      "and          %[hev_l],        %[HWM],          %[hev]          \n\t"
1287      "sll          %[hev_r],        %[hev],          8               \n\t"
1288      "and          %[hev_r],        %[HWM],          %[hev_r]        \n\t"
1289      "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
1290      "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
1291
1292      /* vp8_filter &= mask; */
1293      "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
1294      "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
1295
1296      /* Filter2 = vp8_filter & hev; */
1297      "and          %[Filter2_l],    %[vp8_filter_l], %[hev_l]        \n\t"
1298      "and          %[Filter2_r],    %[vp8_filter_r], %[hev_r]        \n\t"
1299
1300      : [vp8_filter_l] "+r"(vp8_filter_l), [vp8_filter_r] "+r"(vp8_filter_r),
1301        [hev_l] "=&r"(hev_l), [hev_r] "=&r"(hev_r), [mask_l] "=&r"(mask_l),
1302        [mask_r] "=&r"(mask_r), [Filter2_l] "=&r"(Filter2_l),
1303        [Filter2_r] "=&r"(Filter2_r)
1304      : [subr_l] "r"(subr_l), [subr_r] "r"(subr_r), [HWM] "r"(HWM),
1305        [hev] "r"(hev), [mask] "r"(mask));
1306
1307  /* save bottom 3 bits so that we round one side +4 and the other +3 */
1308  __asm__ __volatile__(
1309      /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
1310      "addq_s.ph    %[Filter1_l],    %[Filter2_l],    %[t2]           \n\t"
1311      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
1312      "addq_s.ph    %[Filter1_r],    %[Filter2_r],    %[t2]           \n\t"
1313
1314      /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
1315      "addq_s.ph    %[Filter2_l],    %[Filter2_l],    %[t1]           \n\t"
1316      "addq_s.ph    %[Filter2_r],    %[Filter2_r],    %[t1]           \n\t"
1317
1318      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
1319      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
1320
1321      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
1322      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
1323      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
1324      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
1325      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
1326
1327      /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
1328      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
1329      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
1330
1331      /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
1332      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
1333      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
1334
1335      : [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r),
1336        [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
1337        [Filter2_l] "+r"(Filter2_l), [Filter2_r] "+r"(Filter2_r),
1338        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
1339        [vqs0_r] "+r"(vqs0_r)
1340      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), [hev_l] "r"(hev_l),
1341        [hev_r] "r"(hev_r));
1342
1343  /* only apply wider filter if not high edge variance */
1344  __asm__ __volatile__(
1345      /* vp8_filter &= ~hev; */
1346      "and          %[Filter2_l],    %[vp8_filter_l], %[invhev_l]     \n\t"
1347      "and          %[Filter2_r],    %[vp8_filter_r], %[invhev_r]     \n\t"
1348
1349      "shra.ph      %[Filter2_l],    %[Filter2_l],    8               \n\t"
1350      "shra.ph      %[Filter2_r],    %[Filter2_r],    8               \n\t"
1351
1352      : [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r)
1353      : [vp8_filter_l] "r"(vp8_filter_l), [vp8_filter_r] "r"(vp8_filter_r),
1354        [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
1355
1356  /* roughly 3/7th difference across boundary */
1357  __asm__ __volatile__(
1358      "shll.ph      %[u3_l],         %[Filter2_l],    3               \n\t"
1359      "shll.ph      %[u3_r],         %[Filter2_r],    3               \n\t"
1360
1361      "addq.ph      %[u3_l],         %[u3_l],         %[Filter2_l]    \n\t"
1362      "addq.ph      %[u3_r],         %[u3_r],         %[Filter2_r]    \n\t"
1363
1364      "shll.ph      %[u2_l],         %[u3_l],         1               \n\t"
1365      "shll.ph      %[u2_r],         %[u3_r],         1               \n\t"
1366
1367      "addq.ph      %[u1_l],         %[u3_l],         %[u2_l]         \n\t"
1368      "addq.ph      %[u1_r],         %[u3_r],         %[u2_r]         \n\t"
1369
1370      "addq.ph      %[u2_l],         %[u2_l],         %[R63]          \n\t"
1371      "addq.ph      %[u2_r],         %[u2_r],         %[R63]          \n\t"
1372
1373      "addq.ph      %[u3_l],         %[u3_l],         %[R63]          \n\t"
1374      "addq.ph      %[u3_r],         %[u3_r],         %[R63]          \n\t"
1375
1376      /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
1377       * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
1378       */
1379      "addq.ph      %[u1_l],         %[u1_l],         %[R63]          \n\t"
1380      "addq.ph      %[u1_r],         %[u1_r],         %[R63]          \n\t"
1381      "shra.ph      %[u1_l],         %[u1_l],         7               \n\t"
1382      "shra.ph      %[u1_r],         %[u1_r],         7               \n\t"
1383      "shra.ph      %[u2_l],         %[u2_l],         7               \n\t"
1384      "shra.ph      %[u2_r],         %[u2_r],         7               \n\t"
1385      "shll.ph      %[u1_l],         %[u1_l],         8               \n\t"
1386      "shll.ph      %[u1_r],         %[u1_r],         8               \n\t"
1387      "shll.ph      %[u2_l],         %[u2_l],         8               \n\t"
1388      "shll.ph      %[u2_r],         %[u2_r],         8               \n\t"
1389
1390      /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
1391      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[u1_l]         \n\t"
1392      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[u1_r]         \n\t"
1393
1394      /* vps0 = vp8_signed_char_clamp(ps0 + u); */
1395      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[u1_l]         \n\t"
1396      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[u1_r]         \n\t"
1397
1398      : [u1_l] "=&r"(u1_l), [u1_r] "=&r"(u1_r), [u2_l] "=&r"(u2_l),
1399        [u2_r] "=&r"(u2_r), [u3_l] "=&r"(u3_l), [u3_r] "=&r"(u3_r),
1400        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
1401        [vqs0_r] "+r"(vqs0_r)
1402      : [R63] "r"(R63), [Filter2_l] "r"(Filter2_l), [Filter2_r] "r"(Filter2_r));
1403
1404  __asm__ __volatile__(
1405      /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
1406      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[u2_l]         \n\t"
1407      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[u2_l]         \n\t"
1408
1409      /* vps1 = vp8_signed_char_clamp(ps1 + u); */
1410      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[u2_r]         \n\t"
1411      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[u2_r]         \n\t"
1412
1413      : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
1414        [vqs1_r] "+r"(vqs1_r)
1415      : [u2_l] "r"(u2_l), [u2_r] "r"(u2_r));
1416
1417  /* roughly 1/7th difference across boundary */
1418  __asm__ __volatile__(
1419      /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
1420      "shra.ph      %[u3_l],         %[u3_l],         7               \n\t"
1421      "shra.ph      %[u3_r],         %[u3_r],         7               \n\t"
1422      "shll.ph      %[u3_l],         %[u3_l],         8               \n\t"
1423      "shll.ph      %[u3_r],         %[u3_r],         8               \n\t"
1424
1425      /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
1426      "subq_s.ph    %[vqs2_l],       %[vqs2_l],       %[u3_l]         \n\t"
1427      "subq_s.ph    %[vqs2_r],       %[vqs2_r],       %[u3_r]         \n\t"
1428
1429      /* vps2 = vp8_signed_char_clamp(ps2 + u); */
1430      "addq_s.ph    %[vps2_l],       %[vps2_l],       %[u3_l]         \n\t"
1431      "addq_s.ph    %[vps2_r],       %[vps2_r],       %[u3_r]         \n\t"
1432
1433      : [u3_l] "+r"(u3_l), [u3_r] "+r"(u3_r), [vps2_l] "+r"(vps2_l),
1434        [vps2_r] "+r"(vps2_r), [vqs2_l] "+r"(vqs2_l), [vqs2_r] "+r"(vqs2_r)
1435      :);
1436
1437  /* Create quad-bytes from halfword pairs */
1438  __asm__ __volatile__(
1439      "and          %[vqs0_l],       %[vqs0_l],       %[HWM]          \n\t"
1440      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
1441
1442      "and          %[vps0_l],       %[vps0_l],       %[HWM]          \n\t"
1443      "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
1444
1445      "and          %[vqs1_l],       %[vqs1_l],       %[HWM]          \n\t"
1446      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
1447
1448      "and          %[vps1_l],       %[vps1_l],       %[HWM]          \n\t"
1449      "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
1450
1451      "and          %[vqs2_l],       %[vqs2_l],       %[HWM]          \n\t"
1452      "shrl.ph      %[vqs2_r],       %[vqs2_r],       8               \n\t"
1453
1454      "and          %[vps2_l],       %[vps2_l],       %[HWM]          \n\t"
1455      "shrl.ph      %[vps2_r],       %[vps2_r],       8               \n\t"
1456
1457      "or           %[vqs0_r],       %[vqs0_l],       %[vqs0_r]       \n\t"
1458      "or           %[vps0_r],       %[vps0_l],       %[vps0_r]       \n\t"
1459      "or           %[vqs1_r],       %[vqs1_l],       %[vqs1_r]       \n\t"
1460      "or           %[vps1_r],       %[vps1_l],       %[vps1_r]       \n\t"
1461      "or           %[vqs2_r],       %[vqs2_l],       %[vqs2_r]       \n\t"
1462      "or           %[vps2_r],       %[vps2_l],       %[vps2_r]       \n\t"
1463
1464      : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
1465        [vqs1_r] "+r"(vqs1_r), [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r),
1466        [vqs0_l] "+r"(vqs0_l), [vqs0_r] "+r"(vqs0_r), [vqs2_l] "+r"(vqs2_l),
1467        [vqs2_r] "+r"(vqs2_r), [vps2_r] "+r"(vps2_r), [vps2_l] "+r"(vps2_l)
1468      : [HWM] "r"(HWM));
1469
1470  *ps0 = vps0_r ^ N128;
1471  *ps1 = vps1_r ^ N128;
1472  *ps2 = vps2_r ^ N128;
1473  *qs0 = vqs0_r ^ N128;
1474  *qs1 = vqs1_r ^ N128;
1475  *qs2 = vqs2_r ^ N128;
1476}
1477
1478void vp8_mbloop_filter_horizontal_edge_mips(unsigned char *s, int p,
1479                                            unsigned int flimit,
1480                                            unsigned int limit,
1481                                            unsigned int thresh, int count) {
1482  int i;
1483  uint32_t mask, hev;
1484  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1485  unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
1486
1487  mask = 0;
1488  hev = 0;
1489  i = 0;
1490  p1 = 0;
1491  p2 = 0;
1492  p3 = 0;
1493  p4 = 0;
1494
1495  /* loop filter designed to work using chars so that we can make maximum use
1496   * of 8 bit simd instructions.
1497   */
1498
1499  sm1 = s - (p << 2);
1500  s0 = s - p - p - p;
1501  s1 = s - p - p;
1502  s2 = s - p;
1503  s3 = s;
1504  s4 = s + p;
1505  s5 = s + p + p;
1506  s6 = s + p + p + p;
1507
1508  /* prefetch data for load */
1509  prefetch_load_lf(s + p);
1510
1511  /* apply filter on 4 pixesl at the same time */
1512  do {
1513    /* load quad-byte vectors
1514     * memory is 4 byte aligned
1515     */
1516    p1 = *((uint32_t *)(s1));
1517    p2 = *((uint32_t *)(s2));
1518    p3 = *((uint32_t *)(s3));
1519    p4 = *((uint32_t *)(s4));
1520
1521    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1522     * mask will be zero and filtering is not needed
1523     */
1524    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1525      pm1 = *((uint32_t *)(sm1));
1526      p0 = *((uint32_t *)(s0));
1527      p5 = *((uint32_t *)(s5));
1528      p6 = *((uint32_t *)(s6));
1529
1530      vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1531                               thresh, &hev, &mask);
1532
1533      /* if mask == 0 do filtering is not needed */
1534      if (mask) {
1535        /* filtering */
1536        vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1537
1538        /* unpack processed 4x4 neighborhood
1539         * memory is 4 byte aligned
1540         */
1541        *((uint32_t *)s0) = p0;
1542        *((uint32_t *)s1) = p1;
1543        *((uint32_t *)s2) = p2;
1544        *((uint32_t *)s3) = p3;
1545        *((uint32_t *)s4) = p4;
1546        *((uint32_t *)s5) = p5;
1547      }
1548    }
1549
1550    sm1 += 4;
1551    s0 += 4;
1552    s1 += 4;
1553    s2 += 4;
1554    s3 += 4;
1555    s4 += 4;
1556    s5 += 4;
1557    s6 += 4;
1558
1559    /* load quad-byte vectors
1560     * memory is 4 byte aligned
1561     */
1562    p1 = *((uint32_t *)(s1));
1563    p2 = *((uint32_t *)(s2));
1564    p3 = *((uint32_t *)(s3));
1565    p4 = *((uint32_t *)(s4));
1566
1567    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1568     * mask will be zero and filtering is not needed
1569     */
1570    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1571      pm1 = *((uint32_t *)(sm1));
1572      p0 = *((uint32_t *)(s0));
1573      p5 = *((uint32_t *)(s5));
1574      p6 = *((uint32_t *)(s6));
1575
1576      vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1577                               thresh, &hev, &mask);
1578
1579      /* if mask == 0 do filtering is not needed */
1580      if (mask) {
1581        /* filtering */
1582        vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1583
1584        /* unpack processed 4x4 neighborhood
1585         * memory is 4 byte aligned
1586         */
1587        *((uint32_t *)s0) = p0;
1588        *((uint32_t *)s1) = p1;
1589        *((uint32_t *)s2) = p2;
1590        *((uint32_t *)s3) = p3;
1591        *((uint32_t *)s4) = p4;
1592        *((uint32_t *)s5) = p5;
1593      }
1594    }
1595
1596    sm1 += 4;
1597    s0 += 4;
1598    s1 += 4;
1599    s2 += 4;
1600    s3 += 4;
1601    s4 += 4;
1602    s5 += 4;
1603    s6 += 4;
1604
1605    i += 8;
1606  }
1607
1608  while (i < count);
1609}
1610
1611void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
1612                                              unsigned int flimit,
1613                                              unsigned int limit,
1614                                              unsigned int thresh, int count) {
1615  uint32_t mask, hev;
1616  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1617  unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
1618  (void)count;
1619
1620  mask = 0;
1621  hev = 0;
1622  p1 = 0;
1623  p2 = 0;
1624  p3 = 0;
1625  p4 = 0;
1626
1627  /* loop filter designed to work using chars so that we can make maximum use
1628   * of 8 bit simd instructions.
1629   */
1630
1631  sm1 = s - (p << 2);
1632  s0 = s - p - p - p;
1633  s1 = s - p - p;
1634  s2 = s - p;
1635  s3 = s;
1636  s4 = s + p;
1637  s5 = s + p + p;
1638  s6 = s + p + p + p;
1639
1640  /* load quad-byte vectors
1641   * memory is 4 byte aligned
1642   */
1643  p1 = *((uint32_t *)(s1));
1644  p2 = *((uint32_t *)(s2));
1645  p3 = *((uint32_t *)(s3));
1646  p4 = *((uint32_t *)(s4));
1647
1648  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1649   * mask will be zero and filtering is not needed
1650   */
1651  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1652    pm1 = *((uint32_t *)(sm1));
1653    p0 = *((uint32_t *)(s0));
1654    p5 = *((uint32_t *)(s5));
1655    p6 = *((uint32_t *)(s6));
1656
1657    /* if mask == 0 do filtering is not needed */
1658    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1659                             thresh, &hev, &mask);
1660
1661    if (mask) {
1662      /* filtering */
1663      vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1664
1665      /* unpack processed 4x4 neighborhood
1666       * memory is 4 byte aligned
1667       */
1668      *((uint32_t *)s0) = p0;
1669      *((uint32_t *)s1) = p1;
1670      *((uint32_t *)s2) = p2;
1671      *((uint32_t *)s3) = p3;
1672      *((uint32_t *)s4) = p4;
1673      *((uint32_t *)s5) = p5;
1674    }
1675  }
1676
1677  sm1 += 4;
1678  s0 += 4;
1679  s1 += 4;
1680  s2 += 4;
1681  s3 += 4;
1682  s4 += 4;
1683  s5 += 4;
1684  s6 += 4;
1685
1686  /* load quad-byte vectors
1687   * memory is 4 byte aligned
1688   */
1689  p1 = *((uint32_t *)(s1));
1690  p2 = *((uint32_t *)(s2));
1691  p3 = *((uint32_t *)(s3));
1692  p4 = *((uint32_t *)(s4));
1693
1694  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1695   * mask will be zero and filtering is not needed
1696   */
1697  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1698    pm1 = *((uint32_t *)(sm1));
1699    p0 = *((uint32_t *)(s0));
1700    p5 = *((uint32_t *)(s5));
1701    p6 = *((uint32_t *)(s6));
1702
1703    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1704                             thresh, &hev, &mask);
1705
1706    /* if mask == 0 do filtering is not needed */
1707    if (mask) {
1708      /* filtering */
1709      vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1710
1711      /* unpack processed 4x4 neighborhood
1712       * memory is 4 byte aligned
1713       */
1714      *((uint32_t *)s0) = p0;
1715      *((uint32_t *)s1) = p1;
1716      *((uint32_t *)s2) = p2;
1717      *((uint32_t *)s3) = p3;
1718      *((uint32_t *)s4) = p4;
1719      *((uint32_t *)s5) = p5;
1720    }
1721  }
1722}
1723
1724void vp8_mbloop_filter_vertical_edge_mips(unsigned char *s, int p,
1725                                          unsigned int flimit,
1726                                          unsigned int limit,
1727                                          unsigned int thresh, int count) {
1728  int i;
1729  uint32_t mask, hev;
1730  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1731  unsigned char *s1, *s2, *s3, *s4;
1732  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
1733
1734  mask = 0;
1735  hev = 0;
1736  i = 0;
1737  pm1 = 0;
1738  p0 = 0;
1739  p1 = 0;
1740  p2 = 0;
1741  p3 = 0;
1742  p4 = 0;
1743  p5 = 0;
1744  p6 = 0;
1745
1746  /* loop filter designed to work using chars so that we can make maximum use
1747   * of 8 bit simd instructions.
1748   */
1749
1750  /* apply filter on 4 pixesl at the same time */
1751  do {
1752    s1 = s;
1753    s2 = s + p;
1754    s3 = s2 + p;
1755    s4 = s3 + p;
1756    s = s4 + p;
1757
1758    /* load quad-byte vectors
1759     * memory is 4 byte aligned
1760     */
1761    p2 = *((uint32_t *)(s1 - 4));
1762    p6 = *((uint32_t *)(s1));
1763    p1 = *((uint32_t *)(s2 - 4));
1764    p5 = *((uint32_t *)(s2));
1765    p0 = *((uint32_t *)(s3 - 4));
1766    p4 = *((uint32_t *)(s3));
1767    pm1 = *((uint32_t *)(s4 - 4));
1768    p3 = *((uint32_t *)(s4));
1769
1770    /* transpose pm1, p0, p1, p2 */
1771    __asm__ __volatile__(
1772        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
1773        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
1774        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
1775        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
1776
1777        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
1778        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
1779        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1780        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1781
1782        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
1783        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
1784        "append         %[p1],      %[sec3],    16          \n\t"
1785        "append         %[pm1],     %[sec4],    16          \n\t"
1786
1787        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1788          [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
1789          [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1790        :);
1791
1792    /* transpose p3, p4, p5, p6 */
1793    __asm__ __volatile__(
1794        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
1795        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
1796        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
1797        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
1798
1799        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
1800        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
1801        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1802        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1803
1804        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
1805        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
1806        "append         %[p5],      %[sec3],    16          \n\t"
1807        "append         %[p3],      %[sec4],    16          \n\t"
1808
1809        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1810          [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
1811          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1812        :);
1813
1814    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1815     * mask will be zero and filtering is not needed
1816     */
1817    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1818      vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1819                               thresh, &hev, &mask);
1820
1821      /* if mask == 0 do filtering is not needed */
1822      if (mask) {
1823        /* filtering */
1824        vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1825
1826        /* don't use transpose on output data
1827         * because memory isn't aligned
1828         */
1829        __asm__ __volatile__(
1830            "sb         %[p5],  2(%[s4])        \n\t"
1831            "sb         %[p4],  1(%[s4])        \n\t"
1832            "sb         %[p3],  0(%[s4])        \n\t"
1833            "sb         %[p2], -1(%[s4])        \n\t"
1834            "sb         %[p1], -2(%[s4])        \n\t"
1835            "sb         %[p0], -3(%[s4])        \n\t"
1836            :
1837            : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
1838              [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1839
1840        __asm__ __volatile__(
1841            "srl        %[p5], %[p5], 8         \n\t"
1842            "srl        %[p4], %[p4], 8         \n\t"
1843            "srl        %[p3], %[p3], 8         \n\t"
1844            "srl        %[p2], %[p2], 8         \n\t"
1845            "srl        %[p1], %[p1], 8         \n\t"
1846            "srl        %[p0], %[p0], 8         \n\t"
1847            : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
1848              [p1] "+r"(p1), [p0] "+r"(p0)
1849            :);
1850
1851        __asm__ __volatile__(
1852            "sb         %[p5],  2(%[s3])        \n\t"
1853            "sb         %[p4],  1(%[s3])        \n\t"
1854            "sb         %[p3],  0(%[s3])        \n\t"
1855            "sb         %[p2], -1(%[s3])        \n\t"
1856            "sb         %[p1], -2(%[s3])        \n\t"
1857            "sb         %[p0], -3(%[s3])        \n\t"
1858            :
1859            : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
1860              [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1861
1862        __asm__ __volatile__(
1863            "srl        %[p5], %[p5], 8         \n\t"
1864            "srl        %[p4], %[p4], 8         \n\t"
1865            "srl        %[p3], %[p3], 8         \n\t"
1866            "srl        %[p2], %[p2], 8         \n\t"
1867            "srl        %[p1], %[p1], 8         \n\t"
1868            "srl        %[p0], %[p0], 8         \n\t"
1869            : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
1870              [p1] "+r"(p1), [p0] "+r"(p0)
1871            :);
1872
1873        __asm__ __volatile__(
1874            "sb         %[p5],  2(%[s2])        \n\t"
1875            "sb         %[p4],  1(%[s2])        \n\t"
1876            "sb         %[p3],  0(%[s2])        \n\t"
1877            "sb         %[p2], -1(%[s2])        \n\t"
1878            "sb         %[p1], -2(%[s2])        \n\t"
1879            "sb         %[p0], -3(%[s2])        \n\t"
1880            :
1881            : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
1882              [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1883
1884        __asm__ __volatile__(
1885            "srl        %[p5], %[p5], 8         \n\t"
1886            "srl        %[p4], %[p4], 8         \n\t"
1887            "srl        %[p3], %[p3], 8         \n\t"
1888            "srl        %[p2], %[p2], 8         \n\t"
1889            "srl        %[p1], %[p1], 8         \n\t"
1890            "srl        %[p0], %[p0], 8         \n\t"
1891            : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
1892              [p1] "+r"(p1), [p0] "+r"(p0)
1893            :);
1894
1895        __asm__ __volatile__(
1896            "sb         %[p5],  2(%[s1])        \n\t"
1897            "sb         %[p4],  1(%[s1])        \n\t"
1898            "sb         %[p3],  0(%[s1])        \n\t"
1899            "sb         %[p2], -1(%[s1])        \n\t"
1900            "sb         %[p1], -2(%[s1])        \n\t"
1901            "sb         %[p0], -3(%[s1])        \n\t"
1902            :
1903            : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
1904              [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1905      }
1906    }
1907
1908    i += 4;
1909  }
1910
1911  while (i < count);
1912}
1913
1914void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
1915                                            unsigned int flimit,
1916                                            unsigned int limit,
1917                                            unsigned int thresh, int count) {
1918  uint32_t mask, hev;
1919  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1920  unsigned char *s1, *s2, *s3, *s4;
1921  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
1922  (void)count;
1923
1924  mask = 0;
1925  hev = 0;
1926  pm1 = 0;
1927  p0 = 0;
1928  p1 = 0;
1929  p2 = 0;
1930  p3 = 0;
1931  p4 = 0;
1932  p5 = 0;
1933  p6 = 0;
1934
1935  /* loop filter designed to work using chars so that we can make maximum use
1936   * of 8 bit simd instructions.
1937   */
1938
1939  /* apply filter on 4 pixesl at the same time */
1940
1941  s1 = s;
1942  s2 = s + p;
1943  s3 = s2 + p;
1944  s4 = s3 + p;
1945
1946  /* prefetch data for load */
1947  prefetch_load_lf(s + 2 * p);
1948
1949  /* load quad-byte vectors
1950   * memory is 4 byte aligned
1951   */
1952  p2 = *((uint32_t *)(s1 - 4));
1953  p6 = *((uint32_t *)(s1));
1954  p1 = *((uint32_t *)(s2 - 4));
1955  p5 = *((uint32_t *)(s2));
1956  p0 = *((uint32_t *)(s3 - 4));
1957  p4 = *((uint32_t *)(s3));
1958  pm1 = *((uint32_t *)(s4 - 4));
1959  p3 = *((uint32_t *)(s4));
1960
1961  /* transpose pm1, p0, p1, p2 */
1962  __asm__ __volatile__(
1963      "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
1964      "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
1965      "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
1966      "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
1967
1968      "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
1969      "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
1970      "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1971      "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1972
1973      "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
1974      "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
1975      "append         %[p1],      %[sec3],    16          \n\t"
1976      "append         %[pm1],     %[sec4],    16          \n\t"
1977
1978      : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1979        [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
1980        [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1981      :);
1982
1983  /* transpose p3, p4, p5, p6 */
1984  __asm__ __volatile__(
1985      "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
1986      "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
1987      "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
1988      "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
1989
1990      "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
1991      "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
1992      "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1993      "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1994
1995      "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
1996      "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
1997      "append         %[p5],      %[sec3],    16          \n\t"
1998      "append         %[p3],      %[sec4],    16          \n\t"
1999
2000      : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
2001        [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
2002        [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
2003      :);
2004
2005  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2006   * mask will be zero and filtering is not needed
2007   */
2008  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
2009    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
2010                             thresh, &hev, &mask);
2011
2012    /* if mask == 0 do filtering is not needed */
2013    if (mask) {
2014      /* filtering */
2015      vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2016
2017      /* don't use transpose on output data
2018       * because memory isn't aligned
2019       */
2020      __asm__ __volatile__(
2021          "sb         %[p5],  2(%[s4])        \n\t"
2022          "sb         %[p4],  1(%[s4])        \n\t"
2023          "sb         %[p3],  0(%[s4])        \n\t"
2024          "sb         %[p2], -1(%[s4])        \n\t"
2025          "sb         %[p1], -2(%[s4])        \n\t"
2026          "sb         %[p0], -3(%[s4])        \n\t"
2027          :
2028          : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
2029            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2030
2031      __asm__ __volatile__(
2032          "srl        %[p5], %[p5], 8         \n\t"
2033          "srl        %[p4], %[p4], 8         \n\t"
2034          "srl        %[p3], %[p3], 8         \n\t"
2035          "srl        %[p2], %[p2], 8         \n\t"
2036          "srl        %[p1], %[p1], 8         \n\t"
2037          "srl        %[p0], %[p0], 8         \n\t"
2038          : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2039            [p1] "+r"(p1), [p0] "+r"(p0)
2040          :);
2041
2042      __asm__ __volatile__(
2043          "sb         %[p5],  2(%[s3])        \n\t"
2044          "sb         %[p4],  1(%[s3])        \n\t"
2045          "sb         %[p3],  0(%[s3])        \n\t"
2046          "sb         %[p2], -1(%[s3])        \n\t"
2047          "sb         %[p1], -2(%[s3])        \n\t"
2048          "sb         %[p0], -3(%[s3])        \n\t"
2049          :
2050          : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
2051            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2052
2053      __asm__ __volatile__(
2054          "srl        %[p5], %[p5], 8         \n\t"
2055          "srl        %[p4], %[p4], 8         \n\t"
2056          "srl        %[p3], %[p3], 8         \n\t"
2057          "srl        %[p2], %[p2], 8         \n\t"
2058          "srl        %[p1], %[p1], 8         \n\t"
2059          "srl        %[p0], %[p0], 8         \n\t"
2060          : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2061            [p1] "+r"(p1), [p0] "+r"(p0)
2062          :);
2063
2064      __asm__ __volatile__(
2065          "sb         %[p5],  2(%[s2])        \n\t"
2066          "sb         %[p4],  1(%[s2])        \n\t"
2067          "sb         %[p3],  0(%[s2])        \n\t"
2068          "sb         %[p2], -1(%[s2])        \n\t"
2069          "sb         %[p1], -2(%[s2])        \n\t"
2070          "sb         %[p0], -3(%[s2])        \n\t"
2071          :
2072          : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
2073            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2074
2075      __asm__ __volatile__(
2076          "srl        %[p5], %[p5], 8         \n\t"
2077          "srl        %[p4], %[p4], 8         \n\t"
2078          "srl        %[p3], %[p3], 8         \n\t"
2079          "srl        %[p2], %[p2], 8         \n\t"
2080          "srl        %[p1], %[p1], 8         \n\t"
2081          "srl        %[p0], %[p0], 8         \n\t"
2082          : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2083            [p1] "+r"(p1), [p0] "+r"(p0)
2084          :);
2085
2086      __asm__ __volatile__(
2087          "sb         %[p5],  2(%[s1])        \n\t"
2088          "sb         %[p4],  1(%[s1])        \n\t"
2089          "sb         %[p3],  0(%[s1])        \n\t"
2090          "sb         %[p2], -1(%[s1])        \n\t"
2091          "sb         %[p1], -2(%[s1])        \n\t"
2092          "sb         %[p0], -3(%[s1])        \n\t"
2093          :
2094          : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
2095            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2096    }
2097  }
2098
2099  s1 = s4 + p;
2100  s2 = s1 + p;
2101  s3 = s2 + p;
2102  s4 = s3 + p;
2103
2104  /* load quad-byte vectors
2105  * memory is 4 byte aligned
2106  */
2107  p2 = *((uint32_t *)(s1 - 4));
2108  p6 = *((uint32_t *)(s1));
2109  p1 = *((uint32_t *)(s2 - 4));
2110  p5 = *((uint32_t *)(s2));
2111  p0 = *((uint32_t *)(s3 - 4));
2112  p4 = *((uint32_t *)(s3));
2113  pm1 = *((uint32_t *)(s4 - 4));
2114  p3 = *((uint32_t *)(s4));
2115
2116  /* transpose pm1, p0, p1, p2 */
2117  __asm__ __volatile__(
2118      "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
2119      "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
2120      "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
2121      "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
2122
2123      "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
2124      "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
2125      "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
2126      "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
2127
2128      "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
2129      "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
2130      "append         %[p1],      %[sec3],    16          \n\t"
2131      "append         %[pm1],     %[sec4],    16          \n\t"
2132
2133      : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
2134        [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
2135        [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
2136      :);
2137
2138  /* transpose p3, p4, p5, p6 */
2139  __asm__ __volatile__(
2140      "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
2141      "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
2142      "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
2143      "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
2144
2145      "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
2146      "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
2147      "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
2148      "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
2149
2150      "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
2151      "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
2152      "append         %[p5],      %[sec3],    16          \n\t"
2153      "append         %[p3],      %[sec4],    16          \n\t"
2154
2155      : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
2156        [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
2157        [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
2158      :);
2159
2160  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2161   * mask will be zero and filtering is not needed
2162   */
2163  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
2164    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
2165                             thresh, &hev, &mask);
2166
2167    /* if mask == 0 do filtering is not needed */
2168    if (mask) {
2169      /* filtering */
2170      vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2171
2172      /* don't use transpose on output data
2173       * because memory isn't aligned
2174       */
2175      __asm__ __volatile__(
2176          "sb         %[p5],  2(%[s4])        \n\t"
2177          "sb         %[p4],  1(%[s4])        \n\t"
2178          "sb         %[p3],  0(%[s4])        \n\t"
2179          "sb         %[p2], -1(%[s4])        \n\t"
2180          "sb         %[p1], -2(%[s4])        \n\t"
2181          "sb         %[p0], -3(%[s4])        \n\t"
2182          :
2183          : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
2184            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2185
2186      __asm__ __volatile__(
2187          "srl        %[p5], %[p5], 8         \n\t"
2188          "srl        %[p4], %[p4], 8         \n\t"
2189          "srl        %[p3], %[p3], 8         \n\t"
2190          "srl        %[p2], %[p2], 8         \n\t"
2191          "srl        %[p1], %[p1], 8         \n\t"
2192          "srl        %[p0], %[p0], 8         \n\t"
2193          : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2194            [p1] "+r"(p1), [p0] "+r"(p0)
2195          :);
2196
2197      __asm__ __volatile__(
2198          "sb         %[p5],  2(%[s3])        \n\t"
2199          "sb         %[p4],  1(%[s3])        \n\t"
2200          "sb         %[p3],  0(%[s3])        \n\t"
2201          "sb         %[p2], -1(%[s3])        \n\t"
2202          "sb         %[p1], -2(%[s3])        \n\t"
2203          "sb         %[p0], -3(%[s3])        \n\t"
2204          :
2205          : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
2206            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2207
2208      __asm__ __volatile__(
2209          "srl        %[p5], %[p5], 8         \n\t"
2210          "srl        %[p4], %[p4], 8         \n\t"
2211          "srl        %[p3], %[p3], 8         \n\t"
2212          "srl        %[p2], %[p2], 8         \n\t"
2213          "srl        %[p1], %[p1], 8         \n\t"
2214          "srl        %[p0], %[p0], 8         \n\t"
2215          : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2216            [p1] "+r"(p1), [p0] "+r"(p0)
2217          :);
2218
2219      __asm__ __volatile__(
2220          "sb         %[p5],  2(%[s2])        \n\t"
2221          "sb         %[p4],  1(%[s2])        \n\t"
2222          "sb         %[p3],  0(%[s2])        \n\t"
2223          "sb         %[p2], -1(%[s2])        \n\t"
2224          "sb         %[p1], -2(%[s2])        \n\t"
2225          "sb         %[p0], -3(%[s2])        \n\t"
2226          :
2227          : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
2228            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2229
2230      __asm__ __volatile__(
2231          "srl        %[p5], %[p5], 8         \n\t"
2232          "srl        %[p4], %[p4], 8         \n\t"
2233          "srl        %[p3], %[p3], 8         \n\t"
2234          "srl        %[p2], %[p2], 8         \n\t"
2235          "srl        %[p1], %[p1], 8         \n\t"
2236          "srl        %[p0], %[p0], 8         \n\t"
2237          : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2238            [p1] "+r"(p1), [p0] "+r"(p0)
2239          :);
2240
2241      __asm__ __volatile__(
2242          "sb         %[p5],  2(%[s1])        \n\t"
2243          "sb         %[p4],  1(%[s1])        \n\t"
2244          "sb         %[p3],  0(%[s1])        \n\t"
2245          "sb         %[p2], -1(%[s1])        \n\t"
2246          "sb         %[p1], -2(%[s1])        \n\t"
2247          "sb         %[p0], -3(%[s1])        \n\t"
2248          :
2249          : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
2250            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2251    }
2252  }
2253}
2254
2255/* Horizontal MB filtering */
2256void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2257                               unsigned char *v_ptr, int y_stride,
2258                               int uv_stride, loop_filter_info *lfi) {
2259  unsigned int thresh_vec, flimit_vec, limit_vec;
2260  unsigned char thresh, flimit, limit, flimit_temp;
2261
2262  /* use direct value instead pointers */
2263  limit = *(lfi->lim);
2264  flimit_temp = *(lfi->mblim);
2265  thresh = *(lfi->hev_thr);
2266  flimit = flimit_temp;
2267
2268  /* create quad-byte */
2269  __asm__ __volatile__(
2270      "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2271      "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2272      "replv.qb       %[limit_vec],  %[limit]     \n\t"
2273      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2274        [limit_vec] "=r"(limit_vec)
2275      : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2276
2277  vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec,
2278                                         thresh_vec, 16);
2279
2280  if (u_ptr) {
2281    vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec,
2282                                             limit_vec, thresh_vec, 0);
2283  }
2284
2285  if (v_ptr) {
2286    vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec,
2287                                             limit_vec, thresh_vec, 0);
2288  }
2289}
2290
2291/* Vertical MB Filtering */
2292void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2293                               unsigned char *v_ptr, int y_stride,
2294                               int uv_stride, loop_filter_info *lfi) {
2295  unsigned int thresh_vec, flimit_vec, limit_vec;
2296  unsigned char thresh, flimit, limit, flimit_temp;
2297
2298  /* use direct value instead pointers */
2299  limit = *(lfi->lim);
2300  flimit_temp = *(lfi->mblim);
2301  thresh = *(lfi->hev_thr);
2302  flimit = flimit_temp;
2303
2304  /* create quad-byte */
2305  __asm__ __volatile__(
2306      "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2307      "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2308      "replv.qb       %[limit_vec],  %[limit]     \n\t"
2309      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2310        [limit_vec] "=r"(limit_vec)
2311      : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2312
2313  vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec,
2314                                       thresh_vec, 16);
2315
2316  if (u_ptr)
2317    vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec,
2318                                           limit_vec, thresh_vec, 0);
2319
2320  if (v_ptr)
2321    vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec,
2322                                           limit_vec, thresh_vec, 0);
2323}
2324
2325/* Horizontal B Filtering */
2326void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2327                              unsigned char *v_ptr, int y_stride, int uv_stride,
2328                              loop_filter_info *lfi) {
2329  unsigned int thresh_vec, flimit_vec, limit_vec;
2330  unsigned char thresh, flimit, limit, flimit_temp;
2331
2332  /* use direct value instead pointers */
2333  limit = *(lfi->lim);
2334  flimit_temp = *(lfi->blim);
2335  thresh = *(lfi->hev_thr);
2336  flimit = flimit_temp;
2337
2338  /* create quad-byte */
2339  __asm__ __volatile__(
2340      "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2341      "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2342      "replv.qb       %[limit_vec],  %[limit]     \n\t"
2343      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2344        [limit_vec] "=r"(limit_vec)
2345      : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2346
2347  vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride,
2348                                       flimit_vec, limit_vec, thresh_vec, 16);
2349  vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride,
2350                                       flimit_vec, limit_vec, thresh_vec, 16);
2351  vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride,
2352                                       flimit_vec, limit_vec, thresh_vec, 16);
2353
2354  if (u_ptr)
2355    vp8_loop_filter_uvhorizontal_edge_mips(
2356        u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2357
2358  if (v_ptr)
2359    vp8_loop_filter_uvhorizontal_edge_mips(
2360        v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2361}
2362
2363/* Vertical B Filtering */
2364void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2365                              unsigned char *v_ptr, int y_stride, int uv_stride,
2366                              loop_filter_info *lfi) {
2367  unsigned int thresh_vec, flimit_vec, limit_vec;
2368  unsigned char thresh, flimit, limit, flimit_temp;
2369
2370  /* use direct value instead pointers */
2371  limit = *(lfi->lim);
2372  flimit_temp = *(lfi->blim);
2373  thresh = *(lfi->hev_thr);
2374  flimit = flimit_temp;
2375
2376  /* create quad-byte */
2377  __asm__ __volatile__(
2378      "replv.qb       %[thresh_vec], %[thresh]    \n\t"
2379      "replv.qb       %[flimit_vec], %[flimit]    \n\t"
2380      "replv.qb       %[limit_vec],  %[limit]     \n\t"
2381      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2382        [limit_vec] "=r"(limit_vec)
2383      : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2384
2385  vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec,
2386                                     thresh_vec, 16);
2387  vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec,
2388                                     thresh_vec, 16);
2389  vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec,
2390                                     limit_vec, thresh_vec, 16);
2391
2392  if (u_ptr)
2393    vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec,
2394                                         limit_vec, thresh_vec, 0);
2395
2396  if (v_ptr)
2397    vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec,
2398                                         limit_vec, thresh_vec, 0);
2399}
2400
2401#endif
2402