1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <stdlib.h>
12
13#include "./vp9_rtcd.h"
14#include "vp9/common/vp9_common.h"
15#include "vp9/common/vp9_loopfilter.h"
16#include "vp9/common/vp9_onyxc_int.h"
17#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
18#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
19#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
20#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
21
22#if HAVE_DSPR2
23void vp9_lpf_horizontal_16_dspr2(unsigned char *s,
24                                 int pitch,
25                                 const uint8_t *blimit,
26                                 const uint8_t *limit,
27                                 const uint8_t *thresh,
28                                 int count) {
29  uint32_t  mask;
30  uint32_t  hev, flat, flat2;
31  uint8_t   i;
32  uint8_t   *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
33  uint8_t   *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
34  uint32_t  thresh_vec, flimit_vec, limit_vec;
35  uint32_t  uflimit, ulimit, uthresh;
36  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
37  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
38  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
39  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
40  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
41  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
42  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
43  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
44
45  uflimit = *blimit;
46  ulimit  = *limit;
47  uthresh = *thresh;
48
49  /* create quad-byte */
50  __asm__ __volatile__ (
51      "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
52      "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
53      "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
54
55      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
56        [limit_vec] "=r" (limit_vec)
57      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
58  );
59
60  /* prefetch data for store */
61  vp9_prefetch_store(s);
62
63  for (i = 0; i < (2 * count); i++) {
64    sp7 = s - (pitch << 3);
65    sp6 = sp7 + pitch;
66    sp5 = sp6 + pitch;
67    sp4 = sp5 + pitch;
68    sp3 = sp4 + pitch;
69    sp2 = sp3 + pitch;
70    sp1 = sp2 + pitch;
71    sp0 = sp1 + pitch;
72    sq0 = s;
73    sq1 = s + pitch;
74    sq2 = sq1 + pitch;
75    sq3 = sq2 + pitch;
76    sq4 = sq3 + pitch;
77    sq5 = sq4 + pitch;
78    sq6 = sq5 + pitch;
79    sq7 = sq6 + pitch;
80
81    __asm__ __volatile__ (
82        "lw     %[p7],      (%[sp7])            \n\t"
83        "lw     %[p6],      (%[sp6])            \n\t"
84        "lw     %[p5],      (%[sp5])            \n\t"
85        "lw     %[p4],      (%[sp4])            \n\t"
86        "lw     %[p3],      (%[sp3])            \n\t"
87        "lw     %[p2],      (%[sp2])            \n\t"
88        "lw     %[p1],      (%[sp1])            \n\t"
89        "lw     %[p0],      (%[sp0])            \n\t"
90
91        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
92          [p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4)
93        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
94          [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7)
95    );
96
97    __asm__ __volatile__ (
98        "lw     %[q0],      (%[sq0])            \n\t"
99        "lw     %[q1],      (%[sq1])            \n\t"
100        "lw     %[q2],      (%[sq2])            \n\t"
101        "lw     %[q3],      (%[sq3])            \n\t"
102        "lw     %[q4],      (%[sq4])            \n\t"
103        "lw     %[q5],      (%[sq5])            \n\t"
104        "lw     %[q6],      (%[sq6])            \n\t"
105        "lw     %[q7],      (%[sq7])            \n\t"
106
107        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0),
108          [q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4)
109        : [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0),
110          [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
111    );
112
113    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
114                                        p1, p0, p3, p2, q0, q1, q2, q3,
115                                        &hev, &mask, &flat);
116
117    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
118
119    /* f0 */
120    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
121        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
122      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
123                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
124
125      __asm__ __volatile__ (
126          "sw       %[p1_f0],   (%[sp1])            \n\t"
127          "sw       %[p0_f0],   (%[sp0])            \n\t"
128          "sw       %[q0_f0],   (%[sq0])            \n\t"
129          "sw       %[q1_f0],   (%[sq1])            \n\t"
130
131          :
132          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
133            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
134            [sp1] "r" (sp1), [sp0] "r" (sp0),
135            [sq0] "r" (sq0), [sq1] "r" (sq1)
136      );
137    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
138               (mask == 0xFFFFFFFF)) {
139      /* f2 */
140      PACK_LEFT_0TO3()
141      PACK_LEFT_4TO7()
142      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
143                              &p3_l, &p2_l, &p1_l, &p0_l,
144                              &q0_l, &q1_l, &q2_l, &q3_l,
145                              &q4_l, &q5_l, &q6_l, &q7_l);
146
147      PACK_RIGHT_0TO3()
148      PACK_RIGHT_4TO7()
149      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
150                              &p3_r, &p2_r, &p1_r, &p0_r,
151                              &q0_r, &q1_r, &q2_r, &q3_r,
152                              &q4_r, &q5_r, &q6_r, &q7_r);
153
154      COMBINE_LEFT_RIGHT_0TO2()
155      COMBINE_LEFT_RIGHT_3TO6()
156
157      __asm__ __volatile__ (
158          "sw         %[p6], (%[sp6])    \n\t"
159          "sw         %[p5], (%[sp5])    \n\t"
160          "sw         %[p4], (%[sp4])    \n\t"
161          "sw         %[p3], (%[sp3])    \n\t"
162          "sw         %[p2], (%[sp2])    \n\t"
163          "sw         %[p1], (%[sp1])    \n\t"
164          "sw         %[p0], (%[sp0])    \n\t"
165
166          :
167          : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3),
168            [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
169            [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3),
170            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
171      );
172
173      __asm__ __volatile__ (
174          "sw         %[q6], (%[sq6])    \n\t"
175          "sw         %[q5], (%[sq5])    \n\t"
176          "sw         %[q4], (%[sq4])    \n\t"
177          "sw         %[q3], (%[sq3])    \n\t"
178          "sw         %[q2], (%[sq2])    \n\t"
179          "sw         %[q1], (%[sq1])    \n\t"
180          "sw         %[q0], (%[sq0])    \n\t"
181
182          :
183          : [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3),
184            [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
185            [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3),
186            [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
187      );
188    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
189      /* f1 */
190      /* left 2 element operation */
191      PACK_LEFT_0TO3()
192      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
193                         &q0_l, &q1_l, &q2_l, &q3_l);
194
195      /* right 2 element operation */
196      PACK_RIGHT_0TO3()
197      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
198                         &q0_r, &q1_r, &q2_r, &q3_r);
199
200      COMBINE_LEFT_RIGHT_0TO2()
201
202      __asm__ __volatile__ (
203          "sw         %[p2], (%[sp2])    \n\t"
204          "sw         %[p1], (%[sp1])    \n\t"
205          "sw         %[p0], (%[sp0])    \n\t"
206          "sw         %[q0], (%[sq0])    \n\t"
207          "sw         %[q1], (%[sq1])    \n\t"
208          "sw         %[q2], (%[sq2])    \n\t"
209
210          :
211          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
212            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
213            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
214            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
215      );
216    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
217      /* f0+f1 */
218      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
219                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
220
221      /* left 2 element operation */
222      PACK_LEFT_0TO3()
223      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
224                         &q0_l, &q1_l, &q2_l, &q3_l);
225
226      /* right 2 element operation */
227      PACK_RIGHT_0TO3()
228      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
229                         &q0_r, &q1_r, &q2_r, &q3_r);
230
231      if (mask & flat & 0x000000FF) {
232        __asm__ __volatile__ (
233            "sb         %[p2_r],  (%[sp2])    \n\t"
234            "sb         %[p1_r],  (%[sp1])    \n\t"
235            "sb         %[p0_r],  (%[sp0])    \n\t"
236            "sb         %[q0_r],  (%[sq0])    \n\t"
237            "sb         %[q1_r],  (%[sq1])    \n\t"
238            "sb         %[q2_r],  (%[sq2])    \n\t"
239
240            :
241            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
242              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
243              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
244              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
245        );
246      } else if (mask & 0x000000FF) {
247        __asm__ __volatile__ (
248            "sb         %[p1_f0],  (%[sp1])    \n\t"
249            "sb         %[p0_f0],  (%[sp0])    \n\t"
250            "sb         %[q0_f0],  (%[sq0])    \n\t"
251            "sb         %[q1_f0],  (%[sq1])    \n\t"
252
253            :
254            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
255              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
256              [sp1] "r" (sp1), [sp0] "r" (sp0),
257              [sq0] "r" (sq0), [sq1] "r" (sq1)
258        );
259      }
260
261      __asm__ __volatile__ (
262          "srl      %[p2_r],    %[p2_r],    16      \n\t"
263          "srl      %[p1_r],    %[p1_r],    16      \n\t"
264          "srl      %[p0_r],    %[p0_r],    16      \n\t"
265          "srl      %[q0_r],    %[q0_r],    16      \n\t"
266          "srl      %[q1_r],    %[q1_r],    16      \n\t"
267          "srl      %[q2_r],    %[q2_r],    16      \n\t"
268          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
269          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
270          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
271          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
272
273          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
274            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
275            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
276            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
277          :
278      );
279
280      if (mask & flat & 0x0000FF00) {
281        __asm__ __volatile__ (
282            "sb         %[p2_r],  +1(%[sp2])    \n\t"
283            "sb         %[p1_r],  +1(%[sp1])    \n\t"
284            "sb         %[p0_r],  +1(%[sp0])    \n\t"
285            "sb         %[q0_r],  +1(%[sq0])    \n\t"
286            "sb         %[q1_r],  +1(%[sq1])    \n\t"
287            "sb         %[q2_r],  +1(%[sq2])    \n\t"
288
289            :
290            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
291              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
292              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
293              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
294        );
295      } else if (mask & 0x0000FF00) {
296        __asm__ __volatile__ (
297            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
298            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
299            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
300            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
301
302            :
303            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
304              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
305              [sp1] "r" (sp1), [sp0] "r" (sp0),
306              [sq0] "r" (sq0), [sq1] "r" (sq1)
307        );
308      }
309
310      __asm__ __volatile__ (
311          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
312          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
313          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
314          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
315
316          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
317            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
318          :
319      );
320
321      if (mask & flat & 0x00FF0000) {
322        __asm__ __volatile__ (
323            "sb         %[p2_l],  +2(%[sp2])    \n\t"
324            "sb         %[p1_l],  +2(%[sp1])    \n\t"
325            "sb         %[p0_l],  +2(%[sp0])    \n\t"
326            "sb         %[q0_l],  +2(%[sq0])    \n\t"
327            "sb         %[q1_l],  +2(%[sq1])    \n\t"
328            "sb         %[q2_l],  +2(%[sq2])    \n\t"
329
330            :
331            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
332              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
333              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
334              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
335        );
336      } else if (mask & 0x00FF0000) {
337        __asm__ __volatile__ (
338            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
339            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
340            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
341            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
342
343            :
344            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
345              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
346              [sp1] "r" (sp1), [sp0] "r" (sp0),
347              [sq0] "r" (sq0), [sq1] "r" (sq1)
348        );
349      }
350
351      __asm__ __volatile__ (
352          "srl      %[p2_l],    %[p2_l],    16      \n\t"
353          "srl      %[p1_l],    %[p1_l],    16      \n\t"
354          "srl      %[p0_l],    %[p0_l],    16      \n\t"
355          "srl      %[q0_l],    %[q0_l],    16      \n\t"
356          "srl      %[q1_l],    %[q1_l],    16      \n\t"
357          "srl      %[q2_l],    %[q2_l],    16      \n\t"
358          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
359          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
360          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
361          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
362
363          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
364            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
365            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
366            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
367          :
368      );
369
370      if (mask & flat & 0xFF000000) {
371        __asm__ __volatile__ (
372            "sb         %[p2_l],  +3(%[sp2])    \n\t"
373            "sb         %[p1_l],  +3(%[sp1])    \n\t"
374            "sb         %[p0_l],  +3(%[sp0])    \n\t"
375            "sb         %[q0_l],  +3(%[sq0])    \n\t"
376            "sb         %[q1_l],  +3(%[sq1])    \n\t"
377            "sb         %[q2_l],  +3(%[sq2])    \n\t"
378
379            :
380            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
381              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
382              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
383              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
384        );
385      } else if (mask & 0xFF000000) {
386        __asm__ __volatile__ (
387            "sb         %[p1_f0],  +3(%[sp1])    \n\t"
388            "sb         %[p0_f0],  +3(%[sp0])    \n\t"
389            "sb         %[q0_f0],  +3(%[sq0])    \n\t"
390            "sb         %[q1_f0],  +3(%[sq1])    \n\t"
391
392            :
393            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
394              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
395              [sp1] "r" (sp1), [sp0] "r" (sp0),
396              [sq0] "r" (sq0), [sq1] "r" (sq1)
397        );
398      }
399    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
400      /* f0 + f1 + f2 */
401      /* f0  function */
402      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
403                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
404
405      /* f1  function */
406      /* left 2 element operation */
407      PACK_LEFT_0TO3()
408      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
409                          q0_l, q1_l, q2_l, q3_l,
410                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
411                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
412
413      /* right 2 element operation */
414      PACK_RIGHT_0TO3()
415      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
416                          q0_r, q1_r, q2_r, q3_r,
417                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
418                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
419
420      /* f2  function */
421      PACK_LEFT_4TO7()
422      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
423                              &p3_l, &p2_l, &p1_l, &p0_l,
424                              &q0_l, &q1_l, &q2_l, &q3_l,
425                              &q4_l, &q5_l, &q6_l, &q7_l);
426
427      PACK_RIGHT_4TO7()
428      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
429                              &p3_r, &p2_r, &p1_r, &p0_r,
430                              &q0_r, &q1_r, &q2_r, &q3_r,
431                              &q4_r, &q5_r, &q6_r, &q7_r);
432
433      if (mask & flat & flat2 & 0x000000FF) {
434        __asm__ __volatile__ (
435            "sb         %[p6_r],  (%[sp6])    \n\t"
436            "sb         %[p5_r],  (%[sp5])    \n\t"
437            "sb         %[p4_r],  (%[sp4])    \n\t"
438            "sb         %[p3_r],  (%[sp3])    \n\t"
439            "sb         %[p2_r],  (%[sp2])    \n\t"
440            "sb         %[p1_r],  (%[sp1])    \n\t"
441            "sb         %[p0_r],  (%[sp0])    \n\t"
442
443            :
444            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
445              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
446              [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4),
447              [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1),
448              [p0_r] "r" (p0_r), [sp0] "r" (sp0)
449        );
450
451        __asm__ __volatile__ (
452            "sb         %[q0_r],  (%[sq0])    \n\t"
453            "sb         %[q1_r],  (%[sq1])    \n\t"
454            "sb         %[q2_r],  (%[sq2])    \n\t"
455            "sb         %[q3_r],  (%[sq3])    \n\t"
456            "sb         %[q4_r],  (%[sq4])    \n\t"
457            "sb         %[q5_r],  (%[sq5])    \n\t"
458            "sb         %[q6_r],  (%[sq6])    \n\t"
459
460            :
461            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
462              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
463              [q6_r] "r" (q6_r),
464              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
465              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
466              [sq6] "r" (sq6)
467        );
468      } else if (mask & flat & 0x000000FF) {
469        __asm__ __volatile__ (
470            "sb         %[p2_r_f1],  (%[sp2])    \n\t"
471            "sb         %[p1_r_f1],  (%[sp1])    \n\t"
472            "sb         %[p0_r_f1],  (%[sp0])    \n\t"
473            "sb         %[q0_r_f1],  (%[sq0])    \n\t"
474            "sb         %[q1_r_f1],  (%[sq1])    \n\t"
475            "sb         %[q2_r_f1],  (%[sq2])    \n\t"
476
477            :
478            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
479              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
480              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
481              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
482              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
483        );
484      } else if (mask & 0x000000FF) {
485        __asm__ __volatile__ (
486            "sb         %[p1_f0],  (%[sp1])    \n\t"
487            "sb         %[p0_f0],  (%[sp0])    \n\t"
488            "sb         %[q0_f0],  (%[sq0])    \n\t"
489            "sb         %[q1_f0],  (%[sq1])    \n\t"
490
491            :
492            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
493              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
494              [sq0] "r" (sq0), [sq1] "r" (sq1)
495        );
496      }
497
498      __asm__ __volatile__ (
499          "srl        %[p6_r], %[p6_r], 16     \n\t"
500          "srl        %[p5_r], %[p5_r], 16     \n\t"
501          "srl        %[p4_r], %[p4_r], 16     \n\t"
502          "srl        %[p3_r], %[p3_r], 16     \n\t"
503          "srl        %[p2_r], %[p2_r], 16     \n\t"
504          "srl        %[p1_r], %[p1_r], 16     \n\t"
505          "srl        %[p0_r], %[p0_r], 16     \n\t"
506          "srl        %[q0_r], %[q0_r], 16     \n\t"
507          "srl        %[q1_r], %[q1_r], 16     \n\t"
508          "srl        %[q2_r], %[q2_r], 16     \n\t"
509          "srl        %[q3_r], %[q3_r], 16     \n\t"
510          "srl        %[q4_r], %[q4_r], 16     \n\t"
511          "srl        %[q5_r], %[q5_r], 16     \n\t"
512          "srl        %[q6_r], %[q6_r], 16     \n\t"
513
514          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
515            [q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
516            [p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
517            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r),
518            [q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r)
519          :
520      );
521
522      __asm__ __volatile__ (
523          "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
524          "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
525          "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
526          "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
527          "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
528          "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
529          "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
530          "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
531          "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
532          "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
533
534          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
535            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
536            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
537            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
538            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
539          :
540      );
541
542      if (mask & flat & flat2 & 0x0000FF00) {
543        __asm__ __volatile__ (
544            "sb         %[p6_r],  +1(%[sp6])    \n\t"
545            "sb         %[p5_r],  +1(%[sp5])    \n\t"
546            "sb         %[p4_r],  +1(%[sp4])    \n\t"
547            "sb         %[p3_r],  +1(%[sp3])    \n\t"
548            "sb         %[p2_r],  +1(%[sp2])    \n\t"
549            "sb         %[p1_r],  +1(%[sp1])    \n\t"
550            "sb         %[p0_r],  +1(%[sp0])    \n\t"
551
552            :
553            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
554              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
555              [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5),
556              [sp4] "r" (sp4), [sp3] "r" (sp3),
557              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
558        );
559
560        __asm__ __volatile__ (
561            "sb         %[q0_r],  +1(%[sq0])    \n\t"
562            "sb         %[q1_r],  +1(%[sq1])    \n\t"
563            "sb         %[q2_r],  +1(%[sq2])    \n\t"
564            "sb         %[q3_r],  +1(%[sq3])    \n\t"
565            "sb         %[q4_r],  +1(%[sq4])    \n\t"
566            "sb         %[q5_r],  +1(%[sq5])    \n\t"
567            "sb         %[q6_r],  +1(%[sq6])    \n\t"
568
569            :
570            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
571              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
572              [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1),
573              [sq2] "r" (sq2), [sq3] "r" (sq3),
574              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
575        );
576      } else if (mask & flat & 0x0000FF00) {
577        __asm__ __volatile__ (
578            "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
579            "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
580            "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
581            "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
582            "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
583            "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
584
585            :
586            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
587              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
588              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
589              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
590              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
591        );
592      } else if (mask & 0x0000FF00) {
593        __asm__ __volatile__ (
594            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
595            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
596            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
597            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
598
599            :
600            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
601              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
602              [sq0] "r" (sq0), [sq1] "r" (sq1)
603        );
604      }
605
606      __asm__ __volatile__ (
607          "srl        %[p1_f0], %[p1_f0], 8     \n\t"
608          "srl        %[p0_f0], %[p0_f0], 8     \n\t"
609          "srl        %[q0_f0], %[q0_f0], 8     \n\t"
610          "srl        %[q1_f0], %[q1_f0], 8     \n\t"
611
612          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
613            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
614          :
615      );
616
617      if (mask & flat & flat2 & 0x00FF0000) {
618        __asm__ __volatile__ (
619            "sb         %[p6_l],  +2(%[sp6])    \n\t"
620            "sb         %[p5_l],  +2(%[sp5])    \n\t"
621            "sb         %[p4_l],  +2(%[sp4])    \n\t"
622            "sb         %[p3_l],  +2(%[sp3])    \n\t"
623            "sb         %[p2_l],  +2(%[sp2])    \n\t"
624            "sb         %[p1_l],  +2(%[sp1])    \n\t"
625            "sb         %[p0_l],  +2(%[sp0])    \n\t"
626
627            :
628            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
629              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
630              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
631              [sp4] "r" (sp4), [sp3] "r" (sp3),
632              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
633        );
634
635        __asm__ __volatile__ (
636            "sb         %[q0_l],  +2(%[sq0])    \n\t"
637            "sb         %[q1_l],  +2(%[sq1])    \n\t"
638            "sb         %[q2_l],  +2(%[sq2])    \n\t"
639            "sb         %[q3_l],  +2(%[sq3])    \n\t"
640            "sb         %[q4_l],  +2(%[sq4])    \n\t"
641            "sb         %[q5_l],  +2(%[sq5])    \n\t"
642            "sb         %[q6_l],  +2(%[sq6])    \n\t"
643
644            :
645            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
646              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
647              [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1),
648              [sq2] "r" (sq2), [sq3] "r" (sq3),
649              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
650        );
651      } else if (mask & flat & 0x00FF0000) {
652        __asm__ __volatile__ (
653            "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
654            "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
655            "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
656            "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
657            "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
658            "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
659
660            :
661            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
662              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
663              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
664              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
665              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
666        );
667      } else if (mask & 0x00FF0000) {
668        __asm__ __volatile__ (
669            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
670            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
671            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
672            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
673
674            :
675            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
676              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
677              [sq0] "r" (sq0), [sq1] "r" (sq1)
678        );
679      }
680
681      __asm__ __volatile__ (
682          "srl      %[p6_l],    %[p6_l],    16   \n\t"
683          "srl      %[p5_l],    %[p5_l],    16   \n\t"
684          "srl      %[p4_l],    %[p4_l],    16   \n\t"
685          "srl      %[p3_l],    %[p3_l],    16   \n\t"
686          "srl      %[p2_l],    %[p2_l],    16   \n\t"
687          "srl      %[p1_l],    %[p1_l],    16   \n\t"
688          "srl      %[p0_l],    %[p0_l],    16   \n\t"
689          "srl      %[q0_l],    %[q0_l],    16   \n\t"
690          "srl      %[q1_l],    %[q1_l],    16   \n\t"
691          "srl      %[q2_l],    %[q2_l],    16   \n\t"
692          "srl      %[q3_l],    %[q3_l],    16   \n\t"
693          "srl      %[q4_l],    %[q4_l],    16   \n\t"
694          "srl      %[q5_l],    %[q5_l],    16   \n\t"
695          "srl      %[q6_l],    %[q6_l],    16   \n\t"
696
697          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
698            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
699            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
700            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
701            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
702          :
703      );
704
705      __asm__ __volatile__ (
706          "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
707          "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
708          "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
709          "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
710          "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
711          "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
712          "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
713          "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
714          "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
715          "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
716
717          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
718            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
719            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
720            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
721            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
722          :
723      );
724
725      if (mask & flat & flat2 & 0xFF000000) {
726        __asm__ __volatile__ (
727            "sb     %[p6_l],    +3(%[sp6])    \n\t"
728            "sb     %[p5_l],    +3(%[sp5])    \n\t"
729            "sb     %[p4_l],    +3(%[sp4])    \n\t"
730            "sb     %[p3_l],    +3(%[sp3])    \n\t"
731            "sb     %[p2_l],    +3(%[sp2])    \n\t"
732            "sb     %[p1_l],    +3(%[sp1])    \n\t"
733            "sb     %[p0_l],    +3(%[sp0])    \n\t"
734
735            :
736            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
737              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
738              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
739              [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2),
740              [sp1] "r" (sp1), [sp0] "r" (sp0)
741        );
742
743        __asm__ __volatile__ (
744            "sb     %[q0_l],    +3(%[sq0])    \n\t"
745            "sb     %[q1_l],    +3(%[sq1])    \n\t"
746            "sb     %[q2_l],    +3(%[sq2])    \n\t"
747            "sb     %[q3_l],    +3(%[sq3])    \n\t"
748            "sb     %[q4_l],    +3(%[sq4])    \n\t"
749            "sb     %[q5_l],    +3(%[sq5])    \n\t"
750            "sb     %[q6_l],    +3(%[sq6])    \n\t"
751
752            :
753            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l),
754              [q2_l] "r" (q2_l), [q3_l] "r" (q3_l),
755              [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
756              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
757              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
758              [q6_l] "r" (q6_l), [sq6] "r" (sq6)
759        );
760      } else if (mask & flat & 0xFF000000) {
761        __asm__ __volatile__ (
762            "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
763            "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
764            "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
765            "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
766            "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
767            "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
768
769            :
770            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
771              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
772              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
773              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
774              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
775        );
776      } else if (mask & 0xFF000000) {
777        __asm__ __volatile__ (
778            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
779            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
780            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
781            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
782
783            :
784            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
785              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
786              [sp1] "r" (sp1), [sp0] "r" (sp0),
787              [sq0] "r" (sq0), [sq1] "r" (sq1)
788        );
789      }
790    }
791
792    s = s + 4;
793  }
794}
795#endif  // #if HAVE_DSPR2
796