1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <stdlib.h>
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx/vpx_integer.h"
15#include "vpx_dsp/mips/common_dspr2.h"
16#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
17#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
18#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
19#include "vpx_mem/vpx_mem.h"
20
21#if HAVE_DSPR2
22void vpx_lpf_horizontal_16_dspr2(unsigned char *s,
23                                 int pitch,
24                                 const uint8_t *blimit,
25                                 const uint8_t *limit,
26                                 const uint8_t *thresh,
27                                 int count) {
28  uint32_t  mask;
29  uint32_t  hev, flat, flat2;
30  uint8_t   i;
31  uint8_t   *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
32  uint8_t   *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
33  uint32_t  thresh_vec, flimit_vec, limit_vec;
34  uint32_t  uflimit, ulimit, uthresh;
35  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
36  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
37  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
38  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
39  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
40  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
41  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
42  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
43
44  uflimit = *blimit;
45  ulimit  = *limit;
46  uthresh = *thresh;
47
48  /* create quad-byte */
49  __asm__ __volatile__ (
50      "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
51      "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
52      "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
53
54      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
55        [limit_vec] "=r" (limit_vec)
56      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
57  );
58
59  /* prefetch data for store */
60  prefetch_store(s);
61
62  for (i = 0; i < (2 * count); i++) {
63    sp7 = s - (pitch << 3);
64    sp6 = sp7 + pitch;
65    sp5 = sp6 + pitch;
66    sp4 = sp5 + pitch;
67    sp3 = sp4 + pitch;
68    sp2 = sp3 + pitch;
69    sp1 = sp2 + pitch;
70    sp0 = sp1 + pitch;
71    sq0 = s;
72    sq1 = s + pitch;
73    sq2 = sq1 + pitch;
74    sq3 = sq2 + pitch;
75    sq4 = sq3 + pitch;
76    sq5 = sq4 + pitch;
77    sq6 = sq5 + pitch;
78    sq7 = sq6 + pitch;
79
80    __asm__ __volatile__ (
81        "lw     %[p7],      (%[sp7])            \n\t"
82        "lw     %[p6],      (%[sp6])            \n\t"
83        "lw     %[p5],      (%[sp5])            \n\t"
84        "lw     %[p4],      (%[sp4])            \n\t"
85        "lw     %[p3],      (%[sp3])            \n\t"
86        "lw     %[p2],      (%[sp2])            \n\t"
87        "lw     %[p1],      (%[sp1])            \n\t"
88        "lw     %[p0],      (%[sp0])            \n\t"
89
90        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
91          [p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4)
92        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
93          [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7)
94    );
95
96    __asm__ __volatile__ (
97        "lw     %[q0],      (%[sq0])            \n\t"
98        "lw     %[q1],      (%[sq1])            \n\t"
99        "lw     %[q2],      (%[sq2])            \n\t"
100        "lw     %[q3],      (%[sq3])            \n\t"
101        "lw     %[q4],      (%[sq4])            \n\t"
102        "lw     %[q5],      (%[sq5])            \n\t"
103        "lw     %[q6],      (%[sq6])            \n\t"
104        "lw     %[q7],      (%[sq7])            \n\t"
105
106        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0),
107          [q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4)
108        : [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0),
109          [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
110    );
111
112    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
113                                    p1, p0, p3, p2, q0, q1, q2, q3,
114                                    &hev, &mask, &flat);
115
116    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
117
118    /* f0 */
119    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
120        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
121      filter1_dspr2(mask, hev, p1, p0, q0, q1,
122                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
123
124      __asm__ __volatile__ (
125          "sw       %[p1_f0],   (%[sp1])            \n\t"
126          "sw       %[p0_f0],   (%[sp0])            \n\t"
127          "sw       %[q0_f0],   (%[sq0])            \n\t"
128          "sw       %[q1_f0],   (%[sq1])            \n\t"
129
130          :
131          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
132            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
133            [sp1] "r" (sp1), [sp0] "r" (sp0),
134            [sq0] "r" (sq0), [sq1] "r" (sq1)
135      );
136    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
137               (mask == 0xFFFFFFFF)) {
138      /* f2 */
139      PACK_LEFT_0TO3()
140      PACK_LEFT_4TO7()
141      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
142                          &p3_l, &p2_l, &p1_l, &p0_l,
143                          &q0_l, &q1_l, &q2_l, &q3_l,
144                          &q4_l, &q5_l, &q6_l, &q7_l);
145
146      PACK_RIGHT_0TO3()
147      PACK_RIGHT_4TO7()
148      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
149                          &p3_r, &p2_r, &p1_r, &p0_r,
150                          &q0_r, &q1_r, &q2_r, &q3_r,
151                          &q4_r, &q5_r, &q6_r, &q7_r);
152
153      COMBINE_LEFT_RIGHT_0TO2()
154      COMBINE_LEFT_RIGHT_3TO6()
155
156      __asm__ __volatile__ (
157          "sw         %[p6], (%[sp6])    \n\t"
158          "sw         %[p5], (%[sp5])    \n\t"
159          "sw         %[p4], (%[sp4])    \n\t"
160          "sw         %[p3], (%[sp3])    \n\t"
161          "sw         %[p2], (%[sp2])    \n\t"
162          "sw         %[p1], (%[sp1])    \n\t"
163          "sw         %[p0], (%[sp0])    \n\t"
164
165          :
166          : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3),
167            [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
168            [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3),
169            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
170      );
171
172      __asm__ __volatile__ (
173          "sw         %[q6], (%[sq6])    \n\t"
174          "sw         %[q5], (%[sq5])    \n\t"
175          "sw         %[q4], (%[sq4])    \n\t"
176          "sw         %[q3], (%[sq3])    \n\t"
177          "sw         %[q2], (%[sq2])    \n\t"
178          "sw         %[q1], (%[sq1])    \n\t"
179          "sw         %[q0], (%[sq0])    \n\t"
180
181          :
182          : [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3),
183            [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
184            [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3),
185            [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
186      );
187    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
188      /* f1 */
189      /* left 2 element operation */
190      PACK_LEFT_0TO3()
191      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
192                     &q0_l, &q1_l, &q2_l, &q3_l);
193
194      /* right 2 element operation */
195      PACK_RIGHT_0TO3()
196      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
197                     &q0_r, &q1_r, &q2_r, &q3_r);
198
199      COMBINE_LEFT_RIGHT_0TO2()
200
201      __asm__ __volatile__ (
202          "sw         %[p2], (%[sp2])    \n\t"
203          "sw         %[p1], (%[sp1])    \n\t"
204          "sw         %[p0], (%[sp0])    \n\t"
205          "sw         %[q0], (%[sq0])    \n\t"
206          "sw         %[q1], (%[sq1])    \n\t"
207          "sw         %[q2], (%[sq2])    \n\t"
208
209          :
210          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
211            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
212            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
213            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
214      );
215    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
216      /* f0+f1 */
217      filter1_dspr2(mask, hev, p1, p0, q0, q1,
218                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
219
220      /* left 2 element operation */
221      PACK_LEFT_0TO3()
222      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
223                     &q0_l, &q1_l, &q2_l, &q3_l);
224
225      /* right 2 element operation */
226      PACK_RIGHT_0TO3()
227      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
228                     &q0_r, &q1_r, &q2_r, &q3_r);
229
230      if (mask & flat & 0x000000FF) {
231        __asm__ __volatile__ (
232            "sb         %[p2_r],  (%[sp2])    \n\t"
233            "sb         %[p1_r],  (%[sp1])    \n\t"
234            "sb         %[p0_r],  (%[sp0])    \n\t"
235            "sb         %[q0_r],  (%[sq0])    \n\t"
236            "sb         %[q1_r],  (%[sq1])    \n\t"
237            "sb         %[q2_r],  (%[sq2])    \n\t"
238
239            :
240            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
241              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
242              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
243              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
244        );
245      } else if (mask & 0x000000FF) {
246        __asm__ __volatile__ (
247            "sb         %[p1_f0],  (%[sp1])    \n\t"
248            "sb         %[p0_f0],  (%[sp0])    \n\t"
249            "sb         %[q0_f0],  (%[sq0])    \n\t"
250            "sb         %[q1_f0],  (%[sq1])    \n\t"
251
252            :
253            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
254              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
255              [sp1] "r" (sp1), [sp0] "r" (sp0),
256              [sq0] "r" (sq0), [sq1] "r" (sq1)
257        );
258      }
259
260      __asm__ __volatile__ (
261          "srl      %[p2_r],    %[p2_r],    16      \n\t"
262          "srl      %[p1_r],    %[p1_r],    16      \n\t"
263          "srl      %[p0_r],    %[p0_r],    16      \n\t"
264          "srl      %[q0_r],    %[q0_r],    16      \n\t"
265          "srl      %[q1_r],    %[q1_r],    16      \n\t"
266          "srl      %[q2_r],    %[q2_r],    16      \n\t"
267          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
268          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
269          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
270          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
271
272          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
273            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
274            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
275            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
276          :
277      );
278
279      if (mask & flat & 0x0000FF00) {
280        __asm__ __volatile__ (
281            "sb         %[p2_r],  +1(%[sp2])    \n\t"
282            "sb         %[p1_r],  +1(%[sp1])    \n\t"
283            "sb         %[p0_r],  +1(%[sp0])    \n\t"
284            "sb         %[q0_r],  +1(%[sq0])    \n\t"
285            "sb         %[q1_r],  +1(%[sq1])    \n\t"
286            "sb         %[q2_r],  +1(%[sq2])    \n\t"
287
288            :
289            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
290              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
291              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
292              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
293        );
294      } else if (mask & 0x0000FF00) {
295        __asm__ __volatile__ (
296            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
297            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
298            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
299            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
300
301            :
302            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
303              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
304              [sp1] "r" (sp1), [sp0] "r" (sp0),
305              [sq0] "r" (sq0), [sq1] "r" (sq1)
306        );
307      }
308
309      __asm__ __volatile__ (
310          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
311          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
312          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
313          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
314
315          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
316            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
317          :
318      );
319
320      if (mask & flat & 0x00FF0000) {
321        __asm__ __volatile__ (
322            "sb         %[p2_l],  +2(%[sp2])    \n\t"
323            "sb         %[p1_l],  +2(%[sp1])    \n\t"
324            "sb         %[p0_l],  +2(%[sp0])    \n\t"
325            "sb         %[q0_l],  +2(%[sq0])    \n\t"
326            "sb         %[q1_l],  +2(%[sq1])    \n\t"
327            "sb         %[q2_l],  +2(%[sq2])    \n\t"
328
329            :
330            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
331              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
332              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
333              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
334        );
335      } else if (mask & 0x00FF0000) {
336        __asm__ __volatile__ (
337            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
338            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
339            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
340            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
341
342            :
343            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
344              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
345              [sp1] "r" (sp1), [sp0] "r" (sp0),
346              [sq0] "r" (sq0), [sq1] "r" (sq1)
347        );
348      }
349
350      __asm__ __volatile__ (
351          "srl      %[p2_l],    %[p2_l],    16      \n\t"
352          "srl      %[p1_l],    %[p1_l],    16      \n\t"
353          "srl      %[p0_l],    %[p0_l],    16      \n\t"
354          "srl      %[q0_l],    %[q0_l],    16      \n\t"
355          "srl      %[q1_l],    %[q1_l],    16      \n\t"
356          "srl      %[q2_l],    %[q2_l],    16      \n\t"
357          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
358          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
359          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
360          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
361
362          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
363            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
364            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
365            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
366          :
367      );
368
369      if (mask & flat & 0xFF000000) {
370        __asm__ __volatile__ (
371            "sb         %[p2_l],  +3(%[sp2])    \n\t"
372            "sb         %[p1_l],  +3(%[sp1])    \n\t"
373            "sb         %[p0_l],  +3(%[sp0])    \n\t"
374            "sb         %[q0_l],  +3(%[sq0])    \n\t"
375            "sb         %[q1_l],  +3(%[sq1])    \n\t"
376            "sb         %[q2_l],  +3(%[sq2])    \n\t"
377
378            :
379            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
380              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
381              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
382              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
383        );
384      } else if (mask & 0xFF000000) {
385        __asm__ __volatile__ (
386            "sb         %[p1_f0],  +3(%[sp1])    \n\t"
387            "sb         %[p0_f0],  +3(%[sp0])    \n\t"
388            "sb         %[q0_f0],  +3(%[sq0])    \n\t"
389            "sb         %[q1_f0],  +3(%[sq1])    \n\t"
390
391            :
392            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
393              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
394              [sp1] "r" (sp1), [sp0] "r" (sp0),
395              [sq0] "r" (sq0), [sq1] "r" (sq1)
396        );
397      }
398    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
399      /* f0 + f1 + f2 */
400      /* f0  function */
401      filter1_dspr2(mask, hev, p1, p0, q0, q1,
402                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
403
404      /* f1  function */
405      /* left 2 element operation */
406      PACK_LEFT_0TO3()
407      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
408                      q0_l, q1_l, q2_l, q3_l,
409                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
410                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
411
412      /* right 2 element operation */
413      PACK_RIGHT_0TO3()
414      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
415                      q0_r, q1_r, q2_r, q3_r,
416                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
417                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
418
419      /* f2  function */
420      PACK_LEFT_4TO7()
421      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
422                          &p3_l, &p2_l, &p1_l, &p0_l,
423                          &q0_l, &q1_l, &q2_l, &q3_l,
424                          &q4_l, &q5_l, &q6_l, &q7_l);
425
426      PACK_RIGHT_4TO7()
427      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
428                          &p3_r, &p2_r, &p1_r, &p0_r,
429                          &q0_r, &q1_r, &q2_r, &q3_r,
430                          &q4_r, &q5_r, &q6_r, &q7_r);
431
432      if (mask & flat & flat2 & 0x000000FF) {
433        __asm__ __volatile__ (
434            "sb         %[p6_r],  (%[sp6])    \n\t"
435            "sb         %[p5_r],  (%[sp5])    \n\t"
436            "sb         %[p4_r],  (%[sp4])    \n\t"
437            "sb         %[p3_r],  (%[sp3])    \n\t"
438            "sb         %[p2_r],  (%[sp2])    \n\t"
439            "sb         %[p1_r],  (%[sp1])    \n\t"
440            "sb         %[p0_r],  (%[sp0])    \n\t"
441
442            :
443            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
444              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
445              [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4),
446              [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1),
447              [p0_r] "r" (p0_r), [sp0] "r" (sp0)
448        );
449
450        __asm__ __volatile__ (
451            "sb         %[q0_r],  (%[sq0])    \n\t"
452            "sb         %[q1_r],  (%[sq1])    \n\t"
453            "sb         %[q2_r],  (%[sq2])    \n\t"
454            "sb         %[q3_r],  (%[sq3])    \n\t"
455            "sb         %[q4_r],  (%[sq4])    \n\t"
456            "sb         %[q5_r],  (%[sq5])    \n\t"
457            "sb         %[q6_r],  (%[sq6])    \n\t"
458
459            :
460            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
461              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
462              [q6_r] "r" (q6_r),
463              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
464              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
465              [sq6] "r" (sq6)
466        );
467      } else if (mask & flat & 0x000000FF) {
468        __asm__ __volatile__ (
469            "sb         %[p2_r_f1],  (%[sp2])    \n\t"
470            "sb         %[p1_r_f1],  (%[sp1])    \n\t"
471            "sb         %[p0_r_f1],  (%[sp0])    \n\t"
472            "sb         %[q0_r_f1],  (%[sq0])    \n\t"
473            "sb         %[q1_r_f1],  (%[sq1])    \n\t"
474            "sb         %[q2_r_f1],  (%[sq2])    \n\t"
475
476            :
477            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
478              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
479              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
480              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
481              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
482        );
483      } else if (mask & 0x000000FF) {
484        __asm__ __volatile__ (
485            "sb         %[p1_f0],  (%[sp1])    \n\t"
486            "sb         %[p0_f0],  (%[sp0])    \n\t"
487            "sb         %[q0_f0],  (%[sq0])    \n\t"
488            "sb         %[q1_f0],  (%[sq1])    \n\t"
489
490            :
491            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
492              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
493              [sq0] "r" (sq0), [sq1] "r" (sq1)
494        );
495      }
496
497      __asm__ __volatile__ (
498          "srl        %[p6_r], %[p6_r], 16     \n\t"
499          "srl        %[p5_r], %[p5_r], 16     \n\t"
500          "srl        %[p4_r], %[p4_r], 16     \n\t"
501          "srl        %[p3_r], %[p3_r], 16     \n\t"
502          "srl        %[p2_r], %[p2_r], 16     \n\t"
503          "srl        %[p1_r], %[p1_r], 16     \n\t"
504          "srl        %[p0_r], %[p0_r], 16     \n\t"
505          "srl        %[q0_r], %[q0_r], 16     \n\t"
506          "srl        %[q1_r], %[q1_r], 16     \n\t"
507          "srl        %[q2_r], %[q2_r], 16     \n\t"
508          "srl        %[q3_r], %[q3_r], 16     \n\t"
509          "srl        %[q4_r], %[q4_r], 16     \n\t"
510          "srl        %[q5_r], %[q5_r], 16     \n\t"
511          "srl        %[q6_r], %[q6_r], 16     \n\t"
512
513          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
514            [q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
515            [p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
516            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r),
517            [q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r)
518          :
519      );
520
521      __asm__ __volatile__ (
522          "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
523          "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
524          "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
525          "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
526          "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
527          "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
528          "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
529          "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
530          "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
531          "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
532
533          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
534            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
535            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
536            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
537            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
538          :
539      );
540
541      if (mask & flat & flat2 & 0x0000FF00) {
542        __asm__ __volatile__ (
543            "sb         %[p6_r],  +1(%[sp6])    \n\t"
544            "sb         %[p5_r],  +1(%[sp5])    \n\t"
545            "sb         %[p4_r],  +1(%[sp4])    \n\t"
546            "sb         %[p3_r],  +1(%[sp3])    \n\t"
547            "sb         %[p2_r],  +1(%[sp2])    \n\t"
548            "sb         %[p1_r],  +1(%[sp1])    \n\t"
549            "sb         %[p0_r],  +1(%[sp0])    \n\t"
550
551            :
552            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
553              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
554              [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5),
555              [sp4] "r" (sp4), [sp3] "r" (sp3),
556              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
557        );
558
559        __asm__ __volatile__ (
560            "sb         %[q0_r],  +1(%[sq0])    \n\t"
561            "sb         %[q1_r],  +1(%[sq1])    \n\t"
562            "sb         %[q2_r],  +1(%[sq2])    \n\t"
563            "sb         %[q3_r],  +1(%[sq3])    \n\t"
564            "sb         %[q4_r],  +1(%[sq4])    \n\t"
565            "sb         %[q5_r],  +1(%[sq5])    \n\t"
566            "sb         %[q6_r],  +1(%[sq6])    \n\t"
567
568            :
569            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
570              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
571              [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1),
572              [sq2] "r" (sq2), [sq3] "r" (sq3),
573              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
574        );
575      } else if (mask & flat & 0x0000FF00) {
576        __asm__ __volatile__ (
577            "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
578            "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
579            "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
580            "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
581            "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
582            "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
583
584            :
585            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
586              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
587              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
588              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
589              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
590        );
591      } else if (mask & 0x0000FF00) {
592        __asm__ __volatile__ (
593            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
594            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
595            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
596            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
597
598            :
599            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
600              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
601              [sq0] "r" (sq0), [sq1] "r" (sq1)
602        );
603      }
604
605      __asm__ __volatile__ (
606          "srl        %[p1_f0], %[p1_f0], 8     \n\t"
607          "srl        %[p0_f0], %[p0_f0], 8     \n\t"
608          "srl        %[q0_f0], %[q0_f0], 8     \n\t"
609          "srl        %[q1_f0], %[q1_f0], 8     \n\t"
610
611          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
612            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
613          :
614      );
615
616      if (mask & flat & flat2 & 0x00FF0000) {
617        __asm__ __volatile__ (
618            "sb         %[p6_l],  +2(%[sp6])    \n\t"
619            "sb         %[p5_l],  +2(%[sp5])    \n\t"
620            "sb         %[p4_l],  +2(%[sp4])    \n\t"
621            "sb         %[p3_l],  +2(%[sp3])    \n\t"
622            "sb         %[p2_l],  +2(%[sp2])    \n\t"
623            "sb         %[p1_l],  +2(%[sp1])    \n\t"
624            "sb         %[p0_l],  +2(%[sp0])    \n\t"
625
626            :
627            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
628              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
629              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
630              [sp4] "r" (sp4), [sp3] "r" (sp3),
631              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
632        );
633
634        __asm__ __volatile__ (
635            "sb         %[q0_l],  +2(%[sq0])    \n\t"
636            "sb         %[q1_l],  +2(%[sq1])    \n\t"
637            "sb         %[q2_l],  +2(%[sq2])    \n\t"
638            "sb         %[q3_l],  +2(%[sq3])    \n\t"
639            "sb         %[q4_l],  +2(%[sq4])    \n\t"
640            "sb         %[q5_l],  +2(%[sq5])    \n\t"
641            "sb         %[q6_l],  +2(%[sq6])    \n\t"
642
643            :
644            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
645              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
646              [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1),
647              [sq2] "r" (sq2), [sq3] "r" (sq3),
648              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
649        );
650      } else if (mask & flat & 0x00FF0000) {
651        __asm__ __volatile__ (
652            "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
653            "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
654            "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
655            "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
656            "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
657            "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
658
659            :
660            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
661              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
662              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
663              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
664              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
665        );
666      } else if (mask & 0x00FF0000) {
667        __asm__ __volatile__ (
668            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
669            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
670            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
671            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
672
673            :
674            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
675              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
676              [sq0] "r" (sq0), [sq1] "r" (sq1)
677        );
678      }
679
680      __asm__ __volatile__ (
681          "srl      %[p6_l],    %[p6_l],    16   \n\t"
682          "srl      %[p5_l],    %[p5_l],    16   \n\t"
683          "srl      %[p4_l],    %[p4_l],    16   \n\t"
684          "srl      %[p3_l],    %[p3_l],    16   \n\t"
685          "srl      %[p2_l],    %[p2_l],    16   \n\t"
686          "srl      %[p1_l],    %[p1_l],    16   \n\t"
687          "srl      %[p0_l],    %[p0_l],    16   \n\t"
688          "srl      %[q0_l],    %[q0_l],    16   \n\t"
689          "srl      %[q1_l],    %[q1_l],    16   \n\t"
690          "srl      %[q2_l],    %[q2_l],    16   \n\t"
691          "srl      %[q3_l],    %[q3_l],    16   \n\t"
692          "srl      %[q4_l],    %[q4_l],    16   \n\t"
693          "srl      %[q5_l],    %[q5_l],    16   \n\t"
694          "srl      %[q6_l],    %[q6_l],    16   \n\t"
695
696          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
697            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
698            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
699            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
700            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
701          :
702      );
703
704      __asm__ __volatile__ (
705          "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
706          "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
707          "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
708          "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
709          "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
710          "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
711          "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
712          "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
713          "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
714          "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
715
716          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
717            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
718            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
719            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
720            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
721          :
722      );
723
724      if (mask & flat & flat2 & 0xFF000000) {
725        __asm__ __volatile__ (
726            "sb     %[p6_l],    +3(%[sp6])    \n\t"
727            "sb     %[p5_l],    +3(%[sp5])    \n\t"
728            "sb     %[p4_l],    +3(%[sp4])    \n\t"
729            "sb     %[p3_l],    +3(%[sp3])    \n\t"
730            "sb     %[p2_l],    +3(%[sp2])    \n\t"
731            "sb     %[p1_l],    +3(%[sp1])    \n\t"
732            "sb     %[p0_l],    +3(%[sp0])    \n\t"
733
734            :
735            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
736              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
737              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
738              [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2),
739              [sp1] "r" (sp1), [sp0] "r" (sp0)
740        );
741
742        __asm__ __volatile__ (
743            "sb     %[q0_l],    +3(%[sq0])    \n\t"
744            "sb     %[q1_l],    +3(%[sq1])    \n\t"
745            "sb     %[q2_l],    +3(%[sq2])    \n\t"
746            "sb     %[q3_l],    +3(%[sq3])    \n\t"
747            "sb     %[q4_l],    +3(%[sq4])    \n\t"
748            "sb     %[q5_l],    +3(%[sq5])    \n\t"
749            "sb     %[q6_l],    +3(%[sq6])    \n\t"
750
751            :
752            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l),
753              [q2_l] "r" (q2_l), [q3_l] "r" (q3_l),
754              [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
755              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
756              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
757              [q6_l] "r" (q6_l), [sq6] "r" (sq6)
758        );
759      } else if (mask & flat & 0xFF000000) {
760        __asm__ __volatile__ (
761            "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
762            "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
763            "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
764            "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
765            "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
766            "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
767
768            :
769            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
770              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
771              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
772              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
773              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
774        );
775      } else if (mask & 0xFF000000) {
776        __asm__ __volatile__ (
777            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
778            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
779            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
780            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
781
782            :
783            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
784              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
785              [sp1] "r" (sp1), [sp0] "r" (sp0),
786              [sq0] "r" (sq0), [sq1] "r" (sq1)
787        );
788      }
789    }
790
791    s = s + 4;
792  }
793}
794#endif  // #if HAVE_DSPR2
795