1// Copyright 2014 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// MIPS version of speed-critical encoding functions.
11//
12// Author(s): Darko Laus (darko.laus@imgtec.com)
13//            Mirko Raus (mirko.raus@imgtec.com)
14
15#include "src/dsp/dsp.h"
16
17#if defined(WEBP_USE_MIPS_DSP_R2)
18
19#include "src/dsp/mips_macro.h"
20#include "src/enc/cost_enc.h"
21#include "src/enc/vp8i_enc.h"
22
23static const int kC1 = 20091 + (1 << 16);
24static const int kC2 = 35468;
25
26// O - output
27// I - input (macro doesn't change it)
28#define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7,                      \
29                          I0, I1, I2, I3, I4, I5, I6, I7)                      \
30  "addq.ph          %[" #O0 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
31  "subq.ph          %[" #O1 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
32  "addq.ph          %[" #O2 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
33  "subq.ph          %[" #O3 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
34  "addq.ph          %[" #O4 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
35  "subq.ph          %[" #O5 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
36  "addq.ph          %[" #O6 "],   %[" #I6 "],  %[" #I7 "]     \n\t"            \
37  "subq.ph          %[" #O7 "],   %[" #I6 "],  %[" #I7 "]     \n\t"
38
39// IO - input/output
40#define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7)                         \
41  "absq_s.ph        %[" #IO0 "],   %[" #IO0 "]                \n\t"            \
42  "absq_s.ph        %[" #IO1 "],   %[" #IO1 "]                \n\t"            \
43  "absq_s.ph        %[" #IO2 "],   %[" #IO2 "]                \n\t"            \
44  "absq_s.ph        %[" #IO3 "],   %[" #IO3 "]                \n\t"            \
45  "absq_s.ph        %[" #IO4 "],   %[" #IO4 "]                \n\t"            \
46  "absq_s.ph        %[" #IO5 "],   %[" #IO5 "]                \n\t"            \
47  "absq_s.ph        %[" #IO6 "],   %[" #IO6 "]                \n\t"            \
48  "absq_s.ph        %[" #IO7 "],   %[" #IO7 "]                \n\t"
49
50// dpa.w.ph $ac0 temp0 ,temp1
51//  $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0]
52// dpax.w.ph $ac0 temp0 ,temp1
53//  $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16]
54// O - output
55// I - input (macro doesn't change it)
56#define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7,                           \
57                 I8, I9, I10, I11, I12, I13, I14, I15)                         \
58    "mult            $ac0,      $zero,     $zero              \n\t"            \
59    "dpa.w.ph        $ac0,      %[" #I2 "],  %[" #I0 "]       \n\t"            \
60    "dpax.w.ph       $ac0,      %[" #I5 "],  %[" #I6 "]       \n\t"            \
61    "dpa.w.ph        $ac0,      %[" #I8 "],  %[" #I9 "]       \n\t"            \
62    "dpax.w.ph       $ac0,      %[" #I11 "], %[" #I4 "]       \n\t"            \
63    "dpa.w.ph        $ac0,      %[" #I12 "], %[" #I7 "]       \n\t"            \
64    "dpax.w.ph       $ac0,      %[" #I13 "], %[" #I1 "]       \n\t"            \
65    "dpa.w.ph        $ac0,      %[" #I14 "], %[" #I3 "]       \n\t"            \
66    "dpax.w.ph       $ac0,      %[" #I15 "], %[" #I10 "]      \n\t"            \
67    "mflo            %[" #O0 "],  $ac0                        \n\t"
68
69#define OUTPUT_EARLY_CLOBBER_REGS_17()                                         \
70  OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
71  [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
72  [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
73  [temp17]"=&r"(temp17)
74
75// macro for one horizontal pass in FTransform
76// temp0..temp15 holds tmp[0]..tmp[15]
77// A - offset in bytes to load from src and ref buffers
78// TEMP0..TEMP3 - registers for corresponding tmp elements
79#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                         \
80  "lw              %[" #TEMP0 "],   0(%[args])                          \n\t"  \
81  "lw              %[" #TEMP1 "],   4(%[args])                          \n\t"  \
82  "lw              %[" #TEMP2 "],   " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t"  \
83  "lw              %[" #TEMP3 "],   " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t"  \
84  "preceu.ph.qbl   %[" #TEMP0 "],   %[" #TEMP2 "]                       \n\t"  \
85  "preceu.ph.qbl   %[" #TEMP1 "],   %[" #TEMP3 "]                       \n\t"  \
86  "preceu.ph.qbr   %[" #TEMP2 "],   %[" #TEMP2 "]                       \n\t"  \
87  "preceu.ph.qbr   %[" #TEMP3 "],   %[" #TEMP3 "]                       \n\t"  \
88  "subq.ph         %[" #TEMP0 "],   %[" #TEMP0 "],   %[" #TEMP1 "]      \n\t"  \
89  "subq.ph         %[" #TEMP2 "],   %[" #TEMP2 "],   %[" #TEMP3 "]      \n\t"  \
90  "rotr            %[" #TEMP0 "],   %[" #TEMP0 "],   16                 \n\t"  \
91  "addq.ph         %[" #TEMP1 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
92  "subq.ph         %[" #TEMP3 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
93  "seh             %[" #TEMP0 "],   %[" #TEMP1 "]                       \n\t"  \
94  "sra             %[temp16],     %[" #TEMP1 "],   16                   \n\t"  \
95  "seh             %[temp19],     %[" #TEMP3 "]                         \n\t"  \
96  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   16                 \n\t"  \
97  "subu            %[" #TEMP2 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
98  "addu            %[" #TEMP0 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
99  "mul             %[temp17],     %[temp19],     %[c2217]               \n\t"  \
100  "mul             %[temp18],     %[" #TEMP3 "],   %[c5352]             \n\t"  \
101  "mul             %[" #TEMP1 "],   %[temp19],     %[c5352]             \n\t"  \
102  "mul             %[temp16],     %[" #TEMP3 "],   %[c2217]             \n\t"  \
103  "sll             %[" #TEMP2 "],   %[" #TEMP2 "],   3                  \n\t"  \
104  "sll             %[" #TEMP0 "],   %[" #TEMP0 "],   3                  \n\t"  \
105  "subu            %[" #TEMP3 "],   %[temp17],     %[temp18]            \n\t"  \
106  "addu            %[" #TEMP1 "],   %[temp16],     %[" #TEMP1 "]        \n\t"  \
107  "addiu           %[" #TEMP3 "],   %[" #TEMP3 "],   937                \n\t"  \
108  "addiu           %[" #TEMP1 "],   %[" #TEMP1 "],   1812               \n\t"  \
109  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   9                  \n\t"  \
110  "sra             %[" #TEMP1 "],   %[" #TEMP1 "],   9                  \n\t"
111
112// macro for one vertical pass in FTransform
113// temp0..temp15 holds tmp[0]..tmp[15]
114// A..D - offsets in bytes to store to out buffer
115// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
116#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)                 \
117  "addu            %[temp16],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
118  "subu            %[temp19],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
119  "addu            %[temp17],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
120  "subu            %[temp18],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
121  "mul             %[" #TEMP8 "],   %[temp19],     %[c2217]         \n\t"      \
122  "mul             %[" #TEMP12 "],  %[temp18],     %[c2217]         \n\t"      \
123  "mul             %[" #TEMP4 "],   %[temp19],     %[c5352]         \n\t"      \
124  "mul             %[temp18],     %[temp18],     %[c5352]           \n\t"      \
125  "addiu           %[temp16],     %[temp16],     7                  \n\t"      \
126  "addu            %[" #TEMP0 "],   %[temp16],     %[temp17]        \n\t"      \
127  "sra             %[" #TEMP0 "],   %[" #TEMP0 "],   4              \n\t"      \
128  "addu            %[" #TEMP12 "],  %[" #TEMP12 "],  %[" #TEMP4 "]  \n\t"      \
129  "subu            %[" #TEMP4 "],   %[temp16],     %[temp17]        \n\t"      \
130  "sra             %[" #TEMP4 "],   %[" #TEMP4 "],   4              \n\t"      \
131  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   30000          \n\t"      \
132  "addiu           %[" #TEMP12 "],  %[" #TEMP12 "],  12000          \n\t"      \
133  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   21000          \n\t"      \
134  "subu            %[" #TEMP8 "],   %[" #TEMP8 "],   %[temp18]      \n\t"      \
135  "sra             %[" #TEMP12 "],  %[" #TEMP12 "],  16             \n\t"      \
136  "sra             %[" #TEMP8 "],   %[" #TEMP8 "],   16             \n\t"      \
137  "addiu           %[temp16],     %[" #TEMP12 "],  1                \n\t"      \
138  "movn            %[" #TEMP12 "],  %[temp16],     %[temp19]        \n\t"      \
139  "sh              %[" #TEMP0 "],   " #A "(%[temp20])               \n\t"      \
140  "sh              %[" #TEMP4 "],   " #C "(%[temp20])               \n\t"      \
141  "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
142  "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
143
144static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
145                                 int16_t* out) {
146  const int c2217 = 2217;
147  const int c5352 = 5352;
148  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
149  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
150  int temp17, temp18, temp19, temp20;
151  const int* const args[3] =
152      { (const int*)src, (const int*)ref, (const int*)out };
153
154  __asm__ volatile (
155    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
156    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
157    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
158    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
159    "lw            %[temp20],     8(%[args])                  \n\t"
160    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
161    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
162    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
163    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
164    OUTPUT_EARLY_CLOBBER_REGS_18(),
165      [temp0]"=&r"(temp0), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
166    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
167    : "memory", "hi", "lo"
168  );
169}
170
171#undef VERTICAL_PASS
172#undef HORIZONTAL_PASS
173
174static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
175                                      uint8_t* dst) {
176  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
177  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
178
179  __asm__ volatile (
180    "ulw              %[temp1],   0(%[in])                 \n\t"
181    "ulw              %[temp2],   16(%[in])                \n\t"
182    LOAD_IN_X2(temp5, temp6, 24, 26)
183    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
184    LOAD_IN_X2(temp1, temp2, 8, 10)
185    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
186                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
187                  temp13, temp11, temp14, temp12)
188    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
189    "ulw              %[temp17],  4(%[in])                 \n\t"
190    "ulw              %[temp18],  20(%[in])                \n\t"
191    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
192    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
193    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
194    LOAD_IN_X2(temp17, temp18, 12, 14)
195    LOAD_IN_X2(temp9, temp10, 28, 30)
196    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
197                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
198                  temp15, temp4, temp16, temp17)
199    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
200    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
201    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
202
203    // horizontal
204    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
205    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
206    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
207    "repl.ph          %[temp2],   0x4                      \n\t"
208    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
209    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
210    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
211    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
212    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
213    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
214                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
215                  temp6, temp17, temp8, temp18)
216    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
217                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
218                  temp18, temp12, temp17, temp16)
219    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
220    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
221    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
222                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
223                   temp6)
224    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
225                          temp16, temp11, temp10, temp15, temp14)
226    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref,
227                        0, 0, 0, 0,
228                        0, 1, 2, 3,
229                        BPS)
230    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
231                            temp11, temp10, temp11, temp14, temp15)
232    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
233                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
234                     dst, 0, 1, 2, 3, BPS)
235
236    OUTPUT_EARLY_CLOBBER_REGS_18()
237    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
238    : "memory", "hi", "lo"
239  );
240}
241
242static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
243                                 uint8_t* dst, int do_two) {
244  ITransformOne(ref, in, dst);
245  if (do_two) {
246    ITransformOne(ref + 4, in + 16, dst + 4);
247  }
248}
249
250static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
251                              const uint16_t* const w) {
252  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
253  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
254
255  __asm__ volatile (
256    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a,
257                        0, 0, 0, 0,
258                        0, 1, 2, 3,
259                        BPS)
260    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11,
261                            temp12, temp1, temp2, temp3, temp4)
262    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
263                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
264    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
265                          temp7, temp2, temp4, temp6, temp8)
266    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
267                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
268    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
269                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
270    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
271                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
272    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
273    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
274                        0, 4, 8, 12,
275                        0, 0, 0, 0,
276                        0)
277    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
278                        0, 4, 8, 12,
279                        1, 1, 1, 1,
280                        16)
281    MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
282             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
283    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b,
284                        0, 0, 0, 0,
285                        0, 1, 2, 3,
286                        BPS)
287    CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11,
288                            temp12, temp1, temp2, temp3, temp4)
289    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
290                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
291    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
292                          temp7, temp2, temp4, temp6, temp8)
293    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
294                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
295    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
296                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
297    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
298                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
299    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
300    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
301                        0, 4, 8, 12,
302                        0, 0, 0, 0,
303                        0)
304    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
305                        0, 4, 8, 12,
306                        1, 1, 1, 1,
307                        16)
308    MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
309             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
310    OUTPUT_EARLY_CLOBBER_REGS_17()
311    : [a]"r"(a), [b]"r"(b), [w]"r"(w)
312    : "memory", "hi", "lo"
313  );
314  return abs(temp3 - temp17) >> 5;
315}
316
317static int Disto16x16_MIPSdspR2(const uint8_t* const a,
318                                const uint8_t* const b,
319                                const uint16_t* const w) {
320  int D = 0;
321  int x, y;
322  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
323    for (x = 0; x < 16; x += 4) {
324      D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
325    }
326  }
327  return D;
328}
329
330//------------------------------------------------------------------------------
331// Intra predictions
332
333#define FILL_PART(J, SIZE)                                            \
334    "usw        %[value],  0+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
335    "usw        %[value],  4+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
336  ".if " #SIZE " == 16                                     \n\t"      \
337    "usw        %[value],  8+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
338    "usw        %[value], 12+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
339  ".endif                                                  \n\t"
340
341#define FILL_8_OR_16(DST, VALUE, SIZE) do {                         \
342  int value = (VALUE);                                              \
343  __asm__ volatile (                                                \
344    "replv.qb   %[value],  %[value]                      \n\t"      \
345    FILL_PART( 0, SIZE)                                             \
346    FILL_PART( 1, SIZE)                                             \
347    FILL_PART( 2, SIZE)                                             \
348    FILL_PART( 3, SIZE)                                             \
349    FILL_PART( 4, SIZE)                                             \
350    FILL_PART( 5, SIZE)                                             \
351    FILL_PART( 6, SIZE)                                             \
352    FILL_PART( 7, SIZE)                                             \
353  ".if " #SIZE " == 16                                   \n\t"      \
354    FILL_PART( 8, 16)                                               \
355    FILL_PART( 9, 16)                                               \
356    FILL_PART(10, 16)                                               \
357    FILL_PART(11, 16)                                               \
358    FILL_PART(12, 16)                                               \
359    FILL_PART(13, 16)                                               \
360    FILL_PART(14, 16)                                               \
361    FILL_PART(15, 16)                                               \
362  ".endif                                                \n\t"      \
363    : [value]"+&r"(value)                                           \
364    : [dst]"r"((DST))                                               \
365    : "memory"                                                      \
366  );                                                                \
367} while (0)
368
369#define VERTICAL_PRED(DST, TOP, SIZE)                                          \
370static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST),                     \
371                                           const uint8_t* (TOP)) {             \
372  int j;                                                                       \
373  if ((TOP)) {                                                                 \
374    for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
375  } else {                                                                     \
376    FILL_8_OR_16((DST), 127, (SIZE));                                          \
377  }                                                                            \
378}
379
380VERTICAL_PRED(dst, top, 8)
381VERTICAL_PRED(dst, top, 16)
382
383#undef VERTICAL_PRED
384
385#define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
386static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST),                   \
387                                             const uint8_t* (LEFT)) {          \
388  if (LEFT) {                                                                  \
389    int j;                                                                     \
390    for (j = 0; j < (SIZE); ++j) {                                             \
391      memset((DST) + j * BPS, (LEFT)[j], (SIZE));                              \
392    }                                                                          \
393  } else {                                                                     \
394    FILL_8_OR_16((DST), 129, (SIZE));                                          \
395  }                                                                            \
396}
397
398HORIZONTAL_PRED(dst, left, 8)
399HORIZONTAL_PRED(dst, left, 16)
400
401#undef HORIZONTAL_PRED
402
403#define CLIPPING()                                                             \
404  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
405  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
406  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
407  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
408  "addu.ph         %[temp2],   %[temp2],   %[leftY_1]    \n\t"                 \
409  "addu.ph         %[temp0],   %[temp0],   %[leftY_1]    \n\t"                 \
410  "addu.ph         %[temp3],   %[temp3],   %[leftY_1]    \n\t"                 \
411  "addu.ph         %[temp1],   %[temp1],   %[leftY_1]    \n\t"                 \
412  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
413  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
414  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
415  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
416  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
417  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"
418
419#define CLIP_8B_TO_DST(DST, LEFT, TOP, SIZE) do {                              \
420  int leftY_1 = ((int)(LEFT)[y] << 16) + (LEFT)[y];                            \
421  int temp0, temp1, temp2, temp3;                                              \
422  __asm__ volatile (                                                           \
423    "replv.ph        %[leftY_1], %[leftY_1]              \n\t"                 \
424    "ulw             %[temp0],   0(%[top])               \n\t"                 \
425    "ulw             %[temp1],   4(%[top])               \n\t"                 \
426    "subu.ph         %[leftY_1], %[leftY_1], %[left_1]   \n\t"                 \
427    CLIPPING()                                                                 \
428    "usw             %[temp0],   0(%[dst])               \n\t"                 \
429    "usw             %[temp1],   4(%[dst])               \n\t"                 \
430  ".if " #SIZE " == 16                                   \n\t"                 \
431    "ulw             %[temp0],   8(%[top])               \n\t"                 \
432    "ulw             %[temp1],   12(%[top])              \n\t"                 \
433    CLIPPING()                                                                 \
434    "usw             %[temp0],   8(%[dst])               \n\t"                 \
435    "usw             %[temp1],   12(%[dst])              \n\t"                 \
436  ".endif                                                \n\t"                 \
437    : [leftY_1]"+&r"(leftY_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),       \
438      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
439    : [left_1]"r"(left_1), [top]"r"((TOP)), [dst]"r"((DST))                    \
440    : "memory"                                                                 \
441  );                                                                           \
442} while (0)
443
444#define CLIP_TO_DST(DST, LEFT, TOP, SIZE) do {                                 \
445  int y;                                                                       \
446  const int left_1 = ((int)(LEFT)[-1] << 16) + (LEFT)[-1];                     \
447  for (y = 0; y < (SIZE); ++y) {                                               \
448    CLIP_8B_TO_DST((DST), (LEFT), (TOP), (SIZE));                              \
449    (DST) += BPS;                                                              \
450  }                                                                            \
451} while (0)
452
453#define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
454static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\
455                                         const uint8_t* (TOP)) {               \
456  if ((LEFT) != NULL) {                                                        \
457    if ((TOP) != NULL) {                                                       \
458      CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
459    } else {                                                                   \
460      HorizontalPred##SIZE((DST), (LEFT));                                     \
461    }                                                                          \
462  } else {                                                                     \
463    /* true motion without left samples (hence: with default 129 value)    */  \
464    /* is equivalent to VE prediction where you just copy the top samples. */  \
465    /* Note that if top samples are not available, the default value is    */  \
466    /* then 129, and not 127 as in the VerticalPred case.                  */  \
467    if ((TOP) != NULL) {                                                       \
468      VerticalPred##SIZE((DST), (TOP));                                        \
469    } else {                                                                   \
470      FILL_8_OR_16((DST), 129, (SIZE));                                        \
471    }                                                                          \
472  }                                                                            \
473}
474
475TRUE_MOTION(dst, left, top, 8)
476TRUE_MOTION(dst, left, top, 16)
477
478#undef TRUE_MOTION
479#undef CLIP_TO_DST
480#undef CLIP_8B_TO_DST
481#undef CLIPPING
482
483static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
484                                 const uint8_t* top) {
485  int DC, DC1;
486  int temp0, temp1, temp2, temp3;
487
488  __asm__ volatile(
489    "beqz        %[top],   2f                  \n\t"
490    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, top,
491                        0, 4, 8, 12,
492                        0, 0, 0, 0,
493                        0)
494    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
495    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
496    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
497    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
498    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
499    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
500    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
501    "move        %[DC1],   %[DC]               \n\t"
502    "beqz        %[left],  1f                  \n\t"
503    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
504                        0, 4, 8, 12,
505                        0, 0, 0, 0,
506                        0)
507    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
508    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
509    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
510    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
511    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
512    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
513    "addu        %[DC1],   %[temp0], %[temp2]  \n\t"
514  "1:                                          \n\t"
515    "addu        %[DC],   %[DC],     %[DC1]    \n\t"
516    "j           3f                            \n\t"
517  "2:                                          \n\t"
518    "beqz        %[left],  4f                  \n\t"
519    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
520                        0, 4, 8, 12,
521                        0, 0, 0, 0,
522                        0)
523    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
524    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
525    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
526    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
527    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
528    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
529    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
530    "addu        %[DC],    %[DC],    %[DC]     \n\t"
531  "3:                                          \n\t"
532    "shra_r.w    %[DC],    %[DC],    5         \n\t"
533    "j           5f                            \n\t"
534  "4:                                          \n\t"
535    "li          %[DC],    0x80                \n\t"
536  "5:                                          \n\t"
537    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
538      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
539    : [left]"r"(left), [top]"r"(top)
540    : "memory"
541  );
542
543  FILL_8_OR_16(dst, DC, 16);
544}
545
546static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
547                                const uint8_t* top) {
548  int DC, DC1;
549  int temp0, temp1, temp2, temp3;
550
551  __asm__ volatile(
552    "beqz        %[top],   2f                  \n\t"
553    "ulw         %[temp0], 0(%[top])           \n\t"
554    "ulw         %[temp1], 4(%[top])           \n\t"
555    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
556    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
557    "addu        %[DC],    %[temp0], %[temp1]  \n\t"
558    "move        %[DC1],   %[DC]               \n\t"
559    "beqz        %[left],  1f                  \n\t"
560    "ulw         %[temp2], 0(%[left])          \n\t"
561    "ulw         %[temp3], 4(%[left])          \n\t"
562    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
563    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
564    "addu        %[DC1],   %[temp2], %[temp3]  \n\t"
565  "1:                                          \n\t"
566    "addu        %[DC],    %[DC],    %[DC1]    \n\t"
567    "j           3f                            \n\t"
568  "2:                                          \n\t"
569    "beqz        %[left],  4f                  \n\t"
570    "ulw         %[temp2], 0(%[left])          \n\t"
571    "ulw         %[temp3], 4(%[left])          \n\t"
572    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
573    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
574    "addu        %[DC],    %[temp2], %[temp3]  \n\t"
575    "addu        %[DC],    %[DC],    %[DC]     \n\t"
576  "3:                                          \n\t"
577    "shra_r.w    %[DC], %[DC], 4               \n\t"
578    "j           5f                            \n\t"
579  "4:                                          \n\t"
580    "li          %[DC], 0x80                   \n\t"
581  "5:                                          \n\t"
582    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
583      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
584    : [left]"r"(left), [top]"r"(top)
585    : "memory"
586  );
587
588  FILL_8_OR_16(dst, DC, 8);
589}
590
591static void DC4(uint8_t* dst, const uint8_t* top) {
592  int temp0, temp1;
593  __asm__ volatile(
594    "ulw          %[temp0],   0(%[top])               \n\t"
595    "ulw          %[temp1],   -5(%[top])              \n\t"
596    "raddu.w.qb   %[temp0],   %[temp0]                \n\t"
597    "raddu.w.qb   %[temp1],   %[temp1]                \n\t"
598    "addu         %[temp0],   %[temp0],    %[temp1]   \n\t"
599    "addiu        %[temp0],   %[temp0],    4          \n\t"
600    "srl          %[temp0],   %[temp0],    3          \n\t"
601    "replv.qb     %[temp0],   %[temp0]                \n\t"
602    "usw          %[temp0],   0*" XSTR(BPS) "(%[dst]) \n\t"
603    "usw          %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
604    "usw          %[temp0],   2*" XSTR(BPS) "(%[dst]) \n\t"
605    "usw          %[temp0],   3*" XSTR(BPS) "(%[dst]) \n\t"
606    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
607    : [top]"r"(top), [dst]"r"(dst)
608    : "memory"
609  );
610}
611
612static void TM4(uint8_t* dst, const uint8_t* top) {
613  int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
614  const int c35 = 0xff00ff;
615  __asm__ volatile (
616    "lbu              %[temp1],  0(%[top])                     \n\t"
617    "lbu              %[a10],    1(%[top])                     \n\t"
618    "lbu              %[temp2],  2(%[top])                     \n\t"
619    "lbu              %[a32],    3(%[top])                     \n\t"
620    "ulw              %[temp0],  -5(%[top])                    \n\t"
621    "lbu              %[temp4],  -1(%[top])                    \n\t"
622    "append           %[a10],    %[temp1],   16                \n\t"
623    "append           %[a32],    %[temp2],   16                \n\t"
624    "replv.ph         %[temp4],  %[temp4]                      \n\t"
625    "shrl.ph          %[temp1],  %[temp0],   8                 \n\t"
626    "and              %[temp0],  %[temp0],   %[c35]            \n\t"
627    "subu.ph          %[temp1],  %[temp1],   %[temp4]          \n\t"
628    "subu.ph          %[temp0],  %[temp0],   %[temp4]          \n\t"
629    "srl              %[temp2],  %[temp1],   16                \n\t"
630    "srl              %[temp3],  %[temp0],   16                \n\t"
631    "replv.ph         %[temp2],  %[temp2]                      \n\t"
632    "replv.ph         %[temp3],  %[temp3]                      \n\t"
633    "replv.ph         %[temp4],  %[temp1]                      \n\t"
634    "replv.ph         %[temp5],  %[temp0]                      \n\t"
635    "addu.ph          %[temp0],  %[temp3],   %[a10]            \n\t"
636    "addu.ph          %[temp1],  %[temp3],   %[a32]            \n\t"
637    "addu.ph          %[temp3],  %[temp2],   %[a10]            \n\t"
638    "addu.ph          %[temp2],  %[temp2],   %[a32]            \n\t"
639    "shll_s.ph        %[temp0],  %[temp0],   7                 \n\t"
640    "shll_s.ph        %[temp1],  %[temp1],   7                 \n\t"
641    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
642    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
643    "precrqu_s.qb.ph  %[temp0],  %[temp1],   %[temp0]          \n\t"
644    "precrqu_s.qb.ph  %[temp1],  %[temp2],   %[temp3]          \n\t"
645    "addu.ph          %[temp2],  %[temp5],   %[a10]            \n\t"
646    "addu.ph          %[temp3],  %[temp5],   %[a32]            \n\t"
647    "addu.ph          %[temp5],  %[temp4],   %[a10]            \n\t"
648    "addu.ph          %[temp4],  %[temp4],   %[a32]            \n\t"
649    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
650    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
651    "shll_s.ph        %[temp4],  %[temp4],   7                 \n\t"
652    "shll_s.ph        %[temp5],  %[temp5],   7                 \n\t"
653    "precrqu_s.qb.ph  %[temp2],  %[temp3],   %[temp2]          \n\t"
654    "precrqu_s.qb.ph  %[temp3],  %[temp4],   %[temp5]          \n\t"
655    "usw              %[temp1],  0*" XSTR(BPS) "(%[dst])       \n\t"
656    "usw              %[temp0],  1*" XSTR(BPS) "(%[dst])       \n\t"
657    "usw              %[temp3],  2*" XSTR(BPS) "(%[dst])       \n\t"
658    "usw              %[temp2],  3*" XSTR(BPS) "(%[dst])       \n\t"
659    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
660      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
661      [a10]"=&r"(a10), [a32]"=&r"(a32)
662    : [c35]"r"(c35), [top]"r"(top), [dst]"r"(dst)
663    : "memory"
664  );
665}
666
667static void VE4(uint8_t* dst, const uint8_t* top) {
668  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
669  __asm__ volatile(
670    "ulw             %[temp0],   -1(%[top])              \n\t"
671    "ulh             %[temp1],   3(%[top])               \n\t"
672    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
673    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
674    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
675    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
676    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
677    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
678    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
679    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
680    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
681    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
682    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
683    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
684    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
685    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
686    "usw             %[temp4],   0*" XSTR(BPS) "(%[dst]) \n\t"
687    "usw             %[temp4],   1*" XSTR(BPS) "(%[dst]) \n\t"
688    "usw             %[temp4],   2*" XSTR(BPS) "(%[dst]) \n\t"
689    "usw             %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
690    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
691      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
692      [temp6]"=&r"(temp6)
693    : [top]"r"(top), [dst]"r"(dst)
694    : "memory"
695  );
696}
697
698static void HE4(uint8_t* dst, const uint8_t* top) {
699  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
700  __asm__ volatile(
701    "ulw             %[temp0],   -4(%[top])              \n\t"
702    "lbu             %[temp1],   -5(%[top])              \n\t"
703    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
704    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
705    "replv.ph        %[temp4],   %[temp1]                \n\t"
706    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
707    "packrl.ph       %[temp6],   %[temp2],    %[temp4]   \n\t"
708    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
709    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
710    "addq.ph         %[temp3],   %[temp3],    %[temp5]   \n\t"
711    "addq.ph         %[temp3],   %[temp3],    %[temp2]   \n\t"
712    "addq.ph         %[temp2],   %[temp2],    %[temp6]   \n\t"
713    "addq.ph         %[temp2],   %[temp2],    %[temp4]   \n\t"
714    "shra_r.ph       %[temp3],   %[temp3],    2          \n\t"
715    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
716    "replv.qb        %[temp0],   %[temp3]                \n\t"
717    "replv.qb        %[temp1],   %[temp2]                \n\t"
718    "srl             %[temp3],   %[temp3],    16         \n\t"
719    "srl             %[temp2],   %[temp2],    16         \n\t"
720    "replv.qb        %[temp3],   %[temp3]                \n\t"
721    "replv.qb        %[temp2],   %[temp2]                \n\t"
722    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
723    "usw             %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
724    "usw             %[temp2],   2*" XSTR(BPS) "(%[dst]) \n\t"
725    "usw             %[temp1],   3*" XSTR(BPS) "(%[dst]) \n\t"
726    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
727      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
728      [temp6]"=&r"(temp6)
729    : [top]"r"(top), [dst]"r"(dst)
730    : "memory"
731  );
732}
733
734static void RD4(uint8_t* dst, const uint8_t* top) {
735  int temp0, temp1, temp2, temp3, temp4, temp5;
736  int temp6, temp7, temp8, temp9, temp10, temp11;
737  __asm__ volatile(
738    "ulw             %[temp0],    -5(%[top])               \n\t"
739    "ulw             %[temp1],    -1(%[top])               \n\t"
740    "preceu.ph.qbl   %[temp2],    %[temp0]                 \n\t"
741    "preceu.ph.qbr   %[temp3],    %[temp0]                 \n\t"
742    "preceu.ph.qbr   %[temp4],    %[temp1]                 \n\t"
743    "preceu.ph.qbl   %[temp5],    %[temp1]                 \n\t"
744    "packrl.ph       %[temp6],    %[temp2],    %[temp3]    \n\t"
745    "packrl.ph       %[temp7],    %[temp4],    %[temp2]    \n\t"
746    "packrl.ph       %[temp8],    %[temp5],    %[temp4]    \n\t"
747    "shll.ph         %[temp6],    %[temp6],    1           \n\t"
748    "addq.ph         %[temp9],    %[temp2],    %[temp6]    \n\t"
749    "shll.ph         %[temp7],    %[temp7],    1           \n\t"
750    "addq.ph         %[temp9],    %[temp9],    %[temp3]    \n\t"
751    "shll.ph         %[temp8],    %[temp8],    1           \n\t"
752    "shra_r.ph       %[temp9],    %[temp9],    2           \n\t"
753    "addq.ph         %[temp10],   %[temp4],    %[temp7]    \n\t"
754    "addq.ph         %[temp11],   %[temp5],    %[temp8]    \n\t"
755    "addq.ph         %[temp10],   %[temp10],   %[temp2]    \n\t"
756    "addq.ph         %[temp11],   %[temp11],   %[temp4]    \n\t"
757    "shra_r.ph       %[temp10],   %[temp10],   2           \n\t"
758    "shra_r.ph       %[temp11],   %[temp11],   2           \n\t"
759    "lbu             %[temp0],    3(%[top])                \n\t"
760    "lbu             %[temp1],    2(%[top])                \n\t"
761    "lbu             %[temp2],    1(%[top])                \n\t"
762    "sll             %[temp1],    %[temp1],    1           \n\t"
763    "addu            %[temp0],    %[temp0],    %[temp1]    \n\t"
764    "addu            %[temp0],    %[temp0],    %[temp2]    \n\t"
765    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]    \n\t"
766    "shra_r.w        %[temp0],    %[temp0],    2           \n\t"
767    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]   \n\t"
768    "usw             %[temp9],    3*" XSTR(BPS) "(%[dst])  \n\t"
769    "usw             %[temp10],   1*" XSTR(BPS) "(%[dst])  \n\t"
770    "prepend         %[temp9],    %[temp11],   8           \n\t"
771    "prepend         %[temp10],   %[temp0],    8           \n\t"
772    "usw             %[temp9],    2*" XSTR(BPS) "(%[dst])  \n\t"
773    "usw             %[temp10],   0*" XSTR(BPS) "(%[dst])  \n\t"
774    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
775      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
776      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
777      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
778    : [top]"r"(top), [dst]"r"(dst)
779    : "memory"
780  );
781}
782
783static void VR4(uint8_t* dst, const uint8_t* top) {
784  int temp0, temp1, temp2, temp3, temp4;
785  int temp5, temp6, temp7, temp8, temp9;
786  __asm__ volatile (
787    "ulw              %[temp0],   -4(%[top])              \n\t"
788    "ulw              %[temp1],   0(%[top])               \n\t"
789    "preceu.ph.qbl    %[temp2],   %[temp0]                \n\t"
790    "preceu.ph.qbr    %[temp0],   %[temp0]                \n\t"
791    "preceu.ph.qbla   %[temp3],   %[temp1]                \n\t"
792    "preceu.ph.qbra   %[temp1],   %[temp1]                \n\t"
793    "packrl.ph        %[temp7],   %[temp3],    %[temp2]   \n\t"
794    "addqh_r.ph       %[temp4],   %[temp1],    %[temp3]   \n\t"
795    "move             %[temp6],   %[temp1]                \n\t"
796    "append           %[temp1],   %[temp2],    16         \n\t"
797    "shll.ph          %[temp9],   %[temp6],    1          \n\t"
798    "addqh_r.ph       %[temp5],   %[temp7],    %[temp6]   \n\t"
799    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
800    "addu.ph          %[temp3],   %[temp7],    %[temp3]   \n\t"
801    "addu.ph          %[temp1],   %[temp1],    %[temp6]   \n\t"
802    "packrl.ph        %[temp7],   %[temp2],    %[temp0]   \n\t"
803    "addu.ph          %[temp6],   %[temp0],    %[temp2]   \n\t"
804    "addu.ph          %[temp3],   %[temp3],    %[temp9]   \n\t"
805    "addu.ph          %[temp1],   %[temp1],    %[temp8]   \n\t"
806    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
807    "shra_r.ph        %[temp3],   %[temp3],    2          \n\t"
808    "shra_r.ph        %[temp1],   %[temp1],    2          \n\t"
809    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
810    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
811    "precrq.ph.w      %[temp8],   %[temp4],    %[temp5]   \n\t"
812    "append           %[temp4],   %[temp5],    16         \n\t"
813    "precrq.ph.w      %[temp2],   %[temp3],    %[temp1]   \n\t"
814    "append           %[temp3],   %[temp1],    16         \n\t"
815    "precr.qb.ph      %[temp8],   %[temp8],    %[temp4]   \n\t"
816    "precr.qb.ph      %[temp3],   %[temp2],    %[temp3]   \n\t"
817    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
818    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
819    "append           %[temp3],   %[temp6],    8          \n\t"
820    "srl              %[temp6],   %[temp6],    16         \n\t"
821    "append           %[temp8],   %[temp6],    8          \n\t"
822    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
823    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
824    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
825      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
826      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
827      [temp9]"=&r"(temp9)
828    : [top]"r"(top), [dst]"r"(dst)
829    : "memory"
830  );
831}
832
833static void LD4(uint8_t* dst, const uint8_t* top) {
834  int temp0, temp1, temp2, temp3, temp4, temp5;
835  int temp6, temp7, temp8, temp9, temp10, temp11;
836  __asm__ volatile(
837    "ulw             %[temp0],    0(%[top])               \n\t"
838    "ulw             %[temp1],    4(%[top])               \n\t"
839    "preceu.ph.qbl   %[temp2],    %[temp0]                \n\t"
840    "preceu.ph.qbr   %[temp3],    %[temp0]                \n\t"
841    "preceu.ph.qbr   %[temp4],    %[temp1]                \n\t"
842    "preceu.ph.qbl   %[temp5],    %[temp1]                \n\t"
843    "packrl.ph       %[temp6],    %[temp2],    %[temp3]   \n\t"
844    "packrl.ph       %[temp7],    %[temp4],    %[temp2]   \n\t"
845    "packrl.ph       %[temp8],    %[temp5],    %[temp4]   \n\t"
846    "shll.ph         %[temp6],    %[temp6],    1          \n\t"
847    "addq.ph         %[temp9],    %[temp2],    %[temp6]   \n\t"
848    "shll.ph         %[temp7],    %[temp7],    1          \n\t"
849    "addq.ph         %[temp9],    %[temp9],    %[temp3]   \n\t"
850    "shll.ph         %[temp8],    %[temp8],    1          \n\t"
851    "shra_r.ph       %[temp9],    %[temp9],    2          \n\t"
852    "addq.ph         %[temp10],   %[temp4],    %[temp7]   \n\t"
853    "addq.ph         %[temp11],   %[temp5],    %[temp8]   \n\t"
854    "addq.ph         %[temp10],   %[temp10],   %[temp2]   \n\t"
855    "addq.ph         %[temp11],   %[temp11],   %[temp4]   \n\t"
856    "shra_r.ph       %[temp10],   %[temp10],   2          \n\t"
857    "shra_r.ph       %[temp11],   %[temp11],   2          \n\t"
858    "srl             %[temp1],    %[temp1],    24         \n\t"
859    "sll             %[temp1],    %[temp1],    1          \n\t"
860    "raddu.w.qb      %[temp5],    %[temp5]                \n\t"
861    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]   \n\t"
862    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]  \n\t"
863    "addu            %[temp1],    %[temp1],    %[temp5]   \n\t"
864    "shra_r.w        %[temp1],    %[temp1],    2          \n\t"
865    "usw             %[temp9],    0*" XSTR(BPS) "(%[dst]) \n\t"
866    "usw             %[temp10],   2*" XSTR(BPS) "(%[dst]) \n\t"
867    "prepend         %[temp9],    %[temp11],   8          \n\t"
868    "prepend         %[temp10],   %[temp1],    8          \n\t"
869    "usw             %[temp9],    1*" XSTR(BPS) "(%[dst]) \n\t"
870    "usw             %[temp10],   3*" XSTR(BPS) "(%[dst]) \n\t"
871    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
872      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
873      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
874      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
875    : [top]"r"(top), [dst]"r"(dst)
876    : "memory"
877  );
878}
879
880static void VL4(uint8_t* dst, const uint8_t* top) {
881  int temp0, temp1, temp2, temp3, temp4;
882  int temp5, temp6, temp7, temp8, temp9;
883  __asm__ volatile (
884    "ulw              %[temp0],   0(%[top])               \n\t"
885    "ulw              %[temp1],   4(%[top])               \n\t"
886    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
887    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
888    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
889    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
890    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
891    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
892    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
893    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
894    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
895    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
896    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
897    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
898    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
899    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
900    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
901    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
902    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
903    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
904    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
905    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
906    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
907    "precrq.ph.w      %[temp8],   %[temp5],    %[temp4]   \n\t"
908    "append           %[temp5],   %[temp4],    16         \n\t"
909    "precrq.ph.w      %[temp3],   %[temp2],    %[temp0]   \n\t"
910    "append           %[temp2],   %[temp0],    16         \n\t"
911    "precr.qb.ph      %[temp8],   %[temp8],    %[temp5]   \n\t"
912    "precr.qb.ph      %[temp3],   %[temp3],    %[temp2]   \n\t"
913    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
914    "prepend          %[temp8],   %[temp6],    8          \n\t"
915    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
916    "srl              %[temp6],   %[temp6],    16         \n\t"
917    "prepend          %[temp3],   %[temp6],    8          \n\t"
918    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
919    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
920    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
921      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
922      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
923      [temp9]"=&r"(temp9)
924    : [top]"r"(top), [dst]"r"(dst)
925    : "memory"
926  );
927}
928
929static void HD4(uint8_t* dst, const uint8_t* top) {
930  int temp0, temp1, temp2, temp3, temp4;
931  int temp5, temp6, temp7, temp8, temp9;
932  __asm__ volatile (
933    "ulw              %[temp0],   -5(%[top])              \n\t"
934    "ulw              %[temp1],   -1(%[top])              \n\t"
935    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
936    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
937    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
938    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
939    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
940    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
941    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
942    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
943    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
944    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
945    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
946    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
947    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
948    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
949    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
950    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
951    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
952    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
953    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
954    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
955    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
956    "precrq.ph.w      %[temp1],   %[temp2],    %[temp5]   \n\t"
957    "precrq.ph.w      %[temp3],   %[temp0],    %[temp4]   \n\t"
958    "precr.qb.ph      %[temp7],   %[temp6],    %[temp1]   \n\t"
959    "precr.qb.ph      %[temp6],   %[temp1],    %[temp3]   \n\t"
960    "usw              %[temp7],   0*" XSTR(BPS) "(%[dst]) \n\t"
961    "usw              %[temp6],   1*" XSTR(BPS) "(%[dst]) \n\t"
962    "append           %[temp2],   %[temp5],    16         \n\t"
963    "append           %[temp0],   %[temp4],    16         \n\t"
964    "precr.qb.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
965    "precr.qb.ph      %[temp4],   %[temp2],    %[temp0]   \n\t"
966    "usw              %[temp5],   2*" XSTR(BPS) "(%[dst]) \n\t"
967    "usw              %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
968    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
969      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
970      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
971      [temp9]"=&r"(temp9)
972    : [top]"r"(top), [dst]"r"(dst)
973    : "memory"
974  );
975}
976
977static void HU4(uint8_t* dst, const uint8_t* top) {
978  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
979  __asm__ volatile (
980    "ulw             %[temp0],   -5(%[top])              \n\t"
981    "preceu.ph.qbl   %[temp1],   %[temp0]                \n\t"
982    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
983    "packrl.ph       %[temp3],   %[temp1],    %[temp2]   \n\t"
984    "replv.qb        %[temp7],   %[temp2]                \n\t"
985    "addqh_r.ph      %[temp4],   %[temp1],    %[temp3]   \n\t"
986    "addqh_r.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
987    "shll.ph         %[temp6],   %[temp3],    1          \n\t"
988    "addu.ph         %[temp3],   %[temp2],    %[temp3]   \n\t"
989    "addu.ph         %[temp6],   %[temp1],    %[temp6]   \n\t"
990    "shll.ph         %[temp0],   %[temp2],    1          \n\t"
991    "addu.ph         %[temp6],   %[temp6],    %[temp2]   \n\t"
992    "addu.ph         %[temp0],   %[temp3],    %[temp0]   \n\t"
993    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
994    "shra_r.ph       %[temp0],   %[temp0],    2          \n\t"
995    "packrl.ph       %[temp3],   %[temp6],    %[temp5]   \n\t"
996    "precrq.ph.w     %[temp2],   %[temp6],    %[temp4]   \n\t"
997    "append          %[temp0],   %[temp5],    16         \n\t"
998    "precr.qb.ph     %[temp3],   %[temp3],    %[temp2]   \n\t"
999    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
1000    "precr.qb.ph     %[temp1],   %[temp7],    %[temp0]   \n\t"
1001    "usw             %[temp7],   3*" XSTR(BPS) "(%[dst]) \n\t"
1002    "packrl.ph       %[temp2],   %[temp1],    %[temp3]   \n\t"
1003    "usw             %[temp1],   2*" XSTR(BPS) "(%[dst]) \n\t"
1004    "usw             %[temp2],   1*" XSTR(BPS) "(%[dst]) \n\t"
1005    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1006      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1007      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
1008    : [top]"r"(top), [dst]"r"(dst)
1009    : "memory"
1010  );
1011}
1012
1013//------------------------------------------------------------------------------
1014// Chroma 8x8 prediction (paragraph 12.2)
1015
1016static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
1017                                       const uint8_t* top) {
1018  // U block
1019  DCMode8(C8DC8 + dst, left, top);
1020  VerticalPred8(C8VE8 + dst, top);
1021  HorizontalPred8(C8HE8 + dst, left);
1022  TrueMotion8(C8TM8 + dst, left, top);
1023  // V block
1024  dst += 8;
1025  if (top) top += 8;
1026  if (left) left += 16;
1027  DCMode8(C8DC8 + dst, left, top);
1028  VerticalPred8(C8VE8 + dst, top);
1029  HorizontalPred8(C8HE8 + dst, left);
1030  TrueMotion8(C8TM8 + dst, left, top);
1031}
1032
1033//------------------------------------------------------------------------------
1034// luma 16x16 prediction (paragraph 12.3)
1035
1036static void Intra16Preds_MIPSdspR2(uint8_t* dst,
1037                                   const uint8_t* left, const uint8_t* top) {
1038  DCMode16(I16DC16 + dst, left, top);
1039  VerticalPred16(I16VE16 + dst, top);
1040  HorizontalPred16(I16HE16 + dst, left);
1041  TrueMotion16(I16TM16 + dst, left, top);
1042}
1043
1044// Left samples are top[-5 .. -2], top_left is top[-1], top are
1045// located at top[0..3], and top right is top[4..7]
1046static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
1047  DC4(I4DC4 + dst, top);
1048  TM4(I4TM4 + dst, top);
1049  VE4(I4VE4 + dst, top);
1050  HE4(I4HE4 + dst, top);
1051  RD4(I4RD4 + dst, top);
1052  VR4(I4VR4 + dst, top);
1053  LD4(I4LD4 + dst, top);
1054  VL4(I4VL4 + dst, top);
1055  HD4(I4HD4 + dst, top);
1056  HU4(I4HU4 + dst, top);
1057}
1058
1059//------------------------------------------------------------------------------
1060// Metric
1061
1062#if !defined(WORK_AROUND_GCC)
1063
1064#define GET_SSE_INNER(A)                                                  \
1065  "lw               %[temp0],    " #A "(%[a])                  \n\t"      \
1066  "lw               %[temp1],    " #A "(%[b])                  \n\t"      \
1067  "preceu.ph.qbr    %[temp2],    %[temp0]                      \n\t"      \
1068  "preceu.ph.qbl    %[temp0],    %[temp0]                      \n\t"      \
1069  "preceu.ph.qbr    %[temp3],    %[temp1]                      \n\t"      \
1070  "preceu.ph.qbl    %[temp1],    %[temp1]                      \n\t"      \
1071  "subq.ph          %[temp2],    %[temp2],    %[temp3]         \n\t"      \
1072  "subq.ph          %[temp0],    %[temp0],    %[temp1]         \n\t"      \
1073  "dpa.w.ph         $ac0,        %[temp2],    %[temp2]         \n\t"      \
1074  "dpa.w.ph         $ac0,        %[temp0],    %[temp0]         \n\t"
1075
1076#define GET_SSE(A, B, C, D)               \
1077  GET_SSE_INNER(A)                        \
1078  GET_SSE_INNER(B)                        \
1079  GET_SSE_INNER(C)                        \
1080  GET_SSE_INNER(D)
1081
1082static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
1083  int count;
1084  int temp0, temp1, temp2, temp3;
1085  __asm__ volatile (
1086    "mult   $zero,    $zero                            \n\t"
1087    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
1088    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
1089    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
1090    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
1091    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
1092    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
1093    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
1094    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
1095    GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
1096    GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
1097    GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
1098    GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
1099    GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
1100    GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
1101    GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
1102    GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
1103    "mflo   %[count]                                   \n\t"
1104    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1105      [temp3]"=&r"(temp3), [count]"=&r"(count)
1106    : [a]"r"(a), [b]"r"(b)
1107    : "memory", "hi", "lo"
1108  );
1109  return count;
1110}
1111
1112static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
1113  int count;
1114  int temp0, temp1, temp2, temp3;
1115  __asm__ volatile (
1116    "mult   $zero,    $zero                            \n\t"
1117    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
1118    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
1119    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
1120    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
1121    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
1122    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
1123    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
1124    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
1125    "mflo   %[count]                                   \n\t"
1126    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1127      [temp3]"=&r"(temp3), [count]"=&r"(count)
1128    : [a]"r"(a), [b]"r"(b)
1129    : "memory", "hi", "lo"
1130  );
1131  return count;
1132}
1133
1134static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
1135  int count;
1136  int temp0, temp1, temp2, temp3;
1137  __asm__ volatile (
1138    "mult   $zero,    $zero                            \n\t"
1139    GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
1140    GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
1141    GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
1142    GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
1143    "mflo   %[count]                                   \n\t"
1144    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1145      [temp3]"=&r"(temp3), [count]"=&r"(count)
1146    : [a]"r"(a), [b]"r"(b)
1147    : "memory", "hi", "lo"
1148  );
1149  return count;
1150}
1151
1152static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
1153  int count;
1154  int temp0, temp1, temp2, temp3;
1155  __asm__ volatile (
1156    "mult   $zero,    $zero                            \n\t"
1157    GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
1158    "mflo   %[count]                                   \n\t"
1159    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1160      [temp3]"=&r"(temp3), [count]"=&r"(count)
1161    : [a]"r"(a), [b]"r"(b)
1162    : "memory", "hi", "lo"
1163  );
1164  return count;
1165}
1166
1167#undef GET_SSE
1168#undef GET_SSE_INNER
1169
1170#endif  // !WORK_AROUND_GCC
1171
1172#undef FILL_8_OR_16
1173#undef FILL_PART
1174#undef OUTPUT_EARLY_CLOBBER_REGS_17
1175#undef MUL_HALF
1176#undef ABS_X8
1177#undef ADD_SUB_HALVES_X4
1178
1179//------------------------------------------------------------------------------
1180// Quantization
1181//
1182
1183// macro for one pass through for loop in QuantizeBlock reading 2 values at time
1184// QUANTDIV macro inlined
1185// J - offset in bytes (kZigzag[n] * 2)
1186// K - offset in bytes (kZigzag[n] * 4)
1187// N - offset in bytes (n * 2)
1188// N1 - offset in bytes ((n + 1) * 2)
1189#define QUANTIZE_ONE(J, K, N, N1)                                         \
1190  "ulw         %[temp1],     " #J "(%[ppin])                 \n\t"        \
1191  "ulw         %[temp2],     " #J "(%[ppsharpen])            \n\t"        \
1192  "lhu         %[temp3],     " #K "(%[ppzthresh])            \n\t"        \
1193  "lhu         %[temp6],     " #K "+4(%[ppzthresh])          \n\t"        \
1194  "absq_s.ph   %[temp4],     %[temp1]                        \n\t"        \
1195  "ins         %[temp3],     %[temp6],         16,       16  \n\t"        \
1196  "addu.ph     %[coeff],     %[temp4],         %[temp2]      \n\t"        \
1197  "shra.ph     %[sign],      %[temp1],         15            \n\t"        \
1198  "li          %[level],     0x10001                         \n\t"        \
1199  "cmp.lt.ph   %[temp3],     %[coeff]                        \n\t"        \
1200  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
1201  "pick.ph     %[temp5],     %[level],         $0            \n\t"        \
1202  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
1203  "beqz        %[temp5],     0f                              \n\t"        \
1204  "lhu         %[temp3],     " #J "(%[ppq])                  \n\t"        \
1205  "beq         %[temp5],     %[level],         1f            \n\t"        \
1206  "andi        %[temp5],     %[temp5],         0x1           \n\t"        \
1207  "andi        %[temp4],     %[coeff],         0xffff        \n\t"        \
1208  "beqz        %[temp5],     2f                              \n\t"        \
1209  "mul         %[level],     %[temp4],         %[temp1]      \n\t"        \
1210  "sh          $0,           " #J "+2(%[ppin])               \n\t"        \
1211  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
1212  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
1213  "sra         %[level],     %[level],         17            \n\t"        \
1214  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
1215  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
1216  "andi        %[temp6],     %[sign],          0xffff        \n\t"        \
1217  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
1218  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
1219  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
1220  "or          %[ret],       %[ret],           %[level]      \n\t"        \
1221  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
1222  "sh          %[temp5],     " #J "(%[ppin])                 \n\t"        \
1223  "j           3f                                            \n\t"        \
1224"2:                                                          \n\t"        \
1225  "lhu         %[temp1],     " #J "+2(%[ppiq])               \n\t"        \
1226  "srl         %[temp5],     %[coeff],         16            \n\t"        \
1227  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
1228  "lw          %[temp2],     " #K "+4(%[ppbias])             \n\t"        \
1229  "lhu         %[temp3],     " #J "+2(%[ppq])                \n\t"        \
1230  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
1231  "sra         %[level],     %[level],         17            \n\t"        \
1232  "srl         %[temp6],     %[sign],          16            \n\t"        \
1233  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
1234  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
1235  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
1236  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
1237  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
1238  "sh          $0,           " #J "(%[ppin])                 \n\t"        \
1239  "sh          $0,           " #N "(%[pout])                 \n\t"        \
1240  "or          %[ret],       %[ret],           %[level]      \n\t"        \
1241  "sh          %[temp5],     " #J "+2(%[ppin])               \n\t"        \
1242  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
1243  "j           3f                                            \n\t"        \
1244"1:                                                          \n\t"        \
1245  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
1246  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
1247  "ulw         %[temp3],     " #J "(%[ppq])                  \n\t"        \
1248  "andi        %[temp5],     %[coeff],         0xffff        \n\t"        \
1249  "srl         %[temp0],     %[coeff],         16            \n\t"        \
1250  "lhu         %[temp6],     " #J "+2(%[ppiq])               \n\t"        \
1251  "lw          %[coeff],     " #K "+4(%[ppbias])             \n\t"        \
1252  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
1253  "mul         %[temp4],     %[temp0],         %[temp6]      \n\t"        \
1254  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
1255  "addu        %[temp4],     %[temp4],         %[coeff]      \n\t"        \
1256  "precrq.ph.w %[level],     %[temp4],         %[level]      \n\t"        \
1257  "shra.ph     %[level],     %[level],         1             \n\t"        \
1258  "cmp.lt.ph   %[max_level1],%[level]                        \n\t"        \
1259  "pick.ph     %[level],     %[max_level],     %[level]      \n\t"        \
1260  "xor         %[level],     %[level],         %[sign]       \n\t"        \
1261  "subu.ph     %[level],     %[level],         %[sign]       \n\t"        \
1262  "mul.ph      %[temp3],     %[level],         %[temp3]      \n\t"        \
1263  "or          %[ret],       %[ret],           %[level]      \n\t"        \
1264  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
1265  "srl         %[level],     %[level],         16            \n\t"        \
1266  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
1267  "usw         %[temp3],     " #J "(%[ppin])                 \n\t"        \
1268  "j           3f                                            \n\t"        \
1269"0:                                                          \n\t"        \
1270  "sh          $0,           " #N "(%[pout])                 \n\t"        \
1271  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
1272  "usw         $0,           " #J "(%[ppin])                 \n\t"        \
1273"3:                                                          \n\t"
1274
1275static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
1276                                   const VP8Matrix* const mtx) {
1277  int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
1278  int sign, coeff, level;
1279  int max_level = MAX_LEVEL;
1280  int max_level1 = max_level << 16 | max_level;
1281  int ret = 0;
1282
1283  int16_t* ppin             = &in[0];
1284  int16_t* pout             = &out[0];
1285  const uint16_t* ppsharpen = &mtx->sharpen_[0];
1286  const uint32_t* ppzthresh = &mtx->zthresh_[0];
1287  const uint16_t* ppq       = &mtx->q_[0];
1288  const uint16_t* ppiq      = &mtx->iq_[0];
1289  const uint32_t* ppbias    = &mtx->bias_[0];
1290
1291  __asm__ volatile (
1292    QUANTIZE_ONE( 0,  0,  0,  2)
1293    QUANTIZE_ONE( 4,  8, 10, 12)
1294    QUANTIZE_ONE( 8, 16,  4,  8)
1295    QUANTIZE_ONE(12, 24, 14, 24)
1296    QUANTIZE_ONE(16, 32,  6, 16)
1297    QUANTIZE_ONE(20, 40, 22, 26)
1298    QUANTIZE_ONE(24, 48, 18, 20)
1299    QUANTIZE_ONE(28, 56, 28, 30)
1300
1301    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
1302      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
1303      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1304      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
1305      [level]"=&r"(level), [temp6]"=&r"(temp6), [ret]"+&r"(ret)
1306    : [ppin]"r"(ppin), [pout]"r"(pout), [max_level1]"r"(max_level1),
1307      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
1308      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
1309      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
1310    : "memory", "hi", "lo"
1311  );
1312
1313  return (ret != 0);
1314}
1315
1316static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
1317                                     const VP8Matrix* const mtx) {
1318  int nz;
1319  nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
1320  nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
1321  return nz;
1322}
1323
1324#undef QUANTIZE_ONE
1325
1326// macro for one horizontal pass in FTransformWHT
1327// temp0..temp7 holds tmp[0]..tmp[15]
1328// A, B, C, D - offset in bytes to load from in buffer
1329// TEMP0, TEMP1 - registers for corresponding tmp elements
1330#define HORIZONTAL_PASS_WHT(A, B, C, D, TEMP0, TEMP1)                          \
1331  "lh              %[" #TEMP0 "],  " #A "(%[in])            \n\t"              \
1332  "lh              %[" #TEMP1 "],  " #B "(%[in])            \n\t"              \
1333  "lh              %[temp8],     " #C "(%[in])              \n\t"              \
1334  "lh              %[temp9],     " #D "(%[in])              \n\t"              \
1335  "ins             %[" #TEMP1 "],  %[" #TEMP0 "],  16,  16  \n\t"              \
1336  "ins             %[temp9],     %[temp8],     16,  16      \n\t"              \
1337  "subq.ph         %[temp8],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
1338  "addq.ph         %[temp9],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
1339  "precrq.ph.w     %[" #TEMP0 "],  %[temp8],     %[temp9]   \n\t"              \
1340  "append          %[temp8],     %[temp9],     16           \n\t"              \
1341  "subq.ph         %[" #TEMP1 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
1342  "addq.ph         %[" #TEMP0 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
1343  "rotr            %[" #TEMP1 "],  %[" #TEMP1 "],  16       \n\t"
1344
1345// macro for one vertical pass in FTransformWHT
1346// temp0..temp7 holds tmp[0]..tmp[15]
1347// A, B, C, D - offsets in bytes to store to out buffer
1348// TEMP0, TEMP2, TEMP4 and TEMP6 - registers for corresponding tmp elements
1349#define VERTICAL_PASS_WHT(A, B, C, D, TEMP0, TEMP2, TEMP4, TEMP6)              \
1350  "addq.ph         %[temp8],     %[" #TEMP0 "],  %[" #TEMP4 "]    \n\t"        \
1351  "addq.ph         %[temp9],     %[" #TEMP2 "],  %[" #TEMP6 "]    \n\t"        \
1352  "subq.ph         %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
1353  "subq.ph         %[" #TEMP6 "],  %[" #TEMP0 "],  %[" #TEMP4 "]  \n\t"        \
1354  "addqh.ph        %[" #TEMP0 "],  %[temp8],     %[temp9]         \n\t"        \
1355  "subqh.ph        %[" #TEMP4 "],  %[" #TEMP6 "],  %[" #TEMP2 "]  \n\t"        \
1356  "addqh.ph        %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
1357  "subqh.ph        %[" #TEMP6 "],  %[temp8],     %[temp9]         \n\t"        \
1358  "usw             %[" #TEMP0 "],  " #A "(%[out])                 \n\t"        \
1359  "usw             %[" #TEMP2 "],  " #B "(%[out])                 \n\t"        \
1360  "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
1361  "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
1362
1363static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
1364  int temp0, temp1, temp2, temp3, temp4;
1365  int temp5, temp6, temp7, temp8, temp9;
1366
1367  __asm__ volatile (
1368    HORIZONTAL_PASS_WHT(  0,  32,  64,  96, temp0, temp1)
1369    HORIZONTAL_PASS_WHT(128, 160, 192, 224, temp2, temp3)
1370    HORIZONTAL_PASS_WHT(256, 288, 320, 352, temp4, temp5)
1371    HORIZONTAL_PASS_WHT(384, 416, 448, 480, temp6, temp7)
1372    VERTICAL_PASS_WHT(0,  8, 16, 24, temp0, temp2, temp4, temp6)
1373    VERTICAL_PASS_WHT(4, 12, 20, 28, temp1, temp3, temp5, temp7)
1374    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1375      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1376      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
1377      [temp9]"=&r"(temp9)
1378    : [in]"r"(in), [out]"r"(out)
1379    : "memory"
1380  );
1381}
1382
1383#undef VERTICAL_PASS_WHT
1384#undef HORIZONTAL_PASS_WHT
1385
1386// macro for converting coefficients to bin
1387// convert 8 coeffs at time
1388// A, B, C, D - offsets in bytes to load from out buffer
1389#define CONVERT_COEFFS_TO_BIN(A, B, C, D)                                      \
1390  "ulw        %[temp0],  " #A "(%[out])                \n\t"                   \
1391  "ulw        %[temp1],  " #B "(%[out])                \n\t"                   \
1392  "ulw        %[temp2],  " #C "(%[out])                \n\t"                   \
1393  "ulw        %[temp3],  " #D "(%[out])                \n\t"                   \
1394  "absq_s.ph  %[temp0],  %[temp0]                      \n\t"                   \
1395  "absq_s.ph  %[temp1],  %[temp1]                      \n\t"                   \
1396  "absq_s.ph  %[temp2],  %[temp2]                      \n\t"                   \
1397  "absq_s.ph  %[temp3],  %[temp3]                      \n\t"                   \
1398  "shra.ph    %[temp0],  %[temp0],    3                \n\t"                   \
1399  "shra.ph    %[temp1],  %[temp1],    3                \n\t"                   \
1400  "shra.ph    %[temp2],  %[temp2],    3                \n\t"                   \
1401  "shra.ph    %[temp3],  %[temp3],    3                \n\t"                   \
1402  "shll_s.ph  %[temp0],  %[temp0],    10               \n\t"                   \
1403  "shll_s.ph  %[temp1],  %[temp1],    10               \n\t"                   \
1404  "shll_s.ph  %[temp2],  %[temp2],    10               \n\t"                   \
1405  "shll_s.ph  %[temp3],  %[temp3],    10               \n\t"                   \
1406  "shrl.ph    %[temp0],  %[temp0],    10               \n\t"                   \
1407  "shrl.ph    %[temp1],  %[temp1],    10               \n\t"                   \
1408  "shrl.ph    %[temp2],  %[temp2],    10               \n\t"                   \
1409  "shrl.ph    %[temp3],  %[temp3],    10               \n\t"                   \
1410  "shll.ph    %[temp0],  %[temp0],    2                \n\t"                   \
1411  "shll.ph    %[temp1],  %[temp1],    2                \n\t"                   \
1412  "shll.ph    %[temp2],  %[temp2],    2                \n\t"                   \
1413  "shll.ph    %[temp3],  %[temp3],    2                \n\t"                   \
1414  "ext        %[temp4],  %[temp0],    0,       16      \n\t"                   \
1415  "ext        %[temp0],  %[temp0],    16,      16      \n\t"                   \
1416  "addu       %[temp4],  %[temp4],    %[dist]          \n\t"                   \
1417  "addu       %[temp0],  %[temp0],    %[dist]          \n\t"                   \
1418  "ext        %[temp5],  %[temp1],    0,       16      \n\t"                   \
1419  "lw         %[temp8],  0(%[temp4])                   \n\t"                   \
1420  "ext        %[temp1],  %[temp1],    16,      16      \n\t"                   \
1421  "addu       %[temp5],  %[temp5],    %[dist]          \n\t"                   \
1422  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1423  "sw         %[temp8],  0(%[temp4])                   \n\t"                   \
1424  "lw         %[temp8],  0(%[temp0])                   \n\t"                   \
1425  "addu       %[temp1],  %[temp1],    %[dist]          \n\t"                   \
1426  "ext        %[temp6],  %[temp2],    0,       16      \n\t"                   \
1427  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1428  "sw         %[temp8],  0(%[temp0])                   \n\t"                   \
1429  "lw         %[temp8],  0(%[temp5])                   \n\t"                   \
1430  "ext        %[temp2],  %[temp2],    16,      16      \n\t"                   \
1431  "addu       %[temp6],  %[temp6],    %[dist]          \n\t"                   \
1432  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1433  "sw         %[temp8],  0(%[temp5])                   \n\t"                   \
1434  "lw         %[temp8],  0(%[temp1])                   \n\t"                   \
1435  "addu       %[temp2],  %[temp2],    %[dist]          \n\t"                   \
1436  "ext        %[temp7],  %[temp3],    0,       16      \n\t"                   \
1437  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1438  "sw         %[temp8],  0(%[temp1])                   \n\t"                   \
1439  "lw         %[temp8],  0(%[temp6])                   \n\t"                   \
1440  "ext        %[temp3],  %[temp3],    16,      16      \n\t"                   \
1441  "addu       %[temp7],  %[temp7],    %[dist]          \n\t"                   \
1442  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1443  "sw         %[temp8],  0(%[temp6])                   \n\t"                   \
1444  "lw         %[temp8],  0(%[temp2])                   \n\t"                   \
1445  "addu       %[temp3],  %[temp3],    %[dist]          \n\t"                   \
1446  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1447  "sw         %[temp8],  0(%[temp2])                   \n\t"                   \
1448  "lw         %[temp8],  0(%[temp7])                   \n\t"                   \
1449  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1450  "sw         %[temp8],  0(%[temp7])                   \n\t"                   \
1451  "lw         %[temp8],  0(%[temp3])                   \n\t"                   \
1452  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1453  "sw         %[temp8],  0(%[temp3])                   \n\t"
1454
1455static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
1456                                       int start_block, int end_block,
1457                                       VP8Histogram* const histo) {
1458  int j;
1459  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
1460  const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
1461  for (j = start_block; j < end_block; ++j) {
1462    int16_t out[16];
1463    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
1464
1465    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
1466
1467    // Convert coefficients to bin.
1468    __asm__ volatile (
1469      CONVERT_COEFFS_TO_BIN( 0,  4,  8, 12)
1470      CONVERT_COEFFS_TO_BIN(16, 20, 24, 28)
1471      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1472        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1473        [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
1474      : [dist]"r"(distribution), [out]"r"(out), [max_coeff]"r"(max_coeff)
1475      : "memory"
1476    );
1477  }
1478  VP8SetHistogramData(distribution, histo);
1479}
1480
1481#undef CONVERT_COEFFS_TO_BIN
1482
1483//------------------------------------------------------------------------------
1484// Entry point
1485
1486extern void VP8EncDspInitMIPSdspR2(void);
1487
1488WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
1489  VP8FTransform = FTransform_MIPSdspR2;
1490  VP8FTransformWHT = FTransformWHT_MIPSdspR2;
1491  VP8ITransform = ITransform_MIPSdspR2;
1492
1493  VP8TDisto4x4 = Disto4x4_MIPSdspR2;
1494  VP8TDisto16x16 = Disto16x16_MIPSdspR2;
1495
1496  VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
1497  VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
1498  VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
1499
1500#if !defined(WORK_AROUND_GCC)
1501  VP8SSE16x16 = SSE16x16_MIPSdspR2;
1502  VP8SSE8x8 = SSE8x8_MIPSdspR2;
1503  VP8SSE16x8 = SSE16x8_MIPSdspR2;
1504  VP8SSE4x4 = SSE4x4_MIPSdspR2;
1505#endif
1506
1507  VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
1508  VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
1509
1510  VP8CollectHistogram = CollectHistogram_MIPSdspR2;
1511}
1512
1513#else  // !WEBP_USE_MIPS_DSP_R2
1514
1515WEBP_DSP_INIT_STUB(VP8EncDspInitMIPSdspR2)
1516
1517#endif  // WEBP_USE_MIPS_DSP_R2
1518