1af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Copyright 2014 Google Inc. All Rights Reserved.
2af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//
3af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Use of this source code is governed by a BSD-style license
4af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// that can be found in the COPYING file in the root of the source
5af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// tree. An additional intellectual property rights grant can be found
6af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// in the file PATENTS. All contributing project authors may
7af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// be found in the AUTHORS file in the root of the source tree.
8af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// -----------------------------------------------------------------------------
9af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//
10af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// MIPS version of speed-critical encoding functions.
11af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//
12af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Author(s): Djordje Pesut    (djordje.pesut@imgtec.com)
13af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//            Jovan Zelincevic (jovan.zelincevic@imgtec.com)
14af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//            Slobodan Prijic  (slobodan.prijic@imgtec.com)
15af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
16af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include "./dsp.h"
17af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
18af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(WEBP_USE_MIPS32)
19af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
20af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include "../enc/vp8enci.h"
21af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include "../enc/cost.h"
22af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
23af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409
24af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define WORK_AROUND_GCC
25af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif
26af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
27af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic const int kC1 = 20091 + (1 << 16);
28af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic const int kC2 = 35468;
29af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
30af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// macro for one vertical pass in ITransformOne
31af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// MUL macro inlined
32af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// temp0..temp15 holds tmp[0]..tmp[15]
33af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// A..D - offsets in bytes to load from in buffer
34af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// TEMP0..TEMP3 - registers for corresponding tmp elements
35af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// TEMP4..TEMP5 - temporary registers
36af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
37af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lh      %[temp16],      "#A"(%[temp20])                 \n\t"            \
38af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lh      %[temp18],      "#B"(%[temp20])                 \n\t"            \
39af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lh      %[temp17],      "#C"(%[temp20])                 \n\t"            \
40af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lh      %[temp19],      "#D"(%[temp20])                 \n\t"            \
41af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %["#TEMP4"],    %[temp16],      %[temp18]       \n\t"            \
42af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %[temp16],      %[temp16],      %[temp18]       \n\t"            \
43af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul     %["#TEMP0"],    %[temp17],      %[kC2]          \n\t"            \
44af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul     %[temp18],      %[temp19],      %[kC1]          \n\t"            \
45af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul     %[temp17],      %[temp17],      %[kC1]          \n\t"            \
46af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul     %[temp19],      %[temp19],      %[kC2]          \n\t"            \
47af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\n"            \
48af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %[temp18],      %[temp18],      16              \n\n"            \
49af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %[temp17],      %[temp17],      16              \n\n"            \
50af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %[temp19],      %[temp19],      16              \n\n"            \
51af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %["#TEMP2"],    %["#TEMP0"],    %[temp18]       \n\t"            \
52af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %["#TEMP3"],    %[temp17],      %[temp19]       \n\t"            \
53af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %["#TEMP0"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"            \
54af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %["#TEMP1"],    %[temp16],      %["#TEMP2"]     \n\t"            \
55af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %["#TEMP2"],    %[temp16],      %["#TEMP2"]     \n\t"            \
56af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %["#TEMP3"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"
57af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
58af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// macro for one horizontal pass in ITransformOne
59af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// MUL and STORE macros inlined
60af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
61af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// temp0..temp15 holds tmp[0]..tmp[15]
62af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// A..D - offsets in bytes to load from ref and store to dst buffer
63af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
64af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)            \
65af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addiu   %["#TEMP0"],    %["#TEMP0"],    4               \n\t"            \
66af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %[temp16],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
67af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %[temp17],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
68af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul     %["#TEMP0"],    %["#TEMP4"],    %[kC2]          \n\t"            \
69af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul     %["#TEMP8"],    %["#TEMP12"],   %[kC1]          \n\t"            \
70af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul     %["#TEMP4"],    %["#TEMP4"],    %[kC1]          \n\t"            \
71af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul     %["#TEMP12"],   %["#TEMP12"],   %[kC2]          \n\t"            \
72af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\t"            \
73af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %["#TEMP8"],    %["#TEMP8"],    16              \n\t"            \
74af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %["#TEMP4"],    %["#TEMP4"],    16              \n\t"            \
75af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %["#TEMP12"],   %["#TEMP12"],   16              \n\t"            \
76af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %[temp18],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
77af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %[temp19],      %["#TEMP4"],    %["#TEMP12"]    \n\t"            \
78af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %["#TEMP0"],    %[temp16],      %[temp19]       \n\t"            \
79af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %["#TEMP4"],    %[temp17],      %[temp18]       \n\t"            \
80af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %["#TEMP8"],    %[temp17],      %[temp18]       \n\t"            \
81af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %["#TEMP12"],   %[temp16],      %[temp19]       \n\t"            \
82af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw      %[temp20],      0(%[args])                      \n\t"            \
83af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %["#TEMP0"],    %["#TEMP0"],    3               \n\t"            \
84af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %["#TEMP4"],    %["#TEMP4"],    3               \n\t"            \
85af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %["#TEMP8"],    %["#TEMP8"],    3               \n\t"            \
86af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra     %["#TEMP12"],   %["#TEMP12"],   3               \n\t"            \
87af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp16],      "#A"(%[temp20])                 \n\t"            \
88af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp17],      "#B"(%[temp20])                 \n\t"            \
89af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp18],      "#C"(%[temp20])                 \n\t"            \
90af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp19],      "#D"(%[temp20])                 \n\t"            \
91af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %["#TEMP0"],    %[temp16],      %["#TEMP0"]     \n\t"            \
92af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %["#TEMP4"],    %[temp17],      %["#TEMP4"]     \n\t"            \
93af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %["#TEMP8"],    %[temp18],      %["#TEMP8"]     \n\t"            \
94af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu    %["#TEMP12"],   %[temp19],      %["#TEMP12"]    \n\t"            \
95af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "slt     %[temp16],      %["#TEMP0"],    $zero           \n\t"            \
96af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "slt     %[temp17],      %["#TEMP4"],    $zero           \n\t"            \
97af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "slt     %[temp18],      %["#TEMP8"],    $zero           \n\t"            \
98af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "slt     %[temp19],      %["#TEMP12"],   $zero           \n\t"            \
99af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "movn    %["#TEMP0"],    $zero,          %[temp16]       \n\t"            \
100af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "movn    %["#TEMP4"],    $zero,          %[temp17]       \n\t"            \
101af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "movn    %["#TEMP8"],    $zero,          %[temp18]       \n\t"            \
102af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "movn    %["#TEMP12"],   $zero,          %[temp19]       \n\t"            \
103af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addiu   %[temp20],      $zero,          255             \n\t"            \
104af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "slt     %[temp16],      %["#TEMP0"],    %[temp20]       \n\t"            \
105af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "slt     %[temp17],      %["#TEMP4"],    %[temp20]       \n\t"            \
106af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "slt     %[temp18],      %["#TEMP8"],    %[temp20]       \n\t"            \
107af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "slt     %[temp19],      %["#TEMP12"],   %[temp20]       \n\t"            \
108af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "movz    %["#TEMP0"],    %[temp20],      %[temp16]       \n\t"            \
109af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "movz    %["#TEMP4"],    %[temp20],      %[temp17]       \n\t"            \
110af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw      %[temp16],      8(%[args])                      \n\t"            \
111af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "movz    %["#TEMP8"],    %[temp20],      %[temp18]       \n\t"            \
112af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "movz    %["#TEMP12"],   %[temp20],      %[temp19]       \n\t"            \
113af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sb      %["#TEMP0"],    "#A"(%[temp16])                 \n\t"            \
114af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sb      %["#TEMP4"],    "#B"(%[temp16])                 \n\t"            \
115af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sb      %["#TEMP8"],    "#C"(%[temp16])                 \n\t"            \
116af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sb      %["#TEMP12"],   "#D"(%[temp16])                 \n\t"
117af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
118af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Does one or two inverse transforms.
119af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
120af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                      uint8_t* dst) {
121af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
122af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
123af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
124af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int* args[3] = {(const int*)ref, (const int*)in, (const int*)dst};
125af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
126af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  __asm__ volatile(
127af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "lw      %[temp20],      4(%[args])                      \n\t"
128af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS(0, 16,  8, 24, temp4,  temp0,  temp1,  temp2,  temp3)
129af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS(2, 18, 10, 26, temp8,  temp4,  temp5,  temp6,  temp7)
130af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS(4, 20, 12, 28, temp12, temp8,  temp9,  temp10, temp11)
131af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
132af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
133af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS( 0,  1,  2,  3, temp0, temp4, temp8,  temp12)
134af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS(16, 17, 18, 19, temp1, temp5, temp9,  temp13)
135af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS(32, 33, 34, 35, temp2, temp6, temp10, temp14)
136af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS(48, 49, 50, 51, temp3, temp7, temp11, temp15)
137af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
138af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
139af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
140af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
141af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
142af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
143af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
144af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
145af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
146af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : "memory", "hi", "lo"
147af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  );
148af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
149af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
150af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void ITransform(const uint8_t* ref, const int16_t* in,
151af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                       uint8_t* dst, int do_two) {
152af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  ITransformOne(ref, in, dst);
153af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  if (do_two) {
154af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ITransformOne(ref + 4, in + 16, dst + 4);
155af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
156af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
157af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
158af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef VERTICAL_PASS
159af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef HORIZONTAL_PASS
160af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
161af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// macro for one pass through for loop in QuantizeBlock
162af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// QUANTDIV macro inlined
163af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// J - offset in bytes (kZigzag[n] * 2)
164af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// K - offset in bytes (kZigzag[n] * 4)
165af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// N - offset in bytes (n * 2)
166af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define QUANTIZE_ONE(J, K, N)                                               \
167af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lh           %[temp0],       "#J"(%[ppin])                       \n\t"   \
168af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lhu          %[temp1],       "#J"(%[ppsharpen])                  \n\t"   \
169af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw           %[temp2],       "#K"(%[ppzthresh])                  \n\t"   \
170af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra          %[sign],        %[temp0],           15              \n\t"   \
171af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
172af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
173af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu         %[coeff],       %[coeff],           %[temp1]        \n\t"   \
174af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "slt          %[temp4],       %[temp2],           %[coeff]        \n\t"   \
175af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addiu        %[temp5],       $zero,              0               \n\t"   \
176af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addiu        %[level],       $zero,              0               \n\t"   \
177af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "beqz         %[temp4],       2f                                  \n\t"   \
178af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lhu          %[temp1],       "#J"(%[ppiq])                       \n\t"   \
179af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw           %[temp2],       "#K"(%[ppbias])                     \n\t"   \
180af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lhu          %[temp3],       "#J"(%[ppq])                        \n\t"   \
181af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
182af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu         %[level],       %[level],           %[temp2]        \n\t"   \
183af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra          %[level],       %[level],           17              \n\t"   \
184af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "slt          %[temp4],       %[max_level],       %[level]        \n\t"   \
185af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "movn         %[level],       %[max_level],       %[temp4]        \n\t"   \
186af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "xor          %[level],       %[level],           %[sign]         \n\t"   \
187af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu         %[level],       %[level],           %[sign]         \n\t"   \
188af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
189af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora"2:                                                                 \n\t"   \
190af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sh           %[temp5],       "#J"(%[ppin])                       \n\t"   \
191af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sh           %[level],       "#N"(%[pout])                       \n\t"
192af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
193af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int QuantizeBlock(int16_t in[16], int16_t out[16],
194af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                         const VP8Matrix* const mtx) {
195af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp0, temp1, temp2, temp3, temp4, temp5;
196af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int sign, coeff, level, i;
197af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int max_level = MAX_LEVEL;
198af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
199af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int16_t* ppin             = &in[0];
200af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int16_t* pout             = &out[0];
201af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16_t* ppsharpen = &mtx->sharpen_[0];
202af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint32_t* ppzthresh = &mtx->zthresh_[0];
203af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16_t* ppq       = &mtx->q_[0];
204af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16_t* ppiq      = &mtx->iq_[0];
205af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint32_t* ppbias    = &mtx->bias_[0];
206af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
207af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  __asm__ volatile(
208af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE( 0,  0,  0)
209af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE( 2,  4,  2)
210af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE( 8, 16,  4)
211af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE(16, 32,  6)
212af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE(10, 20,  8)
213af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE( 4,  8, 10)
214af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE( 6, 12, 12)
215af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE(12, 24, 14)
216af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE(18, 36, 16)
217af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE(24, 48, 18)
218af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE(26, 52, 20)
219af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE(20, 40, 22)
220af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE(14, 28, 24)
221af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE(22, 44, 26)
222af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE(28, 56, 28)
223af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    QUANTIZE_ONE(30, 60, 30)
224af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
225af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
226af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
227af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
228af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
229af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [level]"=&r"(level)
230af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [pout]"r"(pout), [ppin]"r"(ppin),
231af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
232af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
233af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
234af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : "memory", "hi", "lo"
235af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  );
236af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
237af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // moved out from macro to increase possibility for earlier breaking
238af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (i = 15; i >= 0; i--) {
239af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    if (out[i]) return 1;
240af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
241af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return 0;
242af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
243af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
244af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef QUANTIZE_ONE
245af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
246af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// macro for one horizontal pass in Disto4x4 (TTransform)
247af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// two calls of function TTransform are merged into single one
248af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// A..D - offsets in bytes to load from a and b buffers
249af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// E..H - offsets in bytes to store first results to tmp buffer
250af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// E1..H1 - offsets in bytes to store second results to tmp buffer
251af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1)   \
252af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp0],  "#A"(%[a])              \n\t"                \
253af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp1],  "#B"(%[a])              \n\t"                \
254af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp2],  "#C"(%[a])              \n\t"                \
255af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp3],  "#D"(%[a])              \n\t"                \
256af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp4],  "#A"(%[b])              \n\t"                \
257af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp5],  "#B"(%[b])              \n\t"                \
258af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp6],  "#C"(%[b])              \n\t"                \
259af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp7],  "#D"(%[b])              \n\t"                \
260af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
261af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
262af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp2],  %[temp1],    %[temp3]   \n\t"                \
263af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp1],  %[temp1],    %[temp3]   \n\t"                \
264af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp3],  %[temp4],    %[temp6]   \n\t"                \
265af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp4],  %[temp4],    %[temp6]   \n\t"                \
266af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp6],  %[temp5],    %[temp7]   \n\t"                \
267af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp5],  %[temp5],    %[temp7]   \n\t"                \
268af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp7],  %[temp8],    %[temp2]   \n\t"                \
269af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp2],  %[temp8],    %[temp2]   \n\t"                \
270af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
271af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
272af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp1],  %[temp3],    %[temp6]   \n\t"                \
273af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp3],  %[temp3],    %[temp6]   \n\t"                \
274af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp6],  %[temp4],    %[temp5]   \n\t"                \
275af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp4],  %[temp4],    %[temp5]   \n\t"                \
276af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sw     %[temp7],  "#E"(%[tmp])            \n\t"                \
277af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sw     %[temp2],  "#H"(%[tmp])            \n\t"                \
278af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sw     %[temp8],  "#F"(%[tmp])            \n\t"                \
279af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sw     %[temp0],  "#G"(%[tmp])            \n\t"                \
280af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sw     %[temp1],  "#E1"(%[tmp])           \n\t"                \
281af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sw     %[temp3],  "#H1"(%[tmp])           \n\t"                \
282af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sw     %[temp6],  "#F1"(%[tmp])           \n\t"                \
283af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sw     %[temp4],  "#G1"(%[tmp])           \n\t"
284af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
285af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// macro for one vertical pass in Disto4x4 (TTransform)
286af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// two calls of function TTransform are merged into single one
287af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// since only one accu is available in mips32r1 instruction set
288af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//   first is done second call of function TTransform and after
289af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//   that first one.
290af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//   const int sum1 = TTransform(a, w);
291af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//   const int sum2 = TTransform(b, w);
292af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//   return abs(sum2 - sum1) >> 5;
293af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//   (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
294af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// A..D - offsets in bytes to load first results from tmp buffer
295af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// A1..D1 - offsets in bytes to load second results from tmp buffer
296af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// E..H - offsets in bytes to load from w buffer
297af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \
298af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw     %[temp0],  "#A1"(%[tmp])           \n\t"                \
299af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw     %[temp1],  "#C1"(%[tmp])           \n\t"                \
300af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw     %[temp2],  "#B1"(%[tmp])           \n\t"                \
301af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw     %[temp3],  "#D1"(%[tmp])           \n\t"                \
302af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
303af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
304af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
305af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
306af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
307af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp8],  %[temp8],    %[temp1]   \n\t"                \
308af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp1],  %[temp0],    %[temp2]   \n\t"                \
309af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
310af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %[temp4],  %[temp3],    31         \n\t"                \
311af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %[temp5],  %[temp1],    31         \n\t"                \
312af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %[temp6],  %[temp0],    31         \n\t"                \
313af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %[temp7],  %[temp8],    31         \n\t"                \
314af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "xor    %[temp3],  %[temp3],    %[temp4]   \n\t"                \
315af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "xor    %[temp1],  %[temp1],    %[temp5]   \n\t"                \
316af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "xor    %[temp0],  %[temp0],    %[temp6]   \n\t"                \
317af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "xor    %[temp8],  %[temp8],    %[temp7]   \n\t"                \
318af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp3],  %[temp3],    %[temp4]   \n\t"                \
319af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \
320af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \
321af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \
322af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lhu    %[temp4],  "#E"(%[w])              \n\t"                \
323af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lhu    %[temp5],  "#F"(%[w])              \n\t"                \
324af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lhu    %[temp6],  "#G"(%[w])              \n\t"                \
325af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lhu    %[temp7],  "#H"(%[w])              \n\t"                \
326af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "madd   %[temp4],  %[temp3]                \n\t"                \
327af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "madd   %[temp5],  %[temp1]                \n\t"                \
328af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "madd   %[temp6],  %[temp0]                \n\t"                \
329af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "madd   %[temp7],  %[temp8]                \n\t"                \
330af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw     %[temp0],  "#A"(%[tmp])            \n\t"                \
331af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw     %[temp1],  "#C"(%[tmp])            \n\t"                \
332af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw     %[temp2],  "#B"(%[tmp])            \n\t"                \
333af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw     %[temp3],  "#D"(%[tmp])            \n\t"                \
334af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
335af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
336af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
337af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
338af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
339af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp1],  %[temp8],    %[temp1]   \n\t"                \
340af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
341af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
342af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %[temp2],  %[temp3],    31         \n\t"                \
343af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "xor    %[temp3],  %[temp3],    %[temp2]   \n\t"                \
344af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp3],  %[temp3],    %[temp2]   \n\t"                \
345af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "msub   %[temp4],  %[temp3]                \n\t"                \
346af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %[temp2],  %[temp8],    31         \n\t"                \
347af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %[temp3],  %[temp0],    31         \n\t"                \
348af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %[temp4],  %[temp1],    31         \n\t"                \
349af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "xor    %[temp8],  %[temp8],    %[temp2]   \n\t"                \
350af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "xor    %[temp0],  %[temp0],    %[temp3]   \n\t"                \
351af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "xor    %[temp1],  %[temp1],    %[temp4]   \n\t"                \
352af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp8],  %[temp8],    %[temp2]   \n\t"                \
353af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp0],  %[temp0],    %[temp3]   \n\t"                \
354af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp1],  %[temp1],    %[temp4]   \n\t"                \
355af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "msub   %[temp5],  %[temp8]                \n\t"                \
356af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "msub   %[temp6],  %[temp0]                \n\t"                \
357af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "msub   %[temp7],  %[temp1]                \n\t"
358af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
359af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b,
360af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                    const uint16_t* const w) {
361af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int tmp[32];
362af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
363af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
364af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  __asm__ volatile(
365af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS( 0,  1,  2,  3,    0,  4,  8, 12,    64,  68,  72,  76)
366af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS(16, 17, 18, 19,   16, 20, 24, 28,    80,  84,  88,  92)
367af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS(32, 33, 34, 35,   32, 36, 40, 44,    96, 100, 104, 108)
368af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS(48, 49, 50, 51,   48, 52, 56, 60,   112, 116, 120, 124)
369af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "mthi   $zero                             \n\t"
370af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "mtlo   $zero                             \n\t"
371af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS( 0, 16, 32, 48,     64, 80,  96, 112,   0,  8, 16, 24)
372af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS( 4, 20, 36, 52,     68, 84, 100, 116,   2, 10, 18, 26)
373af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS( 8, 24, 40, 56,     72, 88, 104, 120,   4, 12, 20, 28)
374af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS(12, 28, 44, 60,     76, 92, 108, 124,   6, 14, 22, 30)
375af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "mflo   %[temp0]                          \n\t"
376af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sra    %[temp1],  %[temp0],  31          \n\t"
377af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "xor    %[temp0],  %[temp0],  %[temp1]    \n\t"
378af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "subu   %[temp0],  %[temp0],  %[temp1]    \n\t"
379af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sra    %[temp0],  %[temp0],  5           \n\t"
380af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
381af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
382af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
383af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
384af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
385af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : "memory", "hi", "lo"
386af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  );
387af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
388af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return temp0;
389af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
390af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
391af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef VERTICAL_PASS
392af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef HORIZONTAL_PASS
393af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
394af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int Disto16x16(const uint8_t* const a, const uint8_t* const b,
395af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                      const uint16_t* const w) {
396af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int D = 0;
397af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int x, y;
398af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
399af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    for (x = 0; x < 16; x += 4) {
400af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      D += Disto4x4(a + x + y, b + x + y, w);
401af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    }
402af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
403af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return D;
404af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
405af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
406af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// macro for one horizontal pass in FTransform
407af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// temp0..temp15 holds tmp[0]..tmp[15]
408af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// A..D - offsets in bytes to load from src and ref buffers
409af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// TEMP0..TEMP3 - registers for corresponding tmp elements
410af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3) \
411af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw     %["#TEMP1"],  0(%[args])                     \n\t"    \
412af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lw     %["#TEMP2"],  4(%[args])                     \n\t"    \
413af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp16],    "#A"(%["#TEMP1"])              \n\t"    \
414af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp17],    "#A"(%["#TEMP2"])              \n\t"    \
415af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp18],    "#B"(%["#TEMP1"])              \n\t"    \
416af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp19],    "#B"(%["#TEMP2"])              \n\t"    \
417af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp20],    %[temp16],    %[temp17]        \n\t"    \
418af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp16],    "#C"(%["#TEMP1"])              \n\t"    \
419af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp17],    "#C"(%["#TEMP2"])              \n\t"    \
420af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %["#TEMP0"],  %[temp18],    %[temp19]        \n\t"    \
421af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp18],    "#D"(%["#TEMP1"])              \n\t"    \
422af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu    %[temp19],    "#D"(%["#TEMP2"])              \n\t"    \
423af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %["#TEMP1"],  %[temp16],    %[temp17]        \n\t"    \
424af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %["#TEMP2"],  %[temp18],    %[temp19]        \n\t"    \
425af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %["#TEMP3"],  %[temp20],    %["#TEMP2"]      \n\t"    \
426af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %["#TEMP2"],  %[temp20],    %["#TEMP2"]      \n\t"    \
427af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp20],    %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
428af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %["#TEMP0"],  %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
429af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul    %[temp16],    %["#TEMP2"],  %[c5352]         \n\t"    \
430af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul    %[temp17],    %["#TEMP2"],  %[c2217]         \n\t"    \
431af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul    %[temp18],    %["#TEMP0"],  %[c5352]         \n\t"    \
432af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul    %[temp19],    %["#TEMP0"],  %[c2217]         \n\t"    \
433af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %["#TEMP1"],  %["#TEMP3"],  %[temp20]        \n\t"    \
434af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp20],    %["#TEMP3"],  %[temp20]        \n\t"    \
435af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sll    %["#TEMP0"],  %["#TEMP1"],  3                \n\t"    \
436af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sll    %["#TEMP2"],  %[temp20],    3                \n\t"    \
437af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addiu  %[temp16],    %[temp16],    1812             \n\t"    \
438af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addiu  %[temp17],    %[temp17],    937              \n\t"    \
439af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp16],    %[temp16],    %[temp19]        \n\t"    \
440af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp17],    %[temp17],    %[temp18]        \n\t"    \
441af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %["#TEMP1"],  %[temp16],    9                \n\t"    \
442af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %["#TEMP3"],  %[temp17],    9                \n\t"
443af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
444af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// macro for one vertical pass in FTransform
445af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// temp0..temp15 holds tmp[0]..tmp[15]
446af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// A..D - offsets in bytes to store to out buffer
447af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
448af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)  \
449af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp16],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
450af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp19],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
451af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %[temp17],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
452af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %[temp18],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
453af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul    %["#TEMP8"],  %[temp19],    %[c2217]         \n\t"    \
454af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul    %["#TEMP12"], %[temp18],    %[c2217]         \n\t"    \
455af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul    %["#TEMP4"],  %[temp19],    %[c5352]         \n\t"    \
456af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "mul    %[temp18],    %[temp18],    %[c5352]         \n\t"    \
457af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addiu  %[temp16],    %[temp16],    7                \n\t"    \
458af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %["#TEMP0"],  %[temp16],    %[temp17]        \n\t"    \
459af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %["#TEMP0"],  %["#TEMP0"],  4                \n\t"    \
460af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addu   %["#TEMP12"], %["#TEMP12"], %["#TEMP4"]      \n\t"    \
461af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %["#TEMP4"],  %[temp16],    %[temp17]        \n\t"    \
462af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %["#TEMP4"],  %["#TEMP4"],  4                \n\t"    \
463af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addiu  %["#TEMP8"],  %["#TEMP8"],  30000            \n\t"    \
464af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addiu  %["#TEMP12"], %["#TEMP12"], 12000            \n\t"    \
465af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addiu  %["#TEMP8"],  %["#TEMP8"],  21000            \n\t"    \
466af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu   %["#TEMP8"],  %["#TEMP8"],  %[temp18]        \n\t"    \
467af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %["#TEMP12"], %["#TEMP12"], 16               \n\t"    \
468af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sra    %["#TEMP8"],  %["#TEMP8"],  16               \n\t"    \
469af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "addiu  %[temp16],    %["#TEMP12"], 1                \n\t"    \
470af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "movn   %["#TEMP12"], %[temp16],    %[temp19]        \n\t"    \
471af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sh     %["#TEMP0"],  "#A"(%[temp20])                \n\t"    \
472af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sh     %["#TEMP4"],  "#C"(%[temp20])                \n\t"    \
473af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sh     %["#TEMP8"],  "#D"(%[temp20])                \n\t"    \
474af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "sh     %["#TEMP12"], "#B"(%[temp20])                \n\t"
475af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
476af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
477af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
478af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
479af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp17, temp18, temp19, temp20;
480af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int c2217 = 2217;
481af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int c5352 = 5352;
482af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int* const args[3] =
483af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      { (const int*)src, (const int*)ref, (const int*)out };
484af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
485af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  __asm__ volatile(
486af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS( 0,  1,  2,  3, temp0,  temp1,  temp2,  temp3)
487af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS(16, 17, 18, 19, temp4,  temp5,  temp6,  temp7)
488af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS(32, 33, 34, 35, temp8,  temp9,  temp10, temp11)
489af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    HORIZONTAL_PASS(48, 49, 50, 51, temp12, temp13, temp14, temp15)
490af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "lw   %[temp20],    8(%[args])                     \n\t"
491af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
492af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
493af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
494af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
495af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
496af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
497af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
498af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
499af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
500af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
501af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
502af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
503af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
504af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : "memory", "hi", "lo"
505af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  );
506af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
507af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
508af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef VERTICAL_PASS
509af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef HORIZONTAL_PASS
510af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
511af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Forward declaration.
512af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroraextern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
513af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
514af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroraint VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
515af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int n = res->first;
516af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
517af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int p0 = res->prob[n][ctx0][0];
518af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16_t* t = res->cost[n][ctx0];
519af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int cost;
520af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int const_2 = 2;
521af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int const_255 = 255;
522af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int const_max_level = MAX_VARIABLE_LEVEL;
523af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int res_cost;
524af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int res_prob;
525af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int res_coeffs;
526af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int res_last;
527af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int v_reg;
528af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int b_reg;
529af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int ctx_reg;
530af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int cost_add, temp_1, temp_2, temp_3;
531af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
532af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  if (res->last < 0) {
533af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    return VP8BitCost(0, p0);
534af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
535af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
536af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
537af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
538af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  res_cost = (int)res->cost;
539af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  res_prob = (int)res->prob;
540af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  res_coeffs = (int)res->coeffs;
541af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  res_last = (int)res->last;
542af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
543af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  __asm__ volatile(
544af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ".set   push                                                           \n\t"
545af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ".set   noreorder                                                      \n\t"
546af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
547af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sll    %[temp_1],     %[n],              1                            \n\t"
548af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addu   %[res_coeffs], %[res_coeffs],     %[temp_1]                    \n\t"
549af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "slt    %[temp_2],     %[n],              %[res_last]                  \n\t"
550af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "bnez   %[temp_2],     1f                                              \n\t"
551af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    " li    %[cost_add],   0                                               \n\t"
552af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "b      2f                                                             \n\t"
553af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    " nop                                                                  \n\t"
554af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "1:                                                                      \n\t"
555af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "lh     %[v_reg],      0(%[res_coeffs])                                \n\t"
556af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addu   %[b_reg],      %[n],              %[VP8EncBands]               \n\t"
557af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "move   %[temp_1],     %[const_max_level]                              \n\t"
558af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addu   %[cost],       %[cost],           %[cost_add]                  \n\t"
559af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "negu   %[temp_2],     %[v_reg]                                        \n\t"
560af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "slti   %[temp_3],     %[v_reg],          0                            \n\t"
561af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "movn   %[v_reg],      %[temp_2],         %[temp_3]                    \n\t"
562af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "lbu    %[b_reg],      1(%[b_reg])                                     \n\t"
563af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "li     %[cost_add],   0                                               \n\t"
564af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
565af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sltiu  %[temp_3],     %[v_reg],          2                            \n\t"
566af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "move   %[ctx_reg],    %[v_reg]                                        \n\t"
567af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "movz   %[ctx_reg],    %[const_2],        %[temp_3]                    \n\t"
568af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    //  cost += VP8LevelCost(t, v);
569af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "slt    %[temp_3],     %[v_reg],          %[const_max_level]           \n\t"
570af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "movn   %[temp_1],     %[v_reg],          %[temp_3]                    \n\t"
571af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sll    %[temp_2],     %[v_reg],          1                            \n\t"
572af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addu   %[temp_2],     %[temp_2],         %[VP8LevelFixedCosts]        \n\t"
573af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "lhu    %[temp_2],     0(%[temp_2])                                    \n\t"
574af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sll    %[temp_1],     %[temp_1],         1                            \n\t"
575af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addu   %[temp_1],     %[temp_1],         %[t]                         \n\t"
576af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "lhu    %[temp_3],     0(%[temp_1])                                    \n\t"
577af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addu   %[cost],       %[cost],           %[temp_2]                    \n\t"
578af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
579af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    //  t = res->cost[b][ctx];
580af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sll    %[temp_1],     %[ctx_reg],        7                            \n\t"
581af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sll    %[temp_2],     %[ctx_reg],        3                            \n\t"
582af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addu   %[cost],       %[cost],           %[temp_3]                    \n\t"
583af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addu   %[temp_1],     %[temp_1],         %[temp_2]                    \n\t"
584af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sll    %[temp_2],     %[b_reg],          3                            \n\t"
585af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sll    %[temp_3],     %[b_reg],          5                            \n\t"
586af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sub    %[temp_2],     %[temp_3],         %[temp_2]                    \n\t"
587af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "sll    %[temp_3],     %[temp_2],         4                            \n\t"
588af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addu   %[temp_1],     %[temp_1],         %[temp_3]                    \n\t"
589af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addu   %[temp_2],     %[temp_2],         %[res_cost]                  \n\t"
590af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addiu  %[n],          %[n],              1                            \n\t"
591af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "addu   %[t],          %[temp_1],         %[temp_2]                    \n\t"
592af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "slt    %[temp_1],     %[n],              %[res_last]                  \n\t"
593af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "bnez   %[temp_1],     1b                                              \n\t"
594af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    " addiu %[res_coeffs], %[res_coeffs],     2                            \n\t"
595af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora   "2:                                                                     \n\t"
596af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
597af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    ".set   pop                                                            \n\t"
598af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [cost]"+r"(cost), [t]"+r"(t), [n]"+r"(n), [v_reg]"=&r"(v_reg),
599af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [ctx_reg]"=&r"(ctx_reg), [b_reg]"=&r"(b_reg), [cost_add]"=&r"(cost_add),
600af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp_1]"=&r"(temp_1), [temp_2]"=&r"(temp_2), [temp_3]"=&r"(temp_3)
601af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [const_2]"r"(const_2), [const_255]"r"(const_255), [res_last]"r"(res_last),
602af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [VP8EntropyCost]"r"(VP8EntropyCost), [VP8EncBands]"r"(VP8EncBands),
603af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [const_max_level]"r"(const_max_level), [res_prob]"r"(res_prob),
604af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_coeffs]"r"(res_coeffs),
605af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [res_cost]"r"(res_cost)
606af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : "memory"
607af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  );
608af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
609af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // Last coefficient is always non-zero
610af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  {
611af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int v = abs(res->coeffs[n]);
612af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    assert(v != 0);
613af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    cost += VP8LevelCost(t, v);
614af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    if (n < 15) {
615af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      const int b = VP8EncBands[n + 1];
616af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      const int ctx = (v == 1) ? 1 : 2;
617af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      const int last_p0 = res->prob[b][ctx][0];
618af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      cost += VP8BitCost(0, last_p0);
619af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    }
620af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
621af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return cost;
622af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
623af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
624af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define GET_SSE_INNER(A, B, C, D)                               \
625af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp0],    "#A"(%[a])                   \n\t"      \
626af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp1],    "#A"(%[b])                   \n\t"      \
627af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp2],    "#B"(%[a])                   \n\t"      \
628af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp3],    "#B"(%[b])                   \n\t"      \
629af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp4],    "#C"(%[a])                   \n\t"      \
630af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp5],    "#C"(%[b])                   \n\t"      \
631af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp6],    "#D"(%[a])                   \n\t"      \
632af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "lbu     %[temp7],    "#D"(%[b])                   \n\t"      \
633af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \
634af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \
635af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \
636af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "subu    %[temp6],    %[temp6],     %[temp7]       \n\t"      \
637af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "madd    %[temp0],    %[temp0]                     \n\t"      \
638af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "madd    %[temp2],    %[temp2]                     \n\t"      \
639af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "madd    %[temp4],    %[temp4]                     \n\t"      \
640af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  "madd    %[temp6],    %[temp6]                     \n\t"
641af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
642af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define GET_SSE(A, B, C, D)               \
643af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  GET_SSE_INNER(A, A + 1, A + 2, A + 3)   \
644af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  GET_SSE_INNER(B, B + 1, B + 2, B + 3)   \
645af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
646af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  GET_SSE_INNER(D, D + 1, D + 2, D + 3)
647af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
648af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if !defined(WORK_AROUND_GCC)
649af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE16x16(const uint8_t* a, const uint8_t* b) {
650af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int count;
651af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
652af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
653af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  __asm__ volatile(
654af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     "mult   $zero,    $zero                            \n\t"
655af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
656af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(  0,   4,   8,  12)
657af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 16,  20,  24,  28)
658af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 32,  36,  40,  44)
659af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 48,  52,  56,  60)
660af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 64,  68,  72,  76)
661af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 80,  84,  88,  92)
662af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 96, 100, 104, 108)
663af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(112, 116, 120, 124)
664af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(128, 132, 136, 140)
665af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(144, 148, 152, 156)
666af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(160, 164, 168, 172)
667af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(176, 180, 184, 188)
668af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(192, 196, 200, 204)
669af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(208, 212, 216, 220)
670af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(224, 228, 232, 236)
671af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(240, 244, 248, 252)
672af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
673af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "mflo    %[count]                                   \n\t"
674af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
675af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
676af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
677af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [a]"r"(a), [b]"r"(b)
678af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : "memory", "hi" , "lo"
679af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  );
680af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return count;
681af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
682af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
683af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE16x8(const uint8_t* a, const uint8_t* b) {
684af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int count;
685af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
686af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
687af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  __asm__ volatile(
688af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     "mult   $zero,    $zero                            \n\t"
689af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
690af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(  0,   4,   8,  12)
691af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 16,  20,  24,  28)
692af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 32,  36,  40,  44)
693af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 48,  52,  56,  60)
694af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 64,  68,  72,  76)
695af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 80,  84,  88,  92)
696af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 96, 100, 104, 108)
697af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(112, 116, 120, 124)
698af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
699af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "mflo    %[count]                                   \n\t"
700af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
701af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
702af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
703af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [a]"r"(a), [b]"r"(b)
704af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : "memory", "hi" , "lo"
705af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  );
706af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return count;
707af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
708af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
709af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE8x8(const uint8_t* a, const uint8_t* b) {
710af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int count;
711af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
712af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
713af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  __asm__ volatile(
714af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     "mult   $zero,    $zero                            \n\t"
715af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
716af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE( 0,   4,  16,  20)
717af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(32,  36,  48,  52)
718af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(64,  68,  80,  84)
719af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(96, 100, 112, 116)
720af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
721af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "mflo    %[count]                                   \n\t"
722af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
723af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
724af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
725af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [a]"r"(a), [b]"r"(b)
726af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : "memory", "hi" , "lo"
727af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  );
728af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return count;
729af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
730af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
731af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE4x4(const uint8_t* a, const uint8_t* b) {
732af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int count;
733af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
734af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
735af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  __asm__ volatile(
736af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     "mult   $zero,    $zero                            \n\t"
737af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
738af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora     GET_SSE(0, 16, 32, 48)
739af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
740af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    "mflo    %[count]                                   \n\t"
741af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
742af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
743af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
744af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : [a]"r"(a), [b]"r"(b)
745af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    : "memory", "hi" , "lo"
746af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  );
747af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return count;
748af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
749af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
750af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif  // WORK_AROUND_GCC
751af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
752af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef GET_SSE_MIPS32
753af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef GET_SSE_MIPS32_INNER
754af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
755af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif  // WEBP_USE_MIPS32
756af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
757af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//------------------------------------------------------------------------------
758af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Entry point
759af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
760af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroraextern void VP8EncDspInitMIPS32(void);
761af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
762af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Aroravoid VP8EncDspInitMIPS32(void) {
763af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(WEBP_USE_MIPS32)
764af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8ITransform = ITransform;
765af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8EncQuantizeBlock = QuantizeBlock;
766af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8TDisto4x4 = Disto4x4;
767af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8TDisto16x16 = Disto16x16;
768af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8FTransform = FTransform;
769af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if !defined(WORK_AROUND_GCC)
770af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8SSE16x16 = SSE16x16;
771af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8SSE8x8 = SSE8x8;
772af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8SSE16x8 = SSE16x8;
773af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8SSE4x4 = SSE4x4;
774af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif
775af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif  // WEBP_USE_MIPS32
776af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
777