1df37111358d02836cb29bbcb9c6e4c95dff90a16Johann/*
2df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *
4df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  Use of this source code is governed by a BSD-style license
5df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  that can be found in the LICENSE file in the root of the source
6df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  tree. An additional intellectual property rights grant can be found
7df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  in the file PATENTS.  All contributing project authors may
8df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  be found in the AUTHORS file in the root of the source tree.
9df37111358d02836cb29bbcb9c6e4c95dff90a16Johann */
10df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
11df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "./vp8_rtcd.h"
12df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx_ports/mem.h"
13df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx_ports/asmdefs_mmi.h"
14df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
15df37111358d02836cb29bbcb9c6e4c95dff90a16Johann/* clang-format off */
16df37111358d02836cb29bbcb9c6e4c95dff90a16Johann/* TRANSPOSE_4H: transpose 4x4 matrix.
17df37111358d02836cb29bbcb9c6e4c95dff90a16Johann   Input: ftmp1,ftmp2,ftmp3,ftmp4
18df37111358d02836cb29bbcb9c6e4c95dff90a16Johann   Output: ftmp1,ftmp2,ftmp3,ftmp4
19df37111358d02836cb29bbcb9c6e4c95dff90a16Johann   Note: ftmp0 always be 0, ftmp5~9 used for temporary value.
20df37111358d02836cb29bbcb9c6e4c95dff90a16Johann */
21df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define TRANSPOSE_4H                                         \
22df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  MMI_LI(%[tmp0], 0x93)                                      \
23df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "mtc1       %[tmp0],    %[ftmp10]                    \n\t" \
24df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp0]         \n\t" \
25df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpcklhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
26df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
27df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "or         %[ftmp5],   %[ftmp5],   %[ftmp9]         \n\t" \
28df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp0]         \n\t" \
29df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpckhhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
30df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
31df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "or         %[ftmp6],   %[ftmp6],   %[ftmp9]         \n\t" \
32df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp0]         \n\t" \
33df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpcklhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
34df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
35df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "or         %[ftmp7],   %[ftmp7],   %[ftmp9]         \n\t" \
36df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp0]         \n\t" \
37df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpckhhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
38df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
39df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "or         %[ftmp8],   %[ftmp8],   %[ftmp9]         \n\t" \
40df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpcklwd  %[ftmp1],   %[ftmp5],   %[ftmp7]         \n\t" \
41df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpckhwd  %[ftmp2],   %[ftmp5],   %[ftmp7]         \n\t" \
42df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpcklwd  %[ftmp3],   %[ftmp6],   %[ftmp8]         \n\t" \
43df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  "punpckhwd  %[ftmp4],   %[ftmp6],   %[ftmp8]         \n\t"
44df37111358d02836cb29bbcb9c6e4c95dff90a16Johann/* clang-format on */
45df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
46df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
47df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint64_t tmp[1];
48df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16_t *ip = input;
49df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
50df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#if _MIPS_SIM == _ABIO32
51df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp0 asm("$f0");
52df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp1 asm("$f2");
53df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp2 asm("$f4");
54df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp3 asm("$f6");
55df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp4 asm("$f8");
56df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp5 asm("$f10");
57df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp6 asm("$f12");
58df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp7 asm("$f14");
59df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp8 asm("$f16");
60df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp9 asm("$f18");
61df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp10 asm("$f20");
62df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp11 asm("$f22");
63df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp12 asm("$f24");
64df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#else
65df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp0 asm("$f0");
66df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp1 asm("$f1");
67df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp2 asm("$f2");
68df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp3 asm("$f3");
69df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp4 asm("$f4");
70df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp5 asm("$f5");
71df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp6 asm("$f6");
72df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp7 asm("$f7");
73df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp8 asm("$f8");
74df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp9 asm("$f9");
75df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp10 asm("$f10");
76df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp11 asm("$f11");
77df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  register double ftmp12 asm("$f12");
78df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#endif  // _MIPS_SIM == _ABIO32
79df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
80df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
81df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
82df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
83df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
84df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
85df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
86df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
87df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
88df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
89df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
90df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
91df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
92df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __asm__ volatile (
93df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "xor        %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"
94df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldlc1    %[ftmp1],   0x07(%[ip])                     \n\t"
95df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldrc1    %[ftmp1],   0x00(%[ip])                     \n\t"
96df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_ADDU(%[ip], %[ip], %[pitch])
97df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldlc1    %[ftmp2],   0x07(%[ip])                     \n\t"
98df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldrc1    %[ftmp2],   0x00(%[ip])                     \n\t"
99df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_ADDU(%[ip], %[ip], %[pitch])
100df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldlc1    %[ftmp3],   0x07(%[ip])                     \n\t"
101df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldrc1    %[ftmp3],   0x00(%[ip])                     \n\t"
102df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_ADDU(%[ip], %[ip], %[pitch])
103df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldlc1    %[ftmp4],   0x07(%[ip])                     \n\t"
104df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldrc1    %[ftmp4],   0x00(%[ip])                     \n\t"
105df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_ADDU(%[ip], %[ip], %[pitch])
106df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    TRANSPOSE_4H
107df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
108df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "ldc1       %[ftmp11],  %[ff_ph_8]                      \n\t"
109df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // f1 + f4
110df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
111df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // a1
112df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
113df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // f2 + f3
114df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
115df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // b1
116df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
117df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // f2 - f3
118df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
119df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // c1
120df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp11]       \n\t"
121df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // f1 - f4
122df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
123df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // d1
124df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp11]       \n\t"
125df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[0] = a1 + b1
126df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
127df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[2] = a1 - b1
128df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubh      %[ftmp3],   %[ftmp5],       %[ftmp6]        \n\t"
129df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
130df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
131df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_LI(%[tmp0], 0x0c)
132df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "mtc1       %[tmp0],    %[ftmp11]                       \n\t"
133df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"
134df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
135df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"
136df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "punpckhhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
137df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op1]    \n\t"
138df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
139df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
140df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
141df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
142df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "packsswh   %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
143df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
144df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12
145df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "ldc1       %[ftmp12],  %[ff_pw_7500]                   \n\t"
146df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "punpcklhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
147df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op3]    \n\t"
148df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "punpckhhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
149df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op3]    \n\t"
150df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
151df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
152df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
153df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
154df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "packsswh   %[ftmp4],   %[ftmp5],       %[ftmp6]        \n\t"
155df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    TRANSPOSE_4H
156df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
157df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
158df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
159df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
160df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
161df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
162df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pcmpeqh    %[ftmp0],   %[ftmp8],       %[ftmp0]        \n\t"
163df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "ldc1       %[ftmp9],   %[ff_ph_01]                     \n\t"
164df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp0],   %[ftmp0],       %[ftmp9]        \n\t"
165df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
166df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
167df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubh      %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
168df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "ldc1       %[ftmp9],   %[ff_ph_07]                     \n\t"
169df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
170df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
171df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_LI(%[tmp0], 0x04)
172df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
173df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
174df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
175df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
176df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_LI(%[tmp0], 0x10)
177df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
178df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "ldc1       %[ftmp12],  %[ff_pw_12000]                  \n\t"
179df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "punpcklhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
180df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op1]    \n\t"
181df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "punpckhhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
182df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op1]    \n\t"
183df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
184df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
185df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
186df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
187df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "packsswh   %[ftmp3],   %[ftmp10],      %[ftmp11]       \n\t"
188df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]        \n\t"
189df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
190df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "ldc1       %[ftmp12],  %[ff_pw_51000]                  \n\t"
191df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
192df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op3]    \n\t"
193df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "punpckhhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
194df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op3]    \n\t"
195df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
196df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
197df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
198df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
199df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "packsswh   %[ftmp4],   %[ftmp10],      %[ftmp11]       \n\t"
200df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
201df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdlc1    %[ftmp1],   0x07(%[output])                 \n\t"
202df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdrc1    %[ftmp1],   0x00(%[output])                 \n\t"
203df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdlc1    %[ftmp3],   0x0f(%[output])                 \n\t"
204df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdrc1    %[ftmp3],   0x08(%[output])                 \n\t"
205df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdlc1    %[ftmp2],   0x17(%[output])                 \n\t"
206df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdrc1    %[ftmp2],   0x10(%[output])                 \n\t"
207df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdlc1    %[ftmp4],   0x1f(%[output])                 \n\t"
208df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdrc1    %[ftmp4],   0x18(%[output])                 \n\t"
209df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
210df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
211df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
212df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
213df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
214df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)
215df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
216df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),
217df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
218df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
219df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
220df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
221df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    : "memory"
222df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  );
223df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
224df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
225df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
226df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vp8_short_fdct4x4_mmi(input, output, pitch);
227df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
228df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
229df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
230df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
231df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  double ftmp[13];
232df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint32_t tmp[1];
233df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
234df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
235df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
236df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
237df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
238df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  __asm__ volatile (
239df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_LI(%[tmp0], 0x02)
240df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
241df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
242df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
243df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
244df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
245df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_ADDU(%[ip], %[ip], %[pitch])
246df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldlc1    %[ftmp2],   0x07(%[ip])                         \n\t"
247df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldrc1    %[ftmp2],   0x00(%[ip])                         \n\t"
248df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_ADDU(%[ip], %[ip], %[pitch])
249df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldlc1    %[ftmp3],   0x07(%[ip])                         \n\t"
250df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldrc1    %[ftmp3],   0x00(%[ip])                         \n\t"
251df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_ADDU(%[ip], %[ip], %[pitch])
252df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldlc1    %[ftmp4],   0x07(%[ip])                         \n\t"
253df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gsldrc1    %[ftmp4],   0x00(%[ip])                         \n\t"
254df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    TRANSPOSE_4H
255df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
256df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psllh      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
257df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psllh      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
258df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psllh      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
259df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psllh      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
260df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // a
261df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
262df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // d
263df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp4]            \n\t"
264df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // c
265df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp4]            \n\t"
266df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // b
267df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp3]            \n\t"
268df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
269df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // a + d
270df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
271df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // b + c
272df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp2],   %[ftmp8],       %[ftmp7]            \n\t"
273df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // b - c
274df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubh      %[ftmp3],   %[ftmp8],       %[ftmp7]            \n\t"
275df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // a - d
276df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp6]            \n\t"
277df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
278df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pcmpeqh    %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
279df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp6],   %[ftmp6],       %[ff_ph_01]         \n\t"
280df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
281df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    TRANSPOSE_4H
282df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
283df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[2], op[0]
284df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp5],   %[ftmp1],       %[ff_pw_01]         \n\t"
285df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[3], op[1]
286df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp1],   %[ftmp1],       %[ff_pw_mask]       \n\t"
287df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
288df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[6], op[4]
289df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp6],   %[ftmp2],       %[ff_pw_01]         \n\t"
290df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[7], op[5]
291df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp2],   %[ftmp2],       %[ff_pw_mask]       \n\t"
292df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
293df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[10], op[8]
294df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp7],   %[ftmp3],       %[ff_pw_01]         \n\t"
295df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[11], op[9]
296df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp3],   %[ftmp3],       %[ff_pw_mask]       \n\t"
297df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
298df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[14], op[12]
299df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp8],   %[ftmp4],       %[ff_pw_01]         \n\t"
300df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // op[15], op[13]
301df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pmaddhw    %[ftmp4],   %[ftmp4],       %[ff_pw_mask]       \n\t"
302df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
303df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // a1, a3
304df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp9],   %[ftmp5],       %[ftmp7]            \n\t"
305df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // d1, d3
306df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp10],  %[ftmp6],       %[ftmp8]            \n\t"
307df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // c1, c3
308df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubw      %[ftmp11],  %[ftmp6],       %[ftmp8]            \n\t"
309df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // b1, b3
310df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubw      %[ftmp12],  %[ftmp5],       %[ftmp7]            \n\t"
311df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
312df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // a1 + d1, a3 + d3
313df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp5],   %[ftmp9],       %[ftmp10]           \n\t"
314df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // b1 + c1, b3 + c3
315df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp6],   %[ftmp12],      %[ftmp11]           \n\t"
316df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // b1 - c1, b3 - c3
317df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubw      %[ftmp7],   %[ftmp12],      %[ftmp11]           \n\t"
318df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // a1 - d1, a3 - d3
319df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubw      %[ftmp8],   %[ftmp9],       %[ftmp10]           \n\t"
320df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
321df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // a2, a4
322df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp9],   %[ftmp1],       %[ftmp3]            \n\t"
323df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // d2, d4
324df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp10],  %[ftmp2],       %[ftmp4]            \n\t"
325df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // c2, c4
326df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubw      %[ftmp11],  %[ftmp2],       %[ftmp4]            \n\t"
327df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // b2, b4
328df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubw      %[ftmp12],  %[ftmp1],       %[ftmp3]            \n\t"
329df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
330df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // a2 + d2, a4 + d4
331df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp1],   %[ftmp9],       %[ftmp10]           \n\t"
332df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // b2 + c2, b4 + c4
333df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp2],   %[ftmp12],      %[ftmp11]           \n\t"
334df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // b2 - c2, b4 - c4
335df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubw      %[ftmp3],   %[ftmp12],      %[ftmp11]           \n\t"
336df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    // a2 - d2, a4 - d4
337df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psubw      %[ftmp4],   %[ftmp9],       %[ftmp10]           \n\t"
338df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
339df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_LI(%[tmp0], 0x03)
340df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
341df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
342df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp1]            \n\t"
343df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
344df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
345df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp1],   %[ftmp1],       %[ff_pw_03]         \n\t"
346df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
347df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
348df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp2]            \n\t"
349df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
350df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
351df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp2],   %[ftmp2],       %[ff_pw_03]         \n\t"
352df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
353df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
354df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp3]            \n\t"
355df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
356df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
357df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp3],   %[ftmp3],       %[ff_pw_03]         \n\t"
358df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
359df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
360df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp4]            \n\t"
361df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
362df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
363df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp4],   %[ftmp4],       %[ff_pw_03]         \n\t"
364df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
365df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
366df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp5]            \n\t"
367df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
368df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp9]            \n\t"
369df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp5],   %[ftmp5],       %[ff_pw_03]         \n\t"
370df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]           \n\t"
371df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
372df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp6]            \n\t"
373df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
374df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
375df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp6],   %[ftmp6],       %[ff_pw_03]         \n\t"
376df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]           \n\t"
377df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
378df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp7]            \n\t"
379df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
380df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp7],   %[ftmp7],       %[ftmp9]            \n\t"
381df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp7],   %[ftmp7],       %[ff_pw_03]         \n\t"
382df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp7],   %[ftmp7],       %[ftmp11]           \n\t"
383df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
384df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp8]            \n\t"
385df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
386df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t"
387df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "paddw      %[ftmp8],   %[ftmp8],       %[ff_pw_03]         \n\t"
388df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "psraw      %[ftmp8],   %[ftmp8],       %[ftmp11]           \n\t"
389df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
390df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
391df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
392df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
393df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
394df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
395df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    MMI_LI(%[tmp0], 0x72)
396df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
397df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
398df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
399df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
400df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
401df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
402df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
403df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
404df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
405df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
406df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
407df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
408df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
409df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
410df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
411df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
412df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
413df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
414df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
415df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
416df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ftmp12]"=&f"(ftmp[12]),
417df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [tmp0]"=&r"(tmp[0]),
418df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ip]"+&r"(input)
419df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    : [op]"r"(output),
420df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ff_pw_01]"f"(ff_pw_01),          [pitch]"r"((mips_reg)pitch),
421df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ff_pw_03]"f"(ff_pw_03),          [ff_pw_mask]"f"(ff_pw_mask),
422df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      [ff_ph_01]"f"(ff_ph_01)
423df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    : "memory"
424df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  );
425df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
426