1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include "vpx_ports/config.h"
13#include "dequantize.h"
14#include "idct.h"
15#include "vpx_mem/vpx_mem.h"
16
17DECLARE_ALIGNED(8, const unsigned char, cma[512]) = {
180, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
220, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2312, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
2433, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
2554, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
2675, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
2796, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
28114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
29131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
30148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
31165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
32182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198,
33199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,
34216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
35233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
36250, 251, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
37255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
38255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
39255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
40255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
41255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
42255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
43255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
44};
45
46extern void vp8_short_idct4x4llm_mips(short *input, short *output, int pitch) ;
47extern void vp8_short_idct4x4llm_1_mips(short *input, short *output, int pitch);
48
49
50void vp8_dequant_idct_add_mips(short *input, short *dq, unsigned char *pred,
51                            unsigned char *dest, int pitch, int stride)
52{
53    short output[16];
54    short *diff_ptr = output;
55    int i;
56    short a1, a2, a3, a0;
57
58    short input_temp[16];
59    unsigned int in1, dq1, x1, in2, dq2;
60    const unsigned char *cm = &(cma[128]);
61
62    __asm__ __volatile__ (
63        "lh            %[in1], 0(%[input])                \n\t"
64        "lh            %[dq1], 0(%[dq])                   \n\t"
65        "lh            %[in2], 2(%[input])                \n\t"
66        "lh            %[dq2], 2(%[dq])                   \n\t"
67        "append        %[in1], %[in2],          16        \n\t"
68        "append        %[dq1], %[dq2],          16        \n\t"
69        "mul.ph        %[x1],  %[dq1],          %[in1]    \n\t"
70        "sh            %[x1],  2(%[input_temp])           \n\t"
71        "srl           %[x1],  %[x1], 16                  \n\t"
72        "sh            %[x1],  0(%[input_temp])           \n\t"
73
74        : [x1] "=&r" (x1), [in1] "=&r" (in1), [dq1] "=&r" (dq1),
75          [in2] "=&r" (in2), [dq2] "=&r" (dq2)
76        : [dq] "r" (dq), [input] "r" (input),
77          [input_temp] "r" (input_temp)
78    );
79
80    for (i = 2; i < 16; i++)
81    {
82        input_temp[i] = dq[i] * input[i];
83    }
84
85    /* the idct halves ( >> 1) the pitch */
86    vp8_short_idct4x4llm_mips(input_temp, output, 4);
87
88    vpx_memset(input, 0, 32);
89
90    /* unroll the loop */
91    for (i = 4; i--; )
92    {
93        a0 = diff_ptr[0] + pred[0];
94        a1 = diff_ptr[1] + pred[1];
95        a2 = diff_ptr[2] + pred[2];
96        a3 = diff_ptr[3] + pred[3];
97
98        dest[0] = cm[a0];
99        dest[1] = cm[a1];
100        dest[2] = cm[a2];
101        dest[3] = cm[a3];
102
103        dest += stride;
104        diff_ptr += 4;
105        pred += pitch;
106    }
107}
108
109
110void vp8_dequant_dc_idct_add_mips(short *input, short *dq, unsigned char *pred,
111                               unsigned char *dest, int pitch, int stride,
112                               int Dc)
113{
114    int i;
115    short output[16];
116    short *diff_ptr = output;
117    short input_temp[16];
118    short a1, a2, a3, a0;
119
120    unsigned int in1, dq1, x1, in2, dq2;
121    const unsigned char *cm = &(cma[128]);
122
123    input_temp[0] = (short)Dc;
124
125    __asm__ __volatile__ (
126        "lh            %[in1], 2(%[input])                \n\t"
127        "lh            %[dq1], 2(%[dq])                   \n\t"
128        "lh            %[in2], 4(%[input])                \n\t"
129        "lh            %[dq2], 4(%[dq])                   \n\t"
130        "append        %[in1], %[in2],          16        \n\t"
131        "append        %[dq1], %[dq2],          16        \n\t"
132        "mul.ph        %[x1],  %[dq1],          %[in1]    \n\t"
133        "sh            %[x1],  4(%[input_temp])           \n\t"
134        "srl           %[x1],  %[x1], 16                  \n\t"
135        "sh            %[x1],  2(%[input_temp])           \n\t"
136
137        : [x1] "=&r" (x1), [in1] "=&r" (in1), [dq1] "=&r" (dq1),
138          [in2] "=&r" (in2), [dq2] "=&r" (dq2)
139        : [dq] "r" (dq), [input] "r" (input),
140          [input_temp] "r" (input_temp)
141    );
142
143    for (i = 3; i < 16; i++)
144    {
145        input_temp[i] = dq[i] * input[i];
146    }
147
148    vp8_short_idct4x4llm_mips(input_temp, output, 4);
149
150    vpx_memset(input, 0, 32);
151
152    for (i = 4; i--; )
153    {
154        a0 = diff_ptr[0] + pred[0];
155        a1 = diff_ptr[1] + pred[1];
156        a2 = diff_ptr[2] + pred[2];
157        a3 = diff_ptr[3] + pred[3];
158
159        dest[0] = cm[a0];
160        dest[1] = cm[a1];
161        dest[2] = cm[a2];
162        dest[3] = cm[a3];
163
164        dest += stride;
165        diff_ptr += 4;
166        pred += pitch;
167    }
168}