1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "vp8_rtcd.h"
12
13#if HAVE_DSPR2
14#define CROP_WIDTH 256
15
16/******************************************************************************
17 * Notes:
18 *
19 * This implementation makes use of 16 bit fixed point version of two multiply
20 * constants:
21 *         1.   sqrt(2) * cos (pi/8)
22 *         2.   sqrt(2) * sin (pi/8)
23 * Since the first constant is bigger than 1, to maintain the same 16 bit
24 * fixed point precision as the second one, we use a trick of
25 *         x * a = x + x*(a-1)
26 * so
27 *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
28 ****************************************************************************/
29extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
30static const int cospi8sqrt2minus1 = 20091;
31static const int sinpi8sqrt2 = 35468;
32
33inline void prefetch_load_short(short *src) {
34  __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
35}
36
37void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr,
38                                int pred_stride, unsigned char *dst_ptr,
39                                int dst_stride) {
40  int r, c;
41  int a1, b1, c1, d1;
42  short output[16];
43  short *ip = input;
44  short *op = output;
45  int temp1, temp2;
46  int shortpitch = 4;
47
48  int c2, d2;
49  int temp3, temp4;
50  unsigned char *cm = ff_cropTbl + CROP_WIDTH;
51
52  /* prepare data for load */
53  prefetch_load_short(ip + 8);
54
55  /* first loop is unrolled */
56  a1 = ip[0] + ip[8];
57  b1 = ip[0] - ip[8];
58
59  temp1 = (ip[4] * sinpi8sqrt2) >> 16;
60  temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
61  c1 = temp1 - temp2;
62
63  temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
64  temp2 = (ip[12] * sinpi8sqrt2) >> 16;
65  d1 = temp1 + temp2;
66
67  temp3 = (ip[5] * sinpi8sqrt2) >> 16;
68  temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
69  c2 = temp3 - temp4;
70
71  temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
72  temp4 = (ip[13] * sinpi8sqrt2) >> 16;
73  d2 = temp3 + temp4;
74
75  op[0] = a1 + d1;
76  op[12] = a1 - d1;
77  op[4] = b1 + c1;
78  op[8] = b1 - c1;
79
80  a1 = ip[1] + ip[9];
81  b1 = ip[1] - ip[9];
82
83  op[1] = a1 + d2;
84  op[13] = a1 - d2;
85  op[5] = b1 + c2;
86  op[9] = b1 - c2;
87
88  a1 = ip[2] + ip[10];
89  b1 = ip[2] - ip[10];
90
91  temp1 = (ip[6] * sinpi8sqrt2) >> 16;
92  temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16);
93  c1 = temp1 - temp2;
94
95  temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16);
96  temp2 = (ip[14] * sinpi8sqrt2) >> 16;
97  d1 = temp1 + temp2;
98
99  temp3 = (ip[7] * sinpi8sqrt2) >> 16;
100  temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
101  c2 = temp3 - temp4;
102
103  temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
104  temp4 = (ip[15] * sinpi8sqrt2) >> 16;
105  d2 = temp3 + temp4;
106
107  op[2] = a1 + d1;
108  op[14] = a1 - d1;
109  op[6] = b1 + c1;
110  op[10] = b1 - c1;
111
112  a1 = ip[3] + ip[11];
113  b1 = ip[3] - ip[11];
114
115  op[3] = a1 + d2;
116  op[15] = a1 - d2;
117  op[7] = b1 + c2;
118  op[11] = b1 - c2;
119
120  ip = output;
121
122  /* prepare data for load */
123  prefetch_load_short(ip + shortpitch);
124
125  /* second loop is unrolled */
126  a1 = ip[0] + ip[2];
127  b1 = ip[0] - ip[2];
128
129  temp1 = (ip[1] * sinpi8sqrt2) >> 16;
130  temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
131  c1 = temp1 - temp2;
132
133  temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
134  temp2 = (ip[3] * sinpi8sqrt2) >> 16;
135  d1 = temp1 + temp2;
136
137  temp3 = (ip[5] * sinpi8sqrt2) >> 16;
138  temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
139  c2 = temp3 - temp4;
140
141  temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
142  temp4 = (ip[7] * sinpi8sqrt2) >> 16;
143  d2 = temp3 + temp4;
144
145  op[0] = (a1 + d1 + 4) >> 3;
146  op[3] = (a1 - d1 + 4) >> 3;
147  op[1] = (b1 + c1 + 4) >> 3;
148  op[2] = (b1 - c1 + 4) >> 3;
149
150  a1 = ip[4] + ip[6];
151  b1 = ip[4] - ip[6];
152
153  op[4] = (a1 + d2 + 4) >> 3;
154  op[7] = (a1 - d2 + 4) >> 3;
155  op[5] = (b1 + c2 + 4) >> 3;
156  op[6] = (b1 - c2 + 4) >> 3;
157
158  a1 = ip[8] + ip[10];
159  b1 = ip[8] - ip[10];
160
161  temp1 = (ip[9] * sinpi8sqrt2) >> 16;
162  temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16);
163  c1 = temp1 - temp2;
164
165  temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16);
166  temp2 = (ip[11] * sinpi8sqrt2) >> 16;
167  d1 = temp1 + temp2;
168
169  temp3 = (ip[13] * sinpi8sqrt2) >> 16;
170  temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
171  c2 = temp3 - temp4;
172
173  temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
174  temp4 = (ip[15] * sinpi8sqrt2) >> 16;
175  d2 = temp3 + temp4;
176
177  op[8] = (a1 + d1 + 4) >> 3;
178  op[11] = (a1 - d1 + 4) >> 3;
179  op[9] = (b1 + c1 + 4) >> 3;
180  op[10] = (b1 - c1 + 4) >> 3;
181
182  a1 = ip[12] + ip[14];
183  b1 = ip[12] - ip[14];
184
185  op[12] = (a1 + d2 + 4) >> 3;
186  op[15] = (a1 - d2 + 4) >> 3;
187  op[13] = (b1 + c2 + 4) >> 3;
188  op[14] = (b1 - c2 + 4) >> 3;
189
190  ip = output;
191
192  for (r = 0; r < 4; ++r) {
193    for (c = 0; c < 4; ++c) {
194      short a = ip[c] + pred_ptr[c];
195      dst_ptr[c] = cm[a];
196    }
197
198    ip += 4;
199    dst_ptr += dst_stride;
200    pred_ptr += pred_stride;
201  }
202}
203
204void vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr,
205                                int pred_stride, unsigned char *dst_ptr,
206                                int dst_stride) {
207  int a1;
208  int i, absa1;
209  int t2, vector_a1, vector_a;
210
211  /* a1 = ((input_dc + 4) >> 3); */
212  __asm__ __volatile__(
213      "addi  %[a1], %[input_dc], 4   \n\t"
214      "sra   %[a1], %[a1],       3   \n\t"
215      : [a1] "=r"(a1)
216      : [input_dc] "r"(input_dc));
217
218  if (a1 < 0) {
219    /* use quad-byte
220     * input and output memory are four byte aligned
221     */
222    __asm__ __volatile__(
223        "abs        %[absa1],     %[a1]         \n\t"
224        "replv.qb   %[vector_a1], %[absa1]      \n\t"
225        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
226        : [a1] "r"(a1));
227
228    /* use (a1 - predptr[c]) instead a1 + predptr[c] */
229    for (i = 4; i--;) {
230      __asm__ __volatile__(
231          "lw             %[t2],       0(%[pred_ptr])                     \n\t"
232          "add            %[pred_ptr], %[pred_ptr],    %[pred_stride]     \n\t"
233          "subu_s.qb      %[vector_a], %[t2],          %[vector_a1]       \n\t"
234          "sw             %[vector_a], 0(%[dst_ptr])                      \n\t"
235          "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]      \n\t"
236          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a),
237            [dst_ptr] "+&r"(dst_ptr), [pred_ptr] "+&r"(pred_ptr)
238          : [dst_stride] "r"(dst_stride), [pred_stride] "r"(pred_stride),
239            [vector_a1] "r"(vector_a1));
240    }
241  } else {
242    /* use quad-byte
243     * input and output memory are four byte aligned
244     */
245    __asm__ __volatile__("replv.qb       %[vector_a1], %[a1]     \n\t"
246                         : [vector_a1] "=r"(vector_a1)
247                         : [a1] "r"(a1));
248
249    for (i = 4; i--;) {
250      __asm__ __volatile__(
251          "lw             %[t2],       0(%[pred_ptr])                 \n\t"
252          "add            %[pred_ptr], %[pred_ptr],    %[pred_stride] \n\t"
253          "addu_s.qb      %[vector_a], %[vector_a1],   %[t2]          \n\t"
254          "sw             %[vector_a], 0(%[dst_ptr])                  \n\t"
255          "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]  \n\t"
256          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a),
257            [dst_ptr] "+&r"(dst_ptr), [pred_ptr] "+&r"(pred_ptr)
258          : [dst_stride] "r"(dst_stride), [pred_stride] "r"(pred_stride),
259            [vector_a1] "r"(vector_a1));
260    }
261  }
262}
263
264void vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff) {
265  short output[16];
266  int i;
267  int a1, b1, c1, d1;
268  int a2, b2, c2, d2;
269  short *ip = input;
270  short *op = output;
271
272  prefetch_load_short(ip);
273
274  for (i = 4; i--;) {
275    a1 = ip[0] + ip[12];
276    b1 = ip[4] + ip[8];
277    c1 = ip[4] - ip[8];
278    d1 = ip[0] - ip[12];
279
280    op[0] = a1 + b1;
281    op[4] = c1 + d1;
282    op[8] = a1 - b1;
283    op[12] = d1 - c1;
284
285    ip++;
286    op++;
287  }
288
289  ip = output;
290  op = output;
291
292  prefetch_load_short(ip);
293
294  for (i = 4; i--;) {
295    a1 = ip[0] + ip[3] + 3;
296    b1 = ip[1] + ip[2];
297    c1 = ip[1] - ip[2];
298    d1 = ip[0] - ip[3] + 3;
299
300    a2 = a1 + b1;
301    b2 = d1 + c1;
302    c2 = a1 - b1;
303    d2 = d1 - c1;
304
305    op[0] = a2 >> 3;
306    op[1] = b2 >> 3;
307    op[2] = c2 >> 3;
308    op[3] = d2 >> 3;
309
310    ip += 4;
311    op += 4;
312  }
313
314  for (i = 0; i < 16; ++i) {
315    mb_dqcoeff[i * 16] = output[i];
316  }
317}
318
319void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff) {
320  int a1;
321
322  a1 = ((input[0] + 3) >> 3);
323
324  __asm__ __volatile__(
325      "sh             %[a1], 0(%[mb_dqcoeff])                    \n\t"
326      "sh             %[a1], 32(%[mb_dqcoeff])                   \n\t"
327      "sh             %[a1], 64(%[mb_dqcoeff])                   \n\t"
328      "sh             %[a1], 96(%[mb_dqcoeff])                   \n\t"
329      "sh             %[a1], 128(%[mb_dqcoeff])                  \n\t"
330      "sh             %[a1], 160(%[mb_dqcoeff])                  \n\t"
331      "sh             %[a1], 192(%[mb_dqcoeff])                  \n\t"
332      "sh             %[a1], 224(%[mb_dqcoeff])                  \n\t"
333      "sh             %[a1], 256(%[mb_dqcoeff])                  \n\t"
334      "sh             %[a1], 288(%[mb_dqcoeff])                  \n\t"
335      "sh             %[a1], 320(%[mb_dqcoeff])                  \n\t"
336      "sh             %[a1], 352(%[mb_dqcoeff])                  \n\t"
337      "sh             %[a1], 384(%[mb_dqcoeff])                  \n\t"
338      "sh             %[a1], 416(%[mb_dqcoeff])                  \n\t"
339      "sh             %[a1], 448(%[mb_dqcoeff])                  \n\t"
340      "sh             %[a1], 480(%[mb_dqcoeff])                  \n\t"
341
342      :
343      : [a1] "r"(a1), [mb_dqcoeff] "r"(mb_dqcoeff));
344}
345
346#endif
347