1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_convolve.h"
17#include "vpx_dsp/vpx_dsp_common.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
22                                     uint8_t *dst, int32_t dst_stride,
23                                     const int16_t *filter_y, int32_t w,
24                                     int32_t h) {
25  int32_t x, y;
26  const uint8_t *src_ptr;
27  uint8_t *dst_ptr;
28  uint8_t *cm = vpx_ff_cropTbl;
29  uint32_t vector4a = 64;
30  uint32_t load1, load2;
31  uint32_t p1, p2;
32  uint32_t scratch1;
33  uint32_t store1, store2;
34  int32_t Temp1, Temp2;
35  const int16_t *filter = &filter_y[3];
36  uint32_t filter45;
37
38  filter45 = ((const int32_t *)filter)[0];
39
40  for (y = h; y--;) {
41    /* prefetch data to cache memory */
42    prefetch_store(dst + dst_stride);
43
44    for (x = 0; x < w; x += 4) {
45      src_ptr = src + x;
46      dst_ptr = dst + x;
47
48      __asm__ __volatile__(
49          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
50          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
51          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
52
53          "mtlo             %[vector4a],  $ac0                            \n\t"
54          "mtlo             %[vector4a],  $ac1                            \n\t"
55          "mtlo             %[vector4a],  $ac2                            \n\t"
56          "mtlo             %[vector4a],  $ac3                            \n\t"
57          "mthi             $zero,        $ac0                            \n\t"
58          "mthi             $zero,        $ac1                            \n\t"
59          "mthi             $zero,        $ac2                            \n\t"
60          "mthi             $zero,        $ac3                            \n\t"
61
62          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
63          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
64
65          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
66          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
67
68          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
69          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
70
71          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
72          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
73
74          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
75          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
76
77          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
78          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
79
80          "extp             %[Temp1],     $ac0,           31              \n\t"
81          "extp             %[Temp2],     $ac1,           31              \n\t"
82
83          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
84          "extp             %[Temp1],     $ac2,           31              \n\t"
85
86          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
87          "extp             %[Temp2],     $ac3,           31              \n\t"
88
89          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
90          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
91
92          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
93          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
94
95          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
96          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
97
98          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
99            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
100            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
101            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
102          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
103            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
104    }
105
106    /* Next row... */
107    src += src_stride;
108    dst += dst_stride;
109  }
110}
111
112static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
113                                      uint8_t *dst, int32_t dst_stride,
114                                      const int16_t *filter_y, int32_t h) {
115  int32_t x, y;
116  const uint8_t *src_ptr;
117  uint8_t *dst_ptr;
118  uint8_t *cm = vpx_ff_cropTbl;
119  uint32_t vector4a = 64;
120  uint32_t load1, load2;
121  uint32_t p1, p2;
122  uint32_t scratch1;
123  uint32_t store1, store2;
124  int32_t Temp1, Temp2;
125  const int16_t *filter = &filter_y[3];
126  uint32_t filter45;
127
128  filter45 = ((const int32_t *)filter)[0];
129
130  for (y = h; y--;) {
131    /* prefetch data to cache memory */
132    prefetch_store(dst + dst_stride);
133
134    for (x = 0; x < 64; x += 4) {
135      src_ptr = src + x;
136      dst_ptr = dst + x;
137
138      __asm__ __volatile__(
139          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
140          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
141          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
142
143          "mtlo             %[vector4a],  $ac0                            \n\t"
144          "mtlo             %[vector4a],  $ac1                            \n\t"
145          "mtlo             %[vector4a],  $ac2                            \n\t"
146          "mtlo             %[vector4a],  $ac3                            \n\t"
147          "mthi             $zero,        $ac0                            \n\t"
148          "mthi             $zero,        $ac1                            \n\t"
149          "mthi             $zero,        $ac2                            \n\t"
150          "mthi             $zero,        $ac3                            \n\t"
151
152          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
153          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
154
155          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
156          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
157
158          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
159          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
160
161          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
162          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
163
164          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
165          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
166
167          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
168          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
169
170          "extp             %[Temp1],     $ac0,           31              \n\t"
171          "extp             %[Temp2],     $ac1,           31              \n\t"
172
173          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
174          "extp             %[Temp1],     $ac2,           31              \n\t"
175
176          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
177          "extp             %[Temp2],     $ac3,           31              \n\t"
178
179          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
180          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
181
182          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
183          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
184
185          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
186          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
187
188          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
189            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
190            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
191            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
192          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
193            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
194    }
195
196    /* Next row... */
197    src += src_stride;
198    dst += dst_stride;
199  }
200}
201
202void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
203                              uint8_t *dst, ptrdiff_t dst_stride,
204                              const InterpKernel *filter, int x0_q4,
205                              int32_t x_step_q4, int y0_q4, int y_step_q4,
206                              int w, int h) {
207  const int16_t *const filter_y = filter[y0_q4];
208  uint32_t pos = 38;
209
210  assert(y_step_q4 == 16);
211
212  /* bit positon for extract from acc */
213  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
214                       :
215                       : [pos] "r"(pos));
216
217  prefetch_store(dst);
218
219  switch (w) {
220    case 4:
221    case 8:
222    case 16:
223    case 32:
224      convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
225                               h);
226      break;
227    case 64:
228      prefetch_store(dst + 32);
229      convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
230      break;
231    default:
232      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
233                           x_step_q4, y0_q4, y_step_q4, w, h);
234      break;
235  }
236}
237#endif
238