1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vp9_rtcd.h"
12#include "vp9/common/vp9_common.h"
13
14extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
15                                               int16_t *output,
16                                               int output_stride);
17extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
18                                               int16_t *output,
19                                               int16_t *pass1Output,
20                                               int16_t skip_adding,
21                                               uint8_t *dest,
22                                               int dest_stride);
23extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
24                                               int16_t *output,
25                                               int output_stride);
26extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
27                                               int16_t *output,
28                                               int16_t *pass1Output,
29                                               int16_t skip_adding,
30                                               uint8_t *dest,
31                                               int dest_stride);
32extern void save_neon_registers();
33extern void restore_neon_registers();
34
35
36void vp9_short_idct16x16_add_neon(int16_t *input,
37                                  uint8_t *dest, int dest_stride) {
38  int16_t pass1_output[16*16] = {0};
39  int16_t row_idct_output[16*16] = {0};
40
41  // save d8-d15 register values.
42  save_neon_registers();
43
44  /* Parallel idct on the upper 8 rows */
45  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
46  // stage 6 result in pass1_output.
47  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
48
49  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
50  // with result in pass1(pass1_output) to calculate final result in stage 7
51  // which will be saved into row_idct_output.
52  vp9_short_idct16x16_add_neon_pass2(input+1,
53                                     row_idct_output,
54                                     pass1_output,
55                                     0,
56                                     dest,
57                                     dest_stride);
58
59  /* Parallel idct on the lower 8 rows */
60  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
61  // stage 6 result in pass1_output.
62  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
63
64  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
65  // with result in pass1(pass1_output) to calculate final result in stage 7
66  // which will be saved into row_idct_output.
67  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
68                                     row_idct_output+8,
69                                     pass1_output,
70                                     0,
71                                     dest,
72                                     dest_stride);
73
74  /* Parallel idct on the left 8 columns */
75  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
76  // stage 6 result in pass1_output.
77  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
78
79  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
80  // with result in pass1(pass1_output) to calculate final result in stage 7.
81  // Then add the result to the destination data.
82  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
83                                     row_idct_output,
84                                     pass1_output,
85                                     1,
86                                     dest,
87                                     dest_stride);
88
89  /* Parallel idct on the right 8 columns */
90  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
91  // stage 6 result in pass1_output.
92  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
93
94  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
95  // with result in pass1(pass1_output) to calculate final result in stage 7.
96  // Then add the result to the destination data.
97  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
98                                     row_idct_output+8,
99                                     pass1_output,
100                                     1,
101                                     dest+8,
102                                     dest_stride);
103
104  // restore d8-d15 register values.
105  restore_neon_registers();
106
107  return;
108}
109
110void vp9_short_idct10_16x16_add_neon(int16_t *input,
111                                  uint8_t *dest, int dest_stride) {
112  int16_t pass1_output[16*16] = {0};
113  int16_t row_idct_output[16*16] = {0};
114
115  // save d8-d15 register values.
116  save_neon_registers();
117
118  /* Parallel idct on the upper 8 rows */
119  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
120  // stage 6 result in pass1_output.
121  vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
122
123  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
124  // with result in pass1(pass1_output) to calculate final result in stage 7
125  // which will be saved into row_idct_output.
126  vp9_short_idct10_16x16_add_neon_pass2(input+1,
127                                        row_idct_output,
128                                        pass1_output,
129                                        0,
130                                        dest,
131                                        dest_stride);
132
133  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
134
135  /* Parallel idct on the left 8 columns */
136  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
137  // stage 6 result in pass1_output.
138  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
139
140  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
141  // with result in pass1(pass1_output) to calculate final result in stage 7.
142  // Then add the result to the destination data.
143  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
144                                     row_idct_output,
145                                     pass1_output,
146                                     1,
147                                     dest,
148                                     dest_stride);
149
150  /* Parallel idct on the right 8 columns */
151  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
152  // stage 6 result in pass1_output.
153  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
154
155  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
156  // with result in pass1(pass1_output) to calculate final result in stage 7.
157  // Then add the result to the destination data.
158  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
159                                     row_idct_output+8,
160                                     pass1_output,
161                                     1,
162                                     dest+8,
163                                     dest_stride);
164
165  // restore d8-d15 register values.
166  restore_neon_registers();
167
168  return;
169}
170