1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vp9_rtcd.h"
12#include "vp9/common/vp9_common.h"
13
14void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
15                                      int16_t *output,
16                                      int output_stride);
17void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
18                                      int16_t *output,
19                                      int16_t *pass1Output,
20                                      int16_t skip_adding,
21                                      uint8_t *dest,
22                                      int dest_stride);
23void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
24                                     int16_t *output,
25                                     int output_stride);
26void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
27                                     int16_t *output,
28                                     int16_t *pass1Output,
29                                     int16_t skip_adding,
30                                     uint8_t *dest,
31                                     int dest_stride);
32
33/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
34extern void vp9_push_neon(int64_t *store);
35extern void vp9_pop_neon(int64_t *store);
36
37void vp9_idct16x16_256_add_neon(const int16_t *input,
38                                uint8_t *dest, int dest_stride) {
39  int64_t store_reg[8];
40  int16_t pass1_output[16*16] = {0};
41  int16_t row_idct_output[16*16] = {0};
42
43  // save d8-d15 register values.
44  vp9_push_neon(store_reg);
45
46  /* Parallel idct on the upper 8 rows */
47  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
48  // stage 6 result in pass1_output.
49  vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
50
51  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
52  // with result in pass1(pass1_output) to calculate final result in stage 7
53  // which will be saved into row_idct_output.
54  vp9_idct16x16_256_add_neon_pass2(input+1,
55                                     row_idct_output,
56                                     pass1_output,
57                                     0,
58                                     dest,
59                                     dest_stride);
60
61  /* Parallel idct on the lower 8 rows */
62  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
63  // stage 6 result in pass1_output.
64  vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
65
66  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
67  // with result in pass1(pass1_output) to calculate final result in stage 7
68  // which will be saved into row_idct_output.
69  vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
70                                     row_idct_output+8,
71                                     pass1_output,
72                                     0,
73                                     dest,
74                                     dest_stride);
75
76  /* Parallel idct on the left 8 columns */
77  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
78  // stage 6 result in pass1_output.
79  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
80
81  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
82  // with result in pass1(pass1_output) to calculate final result in stage 7.
83  // Then add the result to the destination data.
84  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
85                                     row_idct_output,
86                                     pass1_output,
87                                     1,
88                                     dest,
89                                     dest_stride);
90
91  /* Parallel idct on the right 8 columns */
92  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
93  // stage 6 result in pass1_output.
94  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
95
96  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
97  // with result in pass1(pass1_output) to calculate final result in stage 7.
98  // Then add the result to the destination data.
99  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
100                                     row_idct_output+8,
101                                     pass1_output,
102                                     1,
103                                     dest+8,
104                                     dest_stride);
105
106  // restore d8-d15 register values.
107  vp9_pop_neon(store_reg);
108
109  return;
110}
111
112void vp9_idct16x16_10_add_neon(const int16_t *input,
113                               uint8_t *dest, int dest_stride) {
114  int64_t store_reg[8];
115  int16_t pass1_output[16*16] = {0};
116  int16_t row_idct_output[16*16] = {0};
117
118  // save d8-d15 register values.
119  vp9_push_neon(store_reg);
120
121  /* Parallel idct on the upper 8 rows */
122  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
123  // stage 6 result in pass1_output.
124  vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
125
126  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
127  // with result in pass1(pass1_output) to calculate final result in stage 7
128  // which will be saved into row_idct_output.
129  vp9_idct16x16_10_add_neon_pass2(input+1,
130                                        row_idct_output,
131                                        pass1_output,
132                                        0,
133                                        dest,
134                                        dest_stride);
135
136  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
137
138  /* Parallel idct on the left 8 columns */
139  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
140  // stage 6 result in pass1_output.
141  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
142
143  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
144  // with result in pass1(pass1_output) to calculate final result in stage 7.
145  // Then add the result to the destination data.
146  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
147                                     row_idct_output,
148                                     pass1_output,
149                                     1,
150                                     dest,
151                                     dest_stride);
152
153  /* Parallel idct on the right 8 columns */
154  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
155  // stage 6 result in pass1_output.
156  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
157
158  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
159  // with result in pass1(pass1_output) to calculate final result in stage 7.
160  // Then add the result to the destination data.
161  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
162                                     row_idct_output+8,
163                                     pass1_output,
164                                     1,
165                                     dest+8,
166                                     dest_stride);
167
168  // restore d8-d15 register values.
169  vp9_pop_neon(store_reg);
170
171  return;
172}
173