153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org/*
253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org *
453a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org *  Use of this source code is governed by a BSD-style license
553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org *  that can be found in the LICENSE file in the root of the source
653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org *  tree. An additional intellectual property rights grant can be found
753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org *  in the file PATENTS.  All contributing project authors may
853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org *  be found in the AUTHORS file in the root of the source tree.
953a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org */
1053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
1153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#include "./vp9_rtcd.h"
1253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org#include "vp9/common/vp9_common.h"
1353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
14ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
15ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                      int16_t *output,
16ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                      int output_stride);
17ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
18ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                      int16_t *output,
19ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                      int16_t *pass1Output,
20ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                      int16_t skip_adding,
21ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                      uint8_t *dest,
22ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                      int dest_stride);
23ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
24ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                     int16_t *output,
25ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                     int output_stride);
26ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
27ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                     int16_t *output,
28ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                     int16_t *pass1Output,
29ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                     int16_t skip_adding,
30ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                     uint8_t *dest,
31ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                     int dest_stride);
32ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org
33ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
34ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgextern void vp9_push_neon(int64_t *store);
35ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgextern void vp9_pop_neon(int64_t *store);
36ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org
37ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_idct16x16_256_add_neon(const int16_t *input,
38ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                                uint8_t *dest, int dest_stride) {
39ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  int64_t store_reg[8];
4053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  int16_t pass1_output[16*16] = {0};
4153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  int16_t row_idct_output[16*16] = {0};
4253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
4353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // save d8-d15 register values.
44ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_push_neon(store_reg);
4553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
4653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  /* Parallel idct on the upper 8 rows */
4753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
4853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // stage 6 result in pass1_output.
49ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
5053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
5153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
5253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // with result in pass1(pass1_output) to calculate final result in stage 7
5353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // which will be saved into row_idct_output.
54ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass2(input+1,
5553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     row_idct_output,
5653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     pass1_output,
5753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     0,
5853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest,
5953a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest_stride);
6053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
6153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  /* Parallel idct on the lower 8 rows */
6253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
6353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // stage 6 result in pass1_output.
64ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
6553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
6653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
6753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // with result in pass1(pass1_output) to calculate final result in stage 7
6853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // which will be saved into row_idct_output.
69ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
7053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     row_idct_output+8,
7153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     pass1_output,
7253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     0,
7353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest,
7453a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest_stride);
7553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
7653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  /* Parallel idct on the left 8 columns */
7753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
7853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // stage 6 result in pass1_output.
79ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
8053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
8153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
8253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // with result in pass1(pass1_output) to calculate final result in stage 7.
8353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // Then add the result to the destination data.
84ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
8553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     row_idct_output,
8653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     pass1_output,
8753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     1,
8853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest,
8953a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest_stride);
9053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
9153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  /* Parallel idct on the right 8 columns */
9253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
9353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // stage 6 result in pass1_output.
94ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
9553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
9653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
9753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // with result in pass1(pass1_output) to calculate final result in stage 7.
9853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // Then add the result to the destination data.
99ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
10053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     row_idct_output+8,
10153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     pass1_output,
10253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     1,
10353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest+8,
10453a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest_stride);
10553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
10653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // restore d8-d15 register values.
107ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_pop_neon(store_reg);
10853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
10953a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  return;
11053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org}
11153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
112ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgvoid vp9_idct16x16_10_add_neon(const int16_t *input,
113ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org                               uint8_t *dest, int dest_stride) {
114ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  int64_t store_reg[8];
11553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  int16_t pass1_output[16*16] = {0};
11653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  int16_t row_idct_output[16*16] = {0};
11753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
11853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // save d8-d15 register values.
119ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_push_neon(store_reg);
12053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
12153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  /* Parallel idct on the upper 8 rows */
12253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
12353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // stage 6 result in pass1_output.
124ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
12553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
12653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
12753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // with result in pass1(pass1_output) to calculate final result in stage 7
12853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // which will be saved into row_idct_output.
129ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_10_add_neon_pass2(input+1,
13053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                        row_idct_output,
13153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                        pass1_output,
13253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                        0,
13353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                        dest,
13453a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                        dest_stride);
13553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
13653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
13753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
13853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  /* Parallel idct on the left 8 columns */
13953a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
14053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // stage 6 result in pass1_output.
141ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
14253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
14353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
14453a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // with result in pass1(pass1_output) to calculate final result in stage 7.
14553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // Then add the result to the destination data.
146ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
14753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     row_idct_output,
14853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     pass1_output,
14953a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     1,
15053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest,
15153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest_stride);
15253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
15353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  /* Parallel idct on the right 8 columns */
15453a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
15553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // stage 6 result in pass1_output.
156ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
15753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
15853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
15953a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // with result in pass1(pass1_output) to calculate final result in stage 7.
16053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // Then add the result to the destination data.
161ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
16253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     row_idct_output+8,
16353a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     pass1_output,
16453a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     1,
16553a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest+8,
16653a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org                                     dest_stride);
16753a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
16853a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  // restore d8-d15 register values.
169ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org  vp9_pop_neon(store_reg);
17053a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org
17153a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org  return;
17253a13f1fa964820f7a8f9d3932a6f3c0433f8bf5fgalligan@chromium.org}
173