1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vp9_rtcd.h" 12#include "vp9/common/vp9_common.h" 13 14extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input, 15 int16_t *output, 16 int output_stride); 17extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src, 18 int16_t *output, 19 int16_t *pass1Output, 20 int16_t skip_adding, 21 uint8_t *dest, 22 int dest_stride); 23extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input, 24 int16_t *output, 25 int output_stride); 26extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, 27 int16_t *output, 28 int16_t *pass1Output, 29 int16_t skip_adding, 30 uint8_t *dest, 31 int dest_stride); 32extern void save_neon_registers(); 33extern void restore_neon_registers(); 34 35 36void vp9_short_idct16x16_add_neon(int16_t *input, 37 uint8_t *dest, int dest_stride) { 38 int16_t pass1_output[16*16] = {0}; 39 int16_t row_idct_output[16*16] = {0}; 40 41 // save d8-d15 register values. 42 save_neon_registers(); 43 44 /* Parallel idct on the upper 8 rows */ 45 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 46 // stage 6 result in pass1_output. 47 vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8); 48 49 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 50 // with result in pass1(pass1_output) to calculate final result in stage 7 51 // which will be saved into row_idct_output. 52 vp9_short_idct16x16_add_neon_pass2(input+1, 53 row_idct_output, 54 pass1_output, 55 0, 56 dest, 57 dest_stride); 58 59 /* Parallel idct on the lower 8 rows */ 60 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 61 // stage 6 result in pass1_output. 62 vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8); 63 64 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 65 // with result in pass1(pass1_output) to calculate final result in stage 7 66 // which will be saved into row_idct_output. 67 vp9_short_idct16x16_add_neon_pass2(input+8*16+1, 68 row_idct_output+8, 69 pass1_output, 70 0, 71 dest, 72 dest_stride); 73 74 /* Parallel idct on the left 8 columns */ 75 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 76 // stage 6 result in pass1_output. 77 vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); 78 79 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 80 // with result in pass1(pass1_output) to calculate final result in stage 7. 81 // Then add the result to the destination data. 82 vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, 83 row_idct_output, 84 pass1_output, 85 1, 86 dest, 87 dest_stride); 88 89 /* Parallel idct on the right 8 columns */ 90 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 91 // stage 6 result in pass1_output. 92 vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); 93 94 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 95 // with result in pass1(pass1_output) to calculate final result in stage 7. 96 // Then add the result to the destination data. 97 vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, 98 row_idct_output+8, 99 pass1_output, 100 1, 101 dest+8, 102 dest_stride); 103 104 // restore d8-d15 register values. 105 restore_neon_registers(); 106 107 return; 108} 109 110void vp9_short_idct10_16x16_add_neon(int16_t *input, 111 uint8_t *dest, int dest_stride) { 112 int16_t pass1_output[16*16] = {0}; 113 int16_t row_idct_output[16*16] = {0}; 114 115 // save d8-d15 register values. 116 save_neon_registers(); 117 118 /* Parallel idct on the upper 8 rows */ 119 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 120 // stage 6 result in pass1_output. 121 vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8); 122 123 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 124 // with result in pass1(pass1_output) to calculate final result in stage 7 125 // which will be saved into row_idct_output. 126 vp9_short_idct10_16x16_add_neon_pass2(input+1, 127 row_idct_output, 128 pass1_output, 129 0, 130 dest, 131 dest_stride); 132 133 /* Skip Parallel idct on the lower 8 rows as they are all 0s */ 134 135 /* Parallel idct on the left 8 columns */ 136 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 137 // stage 6 result in pass1_output. 138 vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); 139 140 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 141 // with result in pass1(pass1_output) to calculate final result in stage 7. 142 // Then add the result to the destination data. 143 vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, 144 row_idct_output, 145 pass1_output, 146 1, 147 dest, 148 dest_stride); 149 150 /* Parallel idct on the right 8 columns */ 151 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 152 // stage 6 result in pass1_output. 153 vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); 154 155 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 156 // with result in pass1(pass1_output) to calculate final result in stage 7. 157 // Then add the result to the destination data. 158 vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, 159 row_idct_output+8, 160 pass1_output, 161 1, 162 dest+8, 163 dest_stride); 164 165 // restore d8-d15 register values. 166 restore_neon_registers(); 167 168 return; 169} 170