1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vp9_rtcd.h" 12#include "vp9/common/vp9_common.h" 13 14void vp9_idct16x16_256_add_neon_pass1(const int16_t *input, 15 int16_t *output, 16 int output_stride); 17void vp9_idct16x16_256_add_neon_pass2(const int16_t *src, 18 int16_t *output, 19 int16_t *pass1Output, 20 int16_t skip_adding, 21 uint8_t *dest, 22 int dest_stride); 23void vp9_idct16x16_10_add_neon_pass1(const int16_t *input, 24 int16_t *output, 25 int output_stride); 26void vp9_idct16x16_10_add_neon_pass2(const int16_t *src, 27 int16_t *output, 28 int16_t *pass1Output, 29 int16_t skip_adding, 30 uint8_t *dest, 31 int dest_stride); 32 33/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ 34extern void vp9_push_neon(int64_t *store); 35extern void vp9_pop_neon(int64_t *store); 36 37void vp9_idct16x16_256_add_neon(const int16_t *input, 38 uint8_t *dest, int dest_stride) { 39 int64_t store_reg[8]; 40 int16_t pass1_output[16*16] = {0}; 41 int16_t row_idct_output[16*16] = {0}; 42 43 // save d8-d15 register values. 44 vp9_push_neon(store_reg); 45 46 /* Parallel idct on the upper 8 rows */ 47 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 48 // stage 6 result in pass1_output. 49 vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8); 50 51 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 52 // with result in pass1(pass1_output) to calculate final result in stage 7 53 // which will be saved into row_idct_output. 54 vp9_idct16x16_256_add_neon_pass2(input+1, 55 row_idct_output, 56 pass1_output, 57 0, 58 dest, 59 dest_stride); 60 61 /* Parallel idct on the lower 8 rows */ 62 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 63 // stage 6 result in pass1_output. 64 vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); 65 66 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 67 // with result in pass1(pass1_output) to calculate final result in stage 7 68 // which will be saved into row_idct_output. 69 vp9_idct16x16_256_add_neon_pass2(input+8*16+1, 70 row_idct_output+8, 71 pass1_output, 72 0, 73 dest, 74 dest_stride); 75 76 /* Parallel idct on the left 8 columns */ 77 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 78 // stage 6 result in pass1_output. 79 vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); 80 81 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 82 // with result in pass1(pass1_output) to calculate final result in stage 7. 83 // Then add the result to the destination data. 84 vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, 85 row_idct_output, 86 pass1_output, 87 1, 88 dest, 89 dest_stride); 90 91 /* Parallel idct on the right 8 columns */ 92 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 93 // stage 6 result in pass1_output. 94 vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); 95 96 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 97 // with result in pass1(pass1_output) to calculate final result in stage 7. 98 // Then add the result to the destination data. 99 vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, 100 row_idct_output+8, 101 pass1_output, 102 1, 103 dest+8, 104 dest_stride); 105 106 // restore d8-d15 register values. 107 vp9_pop_neon(store_reg); 108 109 return; 110} 111 112void vp9_idct16x16_10_add_neon(const int16_t *input, 113 uint8_t *dest, int dest_stride) { 114 int64_t store_reg[8]; 115 int16_t pass1_output[16*16] = {0}; 116 int16_t row_idct_output[16*16] = {0}; 117 118 // save d8-d15 register values. 119 vp9_push_neon(store_reg); 120 121 /* Parallel idct on the upper 8 rows */ 122 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 123 // stage 6 result in pass1_output. 124 vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8); 125 126 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 127 // with result in pass1(pass1_output) to calculate final result in stage 7 128 // which will be saved into row_idct_output. 129 vp9_idct16x16_10_add_neon_pass2(input+1, 130 row_idct_output, 131 pass1_output, 132 0, 133 dest, 134 dest_stride); 135 136 /* Skip Parallel idct on the lower 8 rows as they are all 0s */ 137 138 /* Parallel idct on the left 8 columns */ 139 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 140 // stage 6 result in pass1_output. 141 vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); 142 143 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 144 // with result in pass1(pass1_output) to calculate final result in stage 7. 145 // Then add the result to the destination data. 146 vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, 147 row_idct_output, 148 pass1_output, 149 1, 150 dest, 151 dest_stride); 152 153 /* Parallel idct on the right 8 columns */ 154 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the 155 // stage 6 result in pass1_output. 156 vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); 157 158 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines 159 // with result in pass1(pass1_output) to calculate final result in stage 7. 160 // Then add the result to the destination data. 161 vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, 162 row_idct_output+8, 163 pass1_output, 164 1, 165 dest+8, 166 dest_stride); 167 168 // restore d8-d15 register values. 169 vp9_pop_neon(store_reg); 170 171 return; 172} 173