armVCM4P10_TransformResidual4x4_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: armVCM4P10_TransformResidual4x4_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 12290 21;// Date: Wednesday, April 9, 2008 22;// 23;// 24;// 25;// 26;// Description: 27;// Transform Residual 4x4 Coefficients 28;// 29;// 30 31 32;// Include standard headers 33 34 INCLUDE omxtypes_s.h 35 INCLUDE armCOMM_s.h 36 37 M_VARIANTS CortexA8 38 39;// Import symbols required from other files 40;// (For example tables) 41 42 43 44 45;// Set debugging level 46;//DEBUG_ON SETL {TRUE} 47 48 49 50;// Guarding implementation by the processor name 51 52 53 54 55 56 57 58 59;// Guarding implementation by the processor name 60 61 IF CortexA8 62 63;// ARM Registers 64 65;//Input Registers 66pDst RN 0 67pSrc RN 1 68 69 70;// Neon Registers 71 72;// Packed Input pixels 73dIn0 DN D0.S16 74dIn1 DN D1.S16 75dIn2 DN D2.S16 76dIn3 DN D3.S16 77 78;// Intermediate calculations 79dZero DN D4.S16 80de0 DN D5.S16 81de1 DN D6.S16 82de2 DN D7.S16 83de3 DN D8.S16 84dIn1RS DN D7.S16 85dIn3RS DN D8.S16 86df0 DN D0.S16 87df1 DN D1.S16 88df2 DN D2.S16 89df3 DN D3.S16 90qf01 QN Q0.32 91qf23 QN Q1.32 92dg0 DN D5.S16 93dg1 DN D6.S16 94dg2 DN D7.S16 95dg3 DN D8.S16 96df1RS DN D7.S16 97df3RS DN D8.S16 98 99;// Output pixels 100dh0 DN D0.S16 101dh1 DN D1.S16 102dh2 DN D2.S16 103dh3 DN D3.S16 104 105 106 ;// Allocate stack memory required by the function 107 108 109 ;// Write function header 110 M_START armVCM4P10_TransformResidual4x4, ,d8 111 112 ;****************************************************************** 113 ;// The strategy used in implementing the transform is as follows:* 114 ;// Load the 4x4 block into 8 registers * 115 ;// Transpose the 4x4 matrix * 116 ;// Perform the row operations (on columns) using SIMD * 117 ;// Transpose the 4x4 result matrix * 118 ;// Perform the coloumn operations * 119 ;// Store the 4x4 block at one go * 120 ;****************************************************************** 121 122 ;// Load all the 4x4 pixels in transposed form 123 124 VLD4 {dIn0,dIn1,dIn2,dIn3},[pSrc] 125 126 VMOV dZero,#0 ;// Used to right shift by 1 127 128 129 ;**************************************** 130 ;// Row Operations (Performed on columns) 131 ;**************************************** 132 133 134 VADD de0,dIn0,dIn2 ;// e0 = d0 + d2 135 VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2 136 VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0 137 VHADD dIn3RS,dIn3,dZero 138 VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3 139 VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1) 140 VADD df0,de0,de3 ;// f0 = e0 + e3 141 VADD df1,de1,de2 ;// f1 = e1 + e2 142 VSUB df2,de1,de2 ;// f2 = e1 - e2 143 VSUB df3,de0,de3 ;// f3 = e0 - e3 144 145 146 147 ;***************************************************************** 148 ;// Transpose the resultant matrix 149 ;***************************************************************** 150 151 VTRN df0,df1 152 VTRN df2,df3 153 VTRN qf01,qf23 154 155 156 ;******************************* 157 ;// Coloumn Operations 158 ;******************************* 159 160 161 VADD dg0,df0,df2 ;// e0 = d0 + d2 162 VSUB dg1,df0,df2 ;// e1 = d0 - d2 163 VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0 164 VHADD df3RS,df3,dZero 165 VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3 166 VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1) 167 VADD dh0,dg0,dg3 ;// f0 = e0 + e3 168 VADD dh1,dg1,dg2 ;// f1 = e1 + e2 169 VSUB dh2,dg1,dg2 ;// f2 = e1 - e2 170 VSUB dh3,dg0,dg3 ;// f3 = e0 - e3 171 172 173 ;************************************************ 174 ;// Calculate final value (colOp[i][j] + 32)>>6 175 ;************************************************ 176 177 VRSHR dh0,#6 178 VRSHR dh1,#6 179 VRSHR dh2,#6 180 VRSHR dh3,#6 181 182 183 ;*************************** 184 ;// Store all the 4x4 pixels 185 ;*************************** 186 187 VST1 {dh0,dh1,dh2,dh3},[pDst] 188 189 190 ;// Set return value 191 192End 193 194 195 ;// Write function tail 196 M_END 197 198 ENDIF ;//CortexA8 199 200 END 201