armVCM4P10_TransformResidual4x4_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;// 2;// 3;// File Name: armVCM4P10_TransformResidual4x4_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 12290 6;// Date: Wednesday, April 9, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12;// Description: 13;// Transform Residual 4x4 Coefficients 14;// 15;// 16 17 18;// Include standard headers 19 20 INCLUDE omxtypes_s.h 21 INCLUDE armCOMM_s.h 22 23 M_VARIANTS CortexA8 24 25;// Import symbols required from other files 26;// (For example tables) 27 28 29 30 31;// Set debugging level 32;//DEBUG_ON SETL {TRUE} 33 34 35 36;// Guarding implementation by the processor name 37 38 39 40 41 42 43 44 45;// Guarding implementation by the processor name 46 47 IF CortexA8 48 49;// ARM Registers 50 51;//Input Registers 52pDst RN 0 53pSrc RN 1 54 55 56;// Neon Registers 57 58;// Packed Input pixels 59dIn0 DN D0.S16 60dIn1 DN D1.S16 61dIn2 DN D2.S16 62dIn3 DN D3.S16 63 64;// Intermediate calculations 65dZero DN D4.S16 66de0 DN D5.S16 67de1 DN D6.S16 68de2 DN D7.S16 69de3 DN D8.S16 70dIn1RS DN D7.S16 71dIn3RS DN D8.S16 72df0 DN D0.S16 73df1 DN D1.S16 74df2 DN D2.S16 75df3 DN D3.S16 76qf01 QN Q0.32 77qf23 QN Q1.32 78dg0 DN D5.S16 79dg1 DN D6.S16 80dg2 DN D7.S16 81dg3 DN D8.S16 82df1RS DN D7.S16 83df3RS DN D8.S16 84 85;// Output pixels 86dh0 DN D0.S16 87dh1 DN D1.S16 88dh2 DN D2.S16 89dh3 DN D3.S16 90 91 92 ;// Allocate stack memory required by the function 93 94 95 ;// Write function header 96 M_START armVCM4P10_TransformResidual4x4, ,d8 97 98 ;****************************************************************** 99 ;// The strategy used in implementing the transform is as follows:* 100 ;// Load the 4x4 block into 8 registers * 101 ;// Transpose the 4x4 matrix * 102 ;// Perform the row operations (on columns) using SIMD * 103 ;// Transpose the 4x4 result matrix * 104 ;// Perform the coloumn operations * 105 ;// Store the 4x4 block at one go * 106 ;****************************************************************** 107 108 ;// Load all the 4x4 pixels in transposed form 109 110 VLD4 {dIn0,dIn1,dIn2,dIn3},[pSrc] 111 112 VMOV dZero,#0 ;// Used to right shift by 1 113 114 115 ;**************************************** 116 ;// Row Operations (Performed on columns) 117 ;**************************************** 118 119 120 VADD de0,dIn0,dIn2 ;// e0 = d0 + d2 121 VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2 122 VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0 123 VHADD dIn3RS,dIn3,dZero 124 VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3 125 VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1) 126 VADD df0,de0,de3 ;// f0 = e0 + e3 127 VADD df1,de1,de2 ;// f1 = e1 + e2 128 VSUB df2,de1,de2 ;// f2 = e1 - e2 129 VSUB df3,de0,de3 ;// f3 = e0 - e3 130 131 132 133 ;***************************************************************** 134 ;// Transpose the resultant matrix 135 ;***************************************************************** 136 137 VTRN df0,df1 138 VTRN df2,df3 139 VTRN qf01,qf23 140 141 142 ;******************************* 143 ;// Coloumn Operations 144 ;******************************* 145 146 147 VADD dg0,df0,df2 ;// e0 = d0 + d2 148 VSUB dg1,df0,df2 ;// e1 = d0 - d2 149 VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0 150 VHADD df3RS,df3,dZero 151 VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3 152 VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1) 153 VADD dh0,dg0,dg3 ;// f0 = e0 + e3 154 VADD dh1,dg1,dg2 ;// f1 = e1 + e2 155 VSUB dh2,dg1,dg2 ;// f2 = e1 - e2 156 VSUB dh3,dg0,dg3 ;// f3 = e0 - e3 157 158 159 ;************************************************ 160 ;// Calculate final value (colOp[i][j] + 32)>>6 161 ;************************************************ 162 163 VRSHR dh0,#6 164 VRSHR dh1,#6 165 VRSHR dh2,#6 166 VRSHR dh3,#6 167 168 169 ;*************************** 170 ;// Store all the 4x4 pixels 171 ;*************************** 172 173 VST1 {dh0,dh1,dh2,dh3},[pDst] 174 175 176 ;// Set return value 177 178End 179 180 181 ;// Write function tail 182 M_END 183 184 ENDIF ;//CortexA8 185 186 END