armVCM4P10_TransformResidual4x4_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_TransformResidual4x4_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26;// Description:
27;// Transform Residual 4x4 Coefficients
28;//
29;//
30
31
32;// Include standard headers
33
34        INCLUDE omxtypes_s.h
35        INCLUDE armCOMM_s.h
36
37        M_VARIANTS CortexA8
38
39;// Import symbols required from other files
40;// (For example tables)
41
42
43
44
45;// Set debugging level
46;//DEBUG_ON    SETL {TRUE}
47
48
49
50;// Guarding implementation by the processor name
51
52
53
54
55
56
57
58
59;// Guarding implementation by the processor name
60
61    IF  CortexA8
62
63;// ARM Registers
64
65;//Input Registers
66pDst                RN  0
67pSrc                RN  1
68
69
70;// Neon Registers
71
72;// Packed Input pixels
73dIn0                DN  D0.S16
74dIn1                DN  D1.S16
75dIn2                DN  D2.S16
76dIn3                DN  D3.S16
77
78;// Intermediate calculations
79dZero               DN  D4.S16
80de0                 DN  D5.S16
81de1                 DN  D6.S16
82de2                 DN  D7.S16
83de3                 DN  D8.S16
84dIn1RS              DN  D7.S16
85dIn3RS              DN  D8.S16
86df0                 DN  D0.S16
87df1                 DN  D1.S16
88df2                 DN  D2.S16
89df3                 DN  D3.S16
90qf01                QN  Q0.32
91qf23                QN  Q1.32
92dg0                 DN  D5.S16
93dg1                 DN  D6.S16
94dg2                 DN  D7.S16
95dg3                 DN  D8.S16
96df1RS               DN  D7.S16
97df3RS               DN  D8.S16
98
99;// Output pixels
100dh0                 DN  D0.S16
101dh1                 DN  D1.S16
102dh2                 DN  D2.S16
103dh3                 DN  D3.S16
104
105
106    ;// Allocate stack memory required by the function
107
108
109    ;// Write function header
110        M_START armVCM4P10_TransformResidual4x4, ,d8
111
112        ;******************************************************************
113        ;// The strategy used in implementing the transform is as follows:*
114        ;// Load the 4x4 block into 8 registers                           *
115        ;// Transpose the 4x4 matrix                                      *
116        ;// Perform the row operations (on columns) using SIMD            *
117        ;// Transpose the 4x4 result matrix                               *
118        ;// Perform the coloumn operations                                *
119        ;// Store the 4x4 block at one go                                 *
120        ;******************************************************************
121
122        ;// Load all the 4x4 pixels in transposed form
123
124        VLD4    {dIn0,dIn1,dIn2,dIn3},[pSrc]
125
126        VMOV    dZero,#0                                    ;// Used to right shift by 1
127
128
129        ;****************************************
130        ;// Row Operations (Performed on columns)
131        ;****************************************
132
133
134        VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2
135        VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2
136        VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
137        VHADD       dIn3RS,dIn3,dZero
138        VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3
139        VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1)
140        VADD        df0,de0,de3                         ;//  f0 = e0 + e3
141        VADD        df1,de1,de2                            ;//  f1 = e1 + e2
142        VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
143        VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
144
145
146
147        ;*****************************************************************
148        ;// Transpose the resultant matrix
149        ;*****************************************************************
150
151        VTRN    df0,df1
152        VTRN    df2,df3
153        VTRN    qf01,qf23
154
155
156        ;*******************************
157        ;// Coloumn Operations
158        ;*******************************
159
160
161        VADD        dg0,df0,df2                         ;//  e0 = d0 + d2
162        VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2
163        VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
164        VHADD       df3RS,df3,dZero
165        VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3
166        VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1)
167        VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
168        VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
169        VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
170        VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
171
172
173        ;************************************************
174        ;// Calculate final value (colOp[i][j] + 32)>>6
175        ;************************************************
176
177        VRSHR       dh0,#6
178        VRSHR       dh1,#6
179        VRSHR       dh2,#6
180        VRSHR       dh3,#6
181
182
183        ;***************************
184        ;// Store all the 4x4 pixels
185        ;***************************
186
187        VST1   {dh0,dh1,dh2,dh3},[pDst]
188
189
190        ;// Set return value
191
192End
193
194
195        ;// Write function tail
196        M_END
197
198    ENDIF                                                           ;//CortexA8
199
200    END
201