armVCM4P10_TransformResidual4x4_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  armVCM4P10_TransformResidual4x4_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12;// Description:
13;// Transform Residual 4x4 Coefficients
14;//
15;//
16
17
18;// Include standard headers
19
20        INCLUDE omxtypes_s.h
21        INCLUDE armCOMM_s.h
22
23        M_VARIANTS CortexA8
24
25;// Import symbols required from other files
26;// (For example tables)
27
28
29
30
31;// Set debugging level
32;//DEBUG_ON    SETL {TRUE}
33
34
35
36;// Guarding implementation by the processor name
37
38
39
40
41
42
43
44
45;// Guarding implementation by the processor name
46
47    IF  CortexA8
48
49;// ARM Registers
50
51;//Input Registers
52pDst                RN  0
53pSrc                RN  1
54
55
56;// Neon Registers
57
58;// Packed Input pixels
59dIn0                DN  D0.S16
60dIn1                DN  D1.S16
61dIn2                DN  D2.S16
62dIn3                DN  D3.S16
63
64;// Intermediate calculations
65dZero               DN  D4.S16
66de0                 DN  D5.S16
67de1                 DN  D6.S16
68de2                 DN  D7.S16
69de3                 DN  D8.S16
70dIn1RS              DN  D7.S16
71dIn3RS              DN  D8.S16
72df0                 DN  D0.S16
73df1                 DN  D1.S16
74df2                 DN  D2.S16
75df3                 DN  D3.S16
76qf01                QN  Q0.32
77qf23                QN  Q1.32
78dg0                 DN  D5.S16
79dg1                 DN  D6.S16
80dg2                 DN  D7.S16
81dg3                 DN  D8.S16
82df1RS               DN  D7.S16
83df3RS               DN  D8.S16
84
85;// Output pixels
86dh0                 DN  D0.S16
87dh1                 DN  D1.S16
88dh2                 DN  D2.S16
89dh3                 DN  D3.S16
90
91
92    ;// Allocate stack memory required by the function
93
94
95    ;// Write function header
96        M_START armVCM4P10_TransformResidual4x4, ,d8
97
98        ;******************************************************************
99        ;// The strategy used in implementing the transform is as follows:*
100        ;// Load the 4x4 block into 8 registers                           *
101        ;// Transpose the 4x4 matrix                                      *
102        ;// Perform the row operations (on columns) using SIMD            *
103        ;// Transpose the 4x4 result matrix                               *
104        ;// Perform the coloumn operations                                *
105        ;// Store the 4x4 block at one go                                 *
106        ;******************************************************************
107
108        ;// Load all the 4x4 pixels in transposed form
109
110        VLD4    {dIn0,dIn1,dIn2,dIn3},[pSrc]
111
112        VMOV    dZero,#0                                    ;// Used to right shift by 1
113
114
115        ;****************************************
116        ;// Row Operations (Performed on columns)
117        ;****************************************
118
119
120        VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2
121        VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2
122        VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
123        VHADD       dIn3RS,dIn3,dZero
124        VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3
125        VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1)
126        VADD        df0,de0,de3                         ;//  f0 = e0 + e3
127        VADD        df1,de1,de2                            ;//  f1 = e1 + e2
128        VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
129        VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
130
131
132
133        ;*****************************************************************
134        ;// Transpose the resultant matrix
135        ;*****************************************************************
136
137        VTRN    df0,df1
138        VTRN    df2,df3
139        VTRN    qf01,qf23
140
141
142        ;*******************************
143        ;// Coloumn Operations
144        ;*******************************
145
146
147        VADD        dg0,df0,df2                         ;//  e0 = d0 + d2
148        VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2
149        VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
150        VHADD       df3RS,df3,dZero
151        VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3
152        VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1)
153        VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
154        VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
155        VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
156        VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
157
158
159        ;************************************************
160        ;// Calculate final value (colOp[i][j] + 32)>>6
161        ;************************************************
162
163        VRSHR       dh0,#6
164        VRSHR       dh1,#6
165        VRSHR       dh2,#6
166        VRSHR       dh3,#6
167
168
169        ;***************************
170        ;// Store all the 4x4 pixels
171        ;***************************
172
173        VST1   {dh0,dh1,dh2,dh3},[pDst]
174
175
176        ;// Set return value
177
178End
179
180
181        ;// Write function tail
182        M_END
183
184    ENDIF                                                           ;//CortexA8
185
186    END