armVCM4P10_TransformResidual4x4_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_TransformResidual4x4_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26;// Description:
27;// Transform Residual 4x4 Coefficients
28;//
29;//
30
31
32;// Include standard headers
33
34        INCLUDE omxtypes_s.h
35        INCLUDE armCOMM_s.h
36
37        M_VARIANTS ARM1136JS
38
39;// Import symbols required from other files
40;// (For example tables)
41
42
43
44
45;// Set debugging level
46;//DEBUG_ON    SETL {TRUE}
47
48
49
50;// Guarding implementation by the processor name
51
52    IF  ARM1136JS
53
54;//Input Registers
55pDst                RN  0
56pSrc                RN  1
57
58;//Output Registers
59
60
61;//Local Scratch Registers
62
63;// Packed Input pixels
64in00                RN  2                   ;// Src[0] & Src[1]
65in02                RN  3                   ;// Src[2] & Src[3]
66in10                RN  4                   ;// Src[4] & Src[5]
67in12                RN  5                   ;// Src[6] & Src[7]
68in20                RN  6                   ;// Src[8] & Src[9]
69in22                RN  7                   ;// Src[10] & Src[11]
70in30                RN  8                   ;// Src[12] & Src[13]
71in32                RN  9                   ;// Src[14] & Src[15]
72
73;// Transpose for Row operations (Rows to cols)
74trRow00             RN  2
75trRow10             RN  10
76trRow02             RN  3
77trRow12             RN  5
78trRow20             RN  11
79trRow30             RN  12
80trRow32             RN  14
81trRow22             RN  7
82
83;// Intermediate calculations
84e0                  RN  4
85e1                  RN  6
86e2                  RN  8
87e3                  RN  9
88constZero           RN  1
89
90;// Row operated pixels
91rowOp00             RN  2
92rowOp10             RN  10
93rowOp20             RN  11
94rowOp30             RN  12
95rowOp02             RN  3
96rowOp12             RN  5
97rowOp22             RN  7
98rowOp32             RN  14
99
100;// Transpose for colulmn operations
101trCol00             RN  2
102trCol02             RN  3
103trCol10             RN  4
104trCol12             RN  5
105trCol20             RN  6
106trCol22             RN  7
107trCol30             RN  8
108trCol32             RN  9
109
110;// Intermediate calculations
111g0                  RN  10
112g1                  RN  11
113g2                  RN  12
114g3                  RN  14
115
116;// Coloumn operated pixels
117colOp00             RN  2
118colOp02             RN  3
119colOp10             RN  4
120colOp12             RN  5
121colOp20             RN  6
122colOp22             RN  7
123colOp30             RN  8
124colOp32             RN  9
125
126
127temp1               RN  10                  ;// Temporary scratch varaibles
128const1              RN  11
129const2              RN  12
130mask                RN  14
131
132;// Output pixels
133out00               RN  2
134out02               RN  3
135out10               RN  4
136out12               RN  5
137out20               RN  6
138out22               RN  7
139out30               RN  8
140out32               RN  9
141
142
143
144    ;// Allocate stack memory required by the function
145
146
147    ;// Write function header
148        M_START armVCM4P10_TransformResidual4x4,r11
149
150        ;******************************************************************
151        ;// The strategy used in implementing the transform is as follows:*
152        ;// Load the 4x4 block into 8 registers                           *
153        ;// Transpose the 4x4 matrix                                      *
154        ;// Perform the row operations (on columns) using SIMD            *
155        ;// Transpose the 4x4 result matrix                               *
156        ;// Perform the coloumn operations                                *
157        ;// Store the 4x4 block at one go                                 *
158        ;******************************************************************
159
160        ;// Load all the 4x4 pixels
161
162        LDMIA   pSrc,{in00,in02,in10,in12,in20,in22,in30,in32}
163
164        MOV       constZero,#0                                     ;// Used to right shift by 1
165        ;LDR       constZero,=0x00000000
166
167        ;*****************************************************************
168        ;//
169        ;// Transpose the matrix inorder to perform row ops as coloumn ops
170        ;// Input:   in[][] = original matrix
171        ;// Output:  trRow[][]= transposed matrix
172        ;// Step1: Obtain the LL part of the transposed matrix
173        ;// Step2: Obtain the HL part
174        ;// step3: Obtain the LH part
175        ;// Step4: Obtain the HH part
176        ;//
177        ;*****************************************************************
178
179        ;// LL 2x2 transposed matrix
180        ;//   d0 d1 - -
181        ;//   d4 d5 - -
182        ;//   -  -  - -
183        ;//   -  -  - -
184
185        PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
186        PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]
187
188        ;// HL 2x2 transposed matrix
189        ;//    -   -   - -
190        ;//    -   -   - -
191        ;//    d8  d9  - -
192        ;//   d12 d13  - -
193
194
195         PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
196         PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]
197
198        ;// LH 2x2 transposed matrix
199        ;//   - - d2 d3
200        ;//   - - d6 d7
201        ;//   - - -  -
202        ;//   - - -  -
203
204        PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
205        PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]
206
207
208
209
210        ;// HH 2x2 transposed matrix
211        ;//    - -   -   -
212        ;//    - -   -   -
213        ;//    - -  d10 d11
214        ;//    - -  d14 d15
215
216        PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
217        PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]
218
219
220        ;****************************************
221        ;// Row Operations (Performed on columns)
222        ;****************************************
223
224
225        ;// SIMD operations on first two columns(two rows of the original matrix)
226
227
228        SADD16      e0, trRow00,trRow20                   ;//  e0 = d0 + d2
229        SSUB16    e1, trRow00,trRow20                   ;//  e1 = d0 - d2
230        SHADD16   e2, trRow10,constZero                 ;// (f1>>1) constZero is a register holding 0
231        SHADD16   e3, trRow30,constZero                 ;//  avoid pipeline stalls for e2 and e3
232        SSUB16    e2, e2, trRow30                       ;//  e2 = (d1>>1) - d3
233        SADD16    e3, e3, trRow10                       ;//  e3 = d1 + (d3>>1)
234        SADD16    rowOp00, e0, e3                       ;//  f0 = e0 + e3
235        SADD16    rowOp10, e1, e2                       ;//  f1 = e1 + e2
236        SSUB16    rowOp20, e1, e2                       ;//  f2 = e1 - e2
237        SSUB16    rowOp30, e0, e3                       ;//  f3 = e0 - e3
238
239        ;// SIMD operations on next two columns(next two rows of the original matrix)
240
241        SADD16      e0, trRow02,trRow22
242        SSUB16    e1, trRow02,trRow22
243        SHADD16   e2, trRow12,constZero                 ;//(f1>>1) constZero is a register holding 0
244        SHADD16   e3, trRow32,constZero
245        SSUB16    e2, e2, trRow32
246        SADD16    e3, e3, trRow12
247        SADD16    rowOp02, e0, e3
248        SADD16    rowOp12, e1, e2
249        SSUB16    rowOp22, e1, e2
250        SSUB16    rowOp32, e0, e3
251
252
253        ;*****************************************************************
254        ;// Transpose the resultant matrix
255        ;// Input:  rowOp[][]
256        ;// Output: trCol[][]
257        ;*****************************************************************
258
259        ;// LL 2x2 transposed matrix
260        ;//   d0 d1 - -
261        ;//   d4 d5 - -
262        ;//   -  -  - -
263        ;//   -  -  - -
264
265        PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
266        PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]
267
268        ;// HL 2x2 transposed matrix
269        ;//    -   -   - -
270        ;//    -   -   - -
271        ;//    d8  d9  - -
272        ;//   d12 d13  - -
273
274
275         PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
276         PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]
277
278        ;// LH 2x2 transposed matrix
279        ;//   - - d2 d3
280        ;//   - - d6 d7
281        ;//   - - -  -
282        ;//   - - -  -
283
284        PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
285        PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]
286
287
288
289
290        ;// HH 2x2 transposed matrix
291        ;//    - -   -   -
292        ;//    - -   -   -
293        ;//    - -  d10 d11
294        ;//    - -  d14 d15
295
296        PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
297        PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]
298
299
300        ;*******************************
301        ;// Coloumn Operations
302        ;*******************************
303
304
305        ;// SIMD operations on first two columns
306
307
308        SADD16      g0, trCol00,trCol20
309        SSUB16    g1, trCol00,trCol20
310        SHADD16   g2, trCol10,constZero                     ;// (f1>>1) constZero is a register holding 0
311        SHADD16   g3, trCol30,constZero
312        SSUB16    g2, g2, trCol30
313        SADD16    g3, g3, trCol10
314        SADD16    colOp00, g0, g3
315        SADD16    colOp10, g1, g2
316        SSUB16    colOp20, g1, g2
317        SSUB16    colOp30, g0, g3
318
319        ;// SIMD operations on next two columns
320
321        SADD16      g0, trCol02,trCol22
322        SSUB16    g1, trCol02,trCol22
323        SHADD16   g2, trCol12,constZero                     ;// (f1>>1) constZero is a register holding 0
324        SHADD16   g3, trCol32,constZero
325        SSUB16    g2, g2, trCol32
326        SADD16    g3, g3, trCol12
327        SADD16    colOp02, g0, g3
328        SADD16    colOp12, g1, g2
329        SSUB16    colOp22, g1, g2
330        SSUB16    colOp32, g0, g3
331
332
333
334
335
336        ;************************************************
337        ;// Calculate final value (colOp[i][j] + 32)>>6
338        ;************************************************
339
340        ;// const1: Serves dual purpose
341        ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result
342        ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768)
343
344        LDR     const1, =0x00208020
345
346        LDR     mask, =0xffff03ff                       ;// Used to mask the down shifted 6 bits
347
348        ;// const2(#512): used to convert the lower 16bit number back to signed value
349
350        MOV     const2,#0x200                           ;// const2 = 2^9
351
352        ;// First Row
353
354        SADD16    colOp00, colOp00, const1
355        SADD16    colOp02, colOp02, const1
356        AND     colOp00, mask, colOp00, ASR #6
357        AND     colOp02, mask, colOp02, ASR #6
358        SSUB16  out00,colOp00,const2
359        SSUB16  out02,colOp02,const2
360
361
362        ;// Second Row
363
364        SADD16    colOp10, colOp10, const1
365        SADD16    colOp12, colOp12, const1
366        AND     colOp10, mask, colOp10, ASR #6
367        AND     colOp12, mask, colOp12, ASR #6
368        SSUB16  out10,colOp10,const2
369        SSUB16  out12,colOp12,const2
370
371
372        ;// Third Row
373
374        SADD16    colOp20, colOp20, const1
375        SADD16    colOp22, colOp22, const1
376        AND     colOp20, mask, colOp20, ASR #6
377        AND     colOp22, mask, colOp22, ASR #6
378        SSUB16  out20,colOp20,const2
379        SSUB16  out22,colOp22,const2
380
381
382        ;// Fourth Row
383
384        SADD16    colOp30, colOp30, const1
385        SADD16    colOp32, colOp32, const1
386        AND     colOp30, mask, colOp30, ASR #6
387        AND     colOp32, mask, colOp32, ASR #6
388        SSUB16  out30,colOp30,const2
389        SSUB16  out32,colOp32,const2
390
391
392
393
394        ;***************************
395        ;// Store all the 4x4 pixels
396        ;***************************
397
398        STMIA   pDst,{out00,out02,out10,out12,out20,out22,out30,out32}
399
400
401
402        ;// Set return value
403
404End
405
406
407        ;// Write function tail
408        M_END
409
410    ENDIF                                                           ;//ARM1136JS
411
412
413
414
415
416
417
418;// Guarding implementation by the processor name
419
420
421    END
422