armVCM4P10_TransformResidual4x4_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  armVCM4P10_TransformResidual4x4_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   9641
6;// Date:       Thursday, February 7, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12;// Description:
13;// Transform Residual 4x4 Coefficients
14;//
15;//
16
17
18;// Include standard headers
19
20        INCLUDE omxtypes_s.h
21        INCLUDE armCOMM_s.h
22
23        M_VARIANTS ARM1136JS
24
25;// Import symbols required from other files
26;// (For example tables)
27
28
29
30
31;// Set debugging level
32;//DEBUG_ON    SETL {TRUE}
33
34
35
36;// Guarding implementation by the processor name
37
38    IF  ARM1136JS
39
40;//Input Registers
41pDst                RN  0
42pSrc                RN  1
43
44;//Output Registers
45
46
47;//Local Scratch Registers
48
49;// Packed Input pixels
50in00                RN  2                   ;// Src[0] & Src[1]
51in02                RN  3                   ;// Src[2] & Src[3]
52in10                RN  4                   ;// Src[4] & Src[5]
53in12                RN  5                   ;// Src[6] & Src[7]
54in20                RN  6                   ;// Src[8] & Src[9]
55in22                RN  7                   ;// Src[10] & Src[11]
56in30                RN  8                   ;// Src[12] & Src[13]
57in32                RN  9                   ;// Src[14] & Src[15]
58
59;// Transpose for Row operations (Rows to cols)
60trRow00             RN  2
61trRow10             RN  10
62trRow02             RN  3
63trRow12             RN  5
64trRow20             RN  11
65trRow30             RN  12
66trRow32             RN  14
67trRow22             RN  7
68
69;// Intermediate calculations
70e0                  RN  4
71e1                  RN  6
72e2                  RN  8
73e3                  RN  9
74constZero           RN  1
75
76;// Row operated pixels
77rowOp00             RN  2
78rowOp10             RN  10
79rowOp20             RN  11
80rowOp30             RN  12
81rowOp02             RN  3
82rowOp12             RN  5
83rowOp22             RN  7
84rowOp32             RN  14
85
86;// Transpose for colulmn operations
87trCol00             RN  2
88trCol02             RN  3
89trCol10             RN  4
90trCol12             RN  5
91trCol20             RN  6
92trCol22             RN  7
93trCol30             RN  8
94trCol32             RN  9
95
96;// Intermediate calculations
97g0                  RN  10
98g1                  RN  11
99g2                  RN  12
100g3                  RN  14
101
102;// Coloumn operated pixels
103colOp00             RN  2
104colOp02             RN  3
105colOp10             RN  4
106colOp12             RN  5
107colOp20             RN  6
108colOp22             RN  7
109colOp30             RN  8
110colOp32             RN  9
111
112
113temp1               RN  10                  ;// Temporary scratch varaibles
114const1              RN  11
115const2              RN  12
116mask                RN  14
117
118;// Output pixels
119out00               RN  2
120out02               RN  3
121out10               RN  4
122out12               RN  5
123out20               RN  6
124out22               RN  7
125out30               RN  8
126out32               RN  9
127
128
129
130    ;// Allocate stack memory required by the function
131
132
133    ;// Write function header
134        M_START armVCM4P10_TransformResidual4x4,r11
135
136        ;******************************************************************
137        ;// The strategy used in implementing the transform is as follows:*
138        ;// Load the 4x4 block into 8 registers                           *
139        ;// Transpose the 4x4 matrix                                      *
140        ;// Perform the row operations (on columns) using SIMD            *
141        ;// Transpose the 4x4 result matrix                               *
142        ;// Perform the coloumn operations                                *
143        ;// Store the 4x4 block at one go                                 *
144        ;******************************************************************
145
146        ;// Load all the 4x4 pixels
147
148        LDMIA   pSrc,{in00,in02,in10,in12,in20,in22,in30,in32}
149
150        MOV       constZero,#0                                     ;// Used to right shift by 1
151        ;LDR       constZero,=0x00000000
152
153        ;*****************************************************************
154        ;//
155        ;// Transpose the matrix inorder to perform row ops as coloumn ops
156        ;// Input:   in[][] = original matrix
157        ;// Output:  trRow[][]= transposed matrix
158        ;// Step1: Obtain the LL part of the transposed matrix
159        ;// Step2: Obtain the HL part
160        ;// step3: Obtain the LH part
161        ;// Step4: Obtain the HH part
162        ;//
163        ;*****************************************************************
164
165        ;// LL 2x2 transposed matrix
166        ;//   d0 d1 - -
167        ;//   d4 d5 - -
168        ;//   -  -  - -
169        ;//   -  -  - -
170
171        PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
172        PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]
173
174        ;// HL 2x2 transposed matrix
175        ;//    -   -   - -
176        ;//    -   -   - -
177        ;//    d8  d9  - -
178        ;//   d12 d13  - -
179
180
181         PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
182         PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]
183
184        ;// LH 2x2 transposed matrix
185        ;//   - - d2 d3
186        ;//   - - d6 d7
187        ;//   - - -  -
188        ;//   - - -  -
189
190        PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
191        PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]
192
193
194
195
196        ;// HH 2x2 transposed matrix
197        ;//    - -   -   -
198        ;//    - -   -   -
199        ;//    - -  d10 d11
200        ;//    - -  d14 d15
201
202        PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
203        PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]
204
205
206        ;****************************************
207        ;// Row Operations (Performed on columns)
208        ;****************************************
209
210
211        ;// SIMD operations on first two columns(two rows of the original matrix)
212
213
214        SADD16      e0, trRow00,trRow20                   ;//  e0 = d0 + d2
215        SSUB16    e1, trRow00,trRow20                   ;//  e1 = d0 - d2
216        SHADD16   e2, trRow10,constZero                 ;// (f1>>1) constZero is a register holding 0
217        SHADD16   e3, trRow30,constZero                 ;//  avoid pipeline stalls for e2 and e3
218        SSUB16    e2, e2, trRow30                       ;//  e2 = (d1>>1) - d3
219        SADD16    e3, e3, trRow10                       ;//  e3 = d1 + (d3>>1)
220        SADD16    rowOp00, e0, e3                       ;//  f0 = e0 + e3
221        SADD16    rowOp10, e1, e2                       ;//  f1 = e1 + e2
222        SSUB16    rowOp20, e1, e2                       ;//  f2 = e1 - e2
223        SSUB16    rowOp30, e0, e3                       ;//  f3 = e0 - e3
224
225        ;// SIMD operations on next two columns(next two rows of the original matrix)
226
227        SADD16      e0, trRow02,trRow22
228        SSUB16    e1, trRow02,trRow22
229        SHADD16   e2, trRow12,constZero                 ;//(f1>>1) constZero is a register holding 0
230        SHADD16   e3, trRow32,constZero
231        SSUB16    e2, e2, trRow32
232        SADD16    e3, e3, trRow12
233        SADD16    rowOp02, e0, e3
234        SADD16    rowOp12, e1, e2
235        SSUB16    rowOp22, e1, e2
236        SSUB16    rowOp32, e0, e3
237
238
239        ;*****************************************************************
240        ;// Transpose the resultant matrix
241        ;// Input:  rowOp[][]
242        ;// Output: trCol[][]
243        ;*****************************************************************
244
245        ;// LL 2x2 transposed matrix
246        ;//   d0 d1 - -
247        ;//   d4 d5 - -
248        ;//   -  -  - -
249        ;//   -  -  - -
250
251        PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
252        PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]
253
254        ;// HL 2x2 transposed matrix
255        ;//    -   -   - -
256        ;//    -   -   - -
257        ;//    d8  d9  - -
258        ;//   d12 d13  - -
259
260
261         PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
262         PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]
263
264        ;// LH 2x2 transposed matrix
265        ;//   - - d2 d3
266        ;//   - - d6 d7
267        ;//   - - -  -
268        ;//   - - -  -
269
270        PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
271        PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]
272
273
274
275
276        ;// HH 2x2 transposed matrix
277        ;//    - -   -   -
278        ;//    - -   -   -
279        ;//    - -  d10 d11
280        ;//    - -  d14 d15
281
282        PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
283        PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]
284
285
286        ;*******************************
287        ;// Coloumn Operations
288        ;*******************************
289
290
291        ;// SIMD operations on first two columns
292
293
294        SADD16      g0, trCol00,trCol20
295        SSUB16    g1, trCol00,trCol20
296        SHADD16   g2, trCol10,constZero                     ;// (f1>>1) constZero is a register holding 0
297        SHADD16   g3, trCol30,constZero
298        SSUB16    g2, g2, trCol30
299        SADD16    g3, g3, trCol10
300        SADD16    colOp00, g0, g3
301        SADD16    colOp10, g1, g2
302        SSUB16    colOp20, g1, g2
303        SSUB16    colOp30, g0, g3
304
305        ;// SIMD operations on next two columns
306
307        SADD16      g0, trCol02,trCol22
308        SSUB16    g1, trCol02,trCol22
309        SHADD16   g2, trCol12,constZero                     ;// (f1>>1) constZero is a register holding 0
310        SHADD16   g3, trCol32,constZero
311        SSUB16    g2, g2, trCol32
312        SADD16    g3, g3, trCol12
313        SADD16    colOp02, g0, g3
314        SADD16    colOp12, g1, g2
315        SSUB16    colOp22, g1, g2
316        SSUB16    colOp32, g0, g3
317
318
319
320
321
322        ;************************************************
323        ;// Calculate final value (colOp[i][j] + 32)>>6
324        ;************************************************
325
326        ;// const1: Serves dual purpose
327        ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result
328        ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768)
329
330        LDR     const1, =0x00208020
331
332        LDR     mask, =0xffff03ff                       ;// Used to mask the down shifted 6 bits
333
334        ;// const2(#512): used to convert the lower 16bit number back to signed value
335
336        MOV     const2,#0x200                           ;// const2 = 2^9
337
338        ;// First Row
339
340        SADD16    colOp00, colOp00, const1
341        SADD16    colOp02, colOp02, const1
342        AND     colOp00, mask, colOp00, ASR #6
343        AND     colOp02, mask, colOp02, ASR #6
344        SSUB16  out00,colOp00,const2
345        SSUB16  out02,colOp02,const2
346
347
348        ;// Second Row
349
350        SADD16    colOp10, colOp10, const1
351        SADD16    colOp12, colOp12, const1
352        AND     colOp10, mask, colOp10, ASR #6
353        AND     colOp12, mask, colOp12, ASR #6
354        SSUB16  out10,colOp10,const2
355        SSUB16  out12,colOp12,const2
356
357
358        ;// Third Row
359
360        SADD16    colOp20, colOp20, const1
361        SADD16    colOp22, colOp22, const1
362        AND     colOp20, mask, colOp20, ASR #6
363        AND     colOp22, mask, colOp22, ASR #6
364        SSUB16  out20,colOp20,const2
365        SSUB16  out22,colOp22,const2
366
367
368        ;// Fourth Row
369
370        SADD16    colOp30, colOp30, const1
371        SADD16    colOp32, colOp32, const1
372        AND     colOp30, mask, colOp30, ASR #6
373        AND     colOp32, mask, colOp32, ASR #6
374        SSUB16  out30,colOp30,const2
375        SSUB16  out32,colOp32,const2
376
377
378
379
380        ;***************************
381        ;// Store all the 4x4 pixels
382        ;***************************
383
384        STMIA   pDst,{out00,out02,out10,out12,out20,out22,out30,out32}
385
386
387
388        ;// Set return value
389
390End
391
392
393        ;// Write function tail
394        M_END
395
396    ENDIF                                                           ;//ARM1136JS
397
398
399
400
401
402
403
404;// Guarding implementation by the processor name
405
406
407    END