omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;// (c) Copyright 2007 ARM Limited. All Rights Reserved.
3;//
4;// Description:
5;// H.264 inverse quantize and transform module
6;//
7;//
8
9
10
11;// Include standard headers
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16;// Import symbols required from other files
17;// (For example tables)
18
19        IMPORT armVCM4P10_UnpackBlock4x4
20        IMPORT armVCM4P10_TransformResidual4x4
21        IMPORT armVCM4P10_QPDivTable
22        IMPORT armVCM4P10_VMatrixU16
23        IMPORT armVCM4P10_QPModuloTable
24
25    M_VARIANTS ARM1136JS, ARM1136JS_U
26
27;// Set debugging level
28;//DEBUG_ON    SETL {TRUE}
29
30
31;// Static Function: armVCM4P10_DequantLumaAC4x4
32
33;// Guarding implementation by the processor name
34
35    IF  ARM1136JS
36
37;//Input Registers
38pSrcDst       RN  0
39QP            RN  1
40
41
42;//Output Registers
43
44
45;//Local Scratch Registers
46pQPdiv          RN  4
47pQPmod          RN  5
48pVRow           RN  2
49QPmod           RN  6
50shift           RN  3
51rowLuma01       RN  1
52rowLuma23       RN  4
53
54SrcDst00        RN  5
55SrcDst02        RN  6
56SrcDst10        RN  7
57SrcDst12        RN  8
58SrcDst20        RN  9
59SrcDst22        RN  10
60SrcDst30        RN  11
61SrcDst32        RN  12
62
63temp1           RN  2
64temp2           RN  3
65temp3           RN  14
66
67
68        ;// Allocate stack memory required by the function
69
70        ;// Write function header
71        M_START armVCM4P10_DequantLumaAC4x4,r11
72
73        LDR    pQPmod,=armVCM4P10_QPModuloTable
74        LDR    pQPdiv,=armVCM4P10_QPDivTable
75        LDR    pVRow,=armVCM4P10_VMatrixU16
76
77        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
78        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
79
80        LDRH    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [00|0a]
81        LDRH    temp3,[pVRow,#2]                     ;// temp3     = [00|0b]
82        LDRH    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [00|0c]
83        ORR     rowLuma01,rowLuma01,temp3,LSL #16    ;// rowLuma01 = [0b|0a]
84
85        ;// Load all the 16 'src' values
86        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
87
88
89        ;//*********************************************************************************************
90        ;//
91        ;// 'Shift' ranges between [0,8]
92        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
93        ;//
94        ;//*********************************************************************************************
95
96        LSL    rowLuma01,rowLuma01,shift
97        LSL    rowLuma23,rowLuma23,shift
98
99
100        ;//**********************************************************************************************
101        ;//
102        ;// The idea is to unroll the Loop completely
103        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
104        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
105        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
106        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
107        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
108        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
109        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
110        ;//
111        ;// We then pack the two 16 bit multiplication result into a word and store at one go
112        ;//
113        ;//**********************************************************************************************
114
115
116        ;// Row 1
117
118
119        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
120        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)
121
122        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
123        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
124
125        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
126
127
128        ;// Row 2
129        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
130        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
131
132        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
133        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
134        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
135
136        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
137
138
139        ;// Row 3
140
141        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
142        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)
143
144        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
145        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
146        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
147
148        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
149
150
151
152        ;// Row 4
153
154        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
155        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
156
157        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
158        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
159
160        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
161        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
162        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
163
164
165        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
166
167
168        ;// Set return value
169
170
171
172        ;// Write function tail
173        M_END
174
175    ENDIF                                                    ;//ARM1136JS
176
177
178;// Guarding implementation by the processor name
179
180    IF  ARM1136JS_U
181
182;//Input Registers
183pSrcDst       RN  0
184QP            RN  1
185
186
187;//Output Registers
188
189
190;//Local Scratch Registers
191pQPdiv          RN  4
192pQPmod          RN  5
193pVRow           RN  2
194QPmod           RN  6
195shift           RN  3
196rowLuma01       RN  1
197rowLuma23       RN  4
198
199SrcDst00        RN  5
200SrcDst02        RN  6
201SrcDst10        RN  7
202SrcDst12        RN  8
203SrcDst20        RN  9
204SrcDst22        RN  10
205SrcDst30        RN  11
206SrcDst32        RN  12
207
208temp1           RN  2
209temp2           RN  3
210temp3           RN  14
211
212
213        ;// Allocate stack memory required by the function
214
215        ;// Write function header
216        M_START armVCM4P10_DequantLumaAC4x4,r11
217
218        LDR    pQPmod,=armVCM4P10_QPModuloTable
219        LDR    pQPdiv,=armVCM4P10_QPDivTable
220        LDR    pVRow,=armVCM4P10_VMatrixU16
221
222        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
223        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
224
225        LDR    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [0b|0a]
226        LDR    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [0d|0c]
227
228        ;// Load all the 16 'src' values
229        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
230
231
232        ;//*********************************************************************************************
233        ;//
234        ;// 'Shift' ranges between [0,8]
235        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
236        ;//
237        ;//*********************************************************************************************
238
239        LSL    rowLuma01,rowLuma01,shift
240        LSL    rowLuma23,rowLuma23,shift
241
242
243        ;//**********************************************************************************************
244        ;//
245        ;// The idea is to unroll the Loop completely
246        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
247        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
248        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
249        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
250        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
251        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
252        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
253        ;//
254        ;// We then pack the two 16 bit multiplication result into a word and store at one go
255        ;//
256        ;//**********************************************************************************************
257
258
259        ;// Row 1
260
261
262        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
263        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)
264
265        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
266        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
267
268        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
269
270
271        ;// Row 2
272        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
273        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
274
275        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
276        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
277        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
278
279        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
280
281
282        ;// Row 3
283
284        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
285        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)
286
287        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
288        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
289        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
290
291        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
292
293
294
295        ;// Row 4
296
297        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
298        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
299
300        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
301        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
302
303        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
304        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
305        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
306
307
308        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
309
310
311        ;// Set return value
312
313
314
315        ;// Write function tail
316        M_END
317
318    ENDIF                                                    ;//ARM1136JS_U
319
320
321
322
323
324;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
325
326;// Guarding implementation by the processor name
327
328    IF  ARM1136JS
329
330;//Input Registers
331ppSrc       RN  0
332pPred       RN  1
333pDC         RN  2
334pDst        RN  3
335
336
337;//Output Registers
338result      RN  0
339
340;//Local Scratch Registers
341pDelta      RN  4
342pDeltaTmp   RN  6
343AC          RN  5                   ;//Load from stack
344pPredTemp   RN  7
345pDCTemp     RN  8
346pDstTemp    RN  9
347pDeltaArg1  RN  1
348pDeltaArg0  RN  0
349QP          RN  1                   ;//Load from stack
350DCval       RN  10
351DCvalCopy   RN  11
352predstep    RN  1
353dstStep     RN  10
354ycounter    RN  0
355PredVal1    RN  3
356PredVal2    RN  5
357DeltaVal1   RN  2
358DeltaVal2   RN  11
359PredVal     RN  8
360tmpDeltaVal RN  6
361sum1        RN  12
362sum2        RN  14
363
364
365
366    ;// Allocate stack memory required by the function
367        M_ALLOC8 pBuffer, 32
368
369
370    ;// Write function header
371        M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11
372
373        ;// Define stack arguments
374        M_ARG   predStepOnStack, 4
375        M_ARG   dstStepOnStack,4
376        M_ARG   QPOnStack, 4
377        M_ARG   ACOnStack,4
378
379
380        M_ADR   pDelta,pBuffer
381        M_LDR   AC,ACOnStack
382
383
384        ;// Save registers r1,r2,r3 before function call
385        MOV     pPredTemp,pPred
386        MOV     pDCTemp,pDC
387        MOV     pDstTemp,pDst
388
389        CMP     AC,#0
390        BEQ     DCcase
391        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
392
393        BL      armVCM4P10_UnpackBlock4x4
394
395        M_LDR   QP,QPOnStack                                ;// Set up r1 for DequantLumaAC4x4
396        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for DequantLumaAC4x4
397
398        BL      armVCM4P10_DequantLumaAC4x4
399
400
401        CMP     pDCTemp,#0
402        LDRSHNE DCval,[pDCTemp]
403        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for armVCM4P10_TransformResidual4x4
404        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_TransformResidual4x4
405        STRHNE  DCval,[pDelta]
406
407        BL      armVCM4P10_TransformResidual4x4
408        B       OutDCcase
409
410
411DCcase
412        LDRSH   DCval,[pDCTemp]
413        ADD     DCval,DCval,#32
414        ASR     DCval,DCval,#6
415        PKHBT   DCval,DCval,DCval,LSL #16                  ;// Duplicating the Lower halfword
416        MOV     DCvalCopy, DCval                           ;// Needed for STRD
417        STRD    DCval, [pDelta, #0]                        ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
418        STRD    DCval, [pDelta, #8]                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
419        STRD    DCval, [pDelta, #16]                       ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
420        STRD    DCval, [pDelta, #24]
421
422
423OutDCcase
424        M_LDR   predstep,predStepOnStack
425        M_LDR   dstStep,dstStepOnStack
426
427        LDMIA   pDelta!,{tmpDeltaVal,DeltaVal2}             ;// Pre load
428        MOV     ycounter,#4                                 ;// Counter for the PredPlusDeltaLoop
429        LDR     PredVal,[pPredTemp]                         ;// Pre load
430
431PredPlusDeltaLoop
432
433
434        SUBS    ycounter,ycounter,#1
435        ADD     pPredTemp,pPredTemp,predstep                ;// Increment pPred ptr
436
437        PKHBT   DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16     ;// Deltaval1 = [C A]
438        PKHTB   DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16     ;// DeltaVal2 = [D B]
439
440        UXTB16  PredVal1,PredVal                            ;// PredVal1 = [0c0a]
441        UXTB16  PredVal2,PredVal,ROR #8                     ;// PredVal2 = [0d0b]
442
443        LDRGT   PredVal,[pPredTemp]                         ;// Pre load
444
445        QADD16  sum2,DeltaVal2,PredVal2                     ;// Add and saturate to 16 bits
446        QADD16  sum1,DeltaVal1,PredVal1
447
448        USAT16  sum2,#8,sum2                                ;// armClip(0,255,sum2)
449        USAT16  sum1,#8,sum1
450
451        LDMGTIA   pDelta!,{tmpDeltaVal,DeltaVal2}           ;// Pre load
452
453        ORR     sum1,sum1,sum2,LSL #8                       ;// sum1 = [dcba]
454        STR     sum1,[pDstTemp]
455
456        ADD     pDstTemp,pDstTemp,dstStep                   ;// Increment pDst ptr
457        BGT     PredPlusDeltaLoop
458
459
460        ;// Set return value
461        MOV     result,#OMX_Sts_NoErr
462
463End
464
465
466        ;// Write function tail
467
468        M_END
469
470    ENDIF                                                    ;//ARM1136JS
471
472
473;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
474
475;// Guarding implementation by the processor name
476
477
478
479
480    END
481