1;//
2;//
3;// File Name:  omxVCM4P10_PredictIntraChroma_8x8_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13
14        INCLUDE omxtypes_s.h
15        INCLUDE armCOMM_s.h
16
17        EXPORT armVCM4P10_pIndexTable8x8
18
19;// Define the processor variants supported by this file
20
21         M_VARIANTS CortexA8
22
23     AREA table, DATA
24;//-------------------------------------------------------
25;// This table for implementing switch case of C in asm by
26;// the mehtod of two levels of indexing.
27;//-------------------------------------------------------
28
29    M_TABLE armVCM4P10_pIndexTable8x8
30    DCD  OMX_VC_CHROMA_DC,     OMX_VC_CHROMA_HOR
31    DCD  OMX_VC_CHROMA_VERT,   OMX_VC_CHROMA_PLANE
32
33    M_TABLE armVCM4P10_MultiplierTableChroma8x8,1
34    DCW   3, 2, 1,4
35    DCW  -3,-2,-1,0
36    DCW   1, 2, 3,4
37
38
39
40    IF CortexA8
41
42;//--------------------------------------------
43;// Scratch variable
44;//--------------------------------------------
45
46pc              RN 15
47return          RN 0
48pTable          RN 8
49
50;//--------------------------------------------
51;// Input Arguments
52;//--------------------------------------------
53pSrcLeft        RN 0    ;// input pointer
54pSrcAbove       RN 1    ;// input pointer
55pSrcAboveLeft   RN 2    ;// input pointer
56pDst            RN 3    ;// output pointer
57leftStep        RN 4    ;// input variable
58dstStep         RN 5    ;// input variable
59predMode        RN 6    ;// input variable
60availability    RN 7    ;// input variable
61pMultiplierTable    RN  2
62
63pTmp            RN 9
64step            RN 10
65
66;//---------------------
67;// Neon Registers
68;//---------------------
69
70;// OMX_VC_CHROMA_HOR
71
72dLeftVal0       DN  D0.8
73dLeftVal1       DN  D1.8
74dLeftVal2       DN  D2.8
75dLeftVal3       DN  D3.8
76dLeftVal4       DN  D4.8
77dLeftVal5       DN  D5.8
78dLeftVal6       DN  D6.8
79dLeftVal7       DN  D7.8
80
81;// OMX_VC_CHROMA_VERT
82
83dAboveVal       DN  D0.U8
84
85;// OMX_VC_CHROMA_DC
86
87dLeftVal        DN  D1.U8
88dSumAboveValU16 DN  D2.U16
89dSumAboveValU32 DN  D3.U32
90dSumAboveValU8  DN  D3.U8
91dSumLeftValU16  DN  D2.U16
92dSumLeftValU32  DN  D1.U32
93dSumLeftValU8   DN  D1.U8
94dSumAboveLeft   DN  D2.U32
95dSumAboveLeftU8 DN  D2.U8
96dIndexRow0U8    DN  D5.U8
97dIndexRow0      DN  D5.U64
98dIndexRow4U8    DN  D6.U8
99dIndexRow4      DN  D6.U64
100dDstRow0        DN  D0.U8
101dDstRow4        DN  D4.U8
102dConst128U8     DN  D0.U8
103
104;// OMX_VC_CHROMA_PLANE
105
106dRevAboveVal    DN  D3.U8
107dRevAboveValU64 DN  D3.U64
108dAboveLeftVal   DN  D2.U8
109qAbove7minus0   QN  Q3.S16
110qAboveDiff      QN  Q2.S16
111dIndex          DN  D8.U8
112dDiffAboveU8    DN  D9.U8
113dDiffAboveS16   DN  D9.S16
114dAboveDiff0U8   DN  D4.U8
115dAboveDiff0U64  DN  D4.U64
116dAbove7minus0U8 DN  D6.U8
117dMultiplier     DN  D10.S16
118dHorPred        DN  D11.S16
119dRevLeftVal     DN  D3.U8
120dRevLeftValU64  DN  D3.U64
121qLeft7minus0    QN  Q7.S16
122qLeftDiff       QN  Q6.S16
123dDiffLeftU8     DN  D16.U8
124dDiffLeftS16    DN  D16.S16
125dLeftDiff0U8    DN  D12.U8
126dLeftDiff0U64   DN  D12.U64
127dLeft7minus0U8  DN  D14.U8
128dVerPred        DN  D3.S16
129dHVValS16       DN  D3.S16
130dHVValS32       DN  D3.S32
131dHVTempS32      DN  D2.S32
132qA              QN  Q0.S16
133qB              QN  Q2.S16
134qC              QN  Q3.S16
135qMultiplier     QN  Q5.S16
136dMultiplier0    DN  D10.S16
137dMultiplier1    DN  D11.S16
138qC0             QN  Q0.S16
139qC1             QN  Q1.S16
140qC2             QN  Q4.S16
141qC3             QN  Q5.S16
142qC4             QN  Q6.S16
143qC5             QN  Q7.S16
144qC6             QN  Q8.S16
145qC7             QN  Q9.S16
146qSum0           QN  Q0.S16
147qSum1           QN  Q1.S16
148qSum2           QN  Q4.S16
149qSum3           QN  Q5.S16
150qSum4           QN  Q6.S16
151qSum5           QN  Q7.S16
152qSum6           QN  Q8.S16
153qSum7           QN  Q9.S16
154dSum0           DN  D0.U8
155dSum1           DN  D1.U8
156dSum2           DN  D2.U8
157dSum3           DN  D3.U8
158dSum4           DN  D4.U8
159dSum5           DN  D5.U8
160dSum6           DN  D6.U8
161dSum7           DN  D7.U8
162
163;//-----------------------------------------------------------------------------------------------
164;// omxVCM4P10_PredictIntraChroma_8x8 starts
165;//-----------------------------------------------------------------------------------------------
166
167        ;// Write function header
168        M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15
169
170        ;// Define stack arguments
171        M_ARG    LeftStep,     4
172        M_ARG    DstStep,      4
173        M_ARG    PredMode,     4
174        M_ARG    Availability, 4
175
176        LDR      pTable,=armVCM4P10_pIndexTable8x8   ;// Load index table for switch case
177
178        ;// Load argument from the stack
179        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
180        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
181        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
182        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
183
184
185        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
186
187OMX_VC_CHROMA_DC
188
189        TST     availability, #OMX_VC_LEFT
190        BEQ     DCChroma8x8LeftNotAvailable
191
192        ADD     pTmp, pSrcLeft, leftStep
193        ADD     step, leftStep, leftStep
194
195        ;// Load Left Edge
196        VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
197        VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
198        VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
199        VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
200        VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
201        VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
202        VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
203        VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep]
204
205        TST     availability, #OMX_VC_UPPER
206        BEQ     DCChroma8x8LeftOnlyAvailable
207
208        ;// Load Upper Edge also
209        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]
210
211        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
212
213        VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]
214        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ]
215
216        VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]
217        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]
218
219        VADD     dSumAboveLeft,dSumAboveValU32,dSumLeftValU32
220        VRSHR    dSumAboveLeft,dSumAboveLeft,#3             ;// Sum = (Sum + 4) >> 3
221        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
222        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
223
224        VMOV     dIndexRow0U8,#0x0c
225        VMOV     dIndexRow4U8,#0x04
226        VSHL     dIndexRow0,dIndexRow0,#32                  ;// index0 = 0x0c0c0c0c00000000
227        VSHR     dIndexRow4,dIndexRow4,#32                  ;// index4 = 0x0000000004040404
228        VADD     dIndexRow4U8,dIndexRow4U8,dIndexRow0U8     ;// index4 = 0x0c0c0c0c04040404
229        VTBL     dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8
230        VTBL     dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8
231
232DCChroma8x8LeftStore
233        ADD     pTmp, pDst, dstStep
234        ADD     step, dstStep, dstStep
235
236        VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
237        VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
238        VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
239        VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
240        VST1     dDstRow4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
241        VST1     dDstRow4,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
242        VST1     dDstRow4,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
243        VST1     dDstRow4,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
244
245        M_EXIT
246
247
248DCChroma8x8LeftOnlyAvailable
249
250        MOV      return, #OMX_Sts_NoErr
251
252        VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]
253        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]
254        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
255
256        VDUP     dDstRow0,dSumLeftValU8[0]
257        VDUP     dDstRow4,dSumLeftValU8[4]
258
259        B        DCChroma8x8LeftStore
260
261
262DCChroma8x8LeftNotAvailable
263
264        TST     availability, #OMX_VC_UPPER
265        BEQ     DCChroma8x8NoneAvailable
266
267        ;// Load Upper Edge
268        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]
269        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
270
271        VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]
272        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ]
273        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
274        VMOV     dIndexRow0U8,#0x04
275        VSHL     dIndexRow0,dIndexRow0,#32                  ;// index = 0x0404040400000000
276        VTBL     dDstRow0,{dSumAboveValU8},dIndexRow0U8
277
278        B        DCChroma8x8UpperStore
279
280
281DCChroma8x8NoneAvailable
282
283        VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
284        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
285
286DCChroma8x8UpperStore
287
288        ADD     pTmp, pDst, dstStep
289        ADD     step, dstStep, dstStep
290
291        VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
292        VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
293        VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
294        VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
295        VST1     dDstRow0,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
296        VST1     dDstRow0,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
297        VST1     dDstRow0,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
298        VST1     dDstRow0,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
299
300        M_EXIT
301
302
303OMX_VC_CHROMA_VERT
304
305        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[x]      :0<= x <= 7
306        MOV      return, #OMX_Sts_NoErr
307
308        B        DCChroma8x8UpperStore
309
310
311OMX_VC_CHROMA_HOR
312
313        ADD     pTmp, pSrcLeft, leftStep
314        ADD     step, leftStep, leftStep
315
316        VLD1    {dLeftVal0[]},[pSrcLeft],step           ;// pSrcLeft[0*leftStep]
317        VLD1    {dLeftVal1[]},[pTmp],step               ;// pSrcLeft[1*leftStep]
318        VLD1    {dLeftVal2[]},[pSrcLeft],step           ;// pSrcLeft[2*leftStep]
319        VLD1    {dLeftVal3[]},[pTmp],step               ;// pSrcLeft[3*leftStep]
320        VLD1    {dLeftVal4[]},[pSrcLeft],step           ;// pSrcLeft[4*leftStep]
321        VLD1    {dLeftVal5[]},[pTmp],step               ;// pSrcLeft[5*leftStep]
322        VLD1    {dLeftVal6[]},[pSrcLeft],step           ;// pSrcLeft[6*leftStep]
323        VLD1    {dLeftVal7[]},[pTmp]                    ;// pSrcLeft[7*leftStep]
324
325        B        DCChroma8x8PlaneStore
326
327
328OMX_VC_CHROMA_PLANE
329        ADD     pTmp, pSrcLeft, leftStep
330        ADD     step, leftStep, leftStep
331
332        VLD1    dAboveVal,[pSrcAbove]                       ;// pSrcAbove[x]      :0<= x <= 7
333        VLD1    dAboveLeftVal[0],[pSrcAboveLeft]
334
335        VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
336        VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
337        VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
338        VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
339        VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
340        VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
341        VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
342        VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep]
343
344
345        VREV64  dRevAboveVal,dAboveVal                      ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7]
346        VSUBL   qAbove7minus0,dRevAboveVal,dAboveLeftVal    ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0]
347        VSHR    dRevAboveValU64,dRevAboveValU64,#8          ;// pSrcAbove[X:0:1:2:3:4:5:6]
348        VSUBL   qAboveDiff,dRevAboveVal,dAboveVal           ;// pSrcAbove[6] - pSrcAbove[0]
349                                                            ;// pSrcAbove[5] - pSrcAbove[1]
350                                                            ;// pSrcAbove[4] - pSrcAbove[2]
351
352        VREV64  dRevLeftVal,dLeftVal                        ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7]
353        VSUBL   qLeft7minus0,dRevLeftVal,dAboveLeftVal      ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
354        VSHR    dRevLeftValU64,dRevLeftValU64,#8            ;// pSrcLeft[X:0:1:2:3:4:5:6]
355        VSUBL   qLeftDiff,dRevLeftVal,dLeftVal              ;// pSrcLeft[6] - pSrcLeft[0]
356                                                            ;// pSrcLeft[5] - pSrcLeft[1]
357                                                            ;// pSrcLeft[4] - pSrcLeft[2]
358
359        LDR     pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8   ;// Used to calculate Hval & Vval
360        VSHL    dAboveDiff0U64,dAboveDiff0U64,#16
361        VEXT    dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2           ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ]
362        VLD1    dMultiplier,[pMultiplierTable]!
363        VSHL    dLeftDiff0U64,dLeftDiff0U64,#16
364        VEXT    dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2              ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ]
365
366
367        VMUL    dHorPred,dDiffAboveS16,dMultiplier                      ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ]
368        VMUL    dVerPred,dDiffLeftS16,dMultiplier
369        VPADD   dHVValS16,dHorPred,dVerPred
370
371
372        VPADDL  dHVValS32,dHVValS16                                     ;// [V|H] in 32 bits each
373        VSHL    dHVTempS32,dHVValS32,#4                                 ;// 17*H = 16*H + H = (H<<4)+H
374        VADD    dHVValS32,dHVValS32,dHVTempS32                          ;// [ 17*V  | 17*H ]in 32 bits each
375        VLD1    {dMultiplier0,dMultiplier1},[pMultiplierTable]          ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ]
376        VRSHR   dHVValS32,dHVValS32,#5                                  ;// [c|b] in 16bits each
377        VADDL   qA,dAboveVal,dLeftVal
378        VDUP    qA,qA[7]
379        VSHL    qA,qA,#4                                                ;// [a|a|a|a|a|a|a|a]
380        VDUP    qB,dHVValS16[0]                                         ;// [b|b|b|b|b|b|b|b]
381        VDUP    qC,dHVValS16[2]                                         ;// [c|c|c|c|c|c|c|c]
382
383
384        VMUL    qB,qB,qMultiplier
385        VMUL    qC,qC,qMultiplier
386        VADD    qB,qB,qA
387
388        VDUP    qC0,qC[0]
389        VDUP    qC1,qC[1]
390        VDUP    qC2,qC[2]
391        VDUP    qC3,qC[3]
392        VDUP    qC4,qC[4]
393        VDUP    qC5,qC[5]
394        VDUP    qC6,qC[6]
395        VDUP    qC7,qC[7]
396
397        VADD    qSum0,qB,qC0
398        VADD    qSum1,qB,qC1
399        VADD    qSum2,qB,qC2
400        VADD    qSum3,qB,qC3
401        VADD    qSum4,qB,qC4
402        VADD    qSum5,qB,qC5
403        VADD    qSum6,qB,qC6
404        VADD    qSum7,qB,qC7
405
406        VQRSHRUN dSum0,qSum0,#5                         ;// (OMX_U8)armClip(0,255,(Sum+16)>>5)
407        VQRSHRUN dSum1,qSum1,#5
408        VQRSHRUN dSum2,qSum2,#5
409        VQRSHRUN dSum3,qSum3,#5
410        VQRSHRUN dSum4,qSum4,#5
411        VQRSHRUN dSum5,qSum5,#5
412        VQRSHRUN dSum6,qSum6,#5
413        VQRSHRUN dSum7,qSum7,#5
414
415DCChroma8x8PlaneStore
416        ADD     pTmp, pDst, dstStep
417        ADD     step, dstStep, dstStep
418
419        VST1    dSum0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
420        VST1    dSum1,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
421        VST1    dSum2,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
422        VST1    dSum3,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
423        VST1    dSum4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
424        VST1    dSum5,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
425        VST1    dSum6,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
426        VST1    dSum7,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
427
428        MOV     return, #OMX_Sts_NoErr
429        M_END
430
431        ENDIF ;// CortexA8
432
433        END
434;//-----------------------------------------------------------------------------------------------
435;// omxVCM4P10_PredictIntraChroma_8x8 ends
436;//-----------------------------------------------------------------------------------------------
437