omxVCM4P10_PredictIntraChroma_8x8_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_PredictIntraChroma_8x8_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27
28        INCLUDE omxtypes_s.h
29        INCLUDE armCOMM_s.h
30
31        EXPORT armVCM4P10_pIndexTable8x8
32
33;// Define the processor variants supported by this file
34
35         M_VARIANTS CortexA8
36
37     AREA table, DATA
38;//-------------------------------------------------------
39;// This table for implementing switch case of C in asm by
40;// the mehtod of two levels of indexing.
41;//-------------------------------------------------------
42
43    M_TABLE armVCM4P10_pIndexTable8x8
44    DCD  OMX_VC_CHROMA_DC,     OMX_VC_CHROMA_HOR
45    DCD  OMX_VC_CHROMA_VERT,   OMX_VC_CHROMA_PLANE
46
47    M_TABLE armVCM4P10_MultiplierTableChroma8x8,1
48    DCW   3, 2, 1,4
49    DCW  -3,-2,-1,0
50    DCW   1, 2, 3,4
51
52
53
54    IF CortexA8
55
56;//--------------------------------------------
57;// Scratch variable
58;//--------------------------------------------
59
60pc              RN 15
61return          RN 0
62pTable          RN 8
63
64;//--------------------------------------------
65;// Input Arguments
66;//--------------------------------------------
67pSrcLeft        RN 0    ;// input pointer
68pSrcAbove       RN 1    ;// input pointer
69pSrcAboveLeft   RN 2    ;// input pointer
70pDst            RN 3    ;// output pointer
71leftStep        RN 4    ;// input variable
72dstStep         RN 5    ;// input variable
73predMode        RN 6    ;// input variable
74availability    RN 7    ;// input variable
75pMultiplierTable    RN  2
76
77pTmp            RN 9
78step            RN 10
79
80;//---------------------
81;// Neon Registers
82;//---------------------
83
84;// OMX_VC_CHROMA_HOR
85
86dLeftVal0       DN  D0.8
87dLeftVal1       DN  D1.8
88dLeftVal2       DN  D2.8
89dLeftVal3       DN  D3.8
90dLeftVal4       DN  D4.8
91dLeftVal5       DN  D5.8
92dLeftVal6       DN  D6.8
93dLeftVal7       DN  D7.8
94
95;// OMX_VC_CHROMA_VERT
96
97dAboveVal       DN  D0.U8
98
99;// OMX_VC_CHROMA_DC
100
101dLeftVal        DN  D1.U8
102dSumAboveValU16 DN  D2.U16
103dSumAboveValU32 DN  D3.U32
104dSumAboveValU8  DN  D3.U8
105dSumLeftValU16  DN  D2.U16
106dSumLeftValU32  DN  D1.U32
107dSumLeftValU8   DN  D1.U8
108dSumAboveLeft   DN  D2.U32
109dSumAboveLeftU8 DN  D2.U8
110dIndexRow0U8    DN  D5.U8
111dIndexRow0      DN  D5.U64
112dIndexRow4U8    DN  D6.U8
113dIndexRow4      DN  D6.U64
114dDstRow0        DN  D0.U8
115dDstRow4        DN  D4.U8
116dConst128U8     DN  D0.U8
117
118;// OMX_VC_CHROMA_PLANE
119
120dRevAboveVal    DN  D3.U8
121dRevAboveValU64 DN  D3.U64
122dAboveLeftVal   DN  D2.U8
123qAbove7minus0   QN  Q3.S16
124qAboveDiff      QN  Q2.S16
125dIndex          DN  D8.U8
126dDiffAboveU8    DN  D9.U8
127dDiffAboveS16   DN  D9.S16
128dAboveDiff0U8   DN  D4.U8
129dAboveDiff0U64  DN  D4.U64
130dAbove7minus0U8 DN  D6.U8
131dMultiplier     DN  D10.S16
132dHorPred        DN  D11.S16
133dRevLeftVal     DN  D3.U8
134dRevLeftValU64  DN  D3.U64
135qLeft7minus0    QN  Q7.S16
136qLeftDiff       QN  Q6.S16
137dDiffLeftU8     DN  D16.U8
138dDiffLeftS16    DN  D16.S16
139dLeftDiff0U8    DN  D12.U8
140dLeftDiff0U64   DN  D12.U64
141dLeft7minus0U8  DN  D14.U8
142dVerPred        DN  D3.S16
143dHVValS16       DN  D3.S16
144dHVValS32       DN  D3.S32
145dHVTempS32      DN  D2.S32
146qA              QN  Q0.S16
147qB              QN  Q2.S16
148qC              QN  Q3.S16
149qMultiplier     QN  Q5.S16
150dMultiplier0    DN  D10.S16
151dMultiplier1    DN  D11.S16
152qC0             QN  Q0.S16
153qC1             QN  Q1.S16
154qC2             QN  Q4.S16
155qC3             QN  Q5.S16
156qC4             QN  Q6.S16
157qC5             QN  Q7.S16
158qC6             QN  Q8.S16
159qC7             QN  Q9.S16
160qSum0           QN  Q0.S16
161qSum1           QN  Q1.S16
162qSum2           QN  Q4.S16
163qSum3           QN  Q5.S16
164qSum4           QN  Q6.S16
165qSum5           QN  Q7.S16
166qSum6           QN  Q8.S16
167qSum7           QN  Q9.S16
168dSum0           DN  D0.U8
169dSum1           DN  D1.U8
170dSum2           DN  D2.U8
171dSum3           DN  D3.U8
172dSum4           DN  D4.U8
173dSum5           DN  D5.U8
174dSum6           DN  D6.U8
175dSum7           DN  D7.U8
176
177;//-----------------------------------------------------------------------------------------------
178;// omxVCM4P10_PredictIntraChroma_8x8 starts
179;//-----------------------------------------------------------------------------------------------
180
181        ;// Write function header
182        M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15
183
184        ;// Define stack arguments
185        M_ARG    LeftStep,     4
186        M_ARG    DstStep,      4
187        M_ARG    PredMode,     4
188        M_ARG    Availability, 4
189
190        LDR      pTable,=armVCM4P10_pIndexTable8x8   ;// Load index table for switch case
191
192        ;// Load argument from the stack
193        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
194        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
195        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
196        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
197
198
199        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
200
201OMX_VC_CHROMA_DC
202
203        TST     availability, #OMX_VC_LEFT
204        BEQ     DCChroma8x8LeftNotAvailable
205
206        ADD     pTmp, pSrcLeft, leftStep
207        ADD     step, leftStep, leftStep
208
209        ;// Load Left Edge
210        VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
211        VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
212        VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
213        VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
214        VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
215        VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
216        VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
217        VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep]
218
219        TST     availability, #OMX_VC_UPPER
220        BEQ     DCChroma8x8LeftOnlyAvailable
221
222        ;// Load Upper Edge also
223        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]
224
225        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
226
227        VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]
228        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ]
229
230        VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]
231        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]
232
233        VADD     dSumAboveLeft,dSumAboveValU32,dSumLeftValU32
234        VRSHR    dSumAboveLeft,dSumAboveLeft,#3             ;// Sum = (Sum + 4) >> 3
235        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
236        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
237
238        VMOV     dIndexRow0U8,#0x0c
239        VMOV     dIndexRow4U8,#0x04
240        VSHL     dIndexRow0,dIndexRow0,#32                  ;// index0 = 0x0c0c0c0c00000000
241        VSHR     dIndexRow4,dIndexRow4,#32                  ;// index4 = 0x0000000004040404
242        VADD     dIndexRow4U8,dIndexRow4U8,dIndexRow0U8     ;// index4 = 0x0c0c0c0c04040404
243        VTBL     dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8
244        VTBL     dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8
245
246DCChroma8x8LeftStore
247        ADD     pTmp, pDst, dstStep
248        ADD     step, dstStep, dstStep
249
250        VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
251        VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
252        VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
253        VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
254        VST1     dDstRow4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
255        VST1     dDstRow4,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
256        VST1     dDstRow4,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
257        VST1     dDstRow4,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
258
259        M_EXIT
260
261
262DCChroma8x8LeftOnlyAvailable
263
264        MOV      return, #OMX_Sts_NoErr
265
266        VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]
267        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]
268        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
269
270        VDUP     dDstRow0,dSumLeftValU8[0]
271        VDUP     dDstRow4,dSumLeftValU8[4]
272
273        B        DCChroma8x8LeftStore
274
275
276DCChroma8x8LeftNotAvailable
277
278        TST     availability, #OMX_VC_UPPER
279        BEQ     DCChroma8x8NoneAvailable
280
281        ;// Load Upper Edge
282        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]
283        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
284
285        VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]
286        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ]
287        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
288        VMOV     dIndexRow0U8,#0x04
289        VSHL     dIndexRow0,dIndexRow0,#32                  ;// index = 0x0404040400000000
290        VTBL     dDstRow0,{dSumAboveValU8},dIndexRow0U8
291
292        B        DCChroma8x8UpperStore
293
294
295DCChroma8x8NoneAvailable
296
297        VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
298        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
299
300DCChroma8x8UpperStore
301
302        ADD     pTmp, pDst, dstStep
303        ADD     step, dstStep, dstStep
304
305        VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
306        VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
307        VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
308        VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
309        VST1     dDstRow0,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
310        VST1     dDstRow0,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
311        VST1     dDstRow0,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
312        VST1     dDstRow0,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
313
314        M_EXIT
315
316
317OMX_VC_CHROMA_VERT
318
319        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[x]      :0<= x <= 7
320        MOV      return, #OMX_Sts_NoErr
321
322        B        DCChroma8x8UpperStore
323
324
325OMX_VC_CHROMA_HOR
326
327        ADD     pTmp, pSrcLeft, leftStep
328        ADD     step, leftStep, leftStep
329
330        VLD1    {dLeftVal0[]},[pSrcLeft],step           ;// pSrcLeft[0*leftStep]
331        VLD1    {dLeftVal1[]},[pTmp],step               ;// pSrcLeft[1*leftStep]
332        VLD1    {dLeftVal2[]},[pSrcLeft],step           ;// pSrcLeft[2*leftStep]
333        VLD1    {dLeftVal3[]},[pTmp],step               ;// pSrcLeft[3*leftStep]
334        VLD1    {dLeftVal4[]},[pSrcLeft],step           ;// pSrcLeft[4*leftStep]
335        VLD1    {dLeftVal5[]},[pTmp],step               ;// pSrcLeft[5*leftStep]
336        VLD1    {dLeftVal6[]},[pSrcLeft],step           ;// pSrcLeft[6*leftStep]
337        VLD1    {dLeftVal7[]},[pTmp]                    ;// pSrcLeft[7*leftStep]
338
339        B        DCChroma8x8PlaneStore
340
341
342OMX_VC_CHROMA_PLANE
343        ADD     pTmp, pSrcLeft, leftStep
344        ADD     step, leftStep, leftStep
345
346        VLD1    dAboveVal,[pSrcAbove]                       ;// pSrcAbove[x]      :0<= x <= 7
347        VLD1    dAboveLeftVal[0],[pSrcAboveLeft]
348
349        VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
350        VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
351        VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
352        VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
353        VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
354        VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
355        VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
356        VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep]
357
358
359        VREV64  dRevAboveVal,dAboveVal                      ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7]
360        VSUBL   qAbove7minus0,dRevAboveVal,dAboveLeftVal    ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0]
361        VSHR    dRevAboveValU64,dRevAboveValU64,#8          ;// pSrcAbove[X:0:1:2:3:4:5:6]
362        VSUBL   qAboveDiff,dRevAboveVal,dAboveVal           ;// pSrcAbove[6] - pSrcAbove[0]
363                                                            ;// pSrcAbove[5] - pSrcAbove[1]
364                                                            ;// pSrcAbove[4] - pSrcAbove[2]
365
366        VREV64  dRevLeftVal,dLeftVal                        ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7]
367        VSUBL   qLeft7minus0,dRevLeftVal,dAboveLeftVal      ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
368        VSHR    dRevLeftValU64,dRevLeftValU64,#8            ;// pSrcLeft[X:0:1:2:3:4:5:6]
369        VSUBL   qLeftDiff,dRevLeftVal,dLeftVal              ;// pSrcLeft[6] - pSrcLeft[0]
370                                                            ;// pSrcLeft[5] - pSrcLeft[1]
371                                                            ;// pSrcLeft[4] - pSrcLeft[2]
372
373        LDR     pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8   ;// Used to calculate Hval & Vval
374        VSHL    dAboveDiff0U64,dAboveDiff0U64,#16
375        VEXT    dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2           ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ]
376        VLD1    dMultiplier,[pMultiplierTable]!
377        VSHL    dLeftDiff0U64,dLeftDiff0U64,#16
378        VEXT    dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2              ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ]
379
380
381        VMUL    dHorPred,dDiffAboveS16,dMultiplier                      ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ]
382        VMUL    dVerPred,dDiffLeftS16,dMultiplier
383        VPADD   dHVValS16,dHorPred,dVerPred
384
385
386        VPADDL  dHVValS32,dHVValS16                                     ;// [V|H] in 32 bits each
387        VSHL    dHVTempS32,dHVValS32,#4                                 ;// 17*H = 16*H + H = (H<<4)+H
388        VADD    dHVValS32,dHVValS32,dHVTempS32                          ;// [ 17*V  | 17*H ]in 32 bits each
389        VLD1    {dMultiplier0,dMultiplier1},[pMultiplierTable]          ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ]
390        VRSHR   dHVValS32,dHVValS32,#5                                  ;// [c|b] in 16bits each
391        VADDL   qA,dAboveVal,dLeftVal
392        VDUP    qA,qA[7]
393        VSHL    qA,qA,#4                                                ;// [a|a|a|a|a|a|a|a]
394        VDUP    qB,dHVValS16[0]                                         ;// [b|b|b|b|b|b|b|b]
395        VDUP    qC,dHVValS16[2]                                         ;// [c|c|c|c|c|c|c|c]
396
397
398        VMUL    qB,qB,qMultiplier
399        VMUL    qC,qC,qMultiplier
400        VADD    qB,qB,qA
401
402        VDUP    qC0,qC[0]
403        VDUP    qC1,qC[1]
404        VDUP    qC2,qC[2]
405        VDUP    qC3,qC[3]
406        VDUP    qC4,qC[4]
407        VDUP    qC5,qC[5]
408        VDUP    qC6,qC[6]
409        VDUP    qC7,qC[7]
410
411        VADD    qSum0,qB,qC0
412        VADD    qSum1,qB,qC1
413        VADD    qSum2,qB,qC2
414        VADD    qSum3,qB,qC3
415        VADD    qSum4,qB,qC4
416        VADD    qSum5,qB,qC5
417        VADD    qSum6,qB,qC6
418        VADD    qSum7,qB,qC7
419
420        VQRSHRUN dSum0,qSum0,#5                         ;// (OMX_U8)armClip(0,255,(Sum+16)>>5)
421        VQRSHRUN dSum1,qSum1,#5
422        VQRSHRUN dSum2,qSum2,#5
423        VQRSHRUN dSum3,qSum3,#5
424        VQRSHRUN dSum4,qSum4,#5
425        VQRSHRUN dSum5,qSum5,#5
426        VQRSHRUN dSum6,qSum6,#5
427        VQRSHRUN dSum7,qSum7,#5
428
429DCChroma8x8PlaneStore
430        ADD     pTmp, pDst, dstStep
431        ADD     step, dstStep, dstStep
432
433        VST1    dSum0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
434        VST1    dSum1,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
435        VST1    dSum2,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
436        VST1    dSum3,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
437        VST1    dSum4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
438        VST1    dSum5,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
439        VST1    dSum6,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
440        VST1    dSum7,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
441
442        MOV     return, #OMX_Sts_NoErr
443        M_END
444
445        ENDIF ;// CortexA8
446
447        END
448;//-----------------------------------------------------------------------------------------------
449;// omxVCM4P10_PredictIntraChroma_8x8 ends
450;//-----------------------------------------------------------------------------------------------
451