1;//
2;//
3;// File Name:  omxVCM4P10_PredictIntra_4x4_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13
14        INCLUDE omxtypes_s.h
15        INCLUDE armCOMM_s.h
16
17;// Define the processor variants supported by this file
18
19         M_VARIANTS CortexA8
20
21;//-------------------------------------------------------
22;// This table for implementing switch case of C in asm by
23;// the mehtod of two levels of indexing.
24;//-------------------------------------------------------
25
26    M_TABLE armVCM4P10_pSwitchTable4x4
27    DCD  OMX_VC_4x4_VERT,     OMX_VC_4x4_HOR
28    DCD  OMX_VC_4x4_DC,       OMX_VC_4x4_DIAG_DL
29    DCD  OMX_VC_4x4_DIAG_DR,  OMX_VC_4x4_VR
30    DCD  OMX_VC_4x4_HD,       OMX_VC_4x4_VL
31    DCD  OMX_VC_4x4_HU
32
33
34        IF CortexA8
35
36;//--------------------------------------------
37;// Scratch variable
38;//--------------------------------------------
39return          RN 0
40pTable          RN 8
41pc              RN 15
42
43;//--------------------------------------------
44;// Declare input registers
45;//--------------------------------------------
46pSrcLeft        RN 0    ;// input pointer
47pSrcAbove       RN 1    ;// input pointer
48pSrcAboveLeft   RN 2    ;// input pointer
49pDst            RN 3    ;// output pointer
50leftStep        RN 4    ;// input variable
51dstStep         RN 5    ;// input variable
52predMode        RN 6    ;// input variable
53availability    RN 7    ;// input variable
54pDst1           RN 1
55pDst2           RN 4
56pDst3           RN 6
57
58pSrcTmp         RN 9
59srcStep         RN 10
60pDstTmp         RN 11
61dstep           RN 12
62
63;//-------------------
64;// Neon registers
65;//-------------------
66
67;// OMX_VC_CHROMA_VERT
68dAboveU32       DN  D0.U32
69
70;// OMX_VC_CHROMA_HOR
71dLeftVal0       DN  D0.8
72dLeftVal1       DN  D1.8
73dLeftVal2       DN  D2.8
74dLeftVal3       DN  D3.8
75dLeftVal0U32    DN  D0.U32
76dLeftVal1U32    DN  D1.U32
77dLeftVal2U32    DN  D2.U32
78dLeftVal3U32    DN  D3.U32
79
80;// OMX_VC_4x4_DC
81dLeftVal        DN  D0.U8
82dLeftValU32     DN  D0.U32
83dSumAboveLeftU16  DN  D1.U16
84dSumAboveLeftU32  DN  D1.U32
85dSumAboveLeftU64  DN  D1.U64
86dSumAboveLeftU8 DN  D1.U8
87dSum            DN  D0.U8
88
89dSumLeftValU16  DN  D1.U16
90dSumLeftValU32  DN  D1.U32
91dSumLeftValU64  DN  D1.U64
92dSumLeftValU8   DN  D1.U8
93
94dAboveVal       DN  D0.U8
95dSumAboveValU16  DN  D1.U16
96dSumAboveValU32  DN  D1.U32
97dSumAboveValU64  DN  D1.U64
98dSumAboveValU8   DN  D1.U8
99dConst128U8     DN  D0.U8
100
101
102;//OMX_VC_4x4_DIAG_DL
103
104dAbove          DN  D0.U8
105dU7             DN  D2.U8
106dU3             DN  D2.U8
107dAbove0         DN  D3.U8
108dAbove1         DN  D4.U8
109dAbove2         DN  D5.U8
110dTmp            DN  D6.U8
111dTmp0           DN  D7.U8
112dTmp1           DN  D8.U8
113dTmp2            DN  D9.U8
114dTmp3            DN  D10.U8
115dTmpU32         DN  D6.U32
116
117
118;//OMX_VC_4x4_DIAG_DR
119dLeft           DN  D1.U8
120dUL             DN  D2.U8
121
122;//OMX_VC_4x4_VR
123dLeft0          DN  D1.U8
124dLeft1          DN  D2.U8
125dEven0          DN  D3.U8
126dEven1          DN  D4.U8
127dEven2          DN  D5.U8
128dOdd0           DN  D6.U8
129dOdd1           DN  D11.U8
130dOdd2           DN  D12.U8
131dTmp3U32        DN  D10.U32
132dTmp2U32        DN  D9.U32
133
134
135;//OMX_VC_4x4_HD
136dTmp1U64        DN  D8.U64
137dTmp0U64        DN  D7.U64
138dTmpU64         DN  D6.U64
139dTmpU32         DN  D6.U32
140dTmp1U32        DN  D8.U32
141
142;//OMX_VC_4x4_HU
143dL3             DN  D2.U8
144dLeftHU0        DN  D3.U8
145dLeftHU1        DN  D4.U8
146dLeftHU2        DN  D5.U8
147dTmp0U32        DN  D7.U32
148
149
150
151
152;//-----------------------------------------------------------------------------------------------
153;// omxVCM4P10_PredictIntra_4x4 starts
154;//-----------------------------------------------------------------------------------------------
155
156        ;// Write function header
157        M_START omxVCM4P10_PredictIntra_4x4, r12,d12
158
159        ;// Define stack arguments
160        M_ARG    LeftStep,     4
161        M_ARG    DstStep,      4
162        M_ARG    PredMode,     4
163        M_ARG    Availability, 4
164
165
166        LDR      pTable,=armVCM4P10_pSwitchTable4x4  ;// Load index table for switch case
167
168        ;// Load argument from the stack
169        M_LDRD   predMode,availability,PredMode     ;// Arg predMode & availability loaded from stack to reg
170        M_LDRD   leftStep,dstStep,LeftStep          ;// Arg leftStep & dstStep loaded from stack to reg
171
172
173        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
174
175
176OMX_VC_4x4_HOR
177
178        ADD     pSrcTmp, pSrcLeft, leftStep
179        ADD     srcStep, leftStep, leftStep
180        ;// Load Left Edge
181        VLD1    {dLeftVal0[]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
182        VLD1    {dLeftVal1[]},[pSrcTmp],srcStep            ;//    pSrcLeft[1*leftStep]
183        VLD1    {dLeftVal2[]},[pSrcLeft]                   ;//    pSrcLeft[2*leftStep]
184        VLD1    {dLeftVal3[]},[pSrcTmp]                    ;//    pSrcLeft[3*leftStep]
185
186        ADD     pDstTmp, pDst, dstStep
187        ADD     dstep, dstStep, dstStep
188
189        VST1    dLeftVal0U32[0],[pDst],dstep                ;// pDst[0*dstStep+x] :0<= x <= 7
190        VST1    dLeftVal1U32[0],[pDstTmp],dstep             ;// pDst[1*dstStep+x] :0<= x <= 7
191        VST1    dLeftVal2U32[0],[pDst]                      ;// pDst[2*dstStep+x] :0<= x <= 7
192        VST1    dLeftVal3U32[0],[pDstTmp]                   ;// pDst[3*dstStep+x] :0<= x <= 7
193
194        B        ExitPredict4x4                             ;// Branch to exit code
195
196OMX_VC_4x4_VERT
197
198        ;// Load Upper Edge
199        VLD1     dAboveU32[0],[pSrcAbove]
200        ADD     pDstTmp, pDst, dstStep
201        ADD     dstep, dstStep, dstStep
202
203DCPredict4x4VertStore
204
205        VST1     dAboveU32[0],[pDst],dstep
206        VST1     dAboveU32[0],[pDstTmp],dstep
207        VST1     dAboveU32[0],[pDst]
208        VST1     dAboveU32[0],[pDstTmp]
209
210        B        ExitPredict4x4                             ;// Branch to exit code
211
212OMX_VC_4x4_DC
213
214
215        TST     availability, #OMX_VC_LEFT
216        BEQ     DCPredict4x4LeftNotAvailable
217
218        ADD     pSrcTmp, pSrcLeft, leftStep
219        ADD     srcStep, leftStep, leftStep
220        ;// Load Left Edge
221        VLD1    {dLeftVal[0]},[pSrcLeft],srcStep            ;// pSrcLeft[0*leftStep]
222        VLD1    {dLeftVal[1]},[pSrcTmp],srcStep             ;//    pSrcLeft[1*leftStep]
223        VLD1    {dLeftVal[2]},[pSrcLeft]                    ;//    pSrcLeft[2*leftStep]
224        VLD1    {dLeftVal[3]},[pSrcTmp]                     ;//    pSrcLeft[3*leftStep]
225
226        TST     availability, #OMX_VC_UPPER
227        BEQ     DCPredict4x4LeftOnlyAvailable
228
229        ;// Load Upper Edge also
230        VLD1     dLeftValU32[1],[pSrcAbove]                 ;// pSrcAbove[0 to 3]
231        MOV      return, #OMX_Sts_NoErr
232
233        VPADDL   dSumAboveLeftU16, dLeftVal                 ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]]
234        VPADDL   dSumAboveLeftU32, dSumAboveLeftU16         ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]]
235        VPADDL   dSumAboveLeftU64, dSumAboveLeftU32         ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]]
236        VRSHR    dSumAboveLeftU64,dSumAboveLeftU64,#3       ;// Sum = (Sum + 4) >> 3
237        ADD     pDstTmp, pDst, dstStep
238        ADD     dstep, dstStep, dstStep
239        VDUP     dSum,dSumAboveLeftU8[0]
240
241        B        DCPredict4x4VertStore
242
243DCPredict4x4LeftOnlyAvailable
244
245        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
246
247        VPADDL   dSumLeftValU16, dLeftVal                   ;// [ XX | pSrcLeft[2+3 | 0+1]]
248        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// [ XXXX | pSrcLeft[2+3+0+1]]
249
250        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
251        ADD     pDstTmp, pDst, dstStep
252        ADD     dstep, dstStep, dstStep
253        VDUP     dSum,dSumLeftValU8[0]
254
255        B        DCPredict4x4VertStore
256
257DCPredict4x4LeftNotAvailable
258
259        TST     availability, #OMX_VC_UPPER
260        BEQ     DCPredict4x4NoneAvailable
261
262        ;// Load Upper Edge
263        VLD1     dAboveU32[0],[pSrcAbove]                   ;// pSrcAbove[0 to 3]
264        MOV      return, #OMX_Sts_NoErr
265
266        VPADDL   dSumAboveValU16, dAboveVal                 ;// [ XX | pSrcAbove[2+3 | 0+1]]
267        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// [ XXXX | pSrcAbove[2+3+0+1]]
268
269        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
270        ADD     pDstTmp, pDst, dstStep
271        ADD     dstep, dstStep, dstStep
272        VDUP     dSum,dSumAboveValU8[0]
273
274        B        DCPredict4x4VertStore
275
276DCPredict4x4NoneAvailable
277
278        VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
279        MOV      return, #OMX_Sts_NoErr
280
281        ADD     pDstTmp, pDst, dstStep
282        ADD     dstep, dstStep, dstStep
283        B        DCPredict4x4VertStore
284
285
286
287OMX_VC_4x4_DIAG_DL
288
289        TST     availability, #OMX_VC_UPPER_RIGHT
290        BEQ     DiagDLUpperRightNotAvailable
291
292        VLD1    dAbove0,[pSrcAbove]                     ;// [U7|U6|U5|U4|U3|U2|U1|U0]
293        VDUP    dU7, dAbove0[7]                         ;// [U7|U7|U7|U7|U7|U7|U7|U7]
294        VEXT    dAbove1, dAbove0, dU7, #1               ;// [U7|U7|U6|U5|U4|U3|U2|U1]
295        VEXT    dAbove2, dAbove0, dU7, #2               ;// [U7|U7|U7|U6|U5|U4|U3|U2]
296        B       DiagDLPredict4x4Store
297
298DiagDLUpperRightNotAvailable
299        VLD1    dAboveU32[1],[pSrcAbove]                ;// [U3|U2|U1|U0|-|-|-|-]
300        VDUP    dU3, dAbove[7]                          ;// [U3 U3 U3 U3 U3 U3 U3 U3]
301
302        VEXT    dAbove0, dAbove, dU3, #4                ;// [U3 U3 U3 U3 U3 U2 U1 U0]
303        VEXT    dAbove1, dAbove, dU3, #5                ;// [U3 U3 U3 U3 U3 U3 U2 U1]
304        VEXT    dAbove2, dAbove, dU3, #6                ;// [U3 U3 U3 U3 U3 U3 U3 U2]
305
306DiagDLPredict4x4Store
307
308        VHADD   dTmp, dAbove0, dAbove2
309        VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
310
311
312        VST1    dTmpU32[0],[pDst],dstStep
313        VEXT    dTmp,dTmp,dTmp,#1
314        VST1    dTmpU32[0],[pDst],dstStep
315        VEXT    dTmp,dTmp,dTmp,#1
316        VST1    dTmpU32[0],[pDst],dstStep
317        VEXT    dTmp,dTmp,dTmp,#1
318        VST1    dTmpU32[0],[pDst]
319
320        B        ExitPredict4x4                         ;// Branch to exit code
321
322
323OMX_VC_4x4_DIAG_DR
324
325
326        ;// Load U0,U1,U2,U3
327
328        VLD1    dAboveU32[0],[pSrcAbove]                ;// [X|X|X|X|U3|U2|U1|U0]
329
330        ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
331        VLD1    {dLeft[7]},[pSrcAboveLeft]
332        ADD     pSrcTmp, pSrcLeft, leftStep
333        ADD     srcStep, leftStep, leftStep
334        ADD     pDst1,pDst,dstStep
335
336        VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
337        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
338        VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
339        VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
340
341
342        VEXT    dAbove0,dLeft,dAbove,#3                 ;// [U2|U1|U0|UL|L0|L1|L2|L3]
343        ADD     pDst2,pDst1,dstStep
344        VEXT    dAbove1,dLeft,dAbove,#4                 ;// [U3|U2|U1|U0|UL|L0|L1|L2]
345        ADD     pDst3,pDst2,dstStep
346        VEXT    dAbove2,dLeft,dAbove,#5                 ;// [ X|U3|U2|U1|U0|UL|L0|L1]
347
348        VHADD   dTmp, dAbove0, dAbove2
349        VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
350
351
352        VST1    dTmpU32[0],[pDst3]                      ;// Store pTmp[0],[1],[2],[3] @ pDst3
353        VEXT    dTmp,dTmp,dTmp,#1
354        VST1    dTmpU32[0],[pDst2]                      ;// Store pTmp[1],[2],[3],[4] @ pDst2
355        VEXT    dTmp,dTmp,dTmp,#1
356        VST1    dTmpU32[0],[pDst1]                      ;// Store pTmp[2],[3],[4],[5] @ pDst1
357        VEXT    dTmp,dTmp,dTmp,#1
358        VST1    dTmpU32[0],[pDst]                       ;// Store pTmp[3],[4],[5],[6] @ pDst
359
360        B        ExitPredict4x4                         ;// Branch to exit code
361
362OMX_VC_4x4_VR
363
364
365        ;// Load UL,U0,U1,U2,U3
366        VLD1    dAboveU32[0],[pSrcAbove]
367        VLD1    dAbove[7],[pSrcAboveLeft]               ;// [UL|X|X|X|U3|U2|U1|U0]
368
369        ;// Load L0,L1,L2                               ;// dLeft0 = [L0|L2|X|X|X|X|X|X]
370                                                        ;// dLeft1 = [L1| X|X|X|X|X|X|X]
371        VLD1    {dLeft0[7]},[pSrcLeft],leftStep         ;// pSrcLeft[0*leftStep]
372        VLD1    {dLeft1[7]},[pSrcLeft],leftStep         ;// pSrcLeft[1*leftStep]
373        VLD1    {dLeft0[6]},[pSrcLeft]                  ;// pSrcLeft[2*leftStep]
374
375
376        VEXT    dOdd2,dAbove,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 UL ]
377        VEXT    dEven0,dLeft0,dOdd2,#6                  ;// [ x x x U1 U0 UL L0 L2 ]
378        VEXT    dEven1,dLeft1,dOdd2,#7                  ;// [ x x x U2 U1 U0 UL L1 ]
379        VEXT    dEven2,dLeft0,dAbove,#7                 ;// [ x x x U3 U2 U1 U0 L0 ]
380        VEXT    dOdd0,dLeft1,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 L1 ]
381        VEXT    dOdd1,dLeft0,dOdd2,#7                   ;// [ x x x U2 U1 U0 UL L0 ]
382
383        VHADD   dTmp1, dOdd0, dOdd2
384        VRHADD  dTmp1, dTmp1, dOdd1                     ;// Tmp[ x x x 9 7 5 3 1 ]
385
386        VHADD   dTmp0, dEven0, dEven2
387        VRHADD  dTmp0, dTmp0, dEven1                    ;// Tmp[ x x x 8 6 4 2 0 ]
388
389
390        VEXT    dTmp3,dTmp1,dTmp1,#1                    ;// Tmp[ x x x x 9 7 5 3 ]
391        ADD     pDstTmp, pDst, dstStep
392        ADD     dstep, dstStep, dstStep
393        VEXT    dTmp2,dTmp0,dTmp0,#1                    ;// Tmp[ x x x x 8 6 4 2 ]
394
395
396        VST1    dTmp3U32[0],[pDst],dstep                ;// Tmp[9],[7],[5],[3]
397        VST1    dTmp2U32[0],[pDstTmp],dstep             ;// Tmp[8],[6],[4],[2]
398        VST1    dTmp1U32[0],[pDst],dstep                ;// Tmp[7],[5],[3],[1]
399        VST1    dTmp0U32[0],[pDstTmp]                   ;// Tmp[6],[4],[2],[0]
400
401        B        ExitPredict4x4                         ;// Branch to exit code
402
403OMX_VC_4x4_HD
404
405
406        ;// Load U0,U1,U2,U3
407        VLD1    dAbove,[pSrcAbove]                      ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0]
408
409        ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
410        VLD1    {dLeft[7]},[pSrcAboveLeft]
411        ADD     pSrcTmp, pSrcLeft, leftStep
412        ADD     srcStep, leftStep, leftStep
413
414        VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
415        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
416        VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
417        VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
418
419        VEXT    dAbove0,dLeft,dAbove,#3                 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ]
420        VEXT    dAbove1,dLeft,dAbove,#2                 ;// [ U1|U0|UL|L0|L1|L2|L3|X ]
421        VEXT    dAbove2,dLeft,dAbove,#1                 ;// [ U0|UL|L0|L1|L2|L3|X|X ]
422
423        VHADD   dTmp0, dAbove0, dAbove2
424        VRHADD  dTmp0, dTmp0, dAbove1                   ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ]
425
426
427        VRHADD  dTmp1, dAbove1, dAbove0                 ;// (a+b+1)>>1
428        VSHL    dTmp1U64,dTmp1U64,#24                   ;// Tmp[ 3|5| 7 |9 | X | X | X | X ]
429
430
431        VSHL    dTmpU64,dTmp0U64,#16                    ;// Tmp[ 2|4|6|8| X | X | X | X ]
432        VZIP    dTmp1,dTmp                              ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ]
433        VEXT    dTmp0,dTmp0,dTmp0,#6                    ;// Tmp[  X| X| X| X| X| X| 0 | 1 ]
434        VEXT    dTmp1,dTmp,dTmp0,#2                     ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ]
435
436        ADD     pDstTmp, pDst, dstStep
437        ADD     dstep, dstStep, dstStep
438
439        VST1    dTmp1U32[1],[pDst],dstep                ;// Store pTmp[0|1|2|3]
440        VST1    dTmpU32[1],[pDstTmp],dstep              ;// Store pTmp[2|3|4|5]
441        VST1    dTmp1U32[0],[pDst]                      ;// Store pTmp[4|5|6|7]
442        VST1    dTmpU32[0],[pDstTmp]                    ;// Store pTmp[6|7|8|9]
443
444        B        ExitPredict4x4                         ;// Branch to exit code
445
446OMX_VC_4x4_VL
447
448
449        TST     availability, #OMX_VC_UPPER_RIGHT
450        BEQ     DiagVLUpperRightNotAvailable
451
452        VLD1    dAbove0,[pSrcAbove]                      ;// [U7|U6|U5|U4|U3|U2|U1|U0]
453        VEXT    dAbove1,dAbove0,dAbove0,#1               ;// [ X|U7|U6|U5|U4|U3|U2|U1]
454        VEXT    dAbove2,dAbove1,dAbove1,#1               ;// [ X| X|U7|U6|U5|U4|U3|U2]
455
456        B       DiagVLPredict4x4Store
457
458DiagVLUpperRightNotAvailable
459        VLD1    dAboveU32[1],[pSrcAbove]                 ;// [U3|U2|U1|U0|-|-|-|-]
460        VDUP    dU3, dAbove[7]                           ;// [U3 U3 U3 U3 U3 U3 U3 U3]
461
462        VEXT    dAbove0, dAbove, dU3, #4                 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
463        VEXT    dAbove1, dAbove, dU3, #5                 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
464        VEXT    dAbove2, dAbove, dU3, #6                 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
465
466DiagVLPredict4x4Store
467
468        VRHADD  dTmp0, dAbove1, dAbove0                 ;// (a+b+1)>>1
469                                                        ;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ]
470
471        VHADD   dTmp3, dAbove0, dAbove2
472        VRHADD  dTmp3, dTmp3, dAbove1                   ;// (a+2*b+c+2)>>2
473                                                        ;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ]
474
475        VEXT    dTmp1,dTmp0,dTmp0,#1                    ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ]
476        ADD     pDstTmp, pDst, dstStep
477        ADD     dstep, dstStep, dstStep
478        VEXT    dTmp2,dTmp3,dTmp1,#1                    ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ]
479
480        VST1    dTmp0U32[0],[pDst],dstep                ;// Tmp[6],[4],[2],[0]
481        VST1    dTmp3U32[0],[pDstTmp],dstep             ;// Tmp[7],[5],[3],[1]
482        VST1    dTmp1U32[0],[pDst]                      ;// Tmp[8],[6],[4],[2]
483        VST1    dTmp2U32[0],[pDstTmp]                   ;// Tmp[9],[7],[5],[3]
484
485        B        ExitPredict4x4                         ;// Branch to exit code
486
487OMX_VC_4x4_HU
488        ADD     pSrcTmp, pSrcLeft, leftStep
489        ADD     srcStep, leftStep, leftStep
490
491        ;// Load Left Edge                              ;// [L3|L2|L1|L0|X|X|X|X]
492        VLD1    {dLeft[4]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
493        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
494        VLD1    {dLeft[6]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
495        VLD1    {dLeft[7]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
496
497        VDUP    dL3,dLeft[7]                            ;// [L3|L3|L3|L3|L3|L3|L3|L3]
498
499        VEXT    dLeftHU0,dLeft,dL3,#4                   ;// [L3|L3|L3|L3|L3|L2|L1|L0]
500        VEXT    dLeftHU1,dLeft,dL3,#5                   ;// [L3|L3|L3|L3|L3|L3|L2|L1]
501        VEXT    dLeftHU2,dLeft,dL3,#6                   ;// [L3|L3|L3|L3|L3|L3|L3|L2]
502
503        VHADD   dTmp0, dLeftHU0, dLeftHU2
504        VRHADD  dTmp0, dTmp0, dLeftHU1                  ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ]
505
506        VRHADD  dTmp1, dLeftHU1, dLeftHU0               ;// (a+b+1)>>1
507                                                        ;//  Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ]
508
509        VZIP    dTmp1,dTmp0                             ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0]
510                                                        ;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3]
511
512
513        VST1    dTmp1U32[0],[pDst],dstStep              ;// [3|2|1|0]
514        VEXT    dTmp1,dTmp1,dTmp1,#2
515        VST1    dTmp1U32[0],[pDst],dstStep              ;// [5|4|3|2]
516        VEXT    dTmp1,dTmp1,dTmp1,#2
517        VST1    dTmp1U32[0],[pDst],dstStep              ;// [7|6|5|4]
518        VST1    dTmp0U32[0],[pDst]                      ;// [9|8|7|6]
519
520
521ExitPredict4x4
522
523        MOV      return,  #OMX_Sts_NoErr
524        M_END
525
526        ENDIF ;// CortexA8
527
528        END
529;//-----------------------------------------------------------------------------------------------
530;// omxVCM4P10_PredictIntra_4x4 ends
531;//-----------------------------------------------------------------------------------------------
532