1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_PredictIntra_4x4_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27
28        INCLUDE omxtypes_s.h
29        INCLUDE armCOMM_s.h
30
31;// Define the processor variants supported by this file
32
33         M_VARIANTS CortexA8
34
35;//-------------------------------------------------------
36;// This table for implementing switch case of C in asm by
37;// the mehtod of two levels of indexing.
38;//-------------------------------------------------------
39
40    M_TABLE armVCM4P10_pSwitchTable4x4
41    DCD  OMX_VC_4x4_VERT,     OMX_VC_4x4_HOR
42    DCD  OMX_VC_4x4_DC,       OMX_VC_4x4_DIAG_DL
43    DCD  OMX_VC_4x4_DIAG_DR,  OMX_VC_4x4_VR
44    DCD  OMX_VC_4x4_HD,       OMX_VC_4x4_VL
45    DCD  OMX_VC_4x4_HU
46
47
48        IF CortexA8
49
50;//--------------------------------------------
51;// Scratch variable
52;//--------------------------------------------
53return          RN 0
54pTable          RN 8
55pc              RN 15
56
57;//--------------------------------------------
58;// Declare input registers
59;//--------------------------------------------
60pSrcLeft        RN 0    ;// input pointer
61pSrcAbove       RN 1    ;// input pointer
62pSrcAboveLeft   RN 2    ;// input pointer
63pDst            RN 3    ;// output pointer
64leftStep        RN 4    ;// input variable
65dstStep         RN 5    ;// input variable
66predMode        RN 6    ;// input variable
67availability    RN 7    ;// input variable
68pDst1           RN 1
69pDst2           RN 4
70pDst3           RN 6
71
72pSrcTmp         RN 9
73srcStep         RN 10
74pDstTmp         RN 11
75dstep           RN 12
76
77;//-------------------
78;// Neon registers
79;//-------------------
80
81;// OMX_VC_CHROMA_VERT
82dAboveU32       DN  D0.U32
83
84;// OMX_VC_CHROMA_HOR
85dLeftVal0       DN  D0.8
86dLeftVal1       DN  D1.8
87dLeftVal2       DN  D2.8
88dLeftVal3       DN  D3.8
89dLeftVal0U32    DN  D0.U32
90dLeftVal1U32    DN  D1.U32
91dLeftVal2U32    DN  D2.U32
92dLeftVal3U32    DN  D3.U32
93
94;// OMX_VC_4x4_DC
95dLeftVal        DN  D0.U8
96dLeftValU32     DN  D0.U32
97dSumAboveLeftU16  DN  D1.U16
98dSumAboveLeftU32  DN  D1.U32
99dSumAboveLeftU64  DN  D1.U64
100dSumAboveLeftU8 DN  D1.U8
101dSum            DN  D0.U8
102
103dSumLeftValU16  DN  D1.U16
104dSumLeftValU32  DN  D1.U32
105dSumLeftValU64  DN  D1.U64
106dSumLeftValU8   DN  D1.U8
107
108dAboveVal       DN  D0.U8
109dSumAboveValU16  DN  D1.U16
110dSumAboveValU32  DN  D1.U32
111dSumAboveValU64  DN  D1.U64
112dSumAboveValU8   DN  D1.U8
113dConst128U8     DN  D0.U8
114
115
116;//OMX_VC_4x4_DIAG_DL
117
118dAbove          DN  D0.U8
119dU7             DN  D2.U8
120dU3             DN  D2.U8
121dAbove0         DN  D3.U8
122dAbove1         DN  D4.U8
123dAbove2         DN  D5.U8
124dTmp            DN  D6.U8
125dTmp0           DN  D7.U8
126dTmp1           DN  D8.U8
127dTmp2            DN  D9.U8
128dTmp3            DN  D10.U8
129dTmpU32         DN  D6.U32
130
131
132;//OMX_VC_4x4_DIAG_DR
133dLeft           DN  D1.U8
134dUL             DN  D2.U8
135
136;//OMX_VC_4x4_VR
137dLeft0          DN  D1.U8
138dLeft1          DN  D2.U8
139dEven0          DN  D3.U8
140dEven1          DN  D4.U8
141dEven2          DN  D5.U8
142dOdd0           DN  D6.U8
143dOdd1           DN  D11.U8
144dOdd2           DN  D12.U8
145dTmp3U32        DN  D10.U32
146dTmp2U32        DN  D9.U32
147
148
149;//OMX_VC_4x4_HD
150dTmp1U64        DN  D8.U64
151dTmp0U64        DN  D7.U64
152dTmpU64         DN  D6.U64
153dTmpU32         DN  D6.U32
154dTmp1U32        DN  D8.U32
155
156;//OMX_VC_4x4_HU
157dL3             DN  D2.U8
158dLeftHU0        DN  D3.U8
159dLeftHU1        DN  D4.U8
160dLeftHU2        DN  D5.U8
161dTmp0U32        DN  D7.U32
162
163
164
165
166;//-----------------------------------------------------------------------------------------------
167;// omxVCM4P10_PredictIntra_4x4 starts
168;//-----------------------------------------------------------------------------------------------
169
170        ;// Write function header
171        M_START omxVCM4P10_PredictIntra_4x4, r12,d12
172
173        ;// Define stack arguments
174        M_ARG    LeftStep,     4
175        M_ARG    DstStep,      4
176        M_ARG    PredMode,     4
177        M_ARG    Availability, 4
178
179
180        LDR      pTable,=armVCM4P10_pSwitchTable4x4  ;// Load index table for switch case
181
182        ;// Load argument from the stack
183        M_LDRD   predMode,availability,PredMode     ;// Arg predMode & availability loaded from stack to reg
184        M_LDRD   leftStep,dstStep,LeftStep          ;// Arg leftStep & dstStep loaded from stack to reg
185
186
187        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
188
189
190OMX_VC_4x4_HOR
191
192        ADD     pSrcTmp, pSrcLeft, leftStep
193        ADD     srcStep, leftStep, leftStep
194        ;// Load Left Edge
195        VLD1    {dLeftVal0[]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
196        VLD1    {dLeftVal1[]},[pSrcTmp],srcStep            ;//    pSrcLeft[1*leftStep]
197        VLD1    {dLeftVal2[]},[pSrcLeft]                   ;//    pSrcLeft[2*leftStep]
198        VLD1    {dLeftVal3[]},[pSrcTmp]                    ;//    pSrcLeft[3*leftStep]
199
200        ADD     pDstTmp, pDst, dstStep
201        ADD     dstep, dstStep, dstStep
202
203        VST1    dLeftVal0U32[0],[pDst],dstep                ;// pDst[0*dstStep+x] :0<= x <= 7
204        VST1    dLeftVal1U32[0],[pDstTmp],dstep             ;// pDst[1*dstStep+x] :0<= x <= 7
205        VST1    dLeftVal2U32[0],[pDst]                      ;// pDst[2*dstStep+x] :0<= x <= 7
206        VST1    dLeftVal3U32[0],[pDstTmp]                   ;// pDst[3*dstStep+x] :0<= x <= 7
207
208        B        ExitPredict4x4                             ;// Branch to exit code
209
210OMX_VC_4x4_VERT
211
212        ;// Load Upper Edge
213        VLD1     dAboveU32[0],[pSrcAbove]
214        ADD     pDstTmp, pDst, dstStep
215        ADD     dstep, dstStep, dstStep
216
217DCPredict4x4VertStore
218
219        VST1     dAboveU32[0],[pDst],dstep
220        VST1     dAboveU32[0],[pDstTmp],dstep
221        VST1     dAboveU32[0],[pDst]
222        VST1     dAboveU32[0],[pDstTmp]
223
224        B        ExitPredict4x4                             ;// Branch to exit code
225
226OMX_VC_4x4_DC
227
228
229        TST     availability, #OMX_VC_LEFT
230        BEQ     DCPredict4x4LeftNotAvailable
231
232        ADD     pSrcTmp, pSrcLeft, leftStep
233        ADD     srcStep, leftStep, leftStep
234        ;// Load Left Edge
235        VLD1    {dLeftVal[0]},[pSrcLeft],srcStep            ;// pSrcLeft[0*leftStep]
236        VLD1    {dLeftVal[1]},[pSrcTmp],srcStep             ;//    pSrcLeft[1*leftStep]
237        VLD1    {dLeftVal[2]},[pSrcLeft]                    ;//    pSrcLeft[2*leftStep]
238        VLD1    {dLeftVal[3]},[pSrcTmp]                     ;//    pSrcLeft[3*leftStep]
239
240        TST     availability, #OMX_VC_UPPER
241        BEQ     DCPredict4x4LeftOnlyAvailable
242
243        ;// Load Upper Edge also
244        VLD1     dLeftValU32[1],[pSrcAbove]                 ;// pSrcAbove[0 to 3]
245        MOV      return, #OMX_Sts_NoErr
246
247        VPADDL   dSumAboveLeftU16, dLeftVal                 ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]]
248        VPADDL   dSumAboveLeftU32, dSumAboveLeftU16         ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]]
249        VPADDL   dSumAboveLeftU64, dSumAboveLeftU32         ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]]
250        VRSHR    dSumAboveLeftU64,dSumAboveLeftU64,#3       ;// Sum = (Sum + 4) >> 3
251        ADD     pDstTmp, pDst, dstStep
252        ADD     dstep, dstStep, dstStep
253        VDUP     dSum,dSumAboveLeftU8[0]
254
255        B        DCPredict4x4VertStore
256
257DCPredict4x4LeftOnlyAvailable
258
259        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
260
261        VPADDL   dSumLeftValU16, dLeftVal                   ;// [ XX | pSrcLeft[2+3 | 0+1]]
262        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// [ XXXX | pSrcLeft[2+3+0+1]]
263
264        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
265        ADD     pDstTmp, pDst, dstStep
266        ADD     dstep, dstStep, dstStep
267        VDUP     dSum,dSumLeftValU8[0]
268
269        B        DCPredict4x4VertStore
270
271DCPredict4x4LeftNotAvailable
272
273        TST     availability, #OMX_VC_UPPER
274        BEQ     DCPredict4x4NoneAvailable
275
276        ;// Load Upper Edge
277        VLD1     dAboveU32[0],[pSrcAbove]                   ;// pSrcAbove[0 to 3]
278        MOV      return, #OMX_Sts_NoErr
279
280        VPADDL   dSumAboveValU16, dAboveVal                 ;// [ XX | pSrcAbove[2+3 | 0+1]]
281        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// [ XXXX | pSrcAbove[2+3+0+1]]
282
283        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
284        ADD     pDstTmp, pDst, dstStep
285        ADD     dstep, dstStep, dstStep
286        VDUP     dSum,dSumAboveValU8[0]
287
288        B        DCPredict4x4VertStore
289
290DCPredict4x4NoneAvailable
291
292        VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
293        MOV      return, #OMX_Sts_NoErr
294
295        ADD     pDstTmp, pDst, dstStep
296        ADD     dstep, dstStep, dstStep
297        B        DCPredict4x4VertStore
298
299
300
301OMX_VC_4x4_DIAG_DL
302
303        TST     availability, #OMX_VC_UPPER_RIGHT
304        BEQ     DiagDLUpperRightNotAvailable
305
306        VLD1    dAbove0,[pSrcAbove]                     ;// [U7|U6|U5|U4|U3|U2|U1|U0]
307        VDUP    dU7, dAbove0[7]                         ;// [U7|U7|U7|U7|U7|U7|U7|U7]
308        VEXT    dAbove1, dAbove0, dU7, #1               ;// [U7|U7|U6|U5|U4|U3|U2|U1]
309        VEXT    dAbove2, dAbove0, dU7, #2               ;// [U7|U7|U7|U6|U5|U4|U3|U2]
310        B       DiagDLPredict4x4Store
311
312DiagDLUpperRightNotAvailable
313        VLD1    dAboveU32[1],[pSrcAbove]                ;// [U3|U2|U1|U0|-|-|-|-]
314        VDUP    dU3, dAbove[7]                          ;// [U3 U3 U3 U3 U3 U3 U3 U3]
315
316        VEXT    dAbove0, dAbove, dU3, #4                ;// [U3 U3 U3 U3 U3 U2 U1 U0]
317        VEXT    dAbove1, dAbove, dU3, #5                ;// [U3 U3 U3 U3 U3 U3 U2 U1]
318        VEXT    dAbove2, dAbove, dU3, #6                ;// [U3 U3 U3 U3 U3 U3 U3 U2]
319
320DiagDLPredict4x4Store
321
322        VHADD   dTmp, dAbove0, dAbove2
323        VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
324
325
326        VST1    dTmpU32[0],[pDst],dstStep
327        VEXT    dTmp,dTmp,dTmp,#1
328        VST1    dTmpU32[0],[pDst],dstStep
329        VEXT    dTmp,dTmp,dTmp,#1
330        VST1    dTmpU32[0],[pDst],dstStep
331        VEXT    dTmp,dTmp,dTmp,#1
332        VST1    dTmpU32[0],[pDst]
333
334        B        ExitPredict4x4                         ;// Branch to exit code
335
336
337OMX_VC_4x4_DIAG_DR
338
339
340        ;// Load U0,U1,U2,U3
341
342        VLD1    dAboveU32[0],[pSrcAbove]                ;// [X|X|X|X|U3|U2|U1|U0]
343
344        ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
345        VLD1    {dLeft[7]},[pSrcAboveLeft]
346        ADD     pSrcTmp, pSrcLeft, leftStep
347        ADD     srcStep, leftStep, leftStep
348        ADD     pDst1,pDst,dstStep
349
350        VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
351        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
352        VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
353        VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
354
355
356        VEXT    dAbove0,dLeft,dAbove,#3                 ;// [U2|U1|U0|UL|L0|L1|L2|L3]
357        ADD     pDst2,pDst1,dstStep
358        VEXT    dAbove1,dLeft,dAbove,#4                 ;// [U3|U2|U1|U0|UL|L0|L1|L2]
359        ADD     pDst3,pDst2,dstStep
360        VEXT    dAbove2,dLeft,dAbove,#5                 ;// [ X|U3|U2|U1|U0|UL|L0|L1]
361
362        VHADD   dTmp, dAbove0, dAbove2
363        VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
364
365
366        VST1    dTmpU32[0],[pDst3]                      ;// Store pTmp[0],[1],[2],[3] @ pDst3
367        VEXT    dTmp,dTmp,dTmp,#1
368        VST1    dTmpU32[0],[pDst2]                      ;// Store pTmp[1],[2],[3],[4] @ pDst2
369        VEXT    dTmp,dTmp,dTmp,#1
370        VST1    dTmpU32[0],[pDst1]                      ;// Store pTmp[2],[3],[4],[5] @ pDst1
371        VEXT    dTmp,dTmp,dTmp,#1
372        VST1    dTmpU32[0],[pDst]                       ;// Store pTmp[3],[4],[5],[6] @ pDst
373
374        B        ExitPredict4x4                         ;// Branch to exit code
375
376OMX_VC_4x4_VR
377
378
379        ;// Load UL,U0,U1,U2,U3
380        VLD1    dAboveU32[0],[pSrcAbove]
381        VLD1    dAbove[7],[pSrcAboveLeft]               ;// [UL|X|X|X|U3|U2|U1|U0]
382
383        ;// Load L0,L1,L2                               ;// dLeft0 = [L0|L2|X|X|X|X|X|X]
384                                                        ;// dLeft1 = [L1| X|X|X|X|X|X|X]
385        VLD1    {dLeft0[7]},[pSrcLeft],leftStep         ;// pSrcLeft[0*leftStep]
386        VLD1    {dLeft1[7]},[pSrcLeft],leftStep         ;// pSrcLeft[1*leftStep]
387        VLD1    {dLeft0[6]},[pSrcLeft]                  ;// pSrcLeft[2*leftStep]
388
389
390        VEXT    dOdd2,dAbove,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 UL ]
391        VEXT    dEven0,dLeft0,dOdd2,#6                  ;// [ x x x U1 U0 UL L0 L2 ]
392        VEXT    dEven1,dLeft1,dOdd2,#7                  ;// [ x x x U2 U1 U0 UL L1 ]
393        VEXT    dEven2,dLeft0,dAbove,#7                 ;// [ x x x U3 U2 U1 U0 L0 ]
394        VEXT    dOdd0,dLeft1,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 L1 ]
395        VEXT    dOdd1,dLeft0,dOdd2,#7                   ;// [ x x x U2 U1 U0 UL L0 ]
396
397        VHADD   dTmp1, dOdd0, dOdd2
398        VRHADD  dTmp1, dTmp1, dOdd1                     ;// Tmp[ x x x 9 7 5 3 1 ]
399
400        VHADD   dTmp0, dEven0, dEven2
401        VRHADD  dTmp0, dTmp0, dEven1                    ;// Tmp[ x x x 8 6 4 2 0 ]
402
403
404        VEXT    dTmp3,dTmp1,dTmp1,#1                    ;// Tmp[ x x x x 9 7 5 3 ]
405        ADD     pDstTmp, pDst, dstStep
406        ADD     dstep, dstStep, dstStep
407        VEXT    dTmp2,dTmp0,dTmp0,#1                    ;// Tmp[ x x x x 8 6 4 2 ]
408
409
410        VST1    dTmp3U32[0],[pDst],dstep                ;// Tmp[9],[7],[5],[3]
411        VST1    dTmp2U32[0],[pDstTmp],dstep             ;// Tmp[8],[6],[4],[2]
412        VST1    dTmp1U32[0],[pDst],dstep                ;// Tmp[7],[5],[3],[1]
413        VST1    dTmp0U32[0],[pDstTmp]                   ;// Tmp[6],[4],[2],[0]
414
415        B        ExitPredict4x4                         ;// Branch to exit code
416
417OMX_VC_4x4_HD
418
419
420        ;// Load U0,U1,U2,U3
421        VLD1    dAbove,[pSrcAbove]                      ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0]
422
423        ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
424        VLD1    {dLeft[7]},[pSrcAboveLeft]
425        ADD     pSrcTmp, pSrcLeft, leftStep
426        ADD     srcStep, leftStep, leftStep
427
428        VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
429        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
430        VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
431        VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
432
433        VEXT    dAbove0,dLeft,dAbove,#3                 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ]
434        VEXT    dAbove1,dLeft,dAbove,#2                 ;// [ U1|U0|UL|L0|L1|L2|L3|X ]
435        VEXT    dAbove2,dLeft,dAbove,#1                 ;// [ U0|UL|L0|L1|L2|L3|X|X ]
436
437        VHADD   dTmp0, dAbove0, dAbove2
438        VRHADD  dTmp0, dTmp0, dAbove1                   ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ]
439
440
441        VRHADD  dTmp1, dAbove1, dAbove0                 ;// (a+b+1)>>1
442        VSHL    dTmp1U64,dTmp1U64,#24                   ;// Tmp[ 3|5| 7 |9 | X | X | X | X ]
443
444
445        VSHL    dTmpU64,dTmp0U64,#16                    ;// Tmp[ 2|4|6|8| X | X | X | X ]
446        VZIP    dTmp1,dTmp                              ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ]
447        VEXT    dTmp0,dTmp0,dTmp0,#6                    ;// Tmp[  X| X| X| X| X| X| 0 | 1 ]
448        VEXT    dTmp1,dTmp,dTmp0,#2                     ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ]
449
450        ADD     pDstTmp, pDst, dstStep
451        ADD     dstep, dstStep, dstStep
452
453        VST1    dTmp1U32[1],[pDst],dstep                ;// Store pTmp[0|1|2|3]
454        VST1    dTmpU32[1],[pDstTmp],dstep              ;// Store pTmp[2|3|4|5]
455        VST1    dTmp1U32[0],[pDst]                      ;// Store pTmp[4|5|6|7]
456        VST1    dTmpU32[0],[pDstTmp]                    ;// Store pTmp[6|7|8|9]
457
458        B        ExitPredict4x4                         ;// Branch to exit code
459
460OMX_VC_4x4_VL
461
462
463        TST     availability, #OMX_VC_UPPER_RIGHT
464        BEQ     DiagVLUpperRightNotAvailable
465
466        VLD1    dAbove0,[pSrcAbove]                      ;// [U7|U6|U5|U4|U3|U2|U1|U0]
467        VEXT    dAbove1,dAbove0,dAbove0,#1               ;// [ X|U7|U6|U5|U4|U3|U2|U1]
468        VEXT    dAbove2,dAbove1,dAbove1,#1               ;// [ X| X|U7|U6|U5|U4|U3|U2]
469
470        B       DiagVLPredict4x4Store
471
472DiagVLUpperRightNotAvailable
473        VLD1    dAboveU32[1],[pSrcAbove]                 ;// [U3|U2|U1|U0|-|-|-|-]
474        VDUP    dU3, dAbove[7]                           ;// [U3 U3 U3 U3 U3 U3 U3 U3]
475
476        VEXT    dAbove0, dAbove, dU3, #4                 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
477        VEXT    dAbove1, dAbove, dU3, #5                 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
478        VEXT    dAbove2, dAbove, dU3, #6                 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
479
480DiagVLPredict4x4Store
481
482        VRHADD  dTmp0, dAbove1, dAbove0                 ;// (a+b+1)>>1
483                                                        ;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ]
484
485        VHADD   dTmp3, dAbove0, dAbove2
486        VRHADD  dTmp3, dTmp3, dAbove1                   ;// (a+2*b+c+2)>>2
487                                                        ;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ]
488
489        VEXT    dTmp1,dTmp0,dTmp0,#1                    ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ]
490        ADD     pDstTmp, pDst, dstStep
491        ADD     dstep, dstStep, dstStep
492        VEXT    dTmp2,dTmp3,dTmp1,#1                    ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ]
493
494        VST1    dTmp0U32[0],[pDst],dstep                ;// Tmp[6],[4],[2],[0]
495        VST1    dTmp3U32[0],[pDstTmp],dstep             ;// Tmp[7],[5],[3],[1]
496        VST1    dTmp1U32[0],[pDst]                      ;// Tmp[8],[6],[4],[2]
497        VST1    dTmp2U32[0],[pDstTmp]                   ;// Tmp[9],[7],[5],[3]
498
499        B        ExitPredict4x4                         ;// Branch to exit code
500
501OMX_VC_4x4_HU
502        ADD     pSrcTmp, pSrcLeft, leftStep
503        ADD     srcStep, leftStep, leftStep
504
505        ;// Load Left Edge                              ;// [L3|L2|L1|L0|X|X|X|X]
506        VLD1    {dLeft[4]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
507        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
508        VLD1    {dLeft[6]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
509        VLD1    {dLeft[7]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
510
511        VDUP    dL3,dLeft[7]                            ;// [L3|L3|L3|L3|L3|L3|L3|L3]
512
513        VEXT    dLeftHU0,dLeft,dL3,#4                   ;// [L3|L3|L3|L3|L3|L2|L1|L0]
514        VEXT    dLeftHU1,dLeft,dL3,#5                   ;// [L3|L3|L3|L3|L3|L3|L2|L1]
515        VEXT    dLeftHU2,dLeft,dL3,#6                   ;// [L3|L3|L3|L3|L3|L3|L3|L2]
516
517        VHADD   dTmp0, dLeftHU0, dLeftHU2
518        VRHADD  dTmp0, dTmp0, dLeftHU1                  ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ]
519
520        VRHADD  dTmp1, dLeftHU1, dLeftHU0               ;// (a+b+1)>>1
521                                                        ;//  Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ]
522
523        VZIP    dTmp1,dTmp0                             ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0]
524                                                        ;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3]
525
526
527        VST1    dTmp1U32[0],[pDst],dstStep              ;// [3|2|1|0]
528        VEXT    dTmp1,dTmp1,dTmp1,#2
529        VST1    dTmp1U32[0],[pDst],dstStep              ;// [5|4|3|2]
530        VEXT    dTmp1,dTmp1,dTmp1,#2
531        VST1    dTmp1U32[0],[pDst],dstStep              ;// [7|6|5|4]
532        VST1    dTmp0U32[0],[pDst]                      ;// [9|8|7|6]
533
534
535ExitPredict4x4
536
537        MOV      return,  #OMX_Sts_NoErr
538        M_END
539
540        ENDIF ;// CortexA8
541
542        END
543;//-----------------------------------------------------------------------------------------------
544;// omxVCM4P10_PredictIntra_4x4 ends
545;//-----------------------------------------------------------------------------------------------
546