omxVCM4P10_PredictIntra_16x16_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  omxVCM4P10_PredictIntra_16x16_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   9641
6;// Date:       Thursday, February 7, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16        M_VARIANTS ARM1136JS
17
18;//-------------------------------------------------------
19;// This table for implementing switch case of C in asm by
20;// the mehtod of two levels of indexing.
21;//-------------------------------------------------------
22
23    M_TABLE armVCM4P10_pIndexTable16x16
24    DCD  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
25    DCD  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
26
27    IF ARM1136JS
28
29;//--------------------------------------------
30;// Constants
31;//--------------------------------------------
32BLK_SIZE        EQU 0x10
33MUL_CONST0      EQU 0x01010101
34MUL_CONST1      EQU 0x00060004
35MUL_CONST2      EQU 0x00070005
36MUL_CONST3      EQU 0x00030001
37MASK_CONST      EQU 0x00FF00FF
38
39;//--------------------------------------------
40;// Scratch variable
41;//--------------------------------------------
42y               RN 12
43pc              RN 15
44
45return          RN 0
46innerCount      RN 0
47outerCount      RN 1
48pSrcLeft2       RN 1
49pDst2           RN 2
50sum             RN 6
51pTable          RN 9
52temp1           RN 10
53temp2           RN 12
54cMul1           RN 11
55cMul2           RN 12
56count           RN 12
57dstStepx2       RN 11
58leftStepx2      RN 14
59r0x01010101     RN 10
60r0x00FF00FF     RN 11
61
62tVal0           RN 0
63tVal1           RN 1
64tVal2           RN 2
65tVal3           RN 3
66tVal4           RN 4
67tVal5           RN 5
68tVal6           RN 6
69tVal7           RN 7
70tVal8           RN 8
71tVal9           RN 9
72tVal10          RN 10
73tVal11          RN 11
74tVal12          RN 12
75tVal14          RN 14
76
77b               RN 12
78c               RN 14
79
80p2p0            RN 0
81p3p1            RN 1
82p6p4            RN 2
83p7p5            RN 4
84p10p8           RN 6
85p11p9           RN 7
86p14p12          RN 8
87p15p13          RN 9
88
89p3210           RN 10
90p7654           RN 10
91p111098         RN 10
92p15141312       RN 10
93
94;//--------------------------------------------
95;// Declare input registers
96;//--------------------------------------------
97pSrcLeft        RN 0    ;// input pointer
98pSrcAbove       RN 1    ;// input pointer
99pSrcAboveLeft   RN 2    ;// input pointer
100pDst            RN 3    ;// output pointer
101leftStep        RN 4    ;// input variable
102dstStep         RN 5    ;// input variable
103predMode        RN 6    ;// input variable
104availability    RN 7    ;// input variable
105
106;//-----------------------------------------------------------------------------------------------
107;// omxVCM4P10_PredictIntra_16x16 starts
108;//-----------------------------------------------------------------------------------------------
109
110        ;// Write function header
111        M_START omxVCM4P10_PredictIntra_16x16, r11
112
113        ;// Define stack arguments
114        M_ARG    LeftStep,     4
115        M_ARG    DstStep,      4
116        M_ARG    PredMode,     4
117        M_ARG    Availability, 4
118
119        ;// M_STALL ARM1136JS=4
120
121        LDR      pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
122
123        ;// Load argument from the stack
124        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
125        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
126        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
127        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
128
129        MOV      y, #BLK_SIZE                        ;// Outer Loop Count
130        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
131
132OMX_VC_16X16_VERT
133        LDM      pSrcAbove, {tVal6,tVal7,tVal8,tVal9};// tVal 6 to 9 = pSrcAbove[0 to 15]
134        ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
135        ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
136
137        ;// M_STALL ARM1136JS=2                       ;// Stall outside the loop
138
139LOOP_VERT
140        STM      pDst, {tVal6,tVal7,tVal8,tVal9}     ;// pDst[0 to 15] = tVal 6 to 9
141        SUBS     y, y, #2                            ;// y--
142        ADD      pDst, pDst, dstStepx2               ;// pDst advanced by dstStep
143        STM      pDst2, {tVal6,tVal7,tVal8,tVal9}    ;// pDst2[16 to 31] = tVal 6 to 9
144        ADD      pDst2, pDst2, dstStepx2             ;// pDst advanced by dstStep
145        BNE      LOOP_VERT                           ;// Loop for 8 times
146        MOV      return, #OMX_Sts_NoErr
147        M_EXIT
148
149
150OMX_VC_16X16_HOR
151
152        ;// M_STALL ARM1136JS=6
153
154        LDR      r0x01010101, =MUL_CONST0            ;// Const to repeat the byte in reg 4 times
155        MOV      y, #4                               ;// Outer Loop Count
156        M_LDRB   tVal6, [pSrcLeft], +leftStep        ;// tVal6 = pSrcLeft[0 to 3]
157        ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
158        M_LDRB   tVal7, [pSrcLeft], +leftStep        ;// tVal1 = pSrcLeft[4 to 7]
159        ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
160        SUB      dstStepx2, dstStepx2, #12           ;// double dstStep  minus 12
161
162LOOP_HOR
163        M_LDRB   tVal8, [pSrcLeft], +leftStep        ;// tVal8 = pSrcLeft[0 to 3]
164        MUL      tVal6, tVal6, r0x01010101           ;// replicate the val in all the bytes
165        M_LDRB   tVal9, [pSrcLeft], +leftStep        ;// tVal9 = pSrcLeft[4 to 7]
166        MUL      tVal7, tVal7, r0x01010101           ;// replicate the val in all the bytes
167        SUBS     y, y, #1                            ;// y--
168        STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[0 to 3]
169        STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
170        STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[4 to 7]
171        STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[4 to 7]
172        MUL      tVal8, tVal8, r0x01010101           ;// replicate the val in all the bytes
173        STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[8 to 11]
174        STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[8 to 11]
175        MUL      tVal9, tVal9, r0x01010101           ;// replicate the val in all the bytes
176        M_STR    tVal6, [pDst], dstStepx2            ;// store {tVal6} at pDst[12 to 15]
177        M_STR    tVal7, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[12 to 15]
178        STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[0 to 3]
179        STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
180        STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[4 to 7]
181        STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[4 to 7]
182        STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[8 to 11]
183        STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[8 to 11]
184        M_STR    tVal8, [pDst], dstStepx2            ;// store {tVal6} at pDst[12 to 15]
185        M_LDRB   tVal6, [pSrcLeft], +leftStep        ;// tVal6 = pSrcLeft[0 to 3]
186        M_STR    tVal9, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[12 to 15]
187        M_LDRB   tVal7, [pSrcLeft], +leftStep        ;// tVal7 = pSrcLeft[4 to 7]
188        BNE      LOOP_HOR                            ;// Loop for 3 times
189        MOV      return, #OMX_Sts_NoErr
190        M_EXIT
191
192OMX_VC_16X16_DC
193
194        ;// M_STALL ARM1136JS=2
195
196        MOV      count, #0                           ;// count = 0
197        TST      availability, #OMX_VC_UPPER         ;// if(availability & #OMX_VC_UPPER)
198        BEQ      TST_LEFT                            ;// Jump to Left if not upper
199        LDM      pSrcAbove,{tVal8,tVal9,tVal10,tVal11};// tVal 8 to 11 = pSrcAbove[0 to 15]
200        ADD      count, count, #1                    ;// if upper inc count by 1
201
202        ;// M_STALL ARM1136JS=2
203
204        UXTB16   tVal2, tVal8                        ;// pSrcAbove[0, 2]
205        UXTB16   tVal6, tVal9                        ;// pSrcAbove[4, 6]
206        UADD16   tVal2, tVal2, tVal6                 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6]
207        UXTB16   tVal8, tVal8, ROR #8                ;// pSrcAbove[1, 3]
208        UXTB16   tVal9, tVal9, ROR #8                ;// pSrcAbove[5, 7]
209        UADD16   tVal8, tVal8, tVal9                 ;// pSrcAbove[1, 3] + pSrcAbove[5, 7]
210        UADD16   tVal2, tVal2, tVal8                 ;// sum(pSrcAbove[0] to pSrcAbove[7])
211
212        UXTB16   tVal8, tVal10                       ;// pSrcAbove[8, 10]
213        UXTB16   tVal9, tVal11                       ;// pSrcAbove[12, 14]
214        UADD16   tVal8, tVal8, tVal9                 ;// pSrcAbove[8, 10] + pSrcAbove[12, 14]
215        UXTB16   tVal10, tVal10, ROR #8              ;// pSrcAbove[9, 11]
216        UXTB16   tVal11, tVal11, ROR #8              ;// pSrcAbove[13, 15]
217        UADD16   tVal10, tVal10, tVal11              ;// pSrcAbove[9, 11] + pSrcAbove[13, 15]
218        UADD16   tVal8, tVal8, tVal10                ;// sum(pSrcAbove[8] to pSrcAbove[15])
219
220        UADD16   tVal2, tVal2, tVal8                 ;// sum(pSrcAbove[0] to pSrcAbove[15])
221
222        ;// M_STALL ARM1136JS=1
223
224        ADD      tVal2, tVal2, tVal2, LSR #16        ;// sum(pSrcAbove[0] to pSrcAbove[15])
225
226        ;// M_STALL ARM1136JS=1
227
228        UXTH     sum, tVal2                          ;// Extract the lower half for result
229
230TST_LEFT
231        TST      availability, #OMX_VC_LEFT
232        BEQ      TST_COUNT
233        ADD      leftStepx2, leftStep,leftStep       ;// leftStepx2 = 2 * leftStep
234        ADD      pSrcLeft2, pSrcLeft, leftStep       ;// pSrcLeft2 = pSrcLeft + leftStep
235
236        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
237        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
238        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
239        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
240        ADD      tVal7, tVal8, tVal9                 ;// tVal7 = tVal8 + tVal9
241        ADD      count, count, #1                    ;// Inc Counter if Left is available
242        ADD      tVal6, tVal10, tVal11               ;// tVal6 = tVal10 + tVal11
243
244        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
245        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
246        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
247        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
248        ADD      sum, tVal7, tVal6                   ;// sum = tVal8 + tVal10
249        ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
250        ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
251        ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
252
253
254        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
255        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
256        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
257        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
258        ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
259        ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
260        ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
261        ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
262
263
264        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
265        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
266        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
267        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
268        ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
269        ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
270        ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
271        ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
272        ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
273
274TST_COUNT
275        CMP      count, #0                           ;// if(count == 0)
276        MOVEQ    sum, #128                           ;// sum = 128 if(count == 0)
277        BEQ      TST_COUNT0                          ;// if(count == 0)
278        CMP      count, #1                           ;// if(count == 1)
279        ADDEQ    sum, sum, #8                        ;// sum += 8 if(count == 1)
280        ADDNE    sum, sum, tVal2                     ;// sum = sumleft + sumupper
281        ADDNE    sum, sum, #16                       ;// sum += 16 if(count == 2)
282
283        ;// M_STALL ARM1136JS=1
284
285        UXTH     sum, sum                            ;// sum only byte rest cleared
286
287        ;// M_STALL ARM1136JS=1
288
289        LSREQ    sum, sum, #4                        ;// sum >> 4 if(count == 1)
290
291        ;// M_STALL ARM1136JS=1
292
293        LSRNE    sum, sum, #5                        ;// sum >> 5 if(count == 2)
294
295TST_COUNT0
296
297        ;// M_STALL ARM1136JS=1
298
299        ORR      sum, sum, sum, LSL #8               ;// sum replicated in two halfword
300
301        ;// M_STALL ARM1136JS=1
302
303        ORR      tVal6, sum, sum, LSL #16            ;// sum  replicated in all bytes
304        CPY      tVal7, tVal6                        ;// tVal1 = tVal0
305        CPY      tVal8, tVal6                        ;// tVal2 = tVal0
306        CPY      tVal9, tVal6                        ;// tVal3 = tVal0
307        ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
308        ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
309        MOV      y, #BLK_SIZE                        ;// Outer Loop Count
310
311LOOP_DC
312        STM      pDst, {tVal6,tVal7,tVal8,tVal9}     ;// pDst[0 to 15] = tVal 6 to 9
313        SUBS     y, y, #2                            ;// y--
314        ADD      pDst, pDst, dstStepx2               ;// pDst advanced by dstStep
315        STM      pDst2, {tVal6,tVal7,tVal8,tVal9}    ;// pDst2[16 to 31] = tVal 6 to 9
316        ADD      pDst2, pDst2, dstStepx2             ;// pDst advanced by dstStep
317        BNE      LOOP_DC                             ;// Loop for 8 times
318
319        MOV      return, #OMX_Sts_NoErr
320        M_EXIT
321
322OMX_VC_16X16_PLANE
323
324        ;// M_STALL ARM1136JS=3
325        RSB      tVal14, leftStep, leftStep, LSL #4  ;// tVal14 = 15*leftStep
326
327        ;// M_STALL ARM1136JS=2
328        LDRB     tVal10, [pSrcLeft,  tVal14]         ;// tVal10 = pSrcLeft[15*leftStep]
329        LDRB     tVal11, [pSrcAboveLeft]             ;// tVal11 = pSrcAboveLeft[0]
330        LDRB     tVal12, [pSrcAbove, #15]
331
332        ADD      tVal2,  tVal12,  tVal10             ;// tVal2  = pSrcAbove[15] + pSrcLeft[15*leftStep]
333        SUB      tVal10, tVal10,  tVal11             ;// tVal10 = V0 = pSrcLeft[15*leftStep] - pSrcAboveLeft[0]
334        SUB      tVal11, tVal12,  tVal11             ;// tVal11 = H0 = pSrcAbove[15] - pSrcAboveLeft[0]
335        MOV      tVal2,  tVal2,   LSL #4             ;// tVal2  = a = 16 * (pSrcAbove[15] + pSrcLeft[15*leftStep])
336
337        MOV     tVal11, tVal11, LSL #3              ;// 8*[15]-[-1]
338        LDRB    tVal6, [pSrcAbove, #0]
339        LDRB    tVal7, [pSrcAbove, #14]
340        SUB     tVal8, tVal7, tVal6
341        RSB     tVal8, tVal8, tVal8, LSL #3         ;// 7*[14]-[0]
342        ADD     tVal11, tVal11, tVal8
343        LDRB    tVal6, [pSrcAbove, #1]
344        LDRB    tVal7, [pSrcAbove, #13]
345        SUB     tVal8, tVal7, tVal6
346        ADD     tVal8, tVal8, tVal8
347        ADD     tVal8, tVal8, tVal8, LSL #1         ;// 6*[13]-[1]
348        ADD     tVal11, tVal11, tVal8
349        LDRB    tVal6, [pSrcAbove, #2]
350        LDRB    tVal7, [pSrcAbove, #12]
351        SUB     tVal8, tVal7, tVal6
352        ADD     tVal8, tVal8, tVal8, LSL #2         ;// 5*[12]-[2]
353        ADD     tVal11, tVal11, tVal8
354        LDRB    tVal6, [pSrcAbove, #3]
355        LDRB    tVal7, [pSrcAbove, #11]
356        SUB     tVal8, tVal7, tVal6
357        ADD     tVal11, tVal11, tVal8, LSL #2       ;// + 4*[11]-[3]
358        LDRB    tVal6, [pSrcAbove, #4]
359        LDRB    tVal7, [pSrcAbove, #10]
360        SUB     tVal8, tVal7, tVal6
361        ADD     tVal8, tVal8, tVal8, LSL #1         ;// 3*[10]-[4]
362        ADD     tVal11, tVal11, tVal8
363        LDRB    tVal6, [pSrcAbove, #5]
364        LDRB    tVal7, [pSrcAbove, #9]
365        SUB     tVal8, tVal7, tVal6
366        ADD     tVal11, tVal11, tVal8, LSL #1       ;// + 2*[9]-[5]
367        LDRB    tVal6, [pSrcAbove, #6]
368        LDRB    tVal7, [pSrcAbove, #8]
369        SUB     tVal8, tVal7, tVal6                 ;// 1*[8]-[6]
370        ADD     tVal7, tVal11, tVal8
371
372        ADD      tVal2,  tVal2,   #16                ;// tVal2  = a + 16
373        MOV      tVal1,  pSrcLeft                    ;// tVal4  = pSrcLeft
374        SUB      tVal9,  tVal14,   leftStep          ;// tVal9  = 14*leftStep
375        ADD      tVal9,  pSrcLeft, tVal9             ;// tVal9  = pSrcLeft + 14*leftStep
376
377        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[14*leftStep]
378        M_LDRB   tVal11, [tVal1], +leftStep          ;// tVal11 = pSrcLeft[0]
379        ADD      tVal7,  tVal7,  tVal7,  LSL #2      ;// tVal7  = 5 * H
380        ADD      tVal7,  tVal7,  #32                 ;// tVal7  = 5 * H + 32
381        SUB      tVal8,  tVal8,  tVal11              ;// tVal8  = pSrcLeft[14*leftStep] - pSrcLeft[0]
382        ASR      tVal12, tVal7,  #6                  ;// tVal12 = b = (5 * H + 32) >> 6
383
384        RSB      tVal8,  tVal8,  tVal8,  LSL #3      ;// tVal8  = V1 = 7* (pSrcLeft[14*leftStep]-pSrcLeft[0])
385        ADD      tVal6,  tVal8,  tVal10, LSL #3      ;// tVal6  = V = V0 +V1
386        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[13*leftStep]
387        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[leftStep]
388        RSB      tVal7,  tVal12,  tVal12,  LSL #3    ;// tVal7  = 7*b
389        SUB      tVal2,  tVal2,   tVal7              ;// tVal2  = a + 16 - 7*b
390        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[13*leftStep] - pSrcLeft[leftStep]
391        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[12*lS]
392        ADD      tVal7,  tVal7,   tVal7              ;// tVal7  = 2 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep])
393        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[2*leftStep]
394        ADD      tVal7,  tVal7,   tVal7,  LSL #1     ;// tVal7  = 6 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep])
395        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V2
396        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep]
397        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[11*leftStep]
398        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[3*leftStep]
399        ADD      tVal7,  tVal7,   tVal7,  LSL #2     ;// tVal7  = 5 * (pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep])
400        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V3
401        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[11*leftStep] - pSrcLeft[3*leftStep]
402        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[10*leftStep]
403        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[4*leftStep]
404        ADD      tVal6,  tVal6,   tVal7,  LSL #2     ;// tVal6  = V = V + V4
405        SUB      dstStep, dstStep, #16               ;// tVal5  = dstStep - 16
406        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep]
407        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[9*leftStep]
408        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[5*leftStep]
409        ADD      tVal7,  tVal7,   tVal7,  LSL #1     ;// tVal7  = 3 * (pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep])
410        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V5
411        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[9*leftStep] - pSrcLeft[5*leftStep]
412        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[8*leftStep]
413        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[6*leftStep]
414        ADD      tVal6,  tVal6,   tVal7,  LSL #1     ;// tVal6  = V = V + V6
415
416        ;// M_STALL ARM1136JS=1
417        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[8*leftStep] - pSrcLeft[6*leftStep]
418        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V7
419
420        ;// M_STALL ARM1136JS=1
421        ADD      tVal6,  tVal6,   tVal6,  LSL #2     ;// tVal6  = 5*V
422        ADD      tVal6,  tVal6,   #32                ;// tVal6  = 5*V + 32
423
424        ;// M_STALL ARM1136JS=1
425        ASR      tVal14, tVal6,   #6                 ;// tVal14 = c = (5*V + 32)>>6
426
427        ;// M_STALL ARM1136JS=1
428        RSB      tVal6,  tVal14,  tVal14, LSL #3     ;// tVal6  = 7*c
429        UXTH     tVal14, tVal14                      ;// tVal14 = Cleared the upper half word
430        ADD      tVal10, tVal12,  tVal12             ;// tVal10 = 2*b
431        ORR      tVal14, tVal14,  tVal14, LSL #16    ;// tVal14 = {c  ,  c}
432        SUB      tVal6,  tVal2,   tVal6              ;// tVal6  = d = a - 7*b - 7*c + 16
433        ADD      tVal1,  tVal6,   tVal10             ;// tVal1  = pp2 = d + 2*b
434        ADD      tVal10, tVal10,  tVal12             ;// tVal10 =3*b
435        ORR      tVal0,  tVal6,   tVal1,  LSL #16    ;// tval0  = p2p0   = pack {p2, p0}
436        UXTH     tVal12, tVal12                      ;// tVal12 = Cleared the upper half word
437        UXTH     tVal10, tVal10                      ;// tVal12 = Cleared the upper half word
438        ORR      tVal12, tVal12,  tVal12, LSL #16    ;// tVal12 = {b  ,  b}
439        ORR      tVal10, tVal10,  tVal10, LSL #16    ;// tVal10 = {3b , 3b}
440        SADD16   tVal1,  tVal0,   tVal12             ;// tVal1  = p3p1   = p2p0   + {b,b}
441        SADD16   tVal2,  tVal1,   tVal10             ;// tVal2  = p6p4   = p3p1   + {3b,3b}
442        SADD16   tVal4,  tVal2,   tVal12             ;// tVal4  = p7p5   = p6p4   + {b,b}
443        SADD16   tVal6,  tVal4,   tVal10             ;// tVal6  = p10p8  = p7p5   + {3b,3b}
444        SADD16   tVal7,  tVal6,   tVal12             ;// tVal7  = p11p9  = p10p8  + {b,b}
445        SADD16   tVal8,  tVal7,   tVal10             ;// tVal8  = p14p12 = p11p9  + {3b,3b}
446        SADD16   tVal9,  tVal8,   tVal12             ;// tVal9  = p15p13 = p14p12 + {b,b}
447        LDR      r0x00FF00FF,     =MASK_CONST        ;// r0x00FF00FF = 0x00FF00FF
448
449LOOP_PLANE
450
451        USAT16   temp2, #13, p3p1
452        USAT16   temp1, #13, p2p0
453        SADD16   p3p1,   p3p1,   c
454        SADD16   p2p0,   p2p0,   c
455        AND      temp2, r0x00FF00FF, temp2, ASR #5
456        AND      temp1, r0x00FF00FF, temp1, ASR #5
457        ORR      temp1, temp1, temp2, LSL #8
458        STR      temp1, [pDst], #4
459
460        USAT16   temp2, #13, p7p5
461        USAT16   temp1, #13, p6p4
462        SADD16   p7p5,   p7p5,   c
463        SADD16   p6p4,   p6p4,   c
464        AND      temp2, r0x00FF00FF, temp2, ASR #5
465        AND      temp1, r0x00FF00FF, temp1, ASR #5
466        ORR      temp1, temp1, temp2, LSL #8
467        STR      temp1, [pDst], #4
468
469        USAT16   temp2, #13, p11p9
470        USAT16   temp1, #13, p10p8
471        SADD16   p11p9,  p11p9,  c
472        SADD16   p10p8,  p10p8,  c
473        AND      temp2, r0x00FF00FF, temp2, ASR #5
474        AND      temp1, r0x00FF00FF, temp1, ASR #5
475        ORR      temp1, temp1, temp2, LSL #8
476        STR      temp1, [pDst], #4
477
478        USAT16   temp2, #13, p15p13
479        USAT16   temp1, #13, p14p12
480        SADD16   p15p13, p15p13, c
481        SADD16   p14p12, p14p12, c
482        AND      temp2, r0x00FF00FF, temp2, ASR #5
483        AND      temp1, r0x00FF00FF, temp1, ASR #5
484        ORR      temp1, temp1, temp2, LSL #8
485        STR      temp1, [pDst], #4
486
487        ADDS     r0x00FF00FF, r0x00FF00FF, #1<<28     ;// Loop counter value in top 4 bits
488
489        ADD      pDst, pDst, dstStep
490
491        BCC      LOOP_PLANE                           ;// Loop for 16 times
492        MOV      return, #OMX_Sts_NoErr
493        M_END
494
495        ENDIF ;// ARM1136JS
496
497
498        END
499;-----------------------------------------------------------------------------------------------
500; omxVCM4P10_PredictIntra_16x16 ends
501;-----------------------------------------------------------------------------------------------
502