omxVCM4P10_PredictIntra_16x16_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_PredictIntra_16x16_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS ARM1136JS
31
32;//-------------------------------------------------------
33;// This table for implementing switch case of C in asm by
34;// the mehtod of two levels of indexing.
35;//-------------------------------------------------------
36
37    M_TABLE armVCM4P10_pIndexTable16x16
38    DCD  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
39    DCD  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
40
41    IF ARM1136JS
42
43;//--------------------------------------------
44;// Constants
45;//--------------------------------------------
46BLK_SIZE        EQU 0x10
47MUL_CONST0      EQU 0x01010101
48MUL_CONST1      EQU 0x00060004
49MUL_CONST2      EQU 0x00070005
50MUL_CONST3      EQU 0x00030001
51MASK_CONST      EQU 0x00FF00FF
52
53;//--------------------------------------------
54;// Scratch variable
55;//--------------------------------------------
56y               RN 12
57pc              RN 15
58
59return          RN 0
60innerCount      RN 0
61outerCount      RN 1
62pSrcLeft2       RN 1
63pDst2           RN 2
64sum             RN 6
65pTable          RN 9
66temp1           RN 10
67temp2           RN 12
68cMul1           RN 11
69cMul2           RN 12
70count           RN 12
71dstStepx2       RN 11
72leftStepx2      RN 14
73r0x01010101     RN 10
74r0x00FF00FF     RN 11
75
76tVal0           RN 0
77tVal1           RN 1
78tVal2           RN 2
79tVal3           RN 3
80tVal4           RN 4
81tVal5           RN 5
82tVal6           RN 6
83tVal7           RN 7
84tVal8           RN 8
85tVal9           RN 9
86tVal10          RN 10
87tVal11          RN 11
88tVal12          RN 12
89tVal14          RN 14
90
91b               RN 12
92c               RN 14
93
94p2p0            RN 0
95p3p1            RN 1
96p6p4            RN 2
97p7p5            RN 4
98p10p8           RN 6
99p11p9           RN 7
100p14p12          RN 8
101p15p13          RN 9
102
103p3210           RN 10
104p7654           RN 10
105p111098         RN 10
106p15141312       RN 10
107
108;//--------------------------------------------
109;// Declare input registers
110;//--------------------------------------------
111pSrcLeft        RN 0    ;// input pointer
112pSrcAbove       RN 1    ;// input pointer
113pSrcAboveLeft   RN 2    ;// input pointer
114pDst            RN 3    ;// output pointer
115leftStep        RN 4    ;// input variable
116dstStep         RN 5    ;// input variable
117predMode        RN 6    ;// input variable
118availability    RN 7    ;// input variable
119
120;//-----------------------------------------------------------------------------------------------
121;// omxVCM4P10_PredictIntra_16x16 starts
122;//-----------------------------------------------------------------------------------------------
123
124        ;// Write function header
125        M_START omxVCM4P10_PredictIntra_16x16, r11
126
127        ;// Define stack arguments
128        M_ARG    LeftStep,     4
129        M_ARG    DstStep,      4
130        M_ARG    PredMode,     4
131        M_ARG    Availability, 4
132
133        ;// M_STALL ARM1136JS=4
134
135        LDR      pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
136
137        ;// Load argument from the stack
138        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
139        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
140        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
141        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
142
143        MOV      y, #BLK_SIZE                        ;// Outer Loop Count
144        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
145
146OMX_VC_16X16_VERT
147        LDM      pSrcAbove, {tVal6,tVal7,tVal8,tVal9};// tVal 6 to 9 = pSrcAbove[0 to 15]
148        ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
149        ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
150
151        ;// M_STALL ARM1136JS=2                       ;// Stall outside the loop
152
153LOOP_VERT
154        STM      pDst, {tVal6,tVal7,tVal8,tVal9}     ;// pDst[0 to 15] = tVal 6 to 9
155        SUBS     y, y, #2                            ;// y--
156        ADD      pDst, pDst, dstStepx2               ;// pDst advanced by dstStep
157        STM      pDst2, {tVal6,tVal7,tVal8,tVal9}    ;// pDst2[16 to 31] = tVal 6 to 9
158        ADD      pDst2, pDst2, dstStepx2             ;// pDst advanced by dstStep
159        BNE      LOOP_VERT                           ;// Loop for 8 times
160        MOV      return, #OMX_Sts_NoErr
161        M_EXIT
162
163
164OMX_VC_16X16_HOR
165
166        ;// M_STALL ARM1136JS=6
167
168        LDR      r0x01010101, =MUL_CONST0            ;// Const to repeat the byte in reg 4 times
169        MOV      y, #4                               ;// Outer Loop Count
170        M_LDRB   tVal6, [pSrcLeft], +leftStep        ;// tVal6 = pSrcLeft[0 to 3]
171        ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
172        M_LDRB   tVal7, [pSrcLeft], +leftStep        ;// tVal1 = pSrcLeft[4 to 7]
173        ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
174        SUB      dstStepx2, dstStepx2, #12           ;// double dstStep  minus 12
175
176LOOP_HOR
177        M_LDRB   tVal8, [pSrcLeft], +leftStep        ;// tVal8 = pSrcLeft[0 to 3]
178        MUL      tVal6, tVal6, r0x01010101           ;// replicate the val in all the bytes
179        M_LDRB   tVal9, [pSrcLeft], +leftStep        ;// tVal9 = pSrcLeft[4 to 7]
180        MUL      tVal7, tVal7, r0x01010101           ;// replicate the val in all the bytes
181        SUBS     y, y, #1                            ;// y--
182        STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[0 to 3]
183        STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
184        STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[4 to 7]
185        STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[4 to 7]
186        MUL      tVal8, tVal8, r0x01010101           ;// replicate the val in all the bytes
187        STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[8 to 11]
188        STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[8 to 11]
189        MUL      tVal9, tVal9, r0x01010101           ;// replicate the val in all the bytes
190        M_STR    tVal6, [pDst], dstStepx2            ;// store {tVal6} at pDst[12 to 15]
191        M_STR    tVal7, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[12 to 15]
192        STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[0 to 3]
193        STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
194        STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[4 to 7]
195        STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[4 to 7]
196        STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[8 to 11]
197        STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[8 to 11]
198        M_STR    tVal8, [pDst], dstStepx2            ;// store {tVal6} at pDst[12 to 15]
199        M_LDRB   tVal6, [pSrcLeft], +leftStep        ;// tVal6 = pSrcLeft[0 to 3]
200        M_STR    tVal9, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[12 to 15]
201        M_LDRB   tVal7, [pSrcLeft], +leftStep        ;// tVal7 = pSrcLeft[4 to 7]
202        BNE      LOOP_HOR                            ;// Loop for 3 times
203        MOV      return, #OMX_Sts_NoErr
204        M_EXIT
205
206OMX_VC_16X16_DC
207
208        ;// M_STALL ARM1136JS=2
209
210        MOV      count, #0                           ;// count = 0
211        TST      availability, #OMX_VC_UPPER         ;// if(availability & #OMX_VC_UPPER)
212        BEQ      TST_LEFT                            ;// Jump to Left if not upper
213        LDM      pSrcAbove,{tVal8,tVal9,tVal10,tVal11};// tVal 8 to 11 = pSrcAbove[0 to 15]
214        ADD      count, count, #1                    ;// if upper inc count by 1
215
216        ;// M_STALL ARM1136JS=2
217
218        UXTB16   tVal2, tVal8                        ;// pSrcAbove[0, 2]
219        UXTB16   tVal6, tVal9                        ;// pSrcAbove[4, 6]
220        UADD16   tVal2, tVal2, tVal6                 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6]
221        UXTB16   tVal8, tVal8, ROR #8                ;// pSrcAbove[1, 3]
222        UXTB16   tVal9, tVal9, ROR #8                ;// pSrcAbove[5, 7]
223        UADD16   tVal8, tVal8, tVal9                 ;// pSrcAbove[1, 3] + pSrcAbove[5, 7]
224        UADD16   tVal2, tVal2, tVal8                 ;// sum(pSrcAbove[0] to pSrcAbove[7])
225
226        UXTB16   tVal8, tVal10                       ;// pSrcAbove[8, 10]
227        UXTB16   tVal9, tVal11                       ;// pSrcAbove[12, 14]
228        UADD16   tVal8, tVal8, tVal9                 ;// pSrcAbove[8, 10] + pSrcAbove[12, 14]
229        UXTB16   tVal10, tVal10, ROR #8              ;// pSrcAbove[9, 11]
230        UXTB16   tVal11, tVal11, ROR #8              ;// pSrcAbove[13, 15]
231        UADD16   tVal10, tVal10, tVal11              ;// pSrcAbove[9, 11] + pSrcAbove[13, 15]
232        UADD16   tVal8, tVal8, tVal10                ;// sum(pSrcAbove[8] to pSrcAbove[15])
233
234        UADD16   tVal2, tVal2, tVal8                 ;// sum(pSrcAbove[0] to pSrcAbove[15])
235
236        ;// M_STALL ARM1136JS=1
237
238        ADD      tVal2, tVal2, tVal2, LSR #16        ;// sum(pSrcAbove[0] to pSrcAbove[15])
239
240        ;// M_STALL ARM1136JS=1
241
242        UXTH     sum, tVal2                          ;// Extract the lower half for result
243
244TST_LEFT
245        TST      availability, #OMX_VC_LEFT
246        BEQ      TST_COUNT
247        ADD      leftStepx2, leftStep,leftStep       ;// leftStepx2 = 2 * leftStep
248        ADD      pSrcLeft2, pSrcLeft, leftStep       ;// pSrcLeft2 = pSrcLeft + leftStep
249
250        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
251        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
252        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
253        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
254        ADD      tVal7, tVal8, tVal9                 ;// tVal7 = tVal8 + tVal9
255        ADD      count, count, #1                    ;// Inc Counter if Left is available
256        ADD      tVal6, tVal10, tVal11               ;// tVal6 = tVal10 + tVal11
257
258        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
259        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
260        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
261        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
262        ADD      sum, tVal7, tVal6                   ;// sum = tVal8 + tVal10
263        ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
264        ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
265        ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
266
267
268        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
269        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
270        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
271        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
272        ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
273        ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
274        ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
275        ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
276
277
278        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
279        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
280        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
281        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
282        ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
283        ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
284        ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
285        ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
286        ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
287
288TST_COUNT
289        CMP      count, #0                           ;// if(count == 0)
290        MOVEQ    sum, #128                           ;// sum = 128 if(count == 0)
291        BEQ      TST_COUNT0                          ;// if(count == 0)
292        CMP      count, #1                           ;// if(count == 1)
293        ADDEQ    sum, sum, #8                        ;// sum += 8 if(count == 1)
294        ADDNE    sum, sum, tVal2                     ;// sum = sumleft + sumupper
295        ADDNE    sum, sum, #16                       ;// sum += 16 if(count == 2)
296
297        ;// M_STALL ARM1136JS=1
298
299        UXTH     sum, sum                            ;// sum only byte rest cleared
300
301        ;// M_STALL ARM1136JS=1
302
303        LSREQ    sum, sum, #4                        ;// sum >> 4 if(count == 1)
304
305        ;// M_STALL ARM1136JS=1
306
307        LSRNE    sum, sum, #5                        ;// sum >> 5 if(count == 2)
308
309TST_COUNT0
310
311        ;// M_STALL ARM1136JS=1
312
313        ORR      sum, sum, sum, LSL #8               ;// sum replicated in two halfword
314
315        ;// M_STALL ARM1136JS=1
316
317        ORR      tVal6, sum, sum, LSL #16            ;// sum  replicated in all bytes
318        CPY      tVal7, tVal6                        ;// tVal1 = tVal0
319        CPY      tVal8, tVal6                        ;// tVal2 = tVal0
320        CPY      tVal9, tVal6                        ;// tVal3 = tVal0
321        ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
322        ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
323        MOV      y, #BLK_SIZE                        ;// Outer Loop Count
324
325LOOP_DC
326        STM      pDst, {tVal6,tVal7,tVal8,tVal9}     ;// pDst[0 to 15] = tVal 6 to 9
327        SUBS     y, y, #2                            ;// y--
328        ADD      pDst, pDst, dstStepx2               ;// pDst advanced by dstStep
329        STM      pDst2, {tVal6,tVal7,tVal8,tVal9}    ;// pDst2[16 to 31] = tVal 6 to 9
330        ADD      pDst2, pDst2, dstStepx2             ;// pDst advanced by dstStep
331        BNE      LOOP_DC                             ;// Loop for 8 times
332
333        MOV      return, #OMX_Sts_NoErr
334        M_EXIT
335
336OMX_VC_16X16_PLANE
337
338        ;// M_STALL ARM1136JS=3
339        RSB      tVal14, leftStep, leftStep, LSL #4  ;// tVal14 = 15*leftStep
340
341        ;// M_STALL ARM1136JS=2
342        LDRB     tVal10, [pSrcLeft,  tVal14]         ;// tVal10 = pSrcLeft[15*leftStep]
343        LDRB     tVal11, [pSrcAboveLeft]             ;// tVal11 = pSrcAboveLeft[0]
344        LDRB     tVal12, [pSrcAbove, #15]
345
346        ADD      tVal2,  tVal12,  tVal10             ;// tVal2  = pSrcAbove[15] + pSrcLeft[15*leftStep]
347        SUB      tVal10, tVal10,  tVal11             ;// tVal10 = V0 = pSrcLeft[15*leftStep] - pSrcAboveLeft[0]
348        SUB      tVal11, tVal12,  tVal11             ;// tVal11 = H0 = pSrcAbove[15] - pSrcAboveLeft[0]
349        MOV      tVal2,  tVal2,   LSL #4             ;// tVal2  = a = 16 * (pSrcAbove[15] + pSrcLeft[15*leftStep])
350
351        MOV     tVal11, tVal11, LSL #3              ;// 8*[15]-[-1]
352        LDRB    tVal6, [pSrcAbove, #0]
353        LDRB    tVal7, [pSrcAbove, #14]
354        SUB     tVal8, tVal7, tVal6
355        RSB     tVal8, tVal8, tVal8, LSL #3         ;// 7*[14]-[0]
356        ADD     tVal11, tVal11, tVal8
357        LDRB    tVal6, [pSrcAbove, #1]
358        LDRB    tVal7, [pSrcAbove, #13]
359        SUB     tVal8, tVal7, tVal6
360        ADD     tVal8, tVal8, tVal8
361        ADD     tVal8, tVal8, tVal8, LSL #1         ;// 6*[13]-[1]
362        ADD     tVal11, tVal11, tVal8
363        LDRB    tVal6, [pSrcAbove, #2]
364        LDRB    tVal7, [pSrcAbove, #12]
365        SUB     tVal8, tVal7, tVal6
366        ADD     tVal8, tVal8, tVal8, LSL #2         ;// 5*[12]-[2]
367        ADD     tVal11, tVal11, tVal8
368        LDRB    tVal6, [pSrcAbove, #3]
369        LDRB    tVal7, [pSrcAbove, #11]
370        SUB     tVal8, tVal7, tVal6
371        ADD     tVal11, tVal11, tVal8, LSL #2       ;// + 4*[11]-[3]
372        LDRB    tVal6, [pSrcAbove, #4]
373        LDRB    tVal7, [pSrcAbove, #10]
374        SUB     tVal8, tVal7, tVal6
375        ADD     tVal8, tVal8, tVal8, LSL #1         ;// 3*[10]-[4]
376        ADD     tVal11, tVal11, tVal8
377        LDRB    tVal6, [pSrcAbove, #5]
378        LDRB    tVal7, [pSrcAbove, #9]
379        SUB     tVal8, tVal7, tVal6
380        ADD     tVal11, tVal11, tVal8, LSL #1       ;// + 2*[9]-[5]
381        LDRB    tVal6, [pSrcAbove, #6]
382        LDRB    tVal7, [pSrcAbove, #8]
383        SUB     tVal8, tVal7, tVal6                 ;// 1*[8]-[6]
384        ADD     tVal7, tVal11, tVal8
385
386        ADD      tVal2,  tVal2,   #16                ;// tVal2  = a + 16
387        MOV      tVal1,  pSrcLeft                    ;// tVal4  = pSrcLeft
388        SUB      tVal9,  tVal14,   leftStep          ;// tVal9  = 14*leftStep
389        ADD      tVal9,  pSrcLeft, tVal9             ;// tVal9  = pSrcLeft + 14*leftStep
390
391        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[14*leftStep]
392        M_LDRB   tVal11, [tVal1], +leftStep          ;// tVal11 = pSrcLeft[0]
393        ADD      tVal7,  tVal7,  tVal7,  LSL #2      ;// tVal7  = 5 * H
394        ADD      tVal7,  tVal7,  #32                 ;// tVal7  = 5 * H + 32
395        SUB      tVal8,  tVal8,  tVal11              ;// tVal8  = pSrcLeft[14*leftStep] - pSrcLeft[0]
396        ASR      tVal12, tVal7,  #6                  ;// tVal12 = b = (5 * H + 32) >> 6
397
398        RSB      tVal8,  tVal8,  tVal8,  LSL #3      ;// tVal8  = V1 = 7* (pSrcLeft[14*leftStep]-pSrcLeft[0])
399        ADD      tVal6,  tVal8,  tVal10, LSL #3      ;// tVal6  = V = V0 +V1
400        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[13*leftStep]
401        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[leftStep]
402        RSB      tVal7,  tVal12,  tVal12,  LSL #3    ;// tVal7  = 7*b
403        SUB      tVal2,  tVal2,   tVal7              ;// tVal2  = a + 16 - 7*b
404        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[13*leftStep] - pSrcLeft[leftStep]
405        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[12*lS]
406        ADD      tVal7,  tVal7,   tVal7              ;// tVal7  = 2 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep])
407        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[2*leftStep]
408        ADD      tVal7,  tVal7,   tVal7,  LSL #1     ;// tVal7  = 6 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep])
409        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V2
410        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep]
411        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[11*leftStep]
412        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[3*leftStep]
413        ADD      tVal7,  tVal7,   tVal7,  LSL #2     ;// tVal7  = 5 * (pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep])
414        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V3
415        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[11*leftStep] - pSrcLeft[3*leftStep]
416        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[10*leftStep]
417        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[4*leftStep]
418        ADD      tVal6,  tVal6,   tVal7,  LSL #2     ;// tVal6  = V = V + V4
419        SUB      dstStep, dstStep, #16               ;// tVal5  = dstStep - 16
420        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep]
421        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[9*leftStep]
422        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[5*leftStep]
423        ADD      tVal7,  tVal7,   tVal7,  LSL #1     ;// tVal7  = 3 * (pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep])
424        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V5
425        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[9*leftStep] - pSrcLeft[5*leftStep]
426        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[8*leftStep]
427        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[6*leftStep]
428        ADD      tVal6,  tVal6,   tVal7,  LSL #1     ;// tVal6  = V = V + V6
429
430        ;// M_STALL ARM1136JS=1
431        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[8*leftStep] - pSrcLeft[6*leftStep]
432        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V7
433
434        ;// M_STALL ARM1136JS=1
435        ADD      tVal6,  tVal6,   tVal6,  LSL #2     ;// tVal6  = 5*V
436        ADD      tVal6,  tVal6,   #32                ;// tVal6  = 5*V + 32
437
438        ;// M_STALL ARM1136JS=1
439        ASR      tVal14, tVal6,   #6                 ;// tVal14 = c = (5*V + 32)>>6
440
441        ;// M_STALL ARM1136JS=1
442        RSB      tVal6,  tVal14,  tVal14, LSL #3     ;// tVal6  = 7*c
443        UXTH     tVal14, tVal14                      ;// tVal14 = Cleared the upper half word
444        ADD      tVal10, tVal12,  tVal12             ;// tVal10 = 2*b
445        ORR      tVal14, tVal14,  tVal14, LSL #16    ;// tVal14 = {c  ,  c}
446        SUB      tVal6,  tVal2,   tVal6              ;// tVal6  = d = a - 7*b - 7*c + 16
447        ADD      tVal1,  tVal6,   tVal10             ;// tVal1  = pp2 = d + 2*b
448        ADD      tVal10, tVal10,  tVal12             ;// tVal10 =3*b
449        ORR      tVal0,  tVal6,   tVal1,  LSL #16    ;// tval0  = p2p0   = pack {p2, p0}
450        UXTH     tVal12, tVal12                      ;// tVal12 = Cleared the upper half word
451        UXTH     tVal10, tVal10                      ;// tVal12 = Cleared the upper half word
452        ORR      tVal12, tVal12,  tVal12, LSL #16    ;// tVal12 = {b  ,  b}
453        ORR      tVal10, tVal10,  tVal10, LSL #16    ;// tVal10 = {3b , 3b}
454        SADD16   tVal1,  tVal0,   tVal12             ;// tVal1  = p3p1   = p2p0   + {b,b}
455        SADD16   tVal2,  tVal1,   tVal10             ;// tVal2  = p6p4   = p3p1   + {3b,3b}
456        SADD16   tVal4,  tVal2,   tVal12             ;// tVal4  = p7p5   = p6p4   + {b,b}
457        SADD16   tVal6,  tVal4,   tVal10             ;// tVal6  = p10p8  = p7p5   + {3b,3b}
458        SADD16   tVal7,  tVal6,   tVal12             ;// tVal7  = p11p9  = p10p8  + {b,b}
459        SADD16   tVal8,  tVal7,   tVal10             ;// tVal8  = p14p12 = p11p9  + {3b,3b}
460        SADD16   tVal9,  tVal8,   tVal12             ;// tVal9  = p15p13 = p14p12 + {b,b}
461        LDR      r0x00FF00FF,     =MASK_CONST        ;// r0x00FF00FF = 0x00FF00FF
462
463LOOP_PLANE
464
465        USAT16   temp2, #13, p3p1
466        USAT16   temp1, #13, p2p0
467        SADD16   p3p1,   p3p1,   c
468        SADD16   p2p0,   p2p0,   c
469        AND      temp2, r0x00FF00FF, temp2, ASR #5
470        AND      temp1, r0x00FF00FF, temp1, ASR #5
471        ORR      temp1, temp1, temp2, LSL #8
472        STR      temp1, [pDst], #4
473
474        USAT16   temp2, #13, p7p5
475        USAT16   temp1, #13, p6p4
476        SADD16   p7p5,   p7p5,   c
477        SADD16   p6p4,   p6p4,   c
478        AND      temp2, r0x00FF00FF, temp2, ASR #5
479        AND      temp1, r0x00FF00FF, temp1, ASR #5
480        ORR      temp1, temp1, temp2, LSL #8
481        STR      temp1, [pDst], #4
482
483        USAT16   temp2, #13, p11p9
484        USAT16   temp1, #13, p10p8
485        SADD16   p11p9,  p11p9,  c
486        SADD16   p10p8,  p10p8,  c
487        AND      temp2, r0x00FF00FF, temp2, ASR #5
488        AND      temp1, r0x00FF00FF, temp1, ASR #5
489        ORR      temp1, temp1, temp2, LSL #8
490        STR      temp1, [pDst], #4
491
492        USAT16   temp2, #13, p15p13
493        USAT16   temp1, #13, p14p12
494        SADD16   p15p13, p15p13, c
495        SADD16   p14p12, p14p12, c
496        AND      temp2, r0x00FF00FF, temp2, ASR #5
497        AND      temp1, r0x00FF00FF, temp1, ASR #5
498        ORR      temp1, temp1, temp2, LSL #8
499        STR      temp1, [pDst], #4
500
501        ADDS     r0x00FF00FF, r0x00FF00FF, #1<<28     ;// Loop counter value in top 4 bits
502
503        ADD      pDst, pDst, dstStep
504
505        BCC      LOOP_PLANE                           ;// Loop for 16 times
506        MOV      return, #OMX_Sts_NoErr
507        M_END
508
509        ENDIF ;// ARM1136JS
510
511
512        END
513;-----------------------------------------------------------------------------------------------
514; omxVCM4P10_PredictIntra_16x16 ends
515;-----------------------------------------------------------------------------------------------
516