omxVCM4P10_PredictIntraChroma_8x8_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_PredictIntraChroma_8x8_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26
27
28        INCLUDE omxtypes_s.h
29        INCLUDE armCOMM_s.h
30
31        EXPORT armVCM4P10_pIndexTable8x8
32
33;// Define the processor variants supported by this file
34
35         M_VARIANTS ARM1136JS
36
37     AREA table, DATA
38;//-------------------------------------------------------
39;// This table for implementing switch case of C in asm by
40;// the mehtod of two levels of indexing.
41;//-------------------------------------------------------
42
43    M_TABLE armVCM4P10_pIndexTable8x8
44    DCD  OMX_VC_CHROMA_DC,     OMX_VC_CHROMA_HOR
45    DCD  OMX_VC_CHROMA_VERT,   OMX_VC_CHROMA_PLANE
46
47    M_TABLE armVCM4P10_MultiplierTableChroma8x8,1
48    DCW   3, 2, 1,4
49    DCW  -3,-2,-1,0
50    DCW   1, 2, 3,4
51
52    IF ARM1136JS
53
54;//--------------------------------------------
55;// Constants
56;//--------------------------------------------
57
58BLK_SIZE        EQU 0x8
59MUL_CONST0      EQU 0x01010101
60MASK_CONST      EQU 0x00FF00FF
61MUL_CONST1      EQU 0x80808080
62
63;//--------------------------------------------
64;// Scratch variable
65;//--------------------------------------------
66y               RN 12
67pc              RN 15
68return          RN 0
69pSrcLeft2       RN 1
70pDst2           RN 2
71sum1            RN 6
72sum2            RN 7
73pTable          RN 9
74dstStepx2       RN 11
75leftStepx2      RN 14
76outerCount      RN 14
77r0x01010101     RN 10
78r0x00FF00FF     RN 11
79
80tVal0           RN 0
81tVal1           RN 1
82tVal2           RN 2
83tVal3           RN 3
84tVal4           RN 4
85tVal5           RN 5
86tVal6           RN 6
87tVal7           RN 7
88tVal8           RN 8
89tVal9           RN 9
90tVal10          RN 10
91tVal11          RN 11
92tVal12          RN 12
93tVal14          RN 14
94
95b               RN 14
96c               RN 12
97
98p2p0            RN 0
99p3p1            RN 1
100p6p4            RN 2
101p7p5            RN 4
102
103pp2pp0          RN 6
104pp3pp1          RN 7
105pp6pp4          RN 8
106pp7pp5          RN 9
107
108p3210           RN 10
109p7654           RN 10
110
111;//--------------------------------------------
112;// Input Arguments
113;//--------------------------------------------
114pSrcLeft        RN 0    ;// input pointer
115pSrcAbove       RN 1    ;// input pointer
116pSrcAboveLeft   RN 2    ;// input pointer
117pDst            RN 3    ;// output pointer
118leftStep        RN 4    ;// input variable
119dstStep         RN 5    ;// input variable
120predMode        RN 6    ;// input variable
121availability    RN 7    ;// input variable
122
123;//-----------------------------------------------------------------------------------------------
124;// omxVCM4P10_PredictIntraChroma_8x8 starts
125;//-----------------------------------------------------------------------------------------------
126
127        ;// Write function header
128        M_START omxVCM4P10_PredictIntraChroma_8x8, r11
129
130        ;// Define stack arguments
131        M_ARG    LeftStep,     4
132        M_ARG    DstStep,      4
133        M_ARG    PredMode,     4
134        M_ARG    Availability, 4
135
136        ;// M_STALL ARM1136JS=4
137
138        LDR      pTable,=armVCM4P10_pIndexTable8x8   ;// Load index table for switch case
139
140
141        ;// Load argument from the stack
142        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
143        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
144        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
145        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
146
147        MOV      y, #BLK_SIZE                        ;// Outer Loop Count
148        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
149
150OMX_VC_CHROMA_DC
151        AND      availability, availability,#(OMX_VC_UPPER + OMX_VC_LEFT)
152        CMP      availability, #(OMX_VC_UPPER + OMX_VC_LEFT) ;// if(availability & (#OMX_VC_UPPER | #OMX_VC_LEFT))
153        LDR      r0x01010101, =MUL_CONST0
154        BNE      TST_UPPER                           ;// Jump to Upper if not both
155        LDM      pSrcAbove,{tVal8,tVal9}             ;// tVal 8 to 9 = pSrcAbove[0 to 7]
156
157        ADD      leftStepx2, leftStep,leftStep       ;// leftStepx2 = 2 * leftStep
158        ADD      pSrcLeft2, pSrcLeft, leftStep       ;// pSrcLeft2 = pSrcLeft + leftStep
159
160        ;// M_STALL ARM1136JS=1
161
162        UXTB16   tVal7, tVal8                        ;// pSrcAbove[0, 2]
163        UXTB16   tVal8, tVal8, ROR #8                ;// pSrcAbove[1, 3]
164        UADD16   sum1, tVal7, tVal8                  ;// pSrcAbove[0, 2] + pSrcAbove[1, 3]
165
166        UXTB16   tVal7, tVal9                        ;// pSrcAbove[4, 6]
167        UXTB16   tVal9, tVal9, ROR #8                ;// pSrcAbove[5, 7]
168        UADD16   sum2, tVal7, tVal9                  ;// pSrcAbove[0, 2] + pSrcAbove[4, 6]
169        ADD      sum1, sum1, sum1, LSR #16           ;// sum(pSrcAbove[0] to pSrcAbove[3])
170        ADD      sum2, sum2, sum2, LSR #16           ;// sum(pSrcAbove[4] to pSrcAbove[7])
171        UXTH     sum1, sum1                          ;// upsum1 (Clear the top junk bits)
172        UXTH     sum2, sum2                          ;// upsum2 (Clear the top junk bits)
173
174        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
175        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
176        M_LDRB   tVal4, [pSrcLeft],  +leftStepx2     ;// tVal4 = pSrcLeft[2]
177        M_LDRB   tVal12,[pSrcLeft2], +leftStepx2     ;// tVal12= pSrcLeft[3]
178        ADD      tVal2, tVal8, tVal9                 ;// tVal14 = tVal8 + tVal9
179
180        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[4]
181        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[5]
182        ADD      tVal14, tVal4, tVal12               ;// tVal14 = tVal4 + tVal12
183
184        LDRB     tVal4, [pSrcLeft]                   ;// tVal4 = pSrcLeft[6]
185        LDRB     tVal12,[pSrcLeft2]                  ;// tVal12= pSrcLeft[7]
186        ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
187        ADD      tVal2, tVal2, tVal14                ;// leftsum1  = sum(pSrcLeft[0] to pSrcLeft[3])
188        ADD      tVal4, tVal4, tVal12                ;// tVal4 = tVal4 + tVal12
189        ADD      tVal14, tVal8, tVal4                ;// leftsum2  = sum(pSrcLeft[4] to pSrcLeft[7])
190        ADD      tVal8, tVal14, #2                   ;// tVal8 = leftsum2 + 2
191        ADD      tVal9, sum2,   #2                   ;// tVal8 = upsum2 + 2
192        ADD      sum1,  sum1, tVal2                  ;// sum1 = upsum1 + leftsum1
193        ADD      sum2,  sum2, tVal14                 ;// sum2 = upsum2 + leftsum2
194        ADD      sum1, sum1, #4                      ;// (sum1 + 4)
195        ADD      sum2, sum2, #4                      ;// (sum2 + 4)
196        MOV      sum1,  sum1,  LSR #3                ;// (sum1 + 4)>>3
197        MOV      tVal9, tVal9, LSR #2                ;// (tVal9 + 2)>>2
198        MOV      tVal8, tVal8, LSR #2                ;// (tVal8 + 2)>>2
199        MOV      sum2,  sum2,  LSR #3                ;// (sum2 + 4)>>3
200
201        MUL      tVal0, sum1, r0x01010101            ;// replicate the val in all the bytes
202        MUL      tVal1, tVal9,r0x01010101            ;// replicate the val in all the bytes
203        MUL      tVal8, tVal8,r0x01010101            ;// replicate the val in all the bytes
204        MUL      tVal9, sum2, r0x01010101            ;// replicate the val in all the bytes
205
206        M_STRD   tVal0, tVal1, [pDst], dstStep       ;// pDst[0 to 7]   = tVal 0 to 1
207        M_STRD   tVal0, tVal1, [pDst], dstStep       ;// pDst[8 to 15]  = tVal 0 to 1
208        M_STRD   tVal0, tVal1, [pDst], dstStep       ;// pDst[16 to 23] = tVal 0 to 1
209        M_STRD   tVal0, tVal1, [pDst], dstStep       ;// pDst[24 to 31] = tVal 0 to 1
210
211        M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[32 to 39] = tVal 8 to 9
212        M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[40 to 47] = tVal 8 to 9
213        M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[48 to 55] = tVal 8 to 9
214        M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[56 to 63] = tVal 8 to 9
215        MOV      return, #OMX_Sts_NoErr
216        M_EXIT
217
218TST_UPPER
219
220        ;// M_STALL ARM1136JS=3
221
222        CMP      availability, #OMX_VC_UPPER         ;// if(availability & #OMX_VC_UPPER)
223
224        BNE      TST_LEFT                            ;// Jump to Left if not upper
225        LDM      pSrcAbove,{tVal8,tVal9}             ;// tVal 8 to 9 = pSrcAbove[0 to 7]
226
227        ;// M_STALL ARM1136JS=3
228
229        UXTB16   tVal7, tVal8                        ;// pSrcAbove[0, 2]
230        UXTB16   tVal8, tVal8, ROR #8                ;// pSrcAbove[1, 3]
231        UADD16   sum1,  tVal7, tVal8                 ;// pSrcAbove[0, 2] + pSrcAbove[1, 3]
232
233        UXTB16   tVal7, tVal9                        ;// pSrcAbove[4, 6]
234        UXTB16   tVal9, tVal9, ROR #8                ;// pSrcAbove[5, 7]
235        UADD16   sum2,  tVal7, tVal9                 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6]
236
237        ADD      sum1, sum1, sum1, LSR #16           ;// sum(pSrcAbove[0] to pSrcAbove[3])
238        ADD      sum2, sum2, sum2, LSR #16           ;// sum(pSrcAbove[4] to pSrcAbove[7])
239
240        UXTH     sum1, sum1                          ;// upsum1 (Clear the top junk bits)
241        UXTH     sum2, sum2                          ;// upsum2 (Clear the top junk bits)
242
243        ADD      sum1, sum1, #2                      ;// sum1 + 2
244        ADD      sum2, sum2, #2                      ;// sum2 + 2
245
246        MOV      sum1, sum1, LSR #2                  ;// (sum1 + 2)>>2
247        MOV      sum2, sum2, LSR #2                  ;// (sum2 + 2)>>2
248
249        MUL      sum1, sum1,r0x01010101              ;// replicate the val in all the bytes
250        MUL      sum2, sum2,r0x01010101              ;// replicate the val in all the bytes
251
252        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[0 to 7]   = tVal 6 to 7
253        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[8 to 15]  = tVal 6 to 7
254        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[16 to 23] = tVal 6 to 7
255        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[24 to 31] = tVal 6 to 7
256        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[32 to 39] = tVal 6 to 7
257        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[40 to 47] = tVal 6 to 7
258        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[48 to 55] = tVal 6 to 7
259        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[56 to 63] = tVal 6 to 7
260        MOV      return, #OMX_Sts_NoErr
261        M_EXIT
262
263TST_LEFT
264        ;// M_STALL ARM1136JS=3
265
266        CMP      availability, #OMX_VC_LEFT
267        BNE      TST_COUNT0
268        ADD      leftStepx2, leftStep,leftStep       ;// leftStepx2 = 2 * leftStep
269        ADD      pSrcLeft2, pSrcLeft, leftStep       ;// pSrcLeft2 = pSrcLeft + leftStep
270
271        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
272        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
273        M_LDRB   tVal4, [pSrcLeft],  +leftStepx2     ;// tVal4 = pSrcLeft[2]
274        M_LDRB   tVal12,[pSrcLeft2], +leftStepx2     ;// tVal12= pSrcLeft[3]
275
276        ADD      tVal6, tVal8, tVal9                 ;// tVal6 = tVal8 + tVal9
277
278        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[4]
279        ADD      tVal7, tVal4, tVal12                ;// tVal7 = tVal4 + tVal12
280        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[5]
281        M_LDRB   tVal4, [pSrcLeft],  +leftStepx2     ;// tVal4 = pSrcLeft[6]
282        M_LDRB   tVal12,[pSrcLeft2], +leftStepx2     ;// tVal12= pSrcLeft[7]
283
284        ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
285        ADD      sum1,  tVal6, tVal7                 ;// sum1  = sum(pSrcLeft[0] to pSrcLeft[3])
286        ADD      tVal4, tVal4, tVal12                ;// tVal4 = tVal4 + tVal12
287        ADD      sum2,  tVal8, tVal4                 ;// sum2  = sum(pSrcLeft[4] to pSrcLeft[7])
288
289        ADD      sum1, sum1, #2                      ;// sum1 + 2
290        ADD      sum2, sum2, #2                      ;// sum2 + 2
291
292        MOV      sum1, sum1, LSR #2                  ;// (sum1 + 2)>>2
293        MOV      sum2, sum2, LSR #2                  ;// (sum2 + 2)>>2
294
295        MUL      tVal6, sum1,r0x01010101             ;// replicate the val in all the bytes
296        MUL      tVal8, sum2,r0x01010101             ;// replicate the val in all the bytes
297
298        ;// M_STALL ARM1136JS=1
299        MOV      tVal7,tVal6                         ;// tVal7 = sum1
300        MOV      tVal9,tVal8                         ;// tVal9 = sum2
301
302        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[0 to 7]   = tVal 6 to 7
303        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[8 to 15]  = tVal 6 to 7
304        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[16 to 23] = tVal 6 to 7
305        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[24 to 31] = tVal 6 to 7
306
307        M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[32 to 39] = tVal 8 to 9
308        M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[40 to 47] = tVal 8 to 9
309        M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[48 to 55] = tVal 8 to 9
310        M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[56 to 63] = tVal 8 to 9
311
312        MOV      return, #OMX_Sts_NoErr
313        M_EXIT                                       ;// Macro to exit midway-break frm case
314
315TST_COUNT0
316        LDR      sum1, =MUL_CONST1                  ;// sum1 = 0x80808080 if(count == 0)
317
318        ;// M_STALL ARM1136JS=2
319
320        MOV      tVal7, sum1                         ;// tVal7 = sum1
321
322        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[0 to 7]   = tVal 6 to 7
323        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[8 to 15]  = tVal 6 to 7
324        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[16 to 23] = tVal 6 to 7
325        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[24 to 31] = tVal 6 to 7
326        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[32 to 39] = tVal 6 to 7
327        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[40 to 47] = tVal 6 to 7
328        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[48 to 55] = tVal 6 to 7
329        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[56 to 63] = tVal 6 to 7
330
331        MOV      return, #OMX_Sts_NoErr
332        M_EXIT                                       ;// Macro to exit midway-break frm case
333
334OMX_VC_CHROMA_HOR
335
336        ;// M_STALL ARM1136JS=2
337
338        ADD      pSrcLeft2, pSrcLeft, leftStep       ;// pSrcLeft2 = pSrcLeft + leftStep
339        ADD      leftStepx2, leftStep, leftStep      ;// leftStepx2 = leftStep * 2
340        ADD      pDst2, pDst, dstStep                ;// pDst2 = pDst + dstStep
341        ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
342        SUB      dstStepx2, dstStepx2, #4            ;// double dstStep  minus 4
343        LDR      r0x01010101, =MUL_CONST0            ;// Const to repeat the byte in reg 4 times
344        M_LDRB   tVal6, [pSrcLeft], +leftStepx2      ;// tVal6 = pSrcLeft[0]
345        M_LDRB   tVal7, [pSrcLeft2],+leftStepx2      ;// tVal7 = pSrcLeft[1]
346        M_LDRB   tVal8, [pSrcLeft], +leftStepx2      ;// tVal8 = pSrcLeft[2]
347        M_LDRB   tVal9, [pSrcLeft2],+leftStepx2      ;// tVal9 = pSrcLeft[3]
348        MUL      tVal6, tVal6, r0x01010101           ;// replicate the val in all the bytes
349        MUL      tVal7, tVal7, r0x01010101           ;// replicate the val in all the bytes
350        MUL      tVal8, tVal8, r0x01010101           ;// replicate the val in all the bytes
351        MUL      tVal9, tVal9, r0x01010101           ;// replicate the val in all the bytes
352        STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst [0 to 3]
353        STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
354        M_STR    tVal6, [pDst],  dstStepx2           ;// store {tVal6} at pDst [4 to 7]
355        M_STR    tVal7, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[4 to 7]
356        STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst [0 to 3]
357        STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
358        M_STR    tVal8, [pDst],  dstStepx2           ;// store {tVal6} at pDst [4 to 7]
359        M_STR    tVal9, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[4 to 7]
360        M_LDRB   tVal6, [pSrcLeft], +leftStepx2      ;// tVal6 = pSrcLeft[4]
361        M_LDRB   tVal7, [pSrcLeft2],+leftStepx2      ;// tVal7 = pSrcLeft[5]
362        M_LDRB   tVal8, [pSrcLeft], +leftStepx2      ;// tVal8 = pSrcLeft[6]
363        M_LDRB   tVal9, [pSrcLeft2],+leftStepx2      ;// tVal9 = pSrcLeft[7]
364        MUL      tVal6, tVal6, r0x01010101           ;// replicate the val in all the bytes
365        MUL      tVal7, tVal7, r0x01010101           ;// replicate the val in all the bytes
366        MUL      tVal8, tVal8, r0x01010101           ;// replicate the val in all the bytes
367        MUL      tVal9, tVal9, r0x01010101           ;// replicate the val in all the bytes
368        STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst [0 to 3]
369        STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
370        M_STR    tVal6, [pDst],  dstStepx2           ;// store {tVal6} at pDst [4 to 7]
371        M_STR    tVal7, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[4 to 7]
372        STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst [0 to 3]
373        STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
374        M_STR    tVal8, [pDst],  dstStepx2           ;// store {tVal6} at pDst [4 to 7]
375        M_STR    tVal9, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[4 to 7]
376        MOV      return, #OMX_Sts_NoErr
377        M_EXIT
378
379OMX_VC_CHROMA_VERT
380
381        ;// M_STALL ARM1136JS=4
382
383        LDMIA    pSrcAbove, {tVal6,tVal7}            ;// tVal 6 to 7 = pSrcAbove[0 to 7]
384        MOV      return, #OMX_Sts_NoErr
385
386        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[0 to 7]   = tVal 6 to 7
387        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[8 to 15]  = tVal 6 to 7
388        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[16 to 23] = tVal 6 to 7
389        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[24 to 31] = tVal 6 to 7
390        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[32 to 39] = tVal 6 to 7
391        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[40 to 47] = tVal 6 to 7
392        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[48 to 55] = tVal 6 to 7
393        M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[56 to 63] = tVal 6 to 7
394
395        M_EXIT                                       ;// Macro to exit midway-break frm case
396
397OMX_VC_CHROMA_PLANE
398
399        ;// M_STALL ARM1136JS=3
400
401        RSB      tVal14, leftStep, leftStep, LSL #3  ;// 7*leftStep
402        LDRB     tVal7, [pSrcAbove, #+7]             ;// pSrcAbove[7]
403        LDRB     tVal6, [pSrcLeft, +tVal14]          ;// pSrcLeft[7*leftStep]
404        LDRB     tVal8, [pSrcAboveLeft]              ;// pSrcAboveLeft[0]
405        LDRB     tVal9, [pSrcAbove, #+6 ]            ;// pSrcAbove[6]
406        LDRB     tVal10,[pSrcAbove]                  ;// pSrcAbove[0]
407        ADD      tVal2, tVal7, tVal6                 ;// pSrcAbove[7] + pSrcLeft[7*leftStep]
408        SUB      tVal6, tVal6, tVal8                 ;// V0 = pSrcLeft[7*leftStep] - pSrcAboveLeft[0]
409        SUB      tVal7, tVal7, tVal8                 ;// H0 = pSrcAbove[7] - pSrcAboveLeft[0]
410        LSL      tVal2, tVal2, #4                    ;// a = 16 * (pSrcAbove[15] + pSrcLeft[15*lS])
411        ADD      tVal2, tVal2, #16                   ;// a + 16
412        SUB      tVal9, tVal9,tVal10                 ;// pSrcAbove[6] - pSrcAbove[0]
413        LDRB     tVal8, [pSrcAbove,#+5]              ;// pSrcAbove[5]
414        LDRB     tVal10,[pSrcAbove,#+1]              ;// pSrcAbove[1]
415        ADD      tVal9, tVal9, tVal9, LSL #1         ;// H1 = 3 * (pSrcAbove[6] - pSrcAbove[0])
416        ADD      tVal7, tVal9, tVal7, LSL #2         ;// H = H1 + H0
417        SUB      tVal8, tVal8, tVal10                ;// pSrcAbove[5] - pSrcAbove[1]
418        LDRB     tVal9, [pSrcAbove,#+4]              ;// pSrcAbove[4]
419        LDRB     tVal10,[pSrcAbove,#+2]              ;// pSrcAbove[2]
420        ADD      tVal7, tVal7, tVal8, LSL #1         ;// H = H + H2
421        SUB      tVal11, tVal14,leftStep             ;// 6*leftStep
422        ADD      tVal11, pSrcLeft, tVal11            ;// pSrcLeft + 6*leftStep
423        MOV      tVal12, pSrcLeft                    ;// pSrcLeft
424        SUB      tVal9, tVal9, tVal10                ;// pSrcAbove[4] - pSrcAbove[2]
425        ADD      tVal7, tVal7, tVal9                 ;// H = H + H3
426        M_LDRB   tVal8, [tVal11],-leftStep           ;// pSrcLeft[6*leftStep]
427        M_LDRB   tVal10,[tVal12],+leftStep           ;// pSrcLeft[0]
428        ADD      tVal7, tVal7, tVal7, LSL #4         ;// 17 * H
429        ADD      tVal7, tVal7, #16                   ;// 17 * H + 16
430        SUB      tVal8, tVal8, tVal10                ;// pSrcLeft[6*leftStep] - pSrcLeft[0]
431        ASR      b, tVal7, #5                        ;// b = (17 * H + 16) >> 5
432        ADD      tVal8, tVal8, tVal8, LSL #1         ;// V1 = 3 * (pSrcLeft[6*leftStep] - pSrcLeft[0])
433        ADD      tVal6, tVal8, tVal6, LSL #2         ;// V = V0 +V1
434        M_LDRB   tVal8, [tVal11],-leftStep           ;// pSrcLeft[5*leftStep]
435        M_LDRB   tVal10,[tVal12],+leftStep           ;// pSrcLeft[leftStep]
436        ADD      tVal7, b, b, LSL #1                 ;// 3*b
437        SUB      tVal2, tVal2, tVal7                 ;// a + 16 - 3*b
438        SUB      tVal7, tVal8, tVal10                ;// pSrcLeft[5*leftStep] - pSrcLeft[leftStep]
439        M_LDRB   tVal8, [tVal11],-leftStep           ;// pSrcLeft[4*leftStep]
440        M_LDRB   tVal10,[tVal12],+leftStep           ;// pSrcLeft[2*leftStep]
441        ADD      tVal6, tVal6, tVal7, LSL #1         ;// V = V + V2
442        LDR      r0x00FF00FF, =MASK_CONST            ;// r0x00FF00FF = 0x00FF00FF
443        SUB      tVal7, tVal8, tVal10                ;// pSrcLeft[4*leftStep] - pSrcLeft[2*leftStep]
444        ADD      tVal6, tVal6, tVal7                 ;// V = V + V7
445        SUB      dstStep, dstStep, #4                ;// dstStep - 4
446        ADD      tVal6, tVal6, tVal6, LSL #4         ;// 17*V
447        ADD      tVal6, tVal6, #16                   ;// 17*V + 16
448
449        ;// M_STALL ARM1136JS=1
450
451        ASR      c, tVal6, #5                        ;// c = (17*V + 16)>>5
452
453        ;// M_STALL ARM1136JS=1
454
455        ADD      tVal6, c, c, LSL #1                 ;// 3*c
456        UXTH     c, c                                ;// only in half word
457        SUB      tVal6, tVal2, tVal6                 ;// a - 3*b - 3*c + 16
458        ORR      c, c, c, LSL #16                    ;// c c
459        ADD      tVal7, b, b                         ;// 2b
460        ADD      tVal2, tVal6, tVal7                 ;// pp2 = d + 2*b
461        ADD      tVal7, tVal7, b                     ;// 3b
462        ORR      p2p0,   tVal6,  tVal2,  LSL #16     ;// p2p0   = pack {p2, p0}
463        UXTH     b, b
464        UXTH     tVal7, tVal7
465        ORR      b, b, b, LSL #16                    ;// {b,b}
466        ORR      tVal7, tVal7, tVal7, LSL #16        ;// {3b,3b}
467        SADD16   p3p1,   p2p0, b                     ;// p3p1   = p2p0 + {b,b}
468        SADD16   p6p4,   p3p1, tVal7                 ;// p6p4   = p3p1 + {3b,3b}
469        SADD16   p7p5,   p6p4, b                     ;// p7p5   = p6p4 + {b,b}
470        MOV      outerCount, #BLK_SIZE               ;// Outer Loop Count
471
472LOOP_PLANE
473
474        USAT16   p7p5,   #13, p7p5                    ;// clip13(p7) clip13(p5)
475        USAT16   p6p4,   #13, p6p4                    ;// clip13(p6) clip13(p4)
476        USAT16   p3p1,   #13, p3p1                    ;// clip13(p3) clip13(p1)
477        USAT16   p2p0,   #13, p2p0                    ;// clip13(p2) clip13(p0)
478
479        AND      pp7pp5, r0x00FF00FF, p7p5, ASR #5    ;// clip8(p7) clip8(p5)
480        AND      pp6pp4, r0x00FF00FF, p6p4, ASR #5    ;// clip8(p6) clip8(p4)
481        AND      pp3pp1, r0x00FF00FF, p3p1, ASR #5    ;// clip8(p3) clip8(p1)
482        AND      pp2pp0, r0x00FF00FF, p2p0, ASR #5    ;// clip8(p2) clip8(p0)
483
484        SUBS     outerCount, outerCount, #1           ;// outerCount--
485
486        ORR      p3210, pp2pp0, pp3pp1, LSL #8        ;// pack {p3,p2, p1, p0}
487        STR      p3210, [pDst], #4                    ;// store {pDst[0] to pDst[3]}
488
489        ORR      p7654, pp6pp4, pp7pp5, LSL #8        ;// pack {p7,p6, p5, p4}
490        M_STR    p7654, [pDst], dstStep               ;// store {pDst[4] to pDst[7]}
491
492        SADD16   p7p5,   p7p5,   c                    ;// {p7 + c}, {p5 + c}
493        SADD16   p6p4,   p6p4,   c                    ;// {p6 + c}, {p4 + c}
494        SADD16   p3p1,   p3p1,   c                    ;// {p3 + c}, {p1 + c}
495        SADD16   p2p0,   p2p0,   c                    ;// {p2 + c}, {p0 + c}
496
497        BNE      LOOP_PLANE                           ;// Loop for 8 times
498        MOV      return, #OMX_Sts_NoErr
499        M_END
500
501        ENDIF ;// ARM1136JS
502
503
504
505        END
506;//-----------------------------------------------------------------------------------------------
507;// omxVCM4P10_PredictIntraChroma_8x8 ends
508;//-----------------------------------------------------------------------------------------------
509