omxVCM4P10_PredictIntra_16x16_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  omxVCM4P10_PredictIntra_16x16_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16        M_VARIANTS CortexA8
17
18
19;//-------------------------------------------------------
20;// This table for implementing switch case of C in asm by
21;// the mehtod of two levels of indexing.
22;//-------------------------------------------------------
23
24    M_TABLE armVCM4P10_pIndexTable16x16
25    DCD  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
26    DCD  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
27
28
29    IF CortexA8
30
31    M_TABLE armVCM4P10_MultiplierTable16x16,1
32    DCW   7,  6,  5,  4,  3,  2,  1,  8
33    DCW   0,  1,  2,  3,  4,  5,  6,  7
34    DCW   8,  9, 10, 11, 12, 13, 14, 15
35
36;//--------------------------------------------
37;// Constants
38;//--------------------------------------------
39BLK_SIZE        EQU 0x10
40MUL_CONST0      EQU 0x01010101
41MUL_CONST1      EQU 0x00060004
42MUL_CONST2      EQU 0x00070005
43MUL_CONST3      EQU 0x00030001
44MASK_CONST      EQU 0x00FF00FF
45
46;//--------------------------------------------
47;// Scratch variable
48;//--------------------------------------------
49y               RN 12
50pc              RN 15
51
52return          RN 0
53pTable          RN 9
54count           RN 11
55pMultTable      RN 9
56; ----------------------------------------------
57; Neon registers
58; ----------------------------------------------
59qAbove          QN Q0.U8
60qLeft           QN Q1.U8
61qSum8           QN Q0.U16
62dSum80          DN D0.U16
63dSum81          DN D1.U16
64dSum4           DN D0.U16
65dSum2           DN D0.U32
66dSum1           DN D0.U64
67qOut            QN Q3.U8
68dSumLeft        DN D6.U64
69dSumAbove       DN D7.U64
70dSum            DN D8.U64
71dSum0           DN D8.U8[0]
72
73qH              QN Q11.S32
74qV              QN Q12.S32
75qA              QN Q11.S16
76qB              QN Q6.S16
77qC              QN Q7.S16
78
79qB0             QN Q5.S16
80qB1             QN Q6.S16
81dA1             DN D23.S16
82
83dH0             DN D22.S32
84dH1             DN D23.S32
85dV0             DN D24.S32
86dV1             DN D25.S32
87
88qHV             QN Q11.S64
89qHV0            QN Q11.S32
90qHV1            QN Q12.S64
91
92dHV00           DN D22.S32
93dHV01           DN D23.S32
94
95dHV0            DN D22.S16[0]
96dHV1            DN D23.S16[0]
97dHV10           DN D24.S64
98dHV11           DN D25.S64
99
100qSum0           QN Q0.S16
101qSum1           QN Q1.S16
102
103dOut0           DN D6.U8
104dOut1           DN D7.U8
105
106dLeft0          DN D2.U8
107dLeft1          DN D3.U8
108qConst          QN Q13.S16
109
110dAbove0         DN D0.U8
111dAbove1         DN D1.U8
112
113dRevLeft64      DN D12.U64
114dRevLeft        DN D12.U8
115dRevAbove64     DN D5.U64
116dRevAbove       DN D5.U8
117qLeftDiff       QN Q8.S16
118dLeftDiff1      DN D17.S16
119dLeftDiff64     DN D17.S64
120qDiffLeft       QN Q8.S16
121qDiffAbove      QN Q4.S16
122dAboveDiff1     DN D9.S16
123dAboveDiff64    DN D9.S64
124qAboveDiff      QN Q4.S16
125
126dAboveLeft      DN D4.U8
127
128dDiffLeft0      DN D16.S16
129dDiffLeft1      DN D17.S16
130dDiffAbove0     DN D8.S16
131dDiffAbove1     DN D9.S16
132
133qLeft15minus0   QN Q7.S16
134dLeft15minus0   DN D14.S16
135qAbove15minus0  QN Q3.S16
136dAbove15minus0  DN D6.S16
137
138qMultiplier     QN Q10.S16
139qMultiplier0    QN Q10.S16
140qMultiplier1    QN Q12.S16
141dMultiplier0    DN D20.S16
142dMultiplier1    DN D21.S16
143
144dBPlusCMult7    DN D1.S64
145dBPlusCMult7S16 DN D1.S16
146
147qTmp            QN Q0.U8
148
149;//--------------------------------------------
150;// Declare input registers
151;//--------------------------------------------
152pSrcLeft        RN 0    ;// input pointer
153pSrcAbove       RN 1    ;// input pointer
154pSrcAboveLeft   RN 2    ;// input pointer
155pDst            RN 3    ;// output pointer
156leftStep        RN 4    ;// input variable
157dstStep         RN 5    ;// input variable
158predMode        RN 6    ;// input variable
159availability    RN 7    ;// input variable
160
161pTmp            RN 8
162step            RN 10
163pTmp2           RN 11
164
165;//-----------------------------------------------------------------------------------------------
166;// omxVCM4P10_PredictIntra_16x16 starts
167;//-----------------------------------------------------------------------------------------------
168
169        ;// Write function header
170        M_START omxVCM4P10_PredictIntra_16x16, r11, d15
171
172        ;// Define stack arguments
173        M_ARG    LeftStep,     4
174        M_ARG    DstStep,      4
175        M_ARG    PredMode,     4
176        M_ARG    Availability, 4
177
178        ;// M_STALL ARM1136JS=4
179
180        LDR      pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
181
182        ;// Load argument from the stack
183        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
184        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
185        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
186        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
187
188        MOV      y, #BLK_SIZE                        ;// Outer Loop Count
189        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
190
191OMX_VC_16X16_VERT
192        VLD1    qAbove,  [pSrcAbove]
193        ADD     pTmp, pDst, dstStep
194        ADD     step, dstStep, dstStep
195        VST1    qAbove, [pDst], step
196        VST1    qAbove, [pTmp], step
197        VST1    qAbove, [pDst], step
198        VST1    qAbove, [pTmp], step
199        VST1    qAbove, [pDst], step
200        VST1    qAbove, [pTmp], step
201        VST1    qAbove, [pDst], step
202        VST1    qAbove, [pTmp], step
203        VST1    qAbove, [pDst], step
204        VST1    qAbove, [pTmp], step
205        VST1    qAbove, [pDst], step
206        VST1    qAbove, [pTmp], step
207        VST1    qAbove, [pDst], step
208        VST1    qAbove, [pTmp], step
209        VST1    qAbove, [pDst]
210        VST1    qAbove, [pTmp]
211        MOV     return, #OMX_Sts_NoErr               ;// returnNoError
212        M_EXIT
213
214OMX_VC_16X16_HOR
215        ADD     pTmp, pSrcLeft, leftStep
216        ADD     leftStep, leftStep, leftStep
217        ADD     pTmp2, pDst, dstStep
218        ADD     dstStep, dstStep, dstStep
219LoopHor
220        VLD1     {qLeft[]}, [pSrcLeft], leftStep
221        VLD1     {qTmp[]}, [pTmp], leftStep
222        SUBS     y, y, #8
223        VST1     qLeft, [pDst], dstStep
224        VST1     qTmp, [pTmp2], dstStep
225        VLD1     {qLeft[]}, [pSrcLeft], leftStep
226        VLD1     {qTmp[]}, [pTmp], leftStep
227        VST1     qLeft, [pDst], dstStep
228        VST1     qTmp, [pTmp2], dstStep
229        VLD1     {qLeft[]}, [pSrcLeft], leftStep
230        VLD1     {qTmp[]}, [pTmp], leftStep
231        VST1     qLeft, [pDst], dstStep
232        VST1     qTmp, [pTmp2], dstStep
233        VLD1     {qLeft[]}, [pSrcLeft], leftStep
234        VLD1     {qTmp[]}, [pTmp], leftStep
235        VST1     qLeft, [pDst], dstStep
236        VST1     qTmp, [pTmp2], dstStep
237
238        BNE      LoopHor                                  ;// Loop for 16 times
239        MOV      return, #OMX_Sts_NoErr
240        M_EXIT
241
242OMX_VC_16X16_DC
243        MOV      count, #0                                 ;// count = 0
244        TST      availability, #OMX_VC_LEFT
245        BEQ      UpperOrNoneAvailable                      ;// Jump to Upper if not left
246
247        ADD     pTmp, pSrcLeft, leftStep
248        ADD     step, leftStep, leftStep
249
250        VLD1    {qLeft[0]}, [pSrcLeft],step
251        VLD1    {qLeft[1]}, [pTmp],step
252        VLD1    {qLeft[2]}, [pSrcLeft],step
253        VLD1    {qLeft[3]}, [pTmp],step
254        VLD1    {qLeft[4]}, [pSrcLeft],step
255        VLD1    {qLeft[5]}, [pTmp],step
256        VLD1    {qLeft[6]}, [pSrcLeft],step
257        VLD1    {qLeft[7]}, [pTmp],step
258        VLD1    {qLeft[8]}, [pSrcLeft],step
259        VLD1    {qLeft[9]}, [pTmp],step
260        VLD1    {qLeft[10]},[pSrcLeft],step
261        VLD1    {qLeft[11]},[pTmp],step
262        VLD1    {qLeft[12]},[pSrcLeft],step
263        VLD1    {qLeft[13]},[pTmp],step
264        VLD1    {qLeft[14]},[pSrcLeft],step
265        VLD1    {qLeft[15]},[pTmp]
266
267        VPADDL   qSum8, qLeft
268        ADD     count, count, #1
269        VPADD    dSum4, dSum80, dSum81
270        VPADDL   dSum2, dSum4
271        VPADDL   dSumLeft, dSum2
272        VRSHR    dSum, dSumLeft, #4
273
274UpperOrNoneAvailable
275        TST      availability,  #OMX_VC_UPPER              ;// if(availability & #OMX_VC_UPPER)
276        BEQ      BothOrNoneAvailable                       ;// Jump to Left if not upper
277        VLD1     qAbove, [pSrcAbove]
278        ADD      count, count, #1                          ;// if upper inc count by 1
279        VPADDL   qSum8, qAbove
280        VPADD    dSum4, dSum80, dSum81
281        VPADDL   dSum2, dSum4
282        VPADDL   dSumAbove, dSum2
283        VRSHR    dSum, dSumAbove, #4
284
285BothOrNoneAvailable
286        CMP      count, #2                                  ;// check if both available
287        BNE      NoneAvailable
288        VADD     dSum, dSumAbove, dSumLeft
289        VRSHR    dSum, dSum, #5
290
291
292NoneAvailable
293        VDUP     qOut, dSum0
294        CMP      count, #0                                  ;// check if none available
295        ADD      pTmp, pDst, dstStep
296        ADD      step, dstStep, dstStep
297        BNE      LoopDC
298        VMOV     qOut, #128
299LoopDC
300        VST1    qOut, [pDst], step
301        VST1    qOut, [pTmp], step
302        VST1    qOut, [pDst], step
303        VST1    qOut, [pTmp], step
304        VST1    qOut, [pDst], step
305        VST1    qOut, [pTmp], step
306        VST1    qOut, [pDst], step
307        VST1    qOut, [pTmp], step
308        VST1    qOut, [pDst], step
309        VST1    qOut, [pTmp], step
310        VST1    qOut, [pDst], step
311        VST1    qOut, [pTmp], step
312        VST1    qOut, [pDst], step
313        VST1    qOut, [pTmp], step
314        VST1    qOut, [pDst], step
315        VST1    qOut, [pTmp], step
316        MOV     return, #OMX_Sts_NoErr
317        M_EXIT
318
319OMX_VC_16X16_PLANE
320        LDR     pMultTable, =armVCM4P10_MultiplierTable16x16
321        VLD1    qAbove, [pSrcAbove]                         ;// pSrcAbove[x]      :0<= x <= 7
322        VLD1    dAboveLeft[0],[pSrcAboveLeft]
323        ADD     pTmp, pSrcLeft, leftStep
324        ADD     step, leftStep, leftStep
325        VLD1    {qLeft[0]},  [pSrcLeft],step
326        VLD1    {qLeft[1]},  [pTmp],step
327        VLD1    {qLeft[2]},  [pSrcLeft],step
328        VLD1    {qLeft[3]},  [pTmp],step
329        VLD1    {qLeft[4]},  [pSrcLeft],step
330        VLD1    {qLeft[5]},  [pTmp],step
331        VLD1    {qLeft[6]},  [pSrcLeft],step
332        VLD1    {qLeft[7]},  [pTmp],step
333        VLD1    {qLeft[8]},  [pSrcLeft],step
334        VLD1    {qLeft[9]},  [pTmp],step
335        VLD1    {qLeft[10]}, [pSrcLeft],step
336        VLD1    {qLeft[11]}, [pTmp],step
337        VLD1    {qLeft[12]}, [pSrcLeft],step
338        VLD1    {qLeft[13]}, [pTmp],step
339        VLD1    {qLeft[14]}, [pSrcLeft],step
340        VLD1    {qLeft[15]}, [pTmp]
341
342        VREV64  dRevAbove, dAbove1                          ;// pSrcAbove[15:14:13:12:11:10:9:8]
343        VSUBL   qAbove15minus0, dRevAbove, dAboveLeft       ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0]
344        VSHR    dRevAbove64, dRevAbove64, #8                ;// pSrcAbove[14:13:12:11:10:9:8:X]
345        VSUBL   qAboveDiff, dRevAbove, dAbove0
346
347        VSHL    dAboveDiff64, dAboveDiff64, #16
348        VEXT    dDiffAbove1, dAboveDiff1, dAbove15minus0, #1
349
350        VREV64  dRevLeft,dLeft1                             ;// pSrcLeft[15:14:13:12:11:10:9:8]
351        VSUBL   qLeft15minus0,dRevLeft, dAboveLeft          ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
352        VSHR    dRevLeft64, dRevLeft64, #8                  ;// pSrcLeft[14:13:12:11:10:9:8:X]
353        VSUBL   qLeftDiff,dRevLeft, dLeft0
354
355        ;// Multiplier = [8|1|2|...|6|7]
356        VLD1    qMultiplier, [pMultTable]!
357
358        VSHL    dLeftDiff64, dLeftDiff64, #16
359        VEXT    dDiffLeft1, dLeftDiff1, dLeft15minus0, #1
360
361        VMULL   qH,dDiffAbove0, dMultiplier0
362        VMULL   qV,dDiffLeft0,  dMultiplier0
363        VMLAL   qH,dDiffAbove1, dMultiplier1
364        VMLAL   qV,dDiffLeft1,  dMultiplier1
365
366        VPADD   dHV00,dH1,dH0
367        VPADD   dHV01,dV1,dV0
368        VPADDL  qHV, qHV0
369        VSHL    qHV1,qHV,#2
370        VADD    qHV,qHV,qHV1
371
372        ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)]
373        VRSHR   qHV,qHV,#6
374
375        ;// HV1 = [c*7|b*7]
376        VSHL    qHV1,qHV,#3
377        VSUB    qHV1,qHV1,qHV
378
379        ;// Multiplier1 = [0|1|2|...|7]
380        VLD1    qMultiplier0, [pMultTable]!
381        VDUP    qB, dHV0
382        VDUP    qC, dHV1
383
384        VADDL   qA,dAbove1,dLeft1
385        VSHL    qA,qA, #4
386        VDUP    qA,dA1[3]
387        VADD    dBPlusCMult7, dHV10, dHV11
388
389        ;// Multiplier1 = [8|9|10|...|15]
390        VLD1    qMultiplier1, [pMultTable]
391        ;// Const = a - 7*(b+c)
392        VDUP    qConst, dBPlusCMult7S16[0]
393        VSUB    qConst, qA, qConst
394
395        ;// B0 = [0*b|1*b|2*b|3*b|......|7*b]
396        VMUL    qB0,qB,qMultiplier0
397
398        ;// B0 = [8*b|9*b|10*b|11*b|....|15*b]
399        VMUL    qB1,qB,qMultiplier1
400
401        VADD    qSum0, qB0, qConst
402        VADD    qSum1, qB1, qConst
403
404        ;// Loops for 16 times
405LoopPlane
406        ;// (b*x + c*y + C)>>5
407        VQRSHRUN dOut0, qSum0,#5
408        VQRSHRUN dOut1, qSum1,#5
409        SUBS     y, y, #1
410        VST1     qOut,[pDst],dstStep
411        VADD     qSum0,qSum0,qC
412        VADD     qSum1,qSum1,qC
413        BNE      LoopPlane
414
415        MOV      return, #OMX_Sts_NoErr
416
417        M_END
418
419        ENDIF ;// CortexA8
420
421        END
422;-----------------------------------------------------------------------------------------------
423; omxVCM4P10_PredictIntra_16x16 ends
424;-----------------------------------------------------------------------------------------------
425