omxVCM4P10_PredictIntra_16x16_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_PredictIntra_16x16_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS CortexA8
31
32
33;//-------------------------------------------------------
34;// This table for implementing switch case of C in asm by
35;// the mehtod of two levels of indexing.
36;//-------------------------------------------------------
37
38    M_TABLE armVCM4P10_pIndexTable16x16
39    DCD  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
40    DCD  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
41
42
43    IF CortexA8
44
45    M_TABLE armVCM4P10_MultiplierTable16x16,1
46    DCW   7,  6,  5,  4,  3,  2,  1,  8
47    DCW   0,  1,  2,  3,  4,  5,  6,  7
48    DCW   8,  9, 10, 11, 12, 13, 14, 15
49
50;//--------------------------------------------
51;// Constants
52;//--------------------------------------------
53BLK_SIZE        EQU 0x10
54MUL_CONST0      EQU 0x01010101
55MUL_CONST1      EQU 0x00060004
56MUL_CONST2      EQU 0x00070005
57MUL_CONST3      EQU 0x00030001
58MASK_CONST      EQU 0x00FF00FF
59
60;//--------------------------------------------
61;// Scratch variable
62;//--------------------------------------------
63y               RN 12
64pc              RN 15
65
66return          RN 0
67pTable          RN 9
68count           RN 11
69pMultTable      RN 9
70; ----------------------------------------------
71; Neon registers
72; ----------------------------------------------
73qAbove          QN Q0.U8
74qLeft           QN Q1.U8
75qSum8           QN Q0.U16
76dSum80          DN D0.U16
77dSum81          DN D1.U16
78dSum4           DN D0.U16
79dSum2           DN D0.U32
80dSum1           DN D0.U64
81qOut            QN Q3.U8
82dSumLeft        DN D6.U64
83dSumAbove       DN D7.U64
84dSum            DN D8.U64
85dSum0           DN D8.U8[0]
86
87qH              QN Q11.S32
88qV              QN Q12.S32
89qA              QN Q11.S16
90qB              QN Q6.S16
91qC              QN Q7.S16
92
93qB0             QN Q5.S16
94qB1             QN Q6.S16
95dA1             DN D23.S16
96
97dH0             DN D22.S32
98dH1             DN D23.S32
99dV0             DN D24.S32
100dV1             DN D25.S32
101
102qHV             QN Q11.S64
103qHV0            QN Q11.S32
104qHV1            QN Q12.S64
105
106dHV00           DN D22.S32
107dHV01           DN D23.S32
108
109dHV0            DN D22.S16[0]
110dHV1            DN D23.S16[0]
111dHV10           DN D24.S64
112dHV11           DN D25.S64
113
114qSum0           QN Q0.S16
115qSum1           QN Q1.S16
116
117dOut0           DN D6.U8
118dOut1           DN D7.U8
119
120dLeft0          DN D2.U8
121dLeft1          DN D3.U8
122qConst          QN Q13.S16
123
124dAbove0         DN D0.U8
125dAbove1         DN D1.U8
126
127dRevLeft64      DN D12.U64
128dRevLeft        DN D12.U8
129dRevAbove64     DN D5.U64
130dRevAbove       DN D5.U8
131qLeftDiff       QN Q8.S16
132dLeftDiff1      DN D17.S16
133dLeftDiff64     DN D17.S64
134qDiffLeft       QN Q8.S16
135qDiffAbove      QN Q4.S16
136dAboveDiff1     DN D9.S16
137dAboveDiff64    DN D9.S64
138qAboveDiff      QN Q4.S16
139
140dAboveLeft      DN D4.U8
141
142dDiffLeft0      DN D16.S16
143dDiffLeft1      DN D17.S16
144dDiffAbove0     DN D8.S16
145dDiffAbove1     DN D9.S16
146
147qLeft15minus0   QN Q7.S16
148dLeft15minus0   DN D14.S16
149qAbove15minus0  QN Q3.S16
150dAbove15minus0  DN D6.S16
151
152qMultiplier     QN Q10.S16
153qMultiplier0    QN Q10.S16
154qMultiplier1    QN Q12.S16
155dMultiplier0    DN D20.S16
156dMultiplier1    DN D21.S16
157
158dBPlusCMult7    DN D1.S64
159dBPlusCMult7S16 DN D1.S16
160
161qTmp            QN Q0.U8
162
163;//--------------------------------------------
164;// Declare input registers
165;//--------------------------------------------
166pSrcLeft        RN 0    ;// input pointer
167pSrcAbove       RN 1    ;// input pointer
168pSrcAboveLeft   RN 2    ;// input pointer
169pDst            RN 3    ;// output pointer
170leftStep        RN 4    ;// input variable
171dstStep         RN 5    ;// input variable
172predMode        RN 6    ;// input variable
173availability    RN 7    ;// input variable
174
175pTmp            RN 8
176step            RN 10
177pTmp2           RN 11
178
179;//-----------------------------------------------------------------------------------------------
180;// omxVCM4P10_PredictIntra_16x16 starts
181;//-----------------------------------------------------------------------------------------------
182
183        ;// Write function header
184        M_START omxVCM4P10_PredictIntra_16x16, r11, d15
185
186        ;// Define stack arguments
187        M_ARG    LeftStep,     4
188        M_ARG    DstStep,      4
189        M_ARG    PredMode,     4
190        M_ARG    Availability, 4
191
192        ;// M_STALL ARM1136JS=4
193
194        LDR      pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
195
196        ;// Load argument from the stack
197        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
198        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
199        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
200        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
201
202        MOV      y, #BLK_SIZE                        ;// Outer Loop Count
203        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
204
205OMX_VC_16X16_VERT
206        VLD1    qAbove,  [pSrcAbove]
207        ADD     pTmp, pDst, dstStep
208        ADD     step, dstStep, dstStep
209        VST1    qAbove, [pDst], step
210        VST1    qAbove, [pTmp], step
211        VST1    qAbove, [pDst], step
212        VST1    qAbove, [pTmp], step
213        VST1    qAbove, [pDst], step
214        VST1    qAbove, [pTmp], step
215        VST1    qAbove, [pDst], step
216        VST1    qAbove, [pTmp], step
217        VST1    qAbove, [pDst], step
218        VST1    qAbove, [pTmp], step
219        VST1    qAbove, [pDst], step
220        VST1    qAbove, [pTmp], step
221        VST1    qAbove, [pDst], step
222        VST1    qAbove, [pTmp], step
223        VST1    qAbove, [pDst]
224        VST1    qAbove, [pTmp]
225        MOV     return, #OMX_Sts_NoErr               ;// returnNoError
226        M_EXIT
227
228OMX_VC_16X16_HOR
229        ADD     pTmp, pSrcLeft, leftStep
230        ADD     leftStep, leftStep, leftStep
231        ADD     pTmp2, pDst, dstStep
232        ADD     dstStep, dstStep, dstStep
233LoopHor
234        VLD1     {qLeft[]}, [pSrcLeft], leftStep
235        VLD1     {qTmp[]}, [pTmp], leftStep
236        SUBS     y, y, #8
237        VST1     qLeft, [pDst], dstStep
238        VST1     qTmp, [pTmp2], dstStep
239        VLD1     {qLeft[]}, [pSrcLeft], leftStep
240        VLD1     {qTmp[]}, [pTmp], leftStep
241        VST1     qLeft, [pDst], dstStep
242        VST1     qTmp, [pTmp2], dstStep
243        VLD1     {qLeft[]}, [pSrcLeft], leftStep
244        VLD1     {qTmp[]}, [pTmp], leftStep
245        VST1     qLeft, [pDst], dstStep
246        VST1     qTmp, [pTmp2], dstStep
247        VLD1     {qLeft[]}, [pSrcLeft], leftStep
248        VLD1     {qTmp[]}, [pTmp], leftStep
249        VST1     qLeft, [pDst], dstStep
250        VST1     qTmp, [pTmp2], dstStep
251
252        BNE      LoopHor                                  ;// Loop for 16 times
253        MOV      return, #OMX_Sts_NoErr
254        M_EXIT
255
256OMX_VC_16X16_DC
257        MOV      count, #0                                 ;// count = 0
258        TST      availability, #OMX_VC_LEFT
259        BEQ      UpperOrNoneAvailable                      ;// Jump to Upper if not left
260
261        ADD     pTmp, pSrcLeft, leftStep
262        ADD     step, leftStep, leftStep
263
264        VLD1    {qLeft[0]}, [pSrcLeft],step
265        VLD1    {qLeft[1]}, [pTmp],step
266        VLD1    {qLeft[2]}, [pSrcLeft],step
267        VLD1    {qLeft[3]}, [pTmp],step
268        VLD1    {qLeft[4]}, [pSrcLeft],step
269        VLD1    {qLeft[5]}, [pTmp],step
270        VLD1    {qLeft[6]}, [pSrcLeft],step
271        VLD1    {qLeft[7]}, [pTmp],step
272        VLD1    {qLeft[8]}, [pSrcLeft],step
273        VLD1    {qLeft[9]}, [pTmp],step
274        VLD1    {qLeft[10]},[pSrcLeft],step
275        VLD1    {qLeft[11]},[pTmp],step
276        VLD1    {qLeft[12]},[pSrcLeft],step
277        VLD1    {qLeft[13]},[pTmp],step
278        VLD1    {qLeft[14]},[pSrcLeft],step
279        VLD1    {qLeft[15]},[pTmp]
280
281        VPADDL   qSum8, qLeft
282        ADD     count, count, #1
283        VPADD    dSum4, dSum80, dSum81
284        VPADDL   dSum2, dSum4
285        VPADDL   dSumLeft, dSum2
286        VRSHR    dSum, dSumLeft, #4
287
288UpperOrNoneAvailable
289        TST      availability,  #OMX_VC_UPPER              ;// if(availability & #OMX_VC_UPPER)
290        BEQ      BothOrNoneAvailable                       ;// Jump to Left if not upper
291        VLD1     qAbove, [pSrcAbove]
292        ADD      count, count, #1                          ;// if upper inc count by 1
293        VPADDL   qSum8, qAbove
294        VPADD    dSum4, dSum80, dSum81
295        VPADDL   dSum2, dSum4
296        VPADDL   dSumAbove, dSum2
297        VRSHR    dSum, dSumAbove, #4
298
299BothOrNoneAvailable
300        CMP      count, #2                                  ;// check if both available
301        BNE      NoneAvailable
302        VADD     dSum, dSumAbove, dSumLeft
303        VRSHR    dSum, dSum, #5
304
305
306NoneAvailable
307        VDUP     qOut, dSum0
308        CMP      count, #0                                  ;// check if none available
309        ADD      pTmp, pDst, dstStep
310        ADD      step, dstStep, dstStep
311        BNE      LoopDC
312        VMOV     qOut, #128
313LoopDC
314        VST1    qOut, [pDst], step
315        VST1    qOut, [pTmp], step
316        VST1    qOut, [pDst], step
317        VST1    qOut, [pTmp], step
318        VST1    qOut, [pDst], step
319        VST1    qOut, [pTmp], step
320        VST1    qOut, [pDst], step
321        VST1    qOut, [pTmp], step
322        VST1    qOut, [pDst], step
323        VST1    qOut, [pTmp], step
324        VST1    qOut, [pDst], step
325        VST1    qOut, [pTmp], step
326        VST1    qOut, [pDst], step
327        VST1    qOut, [pTmp], step
328        VST1    qOut, [pDst], step
329        VST1    qOut, [pTmp], step
330        MOV     return, #OMX_Sts_NoErr
331        M_EXIT
332
333OMX_VC_16X16_PLANE
334        LDR     pMultTable, =armVCM4P10_MultiplierTable16x16
335        VLD1    qAbove, [pSrcAbove]                         ;// pSrcAbove[x]      :0<= x <= 7
336        VLD1    dAboveLeft[0],[pSrcAboveLeft]
337        ADD     pTmp, pSrcLeft, leftStep
338        ADD     step, leftStep, leftStep
339        VLD1    {qLeft[0]},  [pSrcLeft],step
340        VLD1    {qLeft[1]},  [pTmp],step
341        VLD1    {qLeft[2]},  [pSrcLeft],step
342        VLD1    {qLeft[3]},  [pTmp],step
343        VLD1    {qLeft[4]},  [pSrcLeft],step
344        VLD1    {qLeft[5]},  [pTmp],step
345        VLD1    {qLeft[6]},  [pSrcLeft],step
346        VLD1    {qLeft[7]},  [pTmp],step
347        VLD1    {qLeft[8]},  [pSrcLeft],step
348        VLD1    {qLeft[9]},  [pTmp],step
349        VLD1    {qLeft[10]}, [pSrcLeft],step
350        VLD1    {qLeft[11]}, [pTmp],step
351        VLD1    {qLeft[12]}, [pSrcLeft],step
352        VLD1    {qLeft[13]}, [pTmp],step
353        VLD1    {qLeft[14]}, [pSrcLeft],step
354        VLD1    {qLeft[15]}, [pTmp]
355
356        VREV64  dRevAbove, dAbove1                          ;// pSrcAbove[15:14:13:12:11:10:9:8]
357        VSUBL   qAbove15minus0, dRevAbove, dAboveLeft       ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0]
358        VSHR    dRevAbove64, dRevAbove64, #8                ;// pSrcAbove[14:13:12:11:10:9:8:X]
359        VSUBL   qAboveDiff, dRevAbove, dAbove0
360
361        VSHL    dAboveDiff64, dAboveDiff64, #16
362        VEXT    dDiffAbove1, dAboveDiff1, dAbove15minus0, #1
363
364        VREV64  dRevLeft,dLeft1                             ;// pSrcLeft[15:14:13:12:11:10:9:8]
365        VSUBL   qLeft15minus0,dRevLeft, dAboveLeft          ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
366        VSHR    dRevLeft64, dRevLeft64, #8                  ;// pSrcLeft[14:13:12:11:10:9:8:X]
367        VSUBL   qLeftDiff,dRevLeft, dLeft0
368
369        ;// Multiplier = [8|1|2|...|6|7]
370        VLD1    qMultiplier, [pMultTable]!
371
372        VSHL    dLeftDiff64, dLeftDiff64, #16
373        VEXT    dDiffLeft1, dLeftDiff1, dLeft15minus0, #1
374
375        VMULL   qH,dDiffAbove0, dMultiplier0
376        VMULL   qV,dDiffLeft0,  dMultiplier0
377        VMLAL   qH,dDiffAbove1, dMultiplier1
378        VMLAL   qV,dDiffLeft1,  dMultiplier1
379
380        VPADD   dHV00,dH1,dH0
381        VPADD   dHV01,dV1,dV0
382        VPADDL  qHV, qHV0
383        VSHL    qHV1,qHV,#2
384        VADD    qHV,qHV,qHV1
385
386        ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)]
387        VRSHR   qHV,qHV,#6
388
389        ;// HV1 = [c*7|b*7]
390        VSHL    qHV1,qHV,#3
391        VSUB    qHV1,qHV1,qHV
392
393        ;// Multiplier1 = [0|1|2|...|7]
394        VLD1    qMultiplier0, [pMultTable]!
395        VDUP    qB, dHV0
396        VDUP    qC, dHV1
397
398        VADDL   qA,dAbove1,dLeft1
399        VSHL    qA,qA, #4
400        VDUP    qA,dA1[3]
401        VADD    dBPlusCMult7, dHV10, dHV11
402
403        ;// Multiplier1 = [8|9|10|...|15]
404        VLD1    qMultiplier1, [pMultTable]
405        ;// Const = a - 7*(b+c)
406        VDUP    qConst, dBPlusCMult7S16[0]
407        VSUB    qConst, qA, qConst
408
409        ;// B0 = [0*b|1*b|2*b|3*b|......|7*b]
410        VMUL    qB0,qB,qMultiplier0
411
412        ;// B0 = [8*b|9*b|10*b|11*b|....|15*b]
413        VMUL    qB1,qB,qMultiplier1
414
415        VADD    qSum0, qB0, qConst
416        VADD    qSum1, qB1, qConst
417
418        ;// Loops for 16 times
419LoopPlane
420        ;// (b*x + c*y + C)>>5
421        VQRSHRUN dOut0, qSum0,#5
422        VQRSHRUN dOut1, qSum1,#5
423        SUBS     y, y, #1
424        VST1     qOut,[pDst],dstStep
425        VADD     qSum0,qSum0,qC
426        VADD     qSum1,qSum1,qC
427        BNE      LoopPlane
428
429        MOV      return, #OMX_Sts_NoErr
430
431        M_END
432
433        ENDIF ;// CortexA8
434
435        END
436;-----------------------------------------------------------------------------------------------
437; omxVCM4P10_PredictIntra_16x16 ends
438;-----------------------------------------------------------------------------------------------
439