omxVCM4P10_InterpolateLuma_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  omxVCM4P10_InterpolateLuma_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13;// Function:
14;//     omxVCM4P10_InterpolateLuma
15;//
16;// This function implements omxVCM4P10_InterpolateLuma in v6 assembly.
17;// Performs quarter pel interpolation of inter luma MB.
18;// It's assumed that the frame is already padded when calling this function.
19;// Parameters:
20;// [in]    pSrc        Pointer to the source reference frame buffer
21;// [in]    srcStep     Reference frame step in byte
22;// [in]    dstStep     Destination frame step in byte. Must be multiple of roi.width
23;// [in]    dx          Fractional part of horizontal motion vector
24;//                         component in 1/4 pixel unit; valid in the range [0,3]
25;// [in]    dy          Fractional part of vertical motion vector
26;//                         component in 1/4 pixel unit; valid in the range [0,3]
27;// [in]    roi         Dimension of the interpolation region;the parameters roi.width and roi.height must
28;//                         be equal to either 4, 8, or 16.
29;// [out]   pDst        Pointer to the destination frame buffer.
30;//                   if roi.width==4,  4-byte alignment required
31;//                   if roi.width==8,  8-byte alignment required
32;//                   if roi.width==16, 16-byte alignment required
33;//
34;// Return Value:
35;// If the function runs without error, it returns OMX_Sts_NoErr.
36;// It is assued that following cases are satisfied before calling this function:
37;//  pSrc or pDst is not NULL.
38;//  srcStep or dstStep >= roi.width.
39;//     dx or dy is in the range [0-3].
40;//     roi.width or roi.height is not out of range {4, 8, 16}.
41;//     If roi.width is equal to 4, Dst is 4 byte aligned.
42;//     If roi.width is equal to 8, pDst is 8 byte aligned.
43;//     If roi.width is equal to 16, pDst is 16 byte aligned.
44;//     srcStep and dstStep is multiple of 8.
45;//
46;//
47
48
49        INCLUDE omxtypes_s.h
50        INCLUDE armCOMM_s.h
51
52        M_VARIANTS CortexA8
53
54        EXPORT omxVCM4P10_InterpolateLuma
55
56
57    IF CortexA8
58        IMPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
59        IMPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
60        IMPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
61        IMPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
62    ENDIF
63
64
65
66;// Declare input registers
67pSrc            RN 0
68srcStep         RN 1
69pDst            RN 2
70dstStep         RN 3
71iHeight         RN 4
72iWidth          RN 5
73
74;// Declare other intermediate registers
75idx             RN 6
76idy             RN 7
77index           RN 6
78Temp            RN 12
79pArgs           RN 11
80
81
82    IF CortexA8
83
84        ;//
85        ;// Interpolation of luma is implemented by processing block of pixels, size 4x4 at a time.
86        ;//
87        M_ALLOC4    ppArgs, 16
88
89        ;// Function header
90        M_START omxVCM4P10_InterpolateLuma, r11, d15
91
92pSrcBK          RN 8
93
94;// Declare Neon registers
95dCoeff5         DN 30.S16
96dCoeff20        DN 31.S16
97
98;// Registers used for implementing Horizontal interpolation
99dSrc0c          DN 14.U8
100dSrc1c          DN 16.U8
101dSrc2c          DN 18.U8
102dSrc3c          DN 20.U8
103dSrc0d          DN 15.U8
104dSrc1d          DN 17.U8
105dSrc2d          DN 19.U8
106dSrc3d          DN 21.U8
107dAccH0          DN 22.U8
108dAccH1          DN 24.U8
109dAccH2          DN 26.U8
110dAccH3          DN 28.U8
111dResultH0       DN 22.U32
112dResultH1       DN 24.U32
113dResultH2       DN 26.U32
114dResultH3       DN 28.U32
115
116;// Registers used for implementing Vertical interpolation
117dSrc0           DN 9.U8
118dSrc1           DN 10.U8
119dSrc2           DN 11.U8
120dSrc3           DN 12.U8
121dSrc4           DN 13.U8
122dAccV0          DN 0.U8
123dAccV1          DN 2.U8
124dAccV2          DN 4.U8
125dAccV3          DN 6.U8
126dResultV0       DN 0.U32
127dResultV1       DN 2.U32
128dResultV2       DN 4.U32
129dResultV3       DN 6.U32
130
131;// Registers used for implementing Diagonal interpolation
132dTAcc0          DN 0.U8
133dTAcc1          DN 2.U8
134dTAcc2          DN 4.U8
135dTAcc3          DN 6.U8
136dTRes0          DN 0.32
137dTRes1          DN 2.32
138dTRes2          DN 4.32
139dTRes3          DN 6.32
140dTResult0       DN 14.U8
141dTResult1       DN 16.U8
142dTResult2       DN 18.U8
143dTResult3       DN 20.U8
144dTempP0         DN 18.S16
145dTempP1         DN 19.S16
146dTempQ0         DN 20.S16
147dTempQ1         DN 21.S16
148dTempR0         DN 22.S16
149dTempR1         DN 23.S16
150dTempS0         DN 24.S16
151dTempS1         DN 25.S16
152qTempP01        QN 9.S16
153qTempQ01        QN 10.S16
154qTempR01        QN 11.S16
155qTempS01        QN 12.S16
156
157;// Intermediate values for averaging
158qRes2           QN 7.S16
159qRes3           QN 8.S16
160qRes4           QN 9.S16
161qRes5           QN 10.S16
162qRes6           QN 11.S16
163
164;// For implementing copy
165dDst0            DN 9.32
166dDst1            DN 10.32
167dDst2            DN 11.32
168dDst3            DN 12.32
169
170        ;// Define stack arguments
171        M_ARG       ptridx, 4
172        M_ARG       ptridy, 4
173        M_ARG       ptrWidth, 4
174        M_ARG       ptrHeight, 4
175
176        ;// Load structure elements of roi
177        M_LDR       idx, ptridx
178        M_LDR       idy, ptridy
179        M_LDR       iWidth, ptrWidth
180        M_LDR       iHeight, ptrHeight
181
182        ADD         index, idx, idy, LSL #2                 ;//  [index] = [idy][idx]
183        M_ADR       pArgs, ppArgs
184
185        ;// Move coefficients Neon registers
186        VMOV        dCoeff20, #20
187        VMOV        dCoeff5, #5
188
189Block4x4WidthLoop
190Block4x4HeightLoop
191
192        STM         pArgs, {pSrc,srcStep,pDst,dstStep}
193
194        ;// switch table using motion vector as index
195        ADD         pc, pc, index, LSL #2
196        B           Case_f
197        B           Case_0
198        B           Case_1
199        B           Case_2
200        B           Case_3
201        B           Case_4
202        B           Case_5
203        B           Case_6
204        B           Case_7
205        B           Case_8
206        B           Case_9
207        B           Case_a
208        B           Case_b
209        B           Case_c
210        B           Case_d
211        B           Case_e
212        B           Case_f
213
214Case_0
215        ;// Case G
216        M_PRINTF "Case 0 \n"
217
218        ;// Loads a 4x4 block of .8 and stores as .32
219        ADD         Temp, pSrc, srcStep, LSL #1
220        VLD1        dSrc0, [pSrc], srcStep
221        VLD1        dSrc2, [Temp], srcStep
222        VLD1        dSrc1, [pSrc]
223        VLD1        dSrc3, [Temp]
224
225        ADD         Temp, pDst, dstStep, LSL #1
226        VST1        dDst0[0], [pDst], dstStep
227        VST1        dDst2[0], [Temp], dstStep
228        VST1        dDst1[0], [pDst]
229        VST1        dDst3[0], [Temp]
230        M_ADR       pArgs, ppArgs
231        B           Block4x4LoopEnd
232Case_1
233        ;// Case a
234        M_PRINTF "Case 1 \n"
235
236        SUB         pSrc, pSrc, #2
237        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
238        VRHADD      dAccH0, dAccH0, dSrc0c
239        VRHADD      dAccH2, dAccH2, dSrc2c
240        VRHADD      dAccH1, dAccH1, dSrc1c
241        VRHADD      dAccH3, dAccH3, dSrc3c
242        ADD         Temp, pDst, dstStep, LSL #1
243        VST1        dResultH0[0], [pDst], dstStep
244        VST1        dResultH2[0], [Temp], dstStep
245        VST1        dResultH1[0], [pDst]
246        VST1        dResultH3[0], [Temp]
247        M_ADR       pArgs, ppArgs
248        B           Block4x4LoopEnd
249Case_2
250        ;// Case b
251        M_PRINTF "Case 2 \n"
252
253        SUB         pSrc, pSrc, #2
254        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
255        ADD         Temp, pDst, dstStep, LSL #1
256        VST1        dResultH0[0], [pDst], dstStep
257        VST1        dResultH2[0], [Temp], dstStep
258        VST1        dResultH1[0], [pDst]
259        VST1        dResultH3[0], [Temp]
260        M_ADR       pArgs, ppArgs
261        B           Block4x4LoopEnd
262Case_3
263        ;// Case c
264        M_PRINTF "Case 3 \n"
265
266        SUB         pSrc, pSrc, #2
267        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
268        VRHADD      dAccH0, dAccH0, dSrc0d
269        VRHADD      dAccH2, dAccH2, dSrc2d
270        VRHADD      dAccH1, dAccH1, dSrc1d
271        VRHADD      dAccH3, dAccH3, dSrc3d
272        ADD         Temp, pDst, dstStep, LSL #1
273        VST1        dResultH0[0], [pDst], dstStep
274        VST1        dResultH2[0], [Temp], dstStep
275        VST1        dResultH1[0], [pDst]
276        VST1        dResultH3[0], [Temp]
277        M_ADR       pArgs, ppArgs
278        B           Block4x4LoopEnd
279Case_4
280        ;// Case d
281        M_PRINTF "Case 4 \n"
282
283        SUB         pSrc, pSrc, srcStep, LSL #1
284        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
285        VRHADD      dAccV0, dAccV0, dSrc0
286        VRHADD      dAccV2, dAccV2, dSrc2
287        VRHADD      dAccV1, dAccV1, dSrc1
288        VRHADD      dAccV3, dAccV3, dSrc3
289        ADD         Temp, pDst, dstStep, LSL #1
290        VST1        dResultV0[0], [pDst], dstStep
291        VST1        dResultV2[0], [Temp], dstStep
292        VST1        dResultV1[0], [pDst]
293        VST1        dResultV3[0], [Temp]
294        M_ADR       pArgs, ppArgs
295        B           Block4x4LoopEnd
296Case_5
297        ;// Case e
298        M_PRINTF "Case 5 \n"
299
300        MOV         pSrcBK, pSrc
301        SUB         pSrc, pSrc, srcStep, LSL #1
302        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
303        SUB         pSrc, pSrcBK, #2
304        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
305        VRHADD      dAccH0, dAccH0, dAccV0
306        VRHADD      dAccH2, dAccH2, dAccV2
307        VRHADD      dAccH1, dAccH1, dAccV1
308        VRHADD      dAccH3, dAccH3, dAccV3
309        ADD         Temp, pDst, dstStep, LSL #1
310        VST1        dResultH0[0], [pDst], dstStep
311        VST1        dResultH2[0], [Temp], dstStep
312        VST1        dResultH1[0], [pDst]
313        VST1        dResultH3[0], [Temp]
314
315        M_ADR       pArgs, ppArgs
316        B       Block4x4LoopEnd
317Case_6
318        ;// Case f
319        M_PRINTF "Case 6 \n"
320
321        SUB         pSrc, pSrc, srcStep, LSL #1
322        SUB         pSrc, pSrc, #2
323        BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
324        VQRSHRUN    dTResult0, qRes2, #5
325        VQRSHRUN    dTResult1, qRes3, #5
326        VQRSHRUN    dTResult2, qRes4, #5
327        VQRSHRUN    dTResult3, qRes5, #5
328        VRHADD      dTAcc0, dTAcc0, dTResult0
329        VRHADD      dTAcc2, dTAcc2, dTResult2
330        VRHADD      dTAcc1, dTAcc1, dTResult1
331        VRHADD      dTAcc3, dTAcc3, dTResult3
332        ADD         Temp, pDst, dstStep, LSL #1
333        VST1        dTRes0[0], [pDst], dstStep
334        VST1        dTRes2[0], [Temp], dstStep
335        VST1        dTRes1[0], [pDst]
336        VST1        dTRes3[0], [Temp]
337
338        M_ADR       pArgs, ppArgs
339        B       Block4x4LoopEnd
340Case_7
341        ;// Case g
342        M_PRINTF "Case 7 \n"
343        MOV         pSrcBK, pSrc
344        ADD         pSrc, pSrc, #1
345        SUB         pSrc, pSrc, srcStep, LSL #1
346        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
347        SUB         pSrc, pSrcBK, #2
348        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
349        VRHADD      dAccH0, dAccH0, dAccV0
350        VRHADD      dAccH2, dAccH2, dAccV2
351        VRHADD      dAccH1, dAccH1, dAccV1
352        VRHADD      dAccH3, dAccH3, dAccV3
353        ADD         Temp, pDst, dstStep, LSL #1
354        VST1        dResultH0[0], [pDst], dstStep
355        VST1        dResultH2[0], [Temp], dstStep
356        VST1        dResultH1[0], [pDst]
357        VST1        dResultH3[0], [Temp]
358
359        M_ADR       pArgs, ppArgs
360        B       Block4x4LoopEnd
361Case_8
362        ;// Case h
363        M_PRINTF "Case 8 \n"
364
365        SUB         pSrc, pSrc, srcStep, LSL #1
366        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
367        ADD         Temp, pDst, dstStep, LSL #1
368        VST1        dResultV0[0], [pDst], dstStep
369        VST1        dResultV2[0], [Temp], dstStep
370        VST1        dResultV1[0], [pDst]
371        VST1        dResultV3[0], [Temp]
372        M_ADR       pArgs, ppArgs
373        B           Block4x4LoopEnd
374Case_9
375        ;// Case i
376        M_PRINTF "Case 9 \n"
377        SUB         pSrc, pSrc, srcStep, LSL #1
378        SUB         pSrc, pSrc, #2
379        BL          armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
380        VEXT        dTempP0, dTempP0, dTempP1, #2
381        VEXT        dTempQ0, dTempQ0, dTempQ1, #2
382        VEXT        dTempR0, dTempR0, dTempR1, #2
383        VEXT        dTempS0, dTempS0, dTempS1, #2
384
385        VQRSHRUN    dTResult0, qTempP01, #5
386        VQRSHRUN    dTResult1, qTempQ01, #5
387        VQRSHRUN    dTResult2, qTempR01, #5
388        VQRSHRUN    dTResult3, qTempS01, #5
389
390        VRHADD      dTAcc0, dTAcc0, dTResult0
391        VRHADD      dTAcc2, dTAcc2, dTResult2
392        VRHADD      dTAcc1, dTAcc1, dTResult1
393        VRHADD      dTAcc3, dTAcc3, dTResult3
394        ADD         Temp, pDst, dstStep, LSL #1
395        VST1        dTRes0[0], [pDst], dstStep
396        VST1        dTRes2[0], [Temp], dstStep
397        VST1        dTRes1[0], [pDst]
398        VST1        dTRes3[0], [Temp]
399        M_ADR       pArgs, ppArgs
400        B       Block4x4LoopEnd
401Case_a
402        ;// Case j
403        M_PRINTF "Case a \n"
404
405        SUB         pSrc, pSrc, srcStep, LSL #1
406        SUB         pSrc, pSrc, #2
407        BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
408        ADD         Temp, pDst, dstStep, LSL #1
409        VST1        dTRes0[0], [pDst], dstStep
410        VST1        dTRes2[0], [Temp], dstStep
411        VST1        dTRes1[0], [pDst]
412        VST1        dTRes3[0], [Temp]
413        M_ADR       pArgs, ppArgs
414        B       Block4x4LoopEnd
415Case_b
416        ;// Case k
417        M_PRINTF "Case b \n"
418        SUB         pSrc, pSrc, srcStep, LSL #1
419        SUB         pSrc, pSrc, #2
420        BL          armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
421        VEXT        dTempP0, dTempP0, dTempP1, #3
422        VEXT        dTempQ0, dTempQ0, dTempQ1, #3
423        VEXT        dTempR0, dTempR0, dTempR1, #3
424        VEXT        dTempS0, dTempS0, dTempS1, #3
425
426        VQRSHRUN    dTResult0, qTempP01, #5
427        VQRSHRUN    dTResult1, qTempQ01, #5
428        VQRSHRUN    dTResult2, qTempR01, #5
429        VQRSHRUN    dTResult3, qTempS01, #5
430
431        VRHADD      dTAcc0, dTAcc0, dTResult0
432        VRHADD      dTAcc2, dTAcc2, dTResult2
433        VRHADD      dTAcc1, dTAcc1, dTResult1
434        VRHADD      dTAcc3, dTAcc3, dTResult3
435        ADD         Temp, pDst, dstStep, LSL #1
436        VST1        dTRes0[0], [pDst], dstStep
437        VST1        dTRes2[0], [Temp], dstStep
438        VST1        dTRes1[0], [pDst]
439        VST1        dTRes3[0], [Temp]
440        M_ADR       pArgs, ppArgs
441        B       Block4x4LoopEnd
442Case_c
443        ;// Case n
444        M_PRINTF "Case c \n"
445
446        SUB         pSrc, pSrc, srcStep, LSL #1
447        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
448        VRHADD      dAccV0, dAccV0, dSrc1
449        VRHADD      dAccV2, dAccV2, dSrc3
450        VRHADD      dAccV1, dAccV1, dSrc2
451        VRHADD      dAccV3, dAccV3, dSrc4
452        ADD         Temp, pDst, dstStep, LSL #1
453        VST1        dResultV0[0], [pDst], dstStep
454        VST1        dResultV2[0], [Temp], dstStep
455        VST1        dResultV1[0], [pDst]
456        VST1        dResultV3[0], [Temp]
457        M_ADR       pArgs, ppArgs
458        B           Block4x4LoopEnd
459Case_d
460        ;// Case p
461        M_PRINTF "Case d \n"
462
463        MOV         pSrcBK, pSrc
464        SUB         pSrc, pSrc, srcStep, LSL #1
465        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
466        ADD         pSrc, pSrcBK, srcStep
467        SUB         pSrc, pSrc, #2
468        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
469        VRHADD      dAccH0, dAccH0, dAccV0
470        VRHADD      dAccH2, dAccH2, dAccV2
471        VRHADD      dAccH1, dAccH1, dAccV1
472        VRHADD      dAccH3, dAccH3, dAccV3
473        ADD         Temp, pDst, dstStep, LSL #1
474        VST1        dResultH0[0], [pDst], dstStep
475        VST1        dResultH2[0], [Temp], dstStep
476        VST1        dResultH1[0], [pDst]
477        VST1        dResultH3[0], [Temp]
478        M_ADR       pArgs, ppArgs
479        B       Block4x4LoopEnd
480Case_e
481        ;// Case q
482        M_PRINTF "Case e \n"
483
484        SUB         pSrc, pSrc, srcStep, LSL #1
485        SUB         pSrc, pSrc, #2
486        BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
487        VQRSHRUN    dTResult0, qRes3, #5
488        VQRSHRUN    dTResult1, qRes4, #5
489        VQRSHRUN    dTResult2, qRes5, #5
490        VQRSHRUN    dTResult3, qRes6, #5
491
492        VRHADD      dTAcc0, dTAcc0, dTResult0
493        VRHADD      dTAcc2, dTAcc2, dTResult2
494        VRHADD      dTAcc1, dTAcc1, dTResult1
495        VRHADD      dTAcc3, dTAcc3, dTResult3
496        ADD         Temp, pDst, dstStep, LSL #1
497        VST1        dTRes0[0], [pDst], dstStep
498        VST1        dTRes2[0], [Temp], dstStep
499        VST1        dTRes1[0], [pDst]
500        VST1        dTRes3[0], [Temp]
501        M_ADR       pArgs, ppArgs
502        B       Block4x4LoopEnd
503Case_f
504        ;// Case r
505        M_PRINTF "Case f \n"
506        MOV         pSrcBK, pSrc
507        ADD         pSrc, pSrc, #1
508        SUB         pSrc, pSrc, srcStep, LSL #1
509        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
510        ADD         pSrc, pSrcBK, srcStep
511        SUB         pSrc, pSrc, #2
512        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
513        VRHADD      dAccH0, dAccH0, dAccV0
514        VRHADD      dAccH2, dAccH2, dAccV2
515        VRHADD      dAccH1, dAccH1, dAccV1
516        VRHADD      dAccH3, dAccH3, dAccV3
517        ADD         Temp, pDst, dstStep, LSL #1
518        VST1        dResultH0[0], [pDst], dstStep
519        VST1        dResultH2[0], [Temp], dstStep
520        VST1        dResultH1[0], [pDst]
521        VST1        dResultH3[0], [Temp]
522        M_ADR       pArgs, ppArgs
523
524
525Block4x4LoopEnd
526
527        ;// Width Loop
528        ;//M_ADR       pArgs, ppArgs
529        LDM         pArgs, {pSrc,srcStep,pDst,dstStep}  ;// Load arguments
530        SUBS        iWidth, iWidth, #4
531        ADD         pSrc, pSrc, #4
532        ADD         pDst, pDst, #4
533        BGT         Block4x4WidthLoop
534
535        ;// Height Loop
536        SUBS        iHeight, iHeight, #4
537        M_LDR       iWidth, ptrWidth
538        M_ADR       pArgs, ppArgs
539        ADD         pSrc, pSrc, srcStep, LSL #2
540        ADD         pDst, pDst, dstStep, LSL #2
541        SUB         pSrc, pSrc, iWidth
542        SUB         pDst, pDst, iWidth
543        BGT         Block4x4HeightLoop
544
545EndOfInterpolation
546        MOV         r0, #0
547        M_END
548
549    ENDIF
550        ;// End of CortexA8
551
552    END
553
554