omxVCM4P2_MCReconBlock_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P2_MCReconBlock_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26;// Description:
27;//
28;//
29
30;// Include standard headers
31    INCLUDE omxtypes_s.h
32    INCLUDE armCOMM_s.h
33
34;// Import symbols required from other files
35
36    M_VARIANTS CortexA8
37
38;// ***************************************************************************
39;// ARM1136JS implementation
40;// ***************************************************************************
41
42;// ***************************************************************************
43;// CortexA8 implementation
44;// ***************************************************************************
45    IF  CortexA8
46;// ***************************************************************************
47;// MACRO DEFINITIONS
48;// ***************************************************************************
49    ;// Description:
50    ;// Does interpolation for the case of "IntegerPixel" predictType. Both
51    ;// rounding cases are handled. Just copies a block from pSrc to pDst
52    ;//
53    ;// Syntax:
54    ;// M_MCRECONBLOCK_IntegerPixel
55    ;//
56    ;// Inputs: None
57    ;// Outputs: None
58
59    MACRO
60    M_MCRECONBLOCK_IntegerPixel
61CaseIntegerPixel_Rnd0
62CaseIntegerPixel_Rnd1
63
64    VLD1        dRow0, [pSrc], srcStep
65    VLD1        dRow1, [pSrc], srcStep
66    VLD1        dRow2, [pSrc], srcStep
67    VLD1        dRow3, [pSrc], srcStep
68    VLD1        dRow4, [pSrc], srcStep
69    VLD1        dRow5, [pSrc], srcStep
70    VLD1        dRow6, [pSrc], srcStep
71    VLD1        dRow7, [pSrc], srcStep
72
73    VST1        dRow0, [pDst@64], dstStep
74    VST1        dRow1, [pDst@64], dstStep
75    VST1        dRow2, [pDst@64], dstStep
76    VST1        dRow3, [pDst@64], dstStep
77    VST1        dRow4, [pDst@64], dstStep
78    VST1        dRow5, [pDst@64], dstStep
79    VST1        dRow6, [pDst@64], dstStep
80    VST1        dRow7, [pDst@64], dstStep
81
82    B           SwitchPredictTypeEnd
83    MEND
84;// ***************************************************************************
85    ;// Description:
86    ;// Does interpolation for the case of "HalfPixelX" predictType. The two
87    ;// rounding cases are handled by the parameter "$rndVal". Averages between
88    ;// a pixel and pixel right to it, rounding it based on $rndVal. The
89    ;// rounding is implemented by using opCode switching between "VRHADD" and
90    ;// "VHADD" instructions.
91    ;//
92    ;// Syntax:
93    ;// M_MCRECONBLOCK_HalfPixelX $rndVal
94    ;//
95    ;// Inputs:
96    ;//     $rndVal: 0 for rounding and 1 for no rounding
97    ;// Outputs: None
98
99    MACRO
100    M_MCRECONBLOCK_HalfPixelX $rndVal
101
102    LCLS M_VHADDR
103    IF $rndVal = 0
104M_VHADDR SETS "VRHADD"
105    ELSE
106M_VHADDR SETS "VHADD"
107    ENDIF
108
109CaseHalfPixelX_Rnd$rndVal
110
111    VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep
112    VEXT        dRow0Shft, dRow0, dRow0Shft, #1
113    VLD1        {dRow1, dRow1Shft}, [pSrc], srcStep
114    VEXT        dRow1Shft, dRow1, dRow1Shft, #1
115    VLD1        {dRow2, dRow2Shft}, [pSrc], srcStep
116    VEXT        dRow2Shft, dRow2, dRow2Shft, #1
117    VLD1        {dRow3, dRow3Shft}, [pSrc], srcStep
118    VEXT        dRow3Shft, dRow3, dRow3Shft, #1
119    VLD1        {dRow4, dRow4Shft}, [pSrc], srcStep
120    VEXT        dRow4Shft, dRow4, dRow4Shft, #1
121    VLD1        {dRow5, dRow5Shft}, [pSrc], srcStep
122    VEXT        dRow5Shft, dRow5, dRow5Shft, #1
123    VLD1        {dRow6, dRow6Shft}, [pSrc], srcStep
124    VEXT        dRow6Shft, dRow6, dRow6Shft, #1
125    VLD1        {dRow7, dRow7Shft}, [pSrc], srcStep
126    VEXT        dRow7Shft, dRow7, dRow7Shft, #1
127    $M_VHADDR   dRow0, dRow0, dRow0Shft
128    $M_VHADDR   dRow1, dRow1, dRow1Shft
129    VST1        dRow0, [pDst@64], dstStep
130    $M_VHADDR   dRow2, dRow2, dRow2Shft
131    VST1        dRow1, [pDst@64], dstStep
132    $M_VHADDR   dRow3, dRow3, dRow3Shft
133    VST1        dRow2, [pDst@64], dstStep
134    $M_VHADDR   dRow4, dRow4, dRow4Shft
135    VST1        dRow3, [pDst@64], dstStep
136    $M_VHADDR   dRow5, dRow5, dRow5Shft
137    VST1        dRow4, [pDst@64], dstStep
138    $M_VHADDR   dRow6, dRow6, dRow6Shft
139    VST1        dRow5, [pDst@64], dstStep
140    $M_VHADDR   dRow7, dRow7, dRow7Shft
141    VST1        dRow6, [pDst@64], dstStep
142    VST1        dRow7, [pDst@64], dstStep
143
144    B           SwitchPredictTypeEnd
145    MEND
146;// ***************************************************************************
147    ;// Description:
148    ;// Does interpolation for the case of "HalfPixelY" predictType. The two
149    ;// rounding cases are handled by the parameter "$rndVal". Averages between
150    ;// a pixel and pixel below it, rounding it based on $rndVal. The
151    ;// rounding is implemented by using opCode switching between "VRHADD" and
152    ;// "VHADD" instructions.
153    ;//
154    ;// Syntax:
155    ;// M_MCRECONBLOCK_HalfPixelY $rndVal
156    ;//
157    ;// Inputs:
158    ;//     $rndVal: 0 for rounding and 1 for no rounding
159    ;// Outputs: None
160
161    MACRO
162    M_MCRECONBLOCK_HalfPixelY $rndVal
163
164    LCLS M_VHADDR
165    IF $rndVal = 0
166M_VHADDR SETS "VRHADD"
167    ELSE
168M_VHADDR SETS "VHADD"
169    ENDIF
170
171CaseHalfPixelY_Rnd$rndVal
172    VLD1        dRow0, [pSrc], srcStep
173    VLD1        dRow1, [pSrc], srcStep
174    VLD1        dRow2, [pSrc], srcStep
175    VLD1        dRow3, [pSrc], srcStep
176    VLD1        dRow4, [pSrc], srcStep
177    VLD1        dRow5, [pSrc], srcStep
178    VLD1        dRow6, [pSrc], srcStep
179    VLD1        dRow7, [pSrc], srcStep
180    $M_VHADDR   dRow0, dRow0, dRow1
181    VLD1        dRow8, [pSrc], srcStep
182    $M_VHADDR   dRow1, dRow1, dRow2
183    VST1        dRow0, [pDst@64], dstStep
184    $M_VHADDR   dRow2, dRow2, dRow3
185    VST1        dRow1, [pDst@64], dstStep
186    $M_VHADDR   dRow3, dRow3, dRow4
187    VST1        dRow2, [pDst@64], dstStep
188    $M_VHADDR   dRow4, dRow4, dRow5
189    VST1        dRow3, [pDst@64], dstStep
190    $M_VHADDR   dRow5, dRow5, dRow6
191    VST1        dRow4, [pDst@64], dstStep
192    $M_VHADDR   dRow6, dRow6, dRow7
193    VST1        dRow5, [pDst@64], dstStep
194    $M_VHADDR   dRow7, dRow7, dRow8
195    VST1        dRow6, [pDst@64], dstStep
196    VST1        dRow7, [pDst@64], dstStep
197
198    B           SwitchPredictTypeEnd
199    MEND
200;// ***************************************************************************
201    ;// Description:
202    ;// Does interpolation for the case of "IntegerPixel" predictType. Both
203    ;// rounding cases are handled.
204    ;// Typical computation for a row goes like this
205    ;//     1. VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep ;// Load the row and next 8 bytes
206    ;//     2. VEXT        dRow0Shft, dRow0, dRow0Shft, #1     ;// Generate the shifted row
207    ;//     3. VADDL       qSum0, dRow0, dRow0Shft             ;// Generate the sum of row and shifted row
208    ;//     5. VADD        qSum0, qSum0, qSum1                 ;// Add to the sum of next row (odd row sum has rounding value added to it)
209    ;//     6. VSHRN       dRow0, qSum0, #2                    ;// Divide by 4
210    ;//     7. VST1        dRow0, [pDst@64], dstStep           ;// Store
211    ;// Odd rows undergo following computation after step 3
212    ;//     4. VADD        qSum1, qSum1, qRound
213    ;// This saves for adding rounding value to each final sum (overall saves 4
214    ;// instructions).
215    ;// There is reuse of registers for qSum6, qSum7 & qSum8. Overall scheduling takes
216    ;// care of this and also minimizes stalls. Rounding value was modified in
217    ;// ARM register rndVal (originally used for rounding flag) before the switch.
218    ;// It is then populated into all lanes in this macro. No branching out to
219    ;// label "SwitchPredictTypeEnd" is required in the end of the macro as these
220    ;// are the last of switch cases.
221    ;//
222    ;// Syntax:
223    ;// M_MCRECONBLOCK_HalfPixelXY
224    ;//
225    ;// Inputs: None
226    ;// Outputs: None
227
228    MACRO
229    M_MCRECONBLOCK_HalfPixelXY
230
231CaseHalfPixelXY_Rnd0
232CaseHalfPixelXY_Rnd1
233    VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep
234    VDUP        qRound, rndVal
235    VLD1        {dRow1, dRow1Shft}, [pSrc], srcStep
236    VEXT        dRow0Shft, dRow0, dRow0Shft, #1
237    VLD1        {dRow2, dRow2Shft}, [pSrc], srcStep
238    VEXT        dRow1Shft, dRow1, dRow1Shft, #1
239    VLD1        {dRow3, dRow3Shft}, [pSrc], srcStep
240    VEXT        dRow2Shft, dRow2, dRow2Shft, #1
241    VLD1        {dRow4, dRow4Shft}, [pSrc], srcStep
242    VADDL       qSum0, dRow0, dRow0Shft
243    VLD1        {dRow5, dRow5Shft}, [pSrc], srcStep
244    VADDL       qSum1, dRow1, dRow1Shft
245    VLD1        {dRow6, dRow6Shft}, [pSrc], srcStep
246    VEXT        dRow3Shft, dRow3, dRow3Shft, #1
247    VLD1        {dRow7, dRow7Shft}, [pSrc], srcStep
248    VEXT        dRow4Shft, dRow4, dRow4Shft, #1
249    VLD1        {dRow8, dRow8Shft}, [pSrc], srcStep
250    VADD        qSum1, qSum1, qRound
251    VADDL       qSum2, dRow2, dRow2Shft
252    VEXT        dRow5Shft, dRow5, dRow5Shft, #1
253    VADD        qSum0, qSum0, qSum1
254    VADDL       qSum3, dRow3, dRow3Shft
255    VEXT        dRow6Shft, dRow6, dRow6Shft, #1
256    VADD        qSum1, qSum1, qSum2
257    VSHRN       dRow0, qSum0, #2
258    VADDL       qSum4, dRow4, dRow4Shft
259    VSHRN       dRow1, qSum1, #2
260    VADD        qSum3, qSum3, qRound
261    VADDL       qSum5, dRow5, dRow5Shft
262    VST1        dRow0, [pDst@64], dstStep
263    VEXT        dRow7Shft, dRow7, dRow7Shft, #1
264    VST1        dRow1, [pDst@64], dstStep
265    VEXT        dRow8Shft, dRow8, dRow8Shft, #1
266    VADD        qSum5, qSum5, qRound
267    VADD        qSum2, qSum2, qSum3
268    VADD        qSum3, qSum3, qSum4
269    VADD        qSum4, qSum4, qSum5
270    VSHRN       dRow2, qSum2, #2
271    VSHRN       dRow3, qSum3, #2
272    VSHRN       dRow4, qSum4, #2
273    VADDL       qSum6, dRow6, dRow6Shft
274    VADDL       qSum7, dRow7, dRow7Shft
275    VST1        dRow2, [pDst@64], dstStep
276    VADDL       qSum8, dRow8, dRow8Shft
277    VADD        qSum7, qSum7, qRound
278    VST1        dRow3, [pDst@64], dstStep
279    VST1        dRow4, [pDst@64], dstStep
280    VADD        qSum5, qSum5, qSum6
281    VADD        qSum6, qSum6, qSum7
282    VADD        qSum7, qSum7, qSum8
283    VSHRN       dRow5, qSum5, #2
284    VSHRN       dRow6, qSum6, #2
285    VSHRN       dRow7, qSum7, #2
286    VST1        dRow5, [pDst@64], dstStep
287    VST1        dRow6, [pDst@64], dstStep
288    VST1        dRow7, [pDst@64], dstStep
289
290    MEND
291;// ***************************************************************************
292
293;// Input/Output Registers
294pSrc                  RN 0
295srcStep               RN 1
296pSrcResidue           RN 2
297pDst                  RN 3
298dstStep               RN 4
299predictType           RN 5
300rndVal                RN 6
301
302;// Local Scratch Registers
303pDstCopy              RN 0
304return                RN 0
305
306;// Neon Registers
307dRow0                 DN D0.U8
308dRow0Shft             DN D1.U8
309dRow1                 DN D2.U8
310dRow1Shft             DN D3.U8
311dRow2                 DN D4.U8
312dRow2Shft             DN D5.U8
313dRow3                 DN D6.U8
314dRow3Shft             DN D7.U8
315dRow4                 DN D8.U8
316dRow4Shft             DN D9.U8
317dRow5                 DN D10.U8
318dRow5Shft             DN D11.U8
319dRow6                 DN D12.U8
320dRow6Shft             DN D13.U8
321dRow7                 DN D14.U8
322dRow7Shft             DN D15.U8
323dRow8                 DN D16.U8
324dRow8Shft             DN D17.U8
325
326
327qSum0                 QN Q9.U16
328qSum1                 QN Q10.U16
329qSum2                 QN Q11.U16
330qSum3                 QN Q12.U16
331qSum4                 QN Q13.U16
332qSum5                 QN Q14.U16
333qSum6                 QN Q0.U16
334qSum7                 QN Q1.U16
335qSum8                 QN Q2.U16
336
337qRound                QN Q15.U16
338
339dDst0                 DN D0.U8
340dDst1                 DN D1.U8
341dDst2                 DN D2.U8
342dDst3                 DN D3.U8
343dDst4                 DN D4.U8
344dDst5                 DN D5.U8
345dDst6                 DN D6.U8
346dDst7                 DN D7.U8
347
348qRes0                 QN Q4.S16
349qRes1                 QN Q5.S16
350qRes2                 QN Q6.S16
351qRes3                 QN Q7.S16
352qRes4                 QN Q8.S16
353qRes5                 QN Q9.S16
354qRes6                 QN Q10.S16
355qRes7                 QN Q11.S16
356
357    ;// Function header
358    M_START     omxVCM4P2_MCReconBlock, r6, d15
359    ;// Define stack arguments
360    M_ARG       Arg_dstStep,        4
361    M_ARG       Arg_predictType,    4
362    M_ARG       Arg_rndVal,         4
363    ;// Load argument from the stack
364    M_LDR       dstStep, Arg_dstStep
365    M_LDR       predictType, Arg_predictType
366    M_LDR       rndVal, Arg_rndVal
367    ADD         predictType, rndVal, predictType, LSL #1
368    RSB         rndVal, rndVal, #2              ;// preparing rndVal for HalfPixelXY
369
370    ;// The following is implementation of switching to different code segments
371    ;// based on different predictType and rndVal flags. The corresponding
372    ;// labels (e.g. CaseIntegerPixel_Rnd0) are embedded in the macros following
373    ;// M_ENDSWITCH (e.g. M_MCRECONBLOCK_IntegerPixel). While "M_MCRECONBLOCK_IntegerPixel"
374    ;// and "M_MCRECONBLOCK_HalfPixelXY" handle for both rounding cases;
375    ;// "M_MCRECONBLOCK_HalfPixelX" and "M_MCRECONBLOCK_HalfPixelY" macros handle
376    ;// the two rounding cases in separate code bases.
377    ;// All these together implement the interpolation functionality
378
379    M_SWITCH    predictType
380        M_CASE      CaseIntegerPixel_Rnd0
381        M_CASE      CaseIntegerPixel_Rnd1
382        M_CASE      CaseHalfPixelX_Rnd0
383        M_CASE      CaseHalfPixelX_Rnd1
384        M_CASE      CaseHalfPixelY_Rnd0
385        M_CASE      CaseHalfPixelY_Rnd1
386        M_CASE      CaseHalfPixelXY_Rnd0
387        M_CASE      CaseHalfPixelXY_Rnd1
388    M_ENDSWITCH
389
390    M_MCRECONBLOCK_IntegerPixel
391    M_MCRECONBLOCK_HalfPixelX 0
392    M_MCRECONBLOCK_HalfPixelX 1
393    M_MCRECONBLOCK_HalfPixelY 0
394    M_MCRECONBLOCK_HalfPixelY 1
395    M_MCRECONBLOCK_HalfPixelXY
396SwitchPredictTypeEnd
397
398    ;// After interpolation is done, residue needs to be added. This is done
399    ;// only in case "pSrcResidue" parameter to the function is not NULL.
400    ;// Following is a completely unrolled code to do so. Each row and
401    ;// corresponding residue is loaded and residue is added and value
402    ;// stored
403
404    CMP         pSrcResidue, #0
405    SUBNE       pDst, pDst, dstStep, LSL #3     ;// Restoring pDst
406    MOVNE       pDstCopy, pDst
407    BEQ         pSrcResidueConditionEnd
408pSrcResidueNotNull
409    VLD1        dDst0, [pDst@64], dstStep
410    VLD1        qRes0, [pSrcResidue@128]!
411    VLD1        dDst1, [pDst@64], dstStep
412    VLD1        qRes1, [pSrcResidue@128]!
413    VLD1        dDst2, [pDst@64], dstStep
414    VLD1        qRes2, [pSrcResidue@128]!
415    VADDW       qRes0, qRes0, dDst0
416    VLD1        dDst3, [pDst@64], dstStep
417    VADDW       qRes1, qRes1, dDst1
418    VLD1        qRes3, [pSrcResidue@128]!
419    VADDW       qRes2, qRes2, dDst2
420    VLD1        dDst4, [pDst@64], dstStep
421    VQMOVUN     dDst0, qRes0
422    VLD1        qRes4, [pSrcResidue@128]!
423    VADDW       qRes3, qRes3, dDst3
424    VLD1        dDst5, [pDst@64], dstStep
425    VQMOVUN     dDst1, qRes1
426    VLD1        qRes5, [pSrcResidue@128]!
427    VADDW       qRes4, qRes4, dDst4
428    VLD1        dDst6, [pDst@64], dstStep
429    VQMOVUN     dDst2, qRes2
430    VLD1        qRes6, [pSrcResidue@128]!
431    VADDW       qRes5, qRes5, dDst5
432    VLD1        dDst7, [pDst@64], dstStep
433    VQMOVUN     dDst3, qRes3
434    VLD1        qRes7, [pSrcResidue@128]!
435    VADDW       qRes6, qRes6, dDst6
436    VST1        dDst0, [pDstCopy@64], dstStep
437    VQMOVUN     dDst4, qRes4
438    VST1        dDst1, [pDstCopy@64], dstStep
439    VADDW       qRes7, qRes7, dDst7
440    VST1        dDst2, [pDstCopy@64], dstStep
441    VQMOVUN     dDst5, qRes5
442    VST1        dDst3, [pDstCopy@64], dstStep
443    VQMOVUN     dDst6, qRes6
444    VST1        dDst4, [pDstCopy@64], dstStep
445    VQMOVUN     dDst7, qRes7
446    VST1        dDst5, [pDstCopy@64], dstStep
447    VST1        dDst6, [pDstCopy@64], dstStep
448    VST1        dDst7, [pDstCopy@64], dstStep
449
450pSrcResidueConditionEnd
451    MOV         return, #OMX_Sts_NoErr
452
453    M_END
454    ENDIF ;// CortexA8
455    END
456;// ***************************************************************************
457;// omxVCM4P2_MCReconBlock ends
458;// ***************************************************************************
459