1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P2_MCReconBlock_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26;// Description:
27;//
28;//
29
30;// Include standard headers
31    INCLUDE omxtypes_s.h
32    INCLUDE armCOMM_s.h
33
34;// Import symbols required from other files
35
36    M_VARIANTS ARM1136JS
37
38;// ***************************************************************************
39;// ARM1136JS implementation
40;// ***************************************************************************
41    IF  ARM1136JS
42
43;// ***************************************************************************
44;// MACRO DEFINITIONS
45;// ***************************************************************************
46    ;// Description:
47    ;//
48    ;//   dest[j] = (x[j] + y[j] + round) >> 1,   j=0..3
49    ;//
50    ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to
51    ;// each sum before dividing by two, if round is 1
52    ;//
53    ;// Syntax:
54    ;// M_UHADD8R   $dest, $x, $y, $round, $mask
55    ;//
56    ;// Inputs:
57    ;// $x        four packed bytes,   x[3] :  x[2]  :  x[1]  :  x[0]
58    ;// $y        four packed bytes,   y[3] :  y[2]  :  y[1]  :  y[0]
59    ;// $round    0 if no rounding to be added, 1 if rounding to be done
60    ;// $mask     some register set to 0x80808080
61    ;//
62    ;// Outputs:
63    ;// $dest     four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
64
65    MACRO
66    M_UHADD8R   $dest, $x, $y, $round, $mask
67    IF $round = 1
68        IF  $dest /= $y
69            MVN         $dest, $x
70            UHSUB8      $dest, $y, $dest
71            EOR         $dest, $dest, $mask
72        ELSE
73            MVN         $dest, $y
74            UHSUB8      $dest, $x, $dest
75            EOR         $dest, $dest, $mask
76        ENDIF
77    ELSE
78        UHADD8      $dest, $x, $y
79    ENDIF
80    MEND
81;// ***************************************************************************
82    ;// Description:
83    ;// Load 8 bytes from $pSrc (aligned or unaligned locations)
84    ;//
85    ;// Syntax:
86    ;// M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
87    ;//
88    ;// Inputs:
89    ;// $pSrc       4 byte aligned source pointer to an address just less than
90    ;//             or equal to the data location
91    ;// $srcStep    The stride on source
92    ;// $scratch    A scratch register, used internally for temp calculations
93    ;// $offset     Difference of source data location to the source pointer
94    ;//             Use when $offset != 0 (unaligned load)
95    ;//
96    ;// Outputs:
97    ;// $pSrc       In case the macro accepts stride, it increments the pSrc by
98    ;//             that value, else unchanged
99    ;// $out0       four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
100    ;// $out1       four packed bytes,   z[7] :  z[6]  :  z[5]  :  z[4]
101    ;//
102    ;// Note: {$out0, $out1, $scratch} should be registers with ascending
103    ;// register numbering. In case offset is 0, $scratch is not modified.
104
105    MACRO
106    M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
107        IF $offset = 0
108            LDM         $pSrc, {$out0, $out1}
109            ADD         $pSrc, $pSrc, $srcStep
110        ELSE
111            LDM         $pSrc, {$out0, $out1, $scratch}
112            ADD         $pSrc, $pSrc, $srcStep
113
114            MOV         $out0, $out0, LSR #8 * $offset
115            ORR         $out0, $out0, $out1, LSL #(32 - 8 * ($offset))
116            MOV         $out1, $out1, LSR #8 * $offset
117            ORR         $out1, $out1, $scratch, LSL #(32 - 8 * ($offset))
118        ENDIF
119    MEND
120
121;// ***************************************************************************
122    ;// Description:
123    ;// Loads three words for X interpolation, update pointer to next row. For
124    ;// X interpolation, given a truncated-4byteAligned source pointer,
125    ;// invariably three continous words are required from there to get the
126    ;// nine bytes from the source pointer for filtering.
127    ;//
128    ;// Syntax:
129    ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
130    ;//
131    ;// Inputs:
132    ;// $pSrc       4 byte aligned source pointer to an address just less than
133    ;//             or equal to the data location
134    ;//
135    ;// $srcStep    The stride on source
136    ;//
137    ;// $offset     Difference of source data location to the source pointer
138    ;//             Use when $offset != 0 (unaligned load)
139    ;//
140    ;// Outputs:
141    ;// $pSrc       Incremented by $srcStep
142    ;//
143    ;// $word0, $word1, $word2, $word3
144    ;//             Three of these are outputs based on the $offset parameter.
145    ;//             The outputs are specifically generated to be processed by
146    ;//             the M_EXT_XINT macro. Following is the illustration to show
147    ;//             how the nine bytes are spanned for different offsets from
148    ;//             notTruncatedForAlignmentSourcePointer.
149    ;//
150    ;//              ------------------------------------------------------
151    ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
152    ;//             |------------------------------------------------------|
153    ;//             |    0   |       0     | 0123  | 4567  | 8xxx  |       |
154    ;//             |    1   |      -1     | x012  | 3456  | 78xx  |       |
155    ;//             |    2   |      -2     | xx01  | 2345  | 678x  |       |
156    ;//             |    3   |      -3     | xxx0  |       | 1234  | 5678  |
157    ;//              ------------------------------------------------------
158    ;//
159    ;//             where the numbering (0-8) is to designate the 9 bytes from
160    ;//             start of a particular row. The illustration doesn't take in
161    ;//             account the positioning of bytes with in the word and the
162    ;//             macro combination with M_EXT_XINT will work only in little
163    ;//             endian environs
164    ;//
165    ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
166    ;// register numbering
167
168    MACRO
169    M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
170        IF $offset /= 3
171            LDM         $pSrc, {$word0, $word1, $word2}
172        ELSE
173            LDM         $pSrc, {$word0, $word2, $word3}
174        ENDIF
175        ADD         $pSrc, $pSrc, $srcStep
176    MEND
177
178;// ***************************************************************************
179    ;// Description:
180    ;// Extract four registers of four pixels for X interpolation
181    ;//
182    ;// Syntax:
183    ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3
184    ;//
185    ;// Inputs:
186    ;// $offset     Difference of source data location to the source pointer
187    ;//             Use when $offset != 0 (unaligned load)
188    ;//
189    ;// $word0, $word1, $word2, $word3
190    ;//             Three of these are inputs based on the $offset parameter.
191    ;//             The inputs are specifically selected to be processed by
192    ;//             the M_EXT_XINT macro.
193    ;//
194    ;//              ------------------------------------------------------
195    ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
196    ;//             |------------------------------------------------------|
197    ;//             |    0   |       0     | 0123  | 4567  | 8xxx  | yyyy  |
198    ;//             |    1   |      -1     | x012  | 3456  | 78xx  | yyyy  |
199    ;//             |    2   |      -2     | xx01  | 2345  | 678x  | yyyy  |
200    ;//             |    3   |      -3     | xxx0  | yyyy  | 1234  | 5678  |
201    ;//              ------------------------------------------------------
202    ;//
203    ;// Outputs:
204    ;// $word0, $word1, $word2, $word3
205    ;//             Bytes from the original source pointer (not truncated for
206    ;//             4 byte alignment) as shown in the table.
207    ;//              -------------------------------
208    ;//             | word0 | word1 | word2 | word3 |
209    ;//             |-------------------------------|
210    ;//             | 0123  | 4567  | 1234  | 5678  |
211    ;//              -------------------------------
212    ;//
213    ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
214    ;// register numbering
215
216    MACRO
217    M_EXT_XINT $offset, $word0, $word1, $word2, $word3
218        IF $offset = 0
219            ; $word0 and $word1 are ok
220            ; $word2, $word3 are just 8 shifted versions
221            MOV         $word3, $word1, LSR #8
222            ORR         $word3, $word3, $word2, LSL #24
223            MOV         $word2, $word0, LSR #8
224            ORR         $word2, $word2, $word1, LSL #24
225        ELIF $offset = 3
226            ; $word2 and $word3 are ok (taken care while loading itself)
227            ; set $word0 & $word1
228            MOV         $word0, $word0, LSR #24
229            ORR         $word0, $word0, $word2, LSL #8
230            MOV         $word1, $word2, LSR #24
231            ORR         $word1, $word1, $word3, LSL #8
232        ELSE
233            MOV         $word0, $word0, LSR #8 * $offset
234            ORR         $word0, $word0, $word1, LSL #(32 - 8 * ($offset))
235            MOV         $word1, $word1, LSR #8 * $offset
236            ORR         $word1, $word1, $word2, LSL #(32 - 8 * ($offset))
237
238            MOV         $word3, $word1, LSR #8
239            ORR         $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1))
240            MOV         $word2, $word0, LSR #8
241            ORR         $word2, $word2, $word1, LSL #24
242        ENDIF
243    MEND
244
245;// ***************************************************************************
246    ;// Description:
247    ;// Computes half-sum and xor of two inputs and puts them in the input
248    ;// registers in that order
249    ;//
250    ;// Syntax:
251    ;// M_HSUM_XOR      $v0, $v1, $tmp
252    ;//
253    ;// Inputs:
254    ;// $v0         a, first input
255    ;// $v1         b, second input
256    ;// $tmp        scratch register
257    ;//
258    ;// Outputs:
259    ;// $v0         (a + b)/2
260    ;// $v1         a ^ b
261
262    MACRO
263    M_HSUM_XOR      $v0, $v1, $tmp
264        UHADD8      $tmp, $v0, $v1     ;// s0 = a + b
265        EOR         $v1, $v0, $v1      ;// l0 = a ^ b
266        MOV         $v0, $tmp          ;// s0
267    MEND
268;// ***************************************************************************
269    ;// Description:
270    ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in
271    ;// mcReconBlock module. Very specific to the implementation of
272    ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and
273    ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are
274    ;// not significant and are used by the callee for row counter (y)
275    ;//
276    ;// Some points to note are:
277    ;// 1. Input is pair of pair-averages and Xors
278    ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another
279    ;//    running average
280    ;// 3. Output is in the first argument
281    ;//
282    ;// Syntax:
283    ;// M_AVG4         $sum0, $lsb0, $sum1, $lsb1, $rndVal
284    ;//
285    ;// Inputs:
286    ;// $sum0       (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged
287    ;// $lsb0       (a ^ b)
288    ;// $sum1       (c + d) >> 1. Not modified
289    ;// $lsb1       (c ^ d)       Not modified
290    ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
291    ;//
292    ;// Outputs:
293    ;// $sum0       (a + b + c + d + 1) / 4 : If no rounding
294    ;//             (a + b + c + d + 2) / 4 : If rounding
295
296    MACRO
297    M_AVG4          $sum0, $lsb0, $sum1, $lsb1, $rndVal
298        LCLS OP1
299        LCLS OP2
300        IF $rndVal = 0 ;// rounding case
301OP1 SETS "AND"
302OP2 SETS "ORR"
303        ELSE           ;// Not rounding case
304OP1 SETS "ORR"
305OP2 SETS "AND"
306        ENDIF
307
308        LCLS lsb2
309        LCLS sum2
310        LCLS dest
311
312lsb2  SETS "tmp"
313sum2  SETS "$lsb0"
314dest  SETS "$sum0"
315
316        $OP1        $lsb0, $lsb0, $lsb1          ;// e0 = e0 & e1
317        EOR         $lsb2, $sum0, $sum1          ;// e2 = s0 ^ s1
318        $OP2        $lsb2, $lsb2, $lsb0          ;// e2 = e2 | e0
319        AND         $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask
320        UHADD8      $sum2, $sum0, $sum1          ;// s2 = (s0 + s1)/2
321        UADD8       $dest, $sum2, $lsb2          ;// dest =  s2 + e2
322    MEND
323;// ***************************************************************************
324;// Motion compensation handler macros
325;// ***************************************************************************
326    ;// Description:
327    ;// Implement motion compensation routines using the named registers in
328    ;// callee function. Each of the following 4 implement the 4 predict type
329    ;// Each handles 8 cases each ie all the combinations of 4 types of source
330    ;// alignment offsets and 2 types of rounding flag
331    ;//
332    ;// Syntax:
333    ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
334    ;// M_MCRECONBLOCK_HalfPixelX   $rndVal, $offset
335    ;// M_MCRECONBLOCK_HalfPixelY   $rndVal, $offset
336    ;// M_MCRECONBLOCK_HalfPixelXY  $rndVal, $offset
337    ;//
338    ;// Inputs:
339    ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
340    ;// $offset     $pSrc MOD 4 value. Offset from 4 byte aligned location.
341    ;//
342    ;// Outputs:
343    ;// Outputs come in the named registers of the callee functions
344    ;// The macro loads the data from the source pointer, processes it and
345    ;// stores in the destination pointer. Does the whole prediction cycle
346    ;// of Motion Compensation routine for a particular predictType
347    ;// After this only residue addition to the predicted values remain
348
349    MACRO
350    M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
351    ;// Algorithmic Description:
352    ;// This handles motion compensation for IntegerPixel predictType. Both
353    ;// rounding cases are handled by the same code base. It is just a copy
354    ;// from source to destination. Two lines are done per loop to reduce
355    ;// stalls. Loop has been software pipelined as well for that purpose.
356    ;//
357    ;// M_LOAD_X loads a whole row in two registers and then they are stored
358
359CaseIntegerPixelRnd0Offset$offset
360CaseIntegerPixelRnd1Offset$offset
361    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
362    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
363YloopIntegerPixelOffset$offset
364    SUBS        y, y, #2
365    STRD        tmp1, tmp2, [pDst], dstStep
366    STRD        tmp3, tmp4, [pDst], dstStep
367    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
368    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
369    BGT         YloopIntegerPixelOffset$offset
370
371    B           SwitchPredictTypeEnd
372    MEND
373;// ***************************************************************************
374    MACRO
375    M_MCRECONBLOCK_HalfPixelX $rndVal, $offset
376    ;// Algorithmic Description:
377    ;// This handles motion compensation for HalfPixelX predictType. The two
378    ;// rounding cases are handled by the different code base and spanned by
379    ;// different macro calls. Loop has been software pipelined to reduce
380    ;// stalls.
381    ;//
382    ;// Filtering involves averaging a pixel with the next horizontal pixel.
383    ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with
384    ;// all pixels in a row with 4 pixel in each register and another 2
385    ;// registers with pixels corresponding to one horizontally shifted pixel
386    ;// corresponding to the initial row pixels. These are set of packed
387    ;// registers appropriate to do 4 lane SIMD.
388    ;// After that M_UHADD8R macro does the averaging taking care of the
389    ;// rounding as required
390
391CaseHalfPixelXRnd$rndVal.Offset$offset
392    IF $rndVal = 0
393        LDR mask, =0x80808080
394    ENDIF
395
396    M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
397YloopHalfPixelXRnd$rndVal.Offset$offset
398    SUBS        y, y, #1
399    M_EXT_XINT  $offset, tmp1, tmp2, tmp3, tmp4
400    M_UHADD8R   tmp5, tmp1, tmp3, (1-$rndVal), mask
401    M_UHADD8R   tmp6, tmp2, tmp4, (1-$rndVal), mask
402    STRD        tmp5, tmp6, [pDst], dstStep
403    M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
404    BGT         YloopHalfPixelXRnd$rndVal.Offset$offset
405
406    B           SwitchPredictTypeEnd
407    MEND
408;// ***************************************************************************
409    MACRO
410    M_MCRECONBLOCK_HalfPixelY $rndVal, $offset
411    ;// Algorithmic Description:
412    ;// This handles motion compensation for HalfPixelY predictType. The two
413    ;// rounding cases are handled by the different code base and spanned by
414    ;// different macro calls. PreLoading is used to avoid reload of same data.
415    ;//
416    ;// Filtering involves averaging a pixel with the next vertical pixel.
417    ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in
418    ;// each register. These are set of packed registers appropriate to do
419    ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care
420    ;// of the rounding as required
421
422CaseHalfPixelYRnd$rndVal.Offset$offset
423    IF $rndVal = 0
424        LDR mask, =0x80808080
425    ENDIF
426
427    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load
428YloopHalfPixelYRnd$rndVal.Offset$offset
429    SUBS        y, y, #2
430    ;// Processing one line
431    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
432    M_UHADD8R   tmp1, tmp1, tmp3, (1-$rndVal), mask
433    M_UHADD8R   tmp2, tmp2, tmp4, (1-$rndVal), mask
434    STRD        tmp1, tmp2, [pDst], dstStep
435    ;// Processing another line
436    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset
437    M_UHADD8R   tmp3, tmp3, tmp1, (1-$rndVal), mask
438    M_UHADD8R   tmp4, tmp4, tmp2, (1-$rndVal), mask
439    STRD        tmp3, tmp4, [pDst], dstStep
440
441    BGT         YloopHalfPixelYRnd$rndVal.Offset$offset
442
443    B           SwitchPredictTypeEnd
444    MEND
445;// ***************************************************************************
446    MACRO
447    M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset
448    ;// Algorithmic Description:
449    ;// This handles motion compensation for HalfPixelXY predictType. The two
450    ;// rounding cases are handled by the different code base and spanned by
451    ;// different macro calls. PreLoading is used to avoid reload of same data.
452    ;//
453    ;// Filtering involves averaging a pixel with the next vertical, horizontal
454    ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT
455    ;// and M_EXT_XINT combination generates 4 registers with a row and its
456    ;// 1 pixel right shifted version, with 4 pixels in one register. Another
457    ;// call of that macro-combination gets another row. Then M_HSUM_XOR is
458    ;// called to get mutual half-sum and xor combinations of a row with its
459    ;// shifted version as they are inputs to the M_AVG4 macro which computes
460    ;// the 4 element average with rounding. Note that it is the half-sum/xor
461    ;// values that are preserved for next row as they can be re-used in the
462    ;// next call to the M_AVG4 and saves recomputation.
463    ;// Due to lack of register, the row counter and a masking value required
464    ;// in M_AVG4 are packed into a single register yMask where the last nibble
465    ;// holds the row counter values and rest holds the masking variable left
466    ;// shifted by 4
467
468CaseHalfPixelXYRnd$rndVal.Offset$offset
469    LDR         yMask, =((0x01010101 << 4) + 8)
470
471    M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
472    M_EXT_XINT  $offset, t00, t01, t10, t11
473    M_HSUM_XOR  t00, t10, tmp               ;// s0, l0
474    M_HSUM_XOR  t01, t11, tmp               ;// s0', l0'
475
476YloopHalfPixelXYRnd$rndVal.Offset$offset
477    ;// Processsing one line
478    ;// t00, t01, t10, t11 required from previous loop
479    M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d'
480    SUB         yMask, yMask, #2
481    M_EXT_XINT  $offset, t20, t21, t30, t31
482    M_HSUM_XOR  t20, t30, tmp               ;// s1, l1
483    M_HSUM_XOR  t21, t31, tmp               ;// s1', l1'
484    M_AVG4      t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1
485    M_AVG4      t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1'
486    STRD        t00, t01, [pDst], dstStep   ;// store the average
487
488    ;// Processsing another line
489    ;// t20, t21, t30, t31 required from above
490    M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
491    TST         yMask, #7
492    M_EXT_XINT  $offset, t00, t01, t10, t11
493    M_HSUM_XOR  t00, t10, tmp
494    M_HSUM_XOR  t01, t11, tmp
495    M_AVG4      t20, t30, t00, t10, $rndVal
496    M_AVG4      t21, t31, t01, t11, $rndVal
497    STRD        t20, t21, [pDst], dstStep
498
499    BGT         YloopHalfPixelXYRnd$rndVal.Offset$offset
500
501    IF $offset/=3 :LOR: $rndVal/=1
502        B           SwitchPredictTypeEnd
503    ENDIF
504    MEND
505;// ***************************************************************************
506;// Motion compensation handler macros end here
507;// ***************************************************************************
508    ;// Description:
509    ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal
510    ;// combination in the "switch" to prediction processing code segment
511    ;//
512    ;// Syntax:
513    ;// M_CASE_OFFSET $rnd, $predictType
514    ;//
515    ;// Inputs:
516    ;// $rnd            0 for rounding, 1 for no rounding
517    ;// $predictType    The prediction mode
518    ;//
519    ;// Outputs:
520    ;// Populated list of "M_CASE"s for the "M_SWITCH" macro
521
522    MACRO
523    M_CASE_OFFSET $rnd, $predictType
524        M_CASE      Case$predictType.Rnd$rnd.Offset0
525        M_CASE      Case$predictType.Rnd$rnd.Offset1
526        M_CASE      Case$predictType.Rnd$rnd.Offset2
527        M_CASE      Case$predictType.Rnd$rnd.Offset3
528    MEND
529;// ***************************************************************************
530    ;// Description:
531    ;// Populates all 2 kinds of rounding "cases" for each predictType in the
532    ;// "switch" to prediction processing code segment
533    ;//
534    ;// Syntax:
535    ;// M_CASE_OFFSET $predictType
536    ;//
537    ;// Inputs:
538    ;// $predictType    The prediction mode
539    ;//
540    ;// Outputs:
541    ;// Populated list of "M_CASE_OFFSET" macros
542
543    MACRO
544    M_CASE_MCRECONBLOCK $predictType
545        M_CASE_OFFSET  0, $predictType ;// 0 for rounding
546        M_CASE_OFFSET  1, $predictType ;// 1 for no rounding
547    MEND
548;// ***************************************************************************
549    ;// Description:
550    ;// Populates all 8 kinds of rounding and offset combinations handling macros
551    ;// for the specified predictType. In case of "IntegerPixel" predictType,
552    ;// rounding is not required so same code segment handles both cases
553    ;//
554    ;// Syntax:
555    ;// M_MCRECONBLOCK    $predictType
556    ;//
557    ;// Inputs:
558    ;// $predictType    The prediction mode
559    ;//
560    ;// Outputs:
561    ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified
562    ;// predictType. Each
563    ;//                 M_MCRECONBLOCK_<predictType> $rnd, $offset
564    ;// is an code segment (starting with a label indicating the predictType,
565    ;// rounding and offset combination)
566    ;// Four calls of this macro with the 4 prediction modes populate all the 32
567    ;// handlers
568
569    MACRO
570    M_MCRECONBLOCK $predictType
571        M_MCRECONBLOCK_$predictType 0, 0
572        M_MCRECONBLOCK_$predictType 0, 1
573        M_MCRECONBLOCK_$predictType 0, 2
574        M_MCRECONBLOCK_$predictType 0, 3
575    IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference
576        M_MCRECONBLOCK_$predictType 1, 0
577        M_MCRECONBLOCK_$predictType 1, 1
578        M_MCRECONBLOCK_$predictType 1, 2
579        M_MCRECONBLOCK_$predictType 1, 3
580    ENDIF
581    MEND
582;// ***************************************************************************
583;// Input/Output Registers
584pSrc                  RN 0
585srcStep               RN 1
586arg_pSrcResidue       RN 2
587pSrcResidue           RN 12
588pDst                  RN 3
589dstStep               RN 2
590predictType           RN 10
591rndVal                RN 11
592mask                  RN 11
593
594;// Local Scratch Registers
595zero                  RN 12
596y                     RN 14
597
598tmp1                  RN 4
599tmp2                  RN 5
600tmp3                  RN 6
601tmp4                  RN 7
602tmp5                  RN 8
603tmp6                  RN 9
604tmp7                  RN 10
605tmp8                  RN 11
606tmp9                  RN 12
607
608t00                   RN 4
609t01                   RN 5
610t10                   RN 6
611t11                   RN 7
612t20                   RN 8
613t21                   RN 9
614t30                   RN 10
615t31                   RN 11
616tmp                   RN 12
617
618yMask                 RN 14
619
620dst                   RN 1
621return                RN 0
622
623    ;// Allocate memory on stack
624    M_ALLOC4    Stk_pDst,           4
625    M_ALLOC4    Stk_pSrcResidue,    4
626    ;// Function header
627    M_START     omxVCM4P2_MCReconBlock, r11
628    ;// Define stack arguments
629    M_ARG       Arg_dstStep,        4
630    M_ARG       Arg_predictType,    4
631    M_ARG       Arg_rndVal,         4
632    ;// Save on stack
633    M_STR       pDst, Stk_pDst
634    M_STR       arg_pSrcResidue, Stk_pSrcResidue
635    ;// Load argument from the stack
636    M_LDR       dstStep, Arg_dstStep
637    M_LDR       predictType, Arg_predictType
638    M_LDR       rndVal, Arg_rndVal
639
640    MOV         y, #8
641
642    AND         tmp1, pSrc, #3
643    ORR         predictType, tmp1, predictType, LSL #3
644    ORR         predictType, predictType, rndVal, LSL #2
645    ;// Truncating source pointer to align to 4 byte location
646    BIC         pSrc, pSrc, #3
647
648    ;// Implementation takes care of all combinations of different
649    ;// predictTypes, rounding cases and source pointer offsets to alignment
650    ;// of 4 bytes in different code bases unless one of these parameter wasn't
651    ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK
652    ;// macros branch into 8 M_CASE macros for all combinations of the 2
653    ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte
654    ;// alignment.
655    M_SWITCH    predictType
656        M_CASE_MCRECONBLOCK IntegerPixel
657        M_CASE_MCRECONBLOCK HalfPixelX
658        M_CASE_MCRECONBLOCK HalfPixelY
659        M_CASE_MCRECONBLOCK HalfPixelXY
660    M_ENDSWITCH
661
662    ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8
663    ;// particular macros (4 in case of IntegerPixel as rounding makes no
664    ;// difference there) to generate the code for all cases of rounding and
665    ;// offsets. LTORG is used to segment the code as code size bloated beyond
666    ;// 4KB.
667    M_MCRECONBLOCK IntegerPixel
668    M_MCRECONBLOCK HalfPixelX
669    LTORG
670    M_MCRECONBLOCK HalfPixelY
671    M_MCRECONBLOCK HalfPixelXY
672SwitchPredictTypeEnd
673
674    ;// Residue Addition
675    ;// This is done in 2 lane SIMD though loads are further optimized and
676    ;// 4 bytes are loaded in case of destination buffer. Algorithmic
677    ;// details are in inlined comments
678    M_LDR       pSrcResidue, Stk_pSrcResidue
679    CMP         pSrcResidue, #0
680    BEQ         pSrcResidueConditionEnd
681pSrcResidueNotNull
682    M_LDR       pDst, Stk_pDst
683    MOV         y, #8
684    SUB         dstStep, dstStep, #4
685Yloop_pSrcResidueNotNull
686    SUBS        y, y, #1
687    LDR         dst, [pDst]                ;// dst = [dcba]
688    LDMIA       pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA]
689    PKHBT       tmp3, tmp1, tmp2, LSL #16  ;// Deltaval1 = [C A]
690    PKHTB       tmp4, tmp2, tmp1, ASR #16  ;// DeltaVal2 = [D B]
691    UXTB16      tmp1, dst                  ;// tmp1 = [0c0a]
692    UXTB16      tmp2, dst, ROR #8          ;// tmp2 = [0d0b]
693    QADD16      tmp1, tmp1, tmp3           ;// Add and saturate to 16 bits
694    QADD16      tmp2, tmp2, tmp4
695    USAT16      tmp1, #8, tmp1
696    USAT16      tmp2, #8, tmp2             ;// armClip(0, 255, tmp2)
697    ORR         tmp1, tmp1, tmp2, LSL #8   ;// tmp1 = [dcba]
698    STR         tmp1, [pDst], #4
699
700    LDR         dst, [pDst]
701    LDMIA       pSrcResidue!, {tmp1, tmp2}
702    PKHBT       tmp3, tmp1, tmp2, LSL #16
703    PKHTB       tmp4, tmp2, tmp1, ASR #16
704    UXTB16      tmp1, dst
705    UXTB16      tmp2, dst, ROR #8
706    QADD16      tmp1, tmp1, tmp3
707    QADD16      tmp2, tmp2, tmp4
708    USAT16      tmp1, #8, tmp1
709    USAT16      tmp2, #8, tmp2
710    ORR         tmp1, tmp1, tmp2, LSL #8
711    STR         tmp1, [pDst], dstStep
712
713    BGT         Yloop_pSrcResidueNotNull
714pSrcResidueConditionEnd
715
716    MOV         return, #OMX_Sts_NoErr
717
718    M_END
719    ENDIF ;// ARM1136JS
720
721;// ***************************************************************************
722;// CortexA8 implementation
723;// ***************************************************************************
724    END
725;// ***************************************************************************
726;// omxVCM4P2_MCReconBlock ends
727;// ***************************************************************************
728