10c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Copyright (C) 2004 ARM Limited
378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Licensed under the Apache License, Version 2.0 (the "License");
578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// you may not use this file except in compliance with the License.
678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// You may obtain a copy of the License at
778e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
878e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//      http://www.apache.org/licenses/LICENSE-2.0
978e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1078e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Unless required by applicable law or agreed to in writing, software
1178e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// distributed under the License is distributed on an "AS IS" BASIS,
1278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// See the License for the specific language governing permissions and
1478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// limitations under the License.
1578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
170c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IDCT_s.s
190c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
200c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Inverse DCT module
210c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
220c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
230c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ALGORITHM DESCRIPTION
240c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
250c1bc742181ded4930842b46e9507372f0b1b963James Dong;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// column and then a 1D IDCT for each row.
270c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
280c1bc742181ded4930842b46e9507372f0b1b963James Dong;// The 8-point 1D IDCT is defined by
290c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
300c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
310c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
320c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   c(u,x) = cos( (2x+1)*u*pi/16 )
330c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
340c1bc742181ded4930842b46e9507372f0b1b963James Dong;// We compute the 8-point 1D IDCT using the reverse of
350c1bc742181ded4930842b46e9507372f0b1b963James Dong;// the Arai-Agui-Nakajima flow graph which we split into
360c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 5 stages named in reverse order to identify with the
370c1bc742181ded4930842b46e9507372f0b1b963James Dong;// forward DCT. Direct inversion of the forward formulae
380c1bc742181ded4930842b46e9507372f0b1b963James Dong;// in file FDCT_s.s gives:
390c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
400c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
410c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             [ A(0) = 2*sqrt(2)
420c1bc742181ded4930842b46e9507372f0b1b963James Dong;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
430c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
440c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 4:   i0 = j0             i1 = j4
450c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
460c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
470c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
480c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
490c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
500c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h2 = (i2*sqrt2)-i3  h3 = i3
510c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
520c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
530c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             [ The above two lines rotate by -(pi/8) ]
540c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2
550c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
560c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
570c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
580c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g7 = h7             g6 = h6 - h7
590c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g5 = h5 - g6        g4 = h4 - g5
600c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
610c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
620c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
630c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
640c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
650c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
660c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Note that most coefficients are halved 3 times during the
670c1bc742181ded4930842b46e9507372f0b1b963James Dong;// above calculation. We can rescale the algorithm dividing
680c1bc742181ded4930842b46e9507372f0b1b963James Dong;// the input by 8 to remove the halvings.
690c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
700c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 5:   j(u) = T(u)*A(u)/8
710c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
720c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 4:   i0 = j0             i1 = j4
730c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i3 = j2 + j6        i2 = j2 - j6
740c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i7 = j5 + j3        i4 = j5 - j3
750c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i5 = j1 + j7        i6 = j1 - j7
760c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
770c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
780c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h2 = (i2*sqrt2)-i3  h3 = i3
790c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
800c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
810c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7
820c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
840c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g1 = h1 + h2        g2 = h1 - h2
850c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g7 = h7             g6 = h6 - h7
860c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g5 = h5 - g6        g4 = h4 - g5
870c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
880c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
890c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f1 = g1 + g6        f6 = g1 - g6
900c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f2 = g2 + g5        f5 = g2 - g5
910c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f3 = g3 + g4        f4 = g3 - g4
920c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
930c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Note:
940c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 1. The scaling by A(u)/8 can often be combined with inverse
950c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    quantization. The column and row scalings can be combined.
960c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
970c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    to the above code but is otherwise identical.
980c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 3. The rotation by -pi/8 can be peformed using three multiplies
990c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
1010c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 4. If |T(u)|<=1 then from the IDCT definition,
1020c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
1030c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
1040c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = (approx)2.64
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
1080c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    The table below shows input patterns generating the maximum
1090c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    InputPattern      Max |f(x)|
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPPPPPPP        |f0| =  2.64
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPPMMMMM        |f1| =  2.64
1130c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPMMMPPP        |f2| =  2.64
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPMMPPMM        |f3| =  2.64
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMMPPMMP        |f4| =  2.64
1160c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMMPMMPM        |f5| =  2.64
1170c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMPPMPMP        |f6| =  2.64
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMPMPMPM        |f7| =  2.64
1190c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   Note that this input pattern is the transpose of the
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   corresponding max input patter for the FDCT.
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Arguments
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong
1240c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc    RN 0    ;// source data buffer
1250c1bc742181ded4930842b46e9507372f0b1b963James DongStride  RN 1    ;// destination stride in bytes
1260c1bc742181ded4930842b46e9507372f0b1b963James DongpDest   RN 2    ;// destination data buffer
1270c1bc742181ded4930842b46e9507372f0b1b963James DongpScale  RN 3    ;// pointer to scaling table
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong
1290c1bc742181ded4930842b46e9507372f0b1b963James Dong
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// DCT Inverse Macro
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// The DCT code should be parametrized according
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// to the following inputs:
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Inputs:
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pSrc   = r0 = Pointer to input data
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//               Range is -256 to +255 (9-bit)
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Stride = r1 = Stride between input lines
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pDest  = r2 = Pointer to output data
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT  $outsize, $inscale, $stride
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong        LCLA    SHIFT
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ARM1136JS
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong;// REGISTER ALLOCATION
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong;// This is hard since we have 8 values, 9 free registers and each
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong;// butterfly requires a temporary register. We also want to
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong;// maintain register order so we can use LDM/STM. The table below
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong;// summarises the register allocation that meets all these criteria.
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r1  a01     g0  h0
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r4  b01 f0  g1  h1  i0
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r5  a23 f1  g2      i1
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r6  b23 f2  g3  h2  i2
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r7  a45 f3      h3  i3
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r8  b45 f4  g4  h4  i4
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r9  a67 f5  g5  h5  i5
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r10 b67 f6  g6  h6  i6
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r11     f7  g7  h7  i7
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
1720c1bc742181ded4930842b46e9507372f0b1b963James Dongra01    RN 1
1730c1bc742181ded4930842b46e9507372f0b1b963James Dongrb01    RN 4
1740c1bc742181ded4930842b46e9507372f0b1b963James Dongra23    RN 5
1750c1bc742181ded4930842b46e9507372f0b1b963James Dongrb23    RN 6
1760c1bc742181ded4930842b46e9507372f0b1b963James Dongra45    RN 7
1770c1bc742181ded4930842b46e9507372f0b1b963James Dongrb45    RN 8
1780c1bc742181ded4930842b46e9507372f0b1b963James Dongra67    RN 9
1790c1bc742181ded4930842b46e9507372f0b1b963James Dongrb67    RN 10
1800c1bc742181ded4930842b46e9507372f0b1b963James Dongrtmp    RN 11
1810c1bc742181ded4930842b46e9507372f0b1b963James DongcsPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
1820c1bc742181ded4930842b46e9507372f0b1b963James DongLoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
1830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transpose allocation
1840c1bc742181ded4930842b46e9507372f0b1b963James Dongxft     RN ra01
1850c1bc742181ded4930842b46e9507372f0b1b963James Dongxf0     RN rb01
1860c1bc742181ded4930842b46e9507372f0b1b963James Dongxf1     RN ra23
1870c1bc742181ded4930842b46e9507372f0b1b963James Dongxf2     RN rb23
1880c1bc742181ded4930842b46e9507372f0b1b963James Dongxf3     RN ra45
1890c1bc742181ded4930842b46e9507372f0b1b963James Dongxf4     RN rb45
1900c1bc742181ded4930842b46e9507372f0b1b963James Dongxf5     RN ra67
1910c1bc742181ded4930842b46e9507372f0b1b963James Dongxf6     RN rb67
1920c1bc742181ded4930842b46e9507372f0b1b963James Dongxf7     RN rtmp
1930c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 1 allocation
1940c1bc742181ded4930842b46e9507372f0b1b963James Dongxg0     RN xft
1950c1bc742181ded4930842b46e9507372f0b1b963James Dongxg1     RN xf0
1960c1bc742181ded4930842b46e9507372f0b1b963James Dongxg2     RN xf1
1970c1bc742181ded4930842b46e9507372f0b1b963James Dongxg3     RN xf2
1980c1bc742181ded4930842b46e9507372f0b1b963James Dongxgt     RN xf3
1990c1bc742181ded4930842b46e9507372f0b1b963James Dongxg4     RN xf4
2000c1bc742181ded4930842b46e9507372f0b1b963James Dongxg5     RN xf5
2010c1bc742181ded4930842b46e9507372f0b1b963James Dongxg6     RN xf6
2020c1bc742181ded4930842b46e9507372f0b1b963James Dongxg7     RN xf7
2030c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 2 allocation
2040c1bc742181ded4930842b46e9507372f0b1b963James Dongxh0     RN xg0
2050c1bc742181ded4930842b46e9507372f0b1b963James Dongxh1     RN xg1
2060c1bc742181ded4930842b46e9507372f0b1b963James Dongxht     RN xg2
2070c1bc742181ded4930842b46e9507372f0b1b963James Dongxh2     RN xg3
2080c1bc742181ded4930842b46e9507372f0b1b963James Dongxh3     RN xgt
2090c1bc742181ded4930842b46e9507372f0b1b963James Dongxh4     RN xg4
2100c1bc742181ded4930842b46e9507372f0b1b963James Dongxh5     RN xg5
2110c1bc742181ded4930842b46e9507372f0b1b963James Dongxh6     RN xg6
2120c1bc742181ded4930842b46e9507372f0b1b963James Dongxh7     RN xg7
2130c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 3,4 allocation
2140c1bc742181ded4930842b46e9507372f0b1b963James Dongxit     RN xh0
2150c1bc742181ded4930842b46e9507372f0b1b963James Dongxi0     RN xh1
2160c1bc742181ded4930842b46e9507372f0b1b963James Dongxi1     RN xht
2170c1bc742181ded4930842b46e9507372f0b1b963James Dongxi2     RN xh2
2180c1bc742181ded4930842b46e9507372f0b1b963James Dongxi3     RN xh3
2190c1bc742181ded4930842b46e9507372f0b1b963James Dongxi4     RN xh4
2200c1bc742181ded4930842b46e9507372f0b1b963James Dongxi5     RN xh5
2210c1bc742181ded4930842b46e9507372f0b1b963James Dongxi6     RN xh6
2220c1bc742181ded4930842b46e9507372f0b1b963James Dongxi7     RN xh7
2230c1bc742181ded4930842b46e9507372f0b1b963James Dong
2240c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STR   pDest,  ppDest
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_STR   Stride, pStride
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ADR   pDest,  pBlk
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     csPiBy8, =0x30fc7642
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     LoopRR2, =0x00005a82
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong
2320c1bc742181ded4930842b46e9507372f0b1b963James Dongv6_idct_col$_F
2330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load even values
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi4, [pSrc], #4  ;// j0
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi5, [pSrc, #4*16-4]  ;// j4
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi6, [pSrc, #2*16-4]  ;// j2
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi7, [pSrc, #6*16-4]  ;// j6
2380c1bc742181ded4930842b46e9507372f0b1b963James Dong
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale Even Values
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s16" ;// 16x16 mul
2410c1bc742181ded4930842b46e9507372f0b1b963James DongSHIFT       SETA    12
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi0, [pScale], #4
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi1, [pScale, #4*16-4]
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi2, [pScale, #2*16-4]
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xit, #1<<(SHIFT-1)
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi3, xi0, xi4, xit
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi4, xi0, xi4, xit
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi0, xi1, xi5, xit
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi5, xi1, xi5, xit
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi3, xi3, ASR #SHIFT
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
2520c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi3, [pScale, #6*16-4]
2530c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi1, xi2, xi6, xit
2540c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi6, xi2, xi6, xit
2550c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi0, xi0, ASR #SHIFT
2560c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
2570c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi2, xi3, xi7, xit
2580c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi7, xi3, xi7, xit
2590c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi1, xi1, ASR #SHIFT
2600c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
2610c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi2, xi2, ASR #SHIFT
2620c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
2630c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
2640c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s32" ;// 32x16 mul
2650c1bc742181ded4930842b46e9507372f0b1b963James DongSHIFT       SETA    (12+8-16)
2660c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xit, #1<<(SHIFT-1)
2670c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi0, [pScale], #8
2680c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi1, [pScale, #0*32+4-8]
2690c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi2, [pScale, #4*32-8]
2700c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi3, [pScale, #4*32+4-8]
2710c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi0, xi0, xi4, xit
2720c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi1, xi1, xi4, xit
2730c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi2, xi2, xi5, xit
2740c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi3, xi3, xi5, xit
2750c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi0, xi0, ASR #SHIFT
2760c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
2770c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi2, xi2, ASR #SHIFT
2780c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
2790c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi0, [pScale, #2*32-8]
2800c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi1, [pScale, #2*32+4-8]
2810c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi2, [pScale, #6*32-8]
2820c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi3, [pScale, #6*32+4-8]
2830c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi0, xi0, xi6, xit
2840c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi1, xi1, xi6, xit
2850c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi2, xi2, xi7, xit
2860c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi3, xi3, xi7, xit
2870c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi0, xi0, ASR #SHIFT
2880c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
2890c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi2, xi2, ASR #SHIFT
2900c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
2910c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
2920c1bc742181ded4930842b46e9507372f0b1b963James Dong
2930c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load odd values
2940c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc, #1*16-4]      ;// j1
2950c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #7*16-4]      ;// j7
2960c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi2, [pSrc, #5*16-4]      ;// j5
2970c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi3, [pSrc, #3*16-4]      ;// j3
2980c1bc742181ded4930842b46e9507372f0b1b963James Dong
2990c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF  {TRUE}
3000c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// shortcut if odd values 0
3010c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQ     xi0, #0
3020c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQEQ   xi1, #0
3030c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQEQ   xi2, #0
3040c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQEQ   xi3, #0
3050c1bc742181ded4930842b46e9507372f0b1b963James Dong            BEQ     v6OddZero$_F
3060c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
3070c1bc742181ded4930842b46e9507372f0b1b963James Dong
3080c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store scaled even values
3090c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest, {xi4, xi5, xi6, xi7}
3100c1bc742181ded4930842b46e9507372f0b1b963James Dong
3110c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale odd values
3120c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s16"
3130c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Perform AAN Scale
3140c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi4, [pScale, #1*16-4]
3150c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi5, [pScale, #7*16-4]
3160c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi6, [pScale, #5*16-4]
3170c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi7, xi0, xi4, xit
3180c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi0, xi0, xi4, xit
3190c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi4, xi1, xi5, xit
3200c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi1, xi1, xi5, xit
3210c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi7, xi7, ASR #SHIFT
3220c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
3230c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi7, [pScale, #3*16-4]
3240c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi5, xi2, xi6, xit
3250c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi2, xi2, xi6, xit
3260c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi4, xi4, ASR #SHIFT
3270c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
3280c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi6, xi3, xi7, xit
3290c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi3, xi3, xi7, xit
3300c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi5, xi5, ASR #SHIFT
3310c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
3320c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi6, xi6, ASR #SHIFT
3330c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
3340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
3350c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s32" ;// 32x16 mul
3360c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi4, [pScale, #1*32-8]
3370c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi5, [pScale, #1*32+4-8]
3380c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi6, [pScale, #7*32-8]
3390c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi7, [pScale, #7*32+4-8]
3400c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi4, xi4, xi0, xit
3410c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi5, xi5, xi0, xit
3420c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi6, xi6, xi1, xit
3430c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi7, xi7, xi1, xit
3440c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi4, xi4, ASR #SHIFT
3450c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
3460c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi6, xi6, ASR #SHIFT
3470c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
3480c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi4, [pScale, #5*32-8]
3490c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi5, [pScale, #5*32+4-8]
3500c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi6, [pScale, #3*32-8]
3510c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi7, [pScale, #3*32+4-8]
3520c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi4, xi4, xi2, xit
3530c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi5, xi5, xi2, xit
3540c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi6, xi6, xi3, xit
3550c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi7, xi7, xi3, xit
3560c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi4, xi4, ASR #SHIFT
3570c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
3580c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi6, xi6, ASR #SHIFT
3590c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
3600c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
3610c1bc742181ded4930842b46e9507372f0b1b963James Dong
3620c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
3630c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi6, xi0, xi1           ;// j1-j7
3640c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
3650c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi4, xi2, xi3           ;// j5-j3
3660c1bc742181ded4930842b46e9507372f0b1b963James Dong
3670c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
3680c1bc742181ded4930842b46e9507372f0b1b963James Dong
3690c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
3700c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
3710c1bc742181ded4930842b46e9507372f0b1b963James Dong
3720c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
3730c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
3740c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
3750c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
3760c1bc742181ded4930842b46e9507372f0b1b963James Dong
3770c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi1, xi3, LoopRR2
3780c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi3, xi3, LoopRR2
3790c1bc742181ded4930842b46e9507372f0b1b963James Dong
3800c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
3810c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
3820c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
3830c1bc742181ded4930842b46e9507372f0b1b963James Dong
3840c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0,xi1,xi2,xi3 now free
3850c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3, rows 2to3 x1/2
3860c1bc742181ded4930842b46e9507372f0b1b963James Dong
3870c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi3, xi3, LSL #1
3880c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
3890c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
3900c1bc742181ded4930842b46e9507372f0b1b963James Dong
3910c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2, rows4to7
3920c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg6, xh6, xh7
3930c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg5, xh5, xg6
3940c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg4, xh4, xg5
3950c1bc742181ded4930842b46e9507372f0b1b963James Dong
3960c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
3970c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
3980c1bc742181ded4930842b46e9507372f0b1b963James Dong
3990c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi0, xi2, LoopRR2
4000c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi2, xi2, LoopRR2
4010c1bc742181ded4930842b46e9507372f0b1b963James Dong
4020c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi2, xi2, LSL #1
4030c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
4040c1bc742181ded4930842b46e9507372f0b1b963James Dong
4050c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0, xi1 now free
4060c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
4070c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRD    xi0, [pDest]            ;// j0, j4 scaled
4080c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xh2, xh2, xi3
4090c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
4100c1bc742181ded4930842b46e9507372f0b1b963James Dong
4110c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh0, xi0, xi1
4120c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xh1, xi0, xi1
4130c1bc742181ded4930842b46e9507372f0b1b963James Dong
4140c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
4150c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg2, xh1, xh2
4160c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg1, xh1, xh2
4170c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg3, xh0, xh3
4180c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg0, xh0, xh3
4190c1bc742181ded4930842b46e9507372f0b1b963James Dong
4200c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
4210c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf3, xg3, xg4
4220c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf4, xg3, xg4
4230c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf2, xg2, xg5
4240c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf5, xg2, xg5
4250c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf1, xg1, xg6
4260c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf6, xg1, xg6
4270c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf0, xg0, xg7
4280c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf7, xg0, xg7
4290c1bc742181ded4930842b46e9507372f0b1b963James Dong
4300c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose, store and loop
4310c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra01, xf0, xf1, LSL #16
4320c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb01, xf1, xf0, ASR #16
4330c1bc742181ded4930842b46e9507372f0b1b963James Dong
4340c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra23, xf2, xf3, LSL #16
4350c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb23, xf3, xf2, ASR #16
4360c1bc742181ded4930842b46e9507372f0b1b963James Dong
4370c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra45, xf4, xf5, LSL #16
4380c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb45, xf5, xf4, ASR #16
4390c1bc742181ded4930842b46e9507372f0b1b963James Dong
4400c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra67, xf6, xf7, LSL #16
4410c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {ra01, ra23, ra45, ra67}
4420c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb67, xf7, xf6, ASR #16
4430c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {rb01, rb23, rb45, rb67}
4440c1bc742181ded4930842b46e9507372f0b1b963James Dong        BCC     v6_idct_col$_F
4450c1bc742181ded4930842b46e9507372f0b1b963James Dong
4460c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB     pSrc, pDest, #(64*2)
4470c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   pDest, ppDest
4480c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
4490c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_LDR   pScale, pStride
4500c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
4510c1bc742181ded4930842b46e9507372f0b1b963James Dong        B       v6_idct_row$_F
4520c1bc742181ded4930842b46e9507372f0b1b963James Dong
4530c1bc742181ded4930842b46e9507372f0b1b963James Dongv6OddZero$_F
4540c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi2, xi6, xi7           ;// (j2-j6)
4550c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
4560c1bc742181ded4930842b46e9507372f0b1b963James Dong
4570c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi0, xi2, LoopRR2
4580c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi2, xi2, LoopRR2
4590c1bc742181ded4930842b46e9507372f0b1b963James Dong
4600c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi2, xi2, LSL #1
4610c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
4620c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xh2, xh2, xi3
4630c1bc742181ded4930842b46e9507372f0b1b963James Dong
4640c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0, xi1 now free
4650c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
4660c1bc742181ded4930842b46e9507372f0b1b963James Dong
4670c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh0, xi4, xi5
4680c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xh1, xi4, xi5
4690c1bc742181ded4930842b46e9507372f0b1b963James Dong
4700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
4710c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg2, xh1, xh2
4720c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg1, xh1, xh2
4730c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg3, xh0, xh3
4740c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg0, xh0, xh3
4750c1bc742181ded4930842b46e9507372f0b1b963James Dong
4760c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
4770c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf3, xg3
4780c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf4, xg3
4790c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf2, xg2
4800c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf5, xg2
4810c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf1, xg1
4820c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf6, xg1
4830c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf0, xg0
4840c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf7, xg0
4850c1bc742181ded4930842b46e9507372f0b1b963James Dong
4860c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose
4870c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra01, xf0, xf1, LSL #16
4880c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb01, xf1, xf0, ASR #16
4890c1bc742181ded4930842b46e9507372f0b1b963James Dong
4900c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra23, xf2, xf3, LSL #16
4910c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb23, xf3, xf2, ASR #16
4920c1bc742181ded4930842b46e9507372f0b1b963James Dong
4930c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra45, xf4, xf5, LSL #16
4940c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb45, xf5, xf4, ASR #16
4950c1bc742181ded4930842b46e9507372f0b1b963James Dong
4960c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra67, xf6, xf7, LSL #16
4970c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb67, xf7, xf6, ASR #16
4980c1bc742181ded4930842b46e9507372f0b1b963James Dong
4990c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {ra01, ra23, ra45, ra67}
5000c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
5010c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {rb01, rb23, rb45, rb67}
5020c1bc742181ded4930842b46e9507372f0b1b963James Dong
5030c1bc742181ded4930842b46e9507372f0b1b963James Dong        BCC     v6_idct_col$_F
5040c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB     pSrc, pDest, #(64*2)
5050c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   pDest, ppDest
5060c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
5070c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_LDR   pScale, pStride
5080c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
5090c1bc742181ded4930842b46e9507372f0b1b963James Dong
5100c1bc742181ded4930842b46e9507372f0b1b963James Dong
5110c1bc742181ded4930842b46e9507372f0b1b963James Dongv6_idct_row$_F
5120c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3, rows4to7 x1/4
5130c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xit, =0x00010001        ;// rounding constant
5140c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc, #1*16]      ;// j1
5150c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #7*16]      ;// 4*j7
5160c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi2, [pSrc, #5*16]      ;// j5
5170c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi3, [pSrc, #3*16]      ;// j3
5180c1bc742181ded4930842b46e9507372f0b1b963James Dong
5190c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi1, xi1, xit           ;// 2*j7
5200c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi1, xi1, xit           ;// j7
5210c1bc742181ded4930842b46e9507372f0b1b963James Dong
5220c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
5230c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi6, xi0, xi1           ;// j1-j7
5240c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
5250c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi4, xi2, xi3           ;// j5-j3
5260c1bc742181ded4930842b46e9507372f0b1b963James Dong
5270c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
5280c1bc742181ded4930842b46e9507372f0b1b963James Dong
5290c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
5300c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
5310c1bc742181ded4930842b46e9507372f0b1b963James Dong
5320c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
5330c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
5340c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
5350c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
5360c1bc742181ded4930842b46e9507372f0b1b963James Dong
5370c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi1, xi3, LoopRR2
5380c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi3, xi3, LoopRR2
5390c1bc742181ded4930842b46e9507372f0b1b963James Dong
5400c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
5410c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
5420c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
5430c1bc742181ded4930842b46e9507372f0b1b963James Dong
5440c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi3, xi3, LSL #1
5450c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
5460c1bc742181ded4930842b46e9507372f0b1b963James Dong
5470c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0,xi1,xi2,xi3 now free
5480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3, rows 2to3 x1/2
5490c1bc742181ded4930842b46e9507372f0b1b963James Dong
5500c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc, #2*16]      ;// j2
5510c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #6*16]      ;// 2*j6
5520c1bc742181ded4930842b46e9507372f0b1b963James Dong
5530c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2, rows4to7
5540c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg6, xh6, xh7
5550c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg5, xh5, xg6
5560c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg4, xh4, xg5
5570c1bc742181ded4930842b46e9507372f0b1b963James Dong
5580c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi1, xi1, xit           ;// j6
5590c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
5600c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
5610c1bc742181ded4930842b46e9507372f0b1b963James Dong
5620c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi0, xi2, LoopRR2
5630c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi2, xi2, LoopRR2
5640c1bc742181ded4930842b46e9507372f0b1b963James Dong
5650c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi2, xi2, LSL #1
5660c1bc742181ded4930842b46e9507372f0b1b963James Dong
5670c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
5680c1bc742181ded4930842b46e9507372f0b1b963James Dong
5690c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0, xi1 now free
5700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
5710c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #4*16]      ;// j4
5720c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc], #4         ;// j0
5730c1bc742181ded4930842b46e9507372f0b1b963James Dong
5740c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xh2, xh2, xi3
5750c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
5760c1bc742181ded4930842b46e9507372f0b1b963James Dong
5770c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
5780c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh0, xi0, xi1           ;// of DC result
5790c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xh1, xi0, xi1
5800c1bc742181ded4930842b46e9507372f0b1b963James Dong
5810c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
5820c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg2, xh1, xh2
5830c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg1, xh1, xh2
5840c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg3, xh0, xh3
5850c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg0, xh0, xh3
5860c1bc742181ded4930842b46e9507372f0b1b963James Dong
5870c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
5880c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf3, xg3, xg4
5890c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf4, xg3, xg4
5900c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf2, xg2, xg5
5910c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf5, xg2, xg5
5920c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf1, xg1, xg6
5930c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf6, xg1, xg6
5940c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf0, xg0, xg7
5950c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf7, xg0, xg7
5960c1bc742181ded4930842b46e9507372f0b1b963James Dong
5970c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Saturate
5980c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="u8")
5990c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf0, #8, xf0
6000c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf1, #8, xf1
6010c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf2, #8, xf2
6020c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf3, #8, xf3
6030c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf4, #8, xf4
6040c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf5, #8, xf5
6050c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf6, #8, xf6
6060c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf7, #8, xf7
6070c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6080c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="s9")
6090c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf0, #9, xf0
6100c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf1, #9, xf1
6110c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf2, #9, xf2
6120c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf3, #9, xf3
6130c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf4, #9, xf4
6140c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf5, #9, xf5
6150c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf6, #9, xf6
6160c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf7, #9, xf7
6170c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6180c1bc742181ded4930842b46e9507372f0b1b963James Dong
6190c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose to Row, Pack and store
6200c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="u8")
6210c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
6220c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
6230c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
6240c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
6250c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra01, xf0, xf2, LSL #16
6260c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb01, xf2, xf0, ASR #16
6270c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra23, xf4, xf6, LSL #16
6280c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb23, xf6, xf4, ASR #16
6290c1bc742181ded4930842b46e9507372f0b1b963James Dong            STMIA   pDest, {ra01, ra23}
6300c1bc742181ded4930842b46e9507372f0b1b963James Dong            IF "$stride"="s"
6310c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6320c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23}
6330c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6340c1bc742181ded4930842b46e9507372f0b1b963James Dong            ELSE
6350c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6360c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23}
6370c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6380c1bc742181ded4930842b46e9507372f0b1b963James Dong            ENDIF
6390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6400c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="s9"):LOR:("$outsize"="s16")
6410c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra01, xf0, xf1, LSL #16
6420c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb01, xf1, xf0, ASR #16
6430c1bc742181ded4930842b46e9507372f0b1b963James Dong
6440c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra23, xf2, xf3, LSL #16
6450c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb23, xf3, xf2, ASR #16
6460c1bc742181ded4930842b46e9507372f0b1b963James Dong
6470c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra45, xf4, xf5, LSL #16
6480c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb45, xf5, xf4, ASR #16
6490c1bc742181ded4930842b46e9507372f0b1b963James Dong
6500c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra67, xf6, xf7, LSL #16
6510c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb67, xf7, xf6, ASR #16
6520c1bc742181ded4930842b46e9507372f0b1b963James Dong
6530c1bc742181ded4930842b46e9507372f0b1b963James Dong            STMIA   pDest, {ra01, ra23, ra45, ra67}
6540c1bc742181ded4930842b46e9507372f0b1b963James Dong            IF "$stride"="s"
6550c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6560c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23, rb45, rb67}
6570c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6580c1bc742181ded4930842b46e9507372f0b1b963James Dong            ELSE
6590c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6600c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23, rb45, rb67}
6610c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6620c1bc742181ded4930842b46e9507372f0b1b963James Dong            ENDIF
6630c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6640c1bc742181ded4930842b46e9507372f0b1b963James Dong
6650c1bc742181ded4930842b46e9507372f0b1b963James Dong        BCC     v6_idct_row$_F
6660c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF ;// ARM1136JS
6670c1bc742181ded4930842b46e9507372f0b1b963James Dong
6680c1bc742181ded4930842b46e9507372f0b1b963James Dong
6690c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF CortexA8
6700c1bc742181ded4930842b46e9507372f0b1b963James Dong
6710c1bc742181ded4930842b46e9507372f0b1b963James DongSrc0            EQU  7
6720c1bc742181ded4930842b46e9507372f0b1b963James DongSrc1            EQU  8
6730c1bc742181ded4930842b46e9507372f0b1b963James DongSrc2            EQU  9
6740c1bc742181ded4930842b46e9507372f0b1b963James DongSrc3            EQU  10
6750c1bc742181ded4930842b46e9507372f0b1b963James DongSrc4            EQU  11
6760c1bc742181ded4930842b46e9507372f0b1b963James DongSrc5            EQU  12
6770c1bc742181ded4930842b46e9507372f0b1b963James DongSrc6            EQU  13
6780c1bc742181ded4930842b46e9507372f0b1b963James DongSrc7            EQU  14
6790c1bc742181ded4930842b46e9507372f0b1b963James DongTmp             EQU  15
6800c1bc742181ded4930842b46e9507372f0b1b963James Dong
6810c1bc742181ded4930842b46e9507372f0b1b963James DongqXj0            QN Src0.S16
6820c1bc742181ded4930842b46e9507372f0b1b963James DongqXj1            QN Src1.S16
6830c1bc742181ded4930842b46e9507372f0b1b963James DongqXj2            QN Src2.S16
6840c1bc742181ded4930842b46e9507372f0b1b963James DongqXj3            QN Src3.S16
6850c1bc742181ded4930842b46e9507372f0b1b963James DongqXj4            QN Src4.S16
6860c1bc742181ded4930842b46e9507372f0b1b963James DongqXj5            QN Src5.S16
6870c1bc742181ded4930842b46e9507372f0b1b963James DongqXj6            QN Src6.S16
6880c1bc742181ded4930842b46e9507372f0b1b963James DongqXj7            QN Src7.S16
6890c1bc742181ded4930842b46e9507372f0b1b963James DongqXjt            QN Tmp.S16
6900c1bc742181ded4930842b46e9507372f0b1b963James Dong
6910c1bc742181ded4930842b46e9507372f0b1b963James DongdXj0lo          DN (Src0*2).S16
6920c1bc742181ded4930842b46e9507372f0b1b963James DongdXj0hi          DN (Src0*2+1).S16
6930c1bc742181ded4930842b46e9507372f0b1b963James DongdXj1lo          DN (Src1*2).S16
6940c1bc742181ded4930842b46e9507372f0b1b963James DongdXj1hi          DN (Src1*2+1).S16
6950c1bc742181ded4930842b46e9507372f0b1b963James DongdXj2lo          DN (Src2*2).S16
6960c1bc742181ded4930842b46e9507372f0b1b963James DongdXj2hi          DN (Src2*2+1).S16
6970c1bc742181ded4930842b46e9507372f0b1b963James DongdXj3lo          DN (Src3*2).S16
6980c1bc742181ded4930842b46e9507372f0b1b963James DongdXj3hi          DN (Src3*2+1).S16
6990c1bc742181ded4930842b46e9507372f0b1b963James DongdXj4lo          DN (Src4*2).S16
7000c1bc742181ded4930842b46e9507372f0b1b963James DongdXj4hi          DN (Src4*2+1).S16
7010c1bc742181ded4930842b46e9507372f0b1b963James DongdXj5lo          DN (Src5*2).S16
7020c1bc742181ded4930842b46e9507372f0b1b963James DongdXj5hi          DN (Src5*2+1).S16
7030c1bc742181ded4930842b46e9507372f0b1b963James DongdXj6lo          DN (Src6*2).S16
7040c1bc742181ded4930842b46e9507372f0b1b963James DongdXj6hi          DN (Src6*2+1).S16
7050c1bc742181ded4930842b46e9507372f0b1b963James DongdXj7lo          DN (Src7*2).S16
7060c1bc742181ded4930842b46e9507372f0b1b963James DongdXj7hi          DN (Src7*2+1).S16
7070c1bc742181ded4930842b46e9507372f0b1b963James DongdXjtlo          DN (Tmp*2).S16
7080c1bc742181ded4930842b46e9507372f0b1b963James DongdXjthi          DN (Tmp*2+1).S16
7090c1bc742181ded4930842b46e9507372f0b1b963James Dong
7100c1bc742181ded4930842b46e9507372f0b1b963James DongqXi0            QN qXj0
7110c1bc742181ded4930842b46e9507372f0b1b963James DongqXi1            QN qXj4
7120c1bc742181ded4930842b46e9507372f0b1b963James DongqXi2            QN qXj2
7130c1bc742181ded4930842b46e9507372f0b1b963James DongqXi3            QN qXj7
7140c1bc742181ded4930842b46e9507372f0b1b963James DongqXi4            QN qXj5
7150c1bc742181ded4930842b46e9507372f0b1b963James DongqXi5            QN qXjt
7160c1bc742181ded4930842b46e9507372f0b1b963James DongqXi6            QN qXj1
7170c1bc742181ded4930842b46e9507372f0b1b963James DongqXi7            QN qXj6
7180c1bc742181ded4930842b46e9507372f0b1b963James DongqXit            QN qXj3
7190c1bc742181ded4930842b46e9507372f0b1b963James Dong
7200c1bc742181ded4930842b46e9507372f0b1b963James DongdXi0lo          DN dXj0lo
7210c1bc742181ded4930842b46e9507372f0b1b963James DongdXi0hi          DN dXj0hi
7220c1bc742181ded4930842b46e9507372f0b1b963James DongdXi1lo          DN dXj4lo
7230c1bc742181ded4930842b46e9507372f0b1b963James DongdXi1hi          DN dXj4hi
7240c1bc742181ded4930842b46e9507372f0b1b963James DongdXi2lo          DN dXj2lo
7250c1bc742181ded4930842b46e9507372f0b1b963James DongdXi2hi          DN dXj2hi
7260c1bc742181ded4930842b46e9507372f0b1b963James DongdXi3lo          DN dXj7lo
7270c1bc742181ded4930842b46e9507372f0b1b963James DongdXi3hi          DN dXj7hi
7280c1bc742181ded4930842b46e9507372f0b1b963James DongdXi4lo          DN dXj5lo
7290c1bc742181ded4930842b46e9507372f0b1b963James DongdXi4hi          DN dXj5hi
7300c1bc742181ded4930842b46e9507372f0b1b963James DongdXi5lo          DN dXjtlo
7310c1bc742181ded4930842b46e9507372f0b1b963James DongdXi5hi          DN dXjthi
7320c1bc742181ded4930842b46e9507372f0b1b963James DongdXi6lo          DN dXj1lo
7330c1bc742181ded4930842b46e9507372f0b1b963James DongdXi6hi          DN dXj1hi
7340c1bc742181ded4930842b46e9507372f0b1b963James DongdXi7lo          DN dXj6lo
7350c1bc742181ded4930842b46e9507372f0b1b963James DongdXi7hi          DN dXj6hi
7360c1bc742181ded4930842b46e9507372f0b1b963James DongdXitlo          DN dXj3lo
7370c1bc742181ded4930842b46e9507372f0b1b963James DongdXithi          DN dXj3hi
7380c1bc742181ded4930842b46e9507372f0b1b963James Dong
7390c1bc742181ded4930842b46e9507372f0b1b963James DongqXh0            QN qXit
7400c1bc742181ded4930842b46e9507372f0b1b963James DongqXh1            QN qXi0
7410c1bc742181ded4930842b46e9507372f0b1b963James DongqXh2            QN qXi2
7420c1bc742181ded4930842b46e9507372f0b1b963James DongqXh3            QN qXi3
7430c1bc742181ded4930842b46e9507372f0b1b963James DongqXh4            QN qXi7
7440c1bc742181ded4930842b46e9507372f0b1b963James DongqXh5            QN qXi5
7450c1bc742181ded4930842b46e9507372f0b1b963James DongqXh6            QN qXi4
7460c1bc742181ded4930842b46e9507372f0b1b963James DongqXh7            QN qXi1
7470c1bc742181ded4930842b46e9507372f0b1b963James DongqXht            QN qXi6
7480c1bc742181ded4930842b46e9507372f0b1b963James Dong
7490c1bc742181ded4930842b46e9507372f0b1b963James DongdXh0lo          DN dXitlo
7500c1bc742181ded4930842b46e9507372f0b1b963James DongdXh0hi          DN dXithi
7510c1bc742181ded4930842b46e9507372f0b1b963James DongdXh1lo          DN dXi0lo
7520c1bc742181ded4930842b46e9507372f0b1b963James DongdXh1hi          DN dXi0hi
7530c1bc742181ded4930842b46e9507372f0b1b963James DongdXh2lo          DN dXi2lo
7540c1bc742181ded4930842b46e9507372f0b1b963James DongdXh2hi          DN dXi2hi
7550c1bc742181ded4930842b46e9507372f0b1b963James DongdXh3lo          DN dXi3lo
7560c1bc742181ded4930842b46e9507372f0b1b963James DongdXh3hi          DN dXi3hi
7570c1bc742181ded4930842b46e9507372f0b1b963James DongdXh4lo          DN dXi7lo
7580c1bc742181ded4930842b46e9507372f0b1b963James DongdXh4hi          DN dXi7hi
7590c1bc742181ded4930842b46e9507372f0b1b963James DongdXh5lo          DN dXi5lo
7600c1bc742181ded4930842b46e9507372f0b1b963James DongdXh5hi          DN dXi5hi
7610c1bc742181ded4930842b46e9507372f0b1b963James DongdXh6lo          DN dXi4lo
7620c1bc742181ded4930842b46e9507372f0b1b963James DongdXh6hi          DN dXi4hi
7630c1bc742181ded4930842b46e9507372f0b1b963James DongdXh7lo          DN dXi1lo
7640c1bc742181ded4930842b46e9507372f0b1b963James DongdXh7hi          DN dXi1hi
7650c1bc742181ded4930842b46e9507372f0b1b963James DongdXhtlo          DN dXi6lo
7660c1bc742181ded4930842b46e9507372f0b1b963James DongdXhthi          DN dXi6hi
7670c1bc742181ded4930842b46e9507372f0b1b963James Dong
7680c1bc742181ded4930842b46e9507372f0b1b963James DongqXg0            QN qXh2
7690c1bc742181ded4930842b46e9507372f0b1b963James DongqXg1            QN qXht
7700c1bc742181ded4930842b46e9507372f0b1b963James DongqXg2            QN qXh1
7710c1bc742181ded4930842b46e9507372f0b1b963James DongqXg3            QN qXh0
7720c1bc742181ded4930842b46e9507372f0b1b963James DongqXg4            QN qXh4
7730c1bc742181ded4930842b46e9507372f0b1b963James DongqXg5            QN qXh5
7740c1bc742181ded4930842b46e9507372f0b1b963James DongqXg6            QN qXh6
7750c1bc742181ded4930842b46e9507372f0b1b963James DongqXg7            QN qXh7
7760c1bc742181ded4930842b46e9507372f0b1b963James DongqXgt            QN qXh3
7770c1bc742181ded4930842b46e9507372f0b1b963James Dong
7780c1bc742181ded4930842b46e9507372f0b1b963James DongqXf0            QN qXg6
7790c1bc742181ded4930842b46e9507372f0b1b963James DongqXf1            QN qXg5
7800c1bc742181ded4930842b46e9507372f0b1b963James DongqXf2            QN qXg4
7810c1bc742181ded4930842b46e9507372f0b1b963James DongqXf3            QN qXgt
7820c1bc742181ded4930842b46e9507372f0b1b963James DongqXf4            QN qXg3
7830c1bc742181ded4930842b46e9507372f0b1b963James DongqXf5            QN qXg2
7840c1bc742181ded4930842b46e9507372f0b1b963James DongqXf6            QN qXg1
7850c1bc742181ded4930842b46e9507372f0b1b963James DongqXf7            QN qXg0
7860c1bc742181ded4930842b46e9507372f0b1b963James DongqXft            QN qXg7
7870c1bc742181ded4930842b46e9507372f0b1b963James Dong
7880c1bc742181ded4930842b46e9507372f0b1b963James Dong
7890c1bc742181ded4930842b46e9507372f0b1b963James DongqXt0            QN 1.S32
7900c1bc742181ded4930842b46e9507372f0b1b963James DongqXt1            QN 2.S32
7910c1bc742181ded4930842b46e9507372f0b1b963James DongqT0lo           QN 1.S32
7920c1bc742181ded4930842b46e9507372f0b1b963James DongqT0hi           QN 2.S32
7930c1bc742181ded4930842b46e9507372f0b1b963James DongqT1lo           QN 3.S32
7940c1bc742181ded4930842b46e9507372f0b1b963James DongqT1hi           QN 4.S32
7950c1bc742181ded4930842b46e9507372f0b1b963James DongqScalelo        QN 5.S32        ;// used to read post scale values
7960c1bc742181ded4930842b46e9507372f0b1b963James DongqScalehi        QN 6.S32
7970c1bc742181ded4930842b46e9507372f0b1b963James DongqTemp0          QN 5.S32
7980c1bc742181ded4930842b46e9507372f0b1b963James DongqTemp1          QN 6.S32
7990c1bc742181ded4930842b46e9507372f0b1b963James Dong
8000c1bc742181ded4930842b46e9507372f0b1b963James Dong
8010c1bc742181ded4930842b46e9507372f0b1b963James DongScale1          EQU 6
8020c1bc742181ded4930842b46e9507372f0b1b963James DongScale2          EQU 15
8030c1bc742181ded4930842b46e9507372f0b1b963James DongqScale1         QN Scale1.S16
8040c1bc742181ded4930842b46e9507372f0b1b963James DongqScale2         QN Scale2.S16
8050c1bc742181ded4930842b46e9507372f0b1b963James DongdScale1lo       DN (Scale1*2).S16
8060c1bc742181ded4930842b46e9507372f0b1b963James DongdScale1hi       DN (Scale1*2+1).S16
8070c1bc742181ded4930842b46e9507372f0b1b963James DongdScale2lo       DN (Scale2*2).S16
8080c1bc742181ded4930842b46e9507372f0b1b963James DongdScale2hi       DN (Scale2*2+1).S16
8090c1bc742181ded4930842b46e9507372f0b1b963James Dong
8100c1bc742181ded4930842b46e9507372f0b1b963James DongdCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
8110c1bc742181ded4930842b46e9507372f0b1b963James DongInvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
8120c1bc742181ded4930842b46e9507372f0b1b963James DongS               DN dCoefs[1]    ;// Sin(PI/8) in Q15
8130c1bc742181ded4930842b46e9507372f0b1b963James DongC               DN dCoefs[2]    ;// Cos(PI/8) in Q15
8140c1bc742181ded4930842b46e9507372f0b1b963James Dong
8150c1bc742181ded4930842b46e9507372f0b1b963James DongpTemp           RN 12
8160c1bc742181ded4930842b46e9507372f0b1b963James Dong
8170c1bc742181ded4930842b46e9507372f0b1b963James Dong
8180c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT  armCOMM_IDCTCoef
8190c1bc742181ded4930842b46e9507372f0b1b963James Dong
8200c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj0,qXj1}, [pSrc @64]!
8210c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj2,qXj3}, [pSrc @64]!
8220c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj4,qXj5}, [pSrc @64]!
8230c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj6,qXj7}, [pSrc @64]!
8240c1bc742181ded4930842b46e9507372f0b1b963James Dong
8250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load PreScale and multiply with Src
8260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4
8270c1bc742181ded4930842b46e9507372f0b1b963James Dong
8280c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s16"                         ;// 16X16 Mul
8290c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_IDCT_PRESCALE16
8300c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
8310c1bc742181ded4930842b46e9507372f0b1b963James Dong
8320c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s32"                         ;// 32X32 ,ul
8330c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_IDCT_PRESCALE32
8340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
8350c1bc742181ded4930842b46e9507372f0b1b963James Dong
8360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 3
8370c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
8380c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
8390c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
8400c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
8410c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
8420c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
8430c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXh2, qXi2, qXi3                ;// h2, h3
8440c1bc742181ded4930842b46e9507372f0b1b963James Dong
8450c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dXi4lo, C                 ;// c*i4
8460c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
8470c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dXi4hi, C
8480c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt1, dXi6hi, S
8490c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh4lo, qXt0, #16               ;// h4
8500c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh4hi, qXt1, #16
8510c1bc742181ded4930842b46e9507372f0b1b963James Dong
8520c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dXi6lo, C                 ;// c*i6
8530c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
8540c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dXi6hi, C
8550c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt1, dXi4hi, S
8560c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh6lo, qXt0, #16               ;// h6
8570c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh6hi, qXt1, #16
8580c1bc742181ded4930842b46e9507372f0b1b963James Dong
8590c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2
8600c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXg6, qXh6, qXh7
8610c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXg5, qXh5, qXg6
8620c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXg4, qXh4, qXg5
8630c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
8640c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
8650c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
8660c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
8670c1bc742181ded4930842b46e9507372f0b1b963James Dong
8680c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
8690c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf3, qXg3, qXg4
8700c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf4, qXg3, qXg4
8710c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf2, qXg2, qXg5
8720c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf5, qXg2, qXg5
8730c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf1, qXg1, qXg6
8740c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf6, qXg1, qXg6
8750c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf0, qXg0, qXg7
8760c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf7, qXg0, qXg7
8770c1bc742181ded4930842b46e9507372f0b1b963James Dong
8780c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose, store and loop
8790c1bc742181ded4930842b46e9507372f0b1b963James DongXTR0            EQU Src5
8800c1bc742181ded4930842b46e9507372f0b1b963James DongXTR1            EQU Tmp
8810c1bc742181ded4930842b46e9507372f0b1b963James DongXTR2            EQU Src6
8820c1bc742181ded4930842b46e9507372f0b1b963James DongXTR3            EQU Src7
8830c1bc742181ded4930842b46e9507372f0b1b963James DongXTR4            EQU Src3
8840c1bc742181ded4930842b46e9507372f0b1b963James DongXTR5            EQU Src0
8850c1bc742181ded4930842b46e9507372f0b1b963James DongXTR6            EQU Src1
8860c1bc742181ded4930842b46e9507372f0b1b963James DongXTR7            EQU Src2
8870c1bc742181ded4930842b46e9507372f0b1b963James DongXTRt            EQU Src4
8880c1bc742181ded4930842b46e9507372f0b1b963James Dong
8890c1bc742181ded4930842b46e9507372f0b1b963James DongqA0             QN  XTR0.S32  ;// for XTRpose
8900c1bc742181ded4930842b46e9507372f0b1b963James DongqA1             QN  XTR1.S32
8910c1bc742181ded4930842b46e9507372f0b1b963James DongqA2             QN  XTR2.S32
8920c1bc742181ded4930842b46e9507372f0b1b963James DongqA3             QN  XTR3.S32
8930c1bc742181ded4930842b46e9507372f0b1b963James DongqA4             QN  XTR4.S32
8940c1bc742181ded4930842b46e9507372f0b1b963James DongqA5             QN  XTR5.S32
8950c1bc742181ded4930842b46e9507372f0b1b963James DongqA6             QN  XTR6.S32
8960c1bc742181ded4930842b46e9507372f0b1b963James DongqA7             QN  XTR7.S32
8970c1bc742181ded4930842b46e9507372f0b1b963James Dong
8980c1bc742181ded4930842b46e9507372f0b1b963James DongdB0             DN  XTR0*2+1      ;// for using VSWP
8990c1bc742181ded4930842b46e9507372f0b1b963James DongdB1             DN  XTR1*2+1
9000c1bc742181ded4930842b46e9507372f0b1b963James DongdB2             DN  XTR2*2+1
9010c1bc742181ded4930842b46e9507372f0b1b963James DongdB3             DN  XTR3*2+1
9020c1bc742181ded4930842b46e9507372f0b1b963James DongdB4             DN  XTR4*2
9030c1bc742181ded4930842b46e9507372f0b1b963James DongdB5             DN  XTR5*2
9040c1bc742181ded4930842b46e9507372f0b1b963James DongdB6             DN  XTR6*2
9050c1bc742181ded4930842b46e9507372f0b1b963James DongdB7             DN  XTR7*2
9060c1bc742181ded4930842b46e9507372f0b1b963James Dong
9070c1bc742181ded4930842b46e9507372f0b1b963James Dong
9080c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf0, qXf1
9090c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf2, qXf3
9100c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf4, qXf5
9110c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf6, qXf7
9120c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA0, qA2
9130c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA1, qA3
9140c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA4, qA6
9150c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA5, qA7
9160c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB0, dB4
9170c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB1, dB5
9180c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB2, dB6
9190c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB3, dB7
9200c1bc742181ded4930842b46e9507372f0b1b963James Dong
9210c1bc742181ded4930842b46e9507372f0b1b963James Dong
9220c1bc742181ded4930842b46e9507372f0b1b963James DongqYj0            QN qXf0
9230c1bc742181ded4930842b46e9507372f0b1b963James DongqYj1            QN qXf1
9240c1bc742181ded4930842b46e9507372f0b1b963James DongqYj2            QN qXf2
9250c1bc742181ded4930842b46e9507372f0b1b963James DongqYj3            QN qXf3
9260c1bc742181ded4930842b46e9507372f0b1b963James DongqYj4            QN qXf4
9270c1bc742181ded4930842b46e9507372f0b1b963James DongqYj5            QN qXf5
9280c1bc742181ded4930842b46e9507372f0b1b963James DongqYj6            QN qXf6
9290c1bc742181ded4930842b46e9507372f0b1b963James DongqYj7            QN qXf7
9300c1bc742181ded4930842b46e9507372f0b1b963James DongqYjt            QN qXft
9310c1bc742181ded4930842b46e9507372f0b1b963James Dong
9320c1bc742181ded4930842b46e9507372f0b1b963James DongdYj0lo          DN (XTR0*2).S16
9330c1bc742181ded4930842b46e9507372f0b1b963James DongdYj0hi          DN (XTR0*2+1).S16
9340c1bc742181ded4930842b46e9507372f0b1b963James DongdYj1lo          DN (XTR1*2).S16
9350c1bc742181ded4930842b46e9507372f0b1b963James DongdYj1hi          DN (XTR1*2+1).S16
9360c1bc742181ded4930842b46e9507372f0b1b963James DongdYj2lo          DN (XTR2*2).S16
9370c1bc742181ded4930842b46e9507372f0b1b963James DongdYj2hi          DN (XTR2*2+1).S16
9380c1bc742181ded4930842b46e9507372f0b1b963James DongdYj3lo          DN (XTR3*2).S16
9390c1bc742181ded4930842b46e9507372f0b1b963James DongdYj3hi          DN (XTR3*2+1).S16
9400c1bc742181ded4930842b46e9507372f0b1b963James DongdYj4lo          DN (XTR4*2).S16
9410c1bc742181ded4930842b46e9507372f0b1b963James DongdYj4hi          DN (XTR4*2+1).S16
9420c1bc742181ded4930842b46e9507372f0b1b963James DongdYj5lo          DN (XTR5*2).S16
9430c1bc742181ded4930842b46e9507372f0b1b963James DongdYj5hi          DN (XTR5*2+1).S16
9440c1bc742181ded4930842b46e9507372f0b1b963James DongdYj6lo          DN (XTR6*2).S16
9450c1bc742181ded4930842b46e9507372f0b1b963James DongdYj6hi          DN (XTR6*2+1).S16
9460c1bc742181ded4930842b46e9507372f0b1b963James DongdYj7lo          DN (XTR7*2).S16
9470c1bc742181ded4930842b46e9507372f0b1b963James DongdYj7hi          DN (XTR7*2+1).S16
9480c1bc742181ded4930842b46e9507372f0b1b963James DongdYjtlo          DN (XTRt*2).S16
9490c1bc742181ded4930842b46e9507372f0b1b963James DongdYjthi          DN (XTRt*2+1).S16
9500c1bc742181ded4930842b46e9507372f0b1b963James Dong
9510c1bc742181ded4930842b46e9507372f0b1b963James DongqYi0            QN qYj0
9520c1bc742181ded4930842b46e9507372f0b1b963James DongqYi1            QN qYj4
9530c1bc742181ded4930842b46e9507372f0b1b963James DongqYi2            QN qYj2
9540c1bc742181ded4930842b46e9507372f0b1b963James DongqYi3            QN qYj7
9550c1bc742181ded4930842b46e9507372f0b1b963James DongqYi4            QN qYj5
9560c1bc742181ded4930842b46e9507372f0b1b963James DongqYi5            QN qYjt
9570c1bc742181ded4930842b46e9507372f0b1b963James DongqYi6            QN qYj1
9580c1bc742181ded4930842b46e9507372f0b1b963James DongqYi7            QN qYj6
9590c1bc742181ded4930842b46e9507372f0b1b963James DongqYit            QN qYj3
9600c1bc742181ded4930842b46e9507372f0b1b963James Dong
9610c1bc742181ded4930842b46e9507372f0b1b963James DongdYi0lo          DN dYj0lo
9620c1bc742181ded4930842b46e9507372f0b1b963James DongdYi0hi          DN dYj0hi
9630c1bc742181ded4930842b46e9507372f0b1b963James DongdYi1lo          DN dYj4lo
9640c1bc742181ded4930842b46e9507372f0b1b963James DongdYi1hi          DN dYj4hi
9650c1bc742181ded4930842b46e9507372f0b1b963James DongdYi2lo          DN dYj2lo
9660c1bc742181ded4930842b46e9507372f0b1b963James DongdYi2hi          DN dYj2hi
9670c1bc742181ded4930842b46e9507372f0b1b963James DongdYi3lo          DN dYj7lo
9680c1bc742181ded4930842b46e9507372f0b1b963James DongdYi3hi          DN dYj7hi
9690c1bc742181ded4930842b46e9507372f0b1b963James DongdYi4lo          DN dYj5lo
9700c1bc742181ded4930842b46e9507372f0b1b963James DongdYi4hi          DN dYj5hi
9710c1bc742181ded4930842b46e9507372f0b1b963James DongdYi5lo          DN dYjtlo
9720c1bc742181ded4930842b46e9507372f0b1b963James DongdYi5hi          DN dYjthi
9730c1bc742181ded4930842b46e9507372f0b1b963James DongdYi6lo          DN dYj1lo
9740c1bc742181ded4930842b46e9507372f0b1b963James DongdYi6hi          DN dYj1hi
9750c1bc742181ded4930842b46e9507372f0b1b963James DongdYi7lo          DN dYj6lo
9760c1bc742181ded4930842b46e9507372f0b1b963James DongdYi7hi          DN dYj6hi
9770c1bc742181ded4930842b46e9507372f0b1b963James DongdYitlo          DN dYj3lo
9780c1bc742181ded4930842b46e9507372f0b1b963James DongdYithi          DN dYj3hi
9790c1bc742181ded4930842b46e9507372f0b1b963James Dong
9800c1bc742181ded4930842b46e9507372f0b1b963James DongqYh0            QN qYit
9810c1bc742181ded4930842b46e9507372f0b1b963James DongqYh1            QN qYi0
9820c1bc742181ded4930842b46e9507372f0b1b963James DongqYh2            QN qYi2
9830c1bc742181ded4930842b46e9507372f0b1b963James DongqYh3            QN qYi3
9840c1bc742181ded4930842b46e9507372f0b1b963James DongqYh4            QN qYi7
9850c1bc742181ded4930842b46e9507372f0b1b963James DongqYh5            QN qYi5
9860c1bc742181ded4930842b46e9507372f0b1b963James DongqYh6            QN qYi4
9870c1bc742181ded4930842b46e9507372f0b1b963James DongqYh7            QN qYi1
9880c1bc742181ded4930842b46e9507372f0b1b963James DongqYht            QN qYi6
9890c1bc742181ded4930842b46e9507372f0b1b963James Dong
9900c1bc742181ded4930842b46e9507372f0b1b963James DongdYh0lo          DN dYitlo
9910c1bc742181ded4930842b46e9507372f0b1b963James DongdYh0hi          DN dYithi
9920c1bc742181ded4930842b46e9507372f0b1b963James DongdYh1lo          DN dYi0lo
9930c1bc742181ded4930842b46e9507372f0b1b963James DongdYh1hi          DN dYi0hi
9940c1bc742181ded4930842b46e9507372f0b1b963James DongdYh2lo          DN dYi2lo
9950c1bc742181ded4930842b46e9507372f0b1b963James DongdYh2hi          DN dYi2hi
9960c1bc742181ded4930842b46e9507372f0b1b963James DongdYh3lo          DN dYi3lo
9970c1bc742181ded4930842b46e9507372f0b1b963James DongdYh3hi          DN dYi3hi
9980c1bc742181ded4930842b46e9507372f0b1b963James DongdYh4lo          DN dYi7lo
9990c1bc742181ded4930842b46e9507372f0b1b963James DongdYh4hi          DN dYi7hi
10000c1bc742181ded4930842b46e9507372f0b1b963James DongdYh5lo          DN dYi5lo
10010c1bc742181ded4930842b46e9507372f0b1b963James DongdYh5hi          DN dYi5hi
10020c1bc742181ded4930842b46e9507372f0b1b963James DongdYh6lo          DN dYi4lo
10030c1bc742181ded4930842b46e9507372f0b1b963James DongdYh6hi          DN dYi4hi
10040c1bc742181ded4930842b46e9507372f0b1b963James DongdYh7lo          DN dYi1lo
10050c1bc742181ded4930842b46e9507372f0b1b963James DongdYh7hi          DN dYi1hi
10060c1bc742181ded4930842b46e9507372f0b1b963James DongdYhtlo          DN dYi6lo
10070c1bc742181ded4930842b46e9507372f0b1b963James DongdYhthi          DN dYi6hi
10080c1bc742181ded4930842b46e9507372f0b1b963James Dong
10090c1bc742181ded4930842b46e9507372f0b1b963James DongqYg0            QN qYh2
10100c1bc742181ded4930842b46e9507372f0b1b963James DongqYg1            QN qYht
10110c1bc742181ded4930842b46e9507372f0b1b963James DongqYg2            QN qYh1
10120c1bc742181ded4930842b46e9507372f0b1b963James DongqYg3            QN qYh0
10130c1bc742181ded4930842b46e9507372f0b1b963James DongqYg4            QN qYh4
10140c1bc742181ded4930842b46e9507372f0b1b963James DongqYg5            QN qYh5
10150c1bc742181ded4930842b46e9507372f0b1b963James DongqYg6            QN qYh6
10160c1bc742181ded4930842b46e9507372f0b1b963James DongqYg7            QN qYh7
10170c1bc742181ded4930842b46e9507372f0b1b963James DongqYgt            QN qYh3
10180c1bc742181ded4930842b46e9507372f0b1b963James Dong
10190c1bc742181ded4930842b46e9507372f0b1b963James DongqYf0            QN qYg6
10200c1bc742181ded4930842b46e9507372f0b1b963James DongqYf1            QN qYg5
10210c1bc742181ded4930842b46e9507372f0b1b963James DongqYf2            QN qYg4
10220c1bc742181ded4930842b46e9507372f0b1b963James DongqYf3            QN qYgt
10230c1bc742181ded4930842b46e9507372f0b1b963James DongqYf4            QN qYg3
10240c1bc742181ded4930842b46e9507372f0b1b963James DongqYf5            QN qYg2
10250c1bc742181ded4930842b46e9507372f0b1b963James DongqYf6            QN qYg1
10260c1bc742181ded4930842b46e9507372f0b1b963James DongqYf7            QN qYg0
10270c1bc742181ded4930842b46e9507372f0b1b963James DongqYft            QN qYg7
10280c1bc742181ded4930842b46e9507372f0b1b963James Dong
10290c1bc742181ded4930842b46e9507372f0b1b963James Dong        VRSHR       qYj7, qYj7, #2
10300c1bc742181ded4930842b46e9507372f0b1b963James Dong        VRSHR       qYj6, qYj6, #1
10310c1bc742181ded4930842b46e9507372f0b1b963James Dong
10320c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
10330c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
10340c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
10350c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
10360c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
10370c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
10380c1bc742181ded4930842b46e9507372f0b1b963James Dong
10390c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
10400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
10410c1bc742181ded4930842b46e9507372f0b1b963James Dong
10420c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         pTemp, #0x4             ;// ensure correct round
10430c1bc742181ded4930842b46e9507372f0b1b963James Dong        VDUP        qScale1, pTemp           ;// of DC result
10440c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qYi0, qYi0, qScale1
10450c1bc742181ded4930842b46e9507372f0b1b963James Dong
10460c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
10470c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
10480c1bc742181ded4930842b46e9507372f0b1b963James Dong
10490c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
10500c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
10510c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYh2, qYi2, qYi3        ;// h2, h3
10520c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
10530c1bc742181ded4930842b46e9507372f0b1b963James Dong
10540c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dYi4lo, C         ;// c*i4
10550c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
10560c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dYi4hi, C
10570c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt1, dYi6hi, S
10580c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh4lo, qXt0, #16       ;// h4
10590c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh4hi, qXt1, #16
10600c1bc742181ded4930842b46e9507372f0b1b963James Dong
10610c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dYi6lo, C         ;// c*i6
10620c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
10630c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dYi6hi, C
10640c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt1, dYi4hi, S
10650c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh6lo, qXt0, #16       ;// h6
10660c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh6hi, qXt1, #16
10670c1bc742181ded4930842b46e9507372f0b1b963James Dong
10680c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYg6, qYh6, qYh7
10690c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYg5, qYh5, qYg6
10700c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYg4, qYh4, qYg5
10710c1bc742181ded4930842b46e9507372f0b1b963James Dong
10720c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
10730c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
10740c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
10750c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
10760c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
10770c1bc742181ded4930842b46e9507372f0b1b963James Dong
10780c1bc742181ded4930842b46e9507372f0b1b963James Dong
10790c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
10800c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf3, qYg3, qYg4
10810c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf4, qYg3, qYg4
10820c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf2, qYg2, qYg5
10830c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf5, qYg2, qYg5
10840c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf1, qYg1, qYg6
10850c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf6, qYg1, qYg6
10860c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf0, qYg0, qYg7
10870c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf7, qYg0, qYg7
10880c1bc742181ded4930842b46e9507372f0b1b963James Dong
10890c1bc742181ded4930842b46e9507372f0b1b963James DongYTR0            EQU Src0
10900c1bc742181ded4930842b46e9507372f0b1b963James DongYTR1            EQU Src4
10910c1bc742181ded4930842b46e9507372f0b1b963James DongYTR2            EQU Src1
10920c1bc742181ded4930842b46e9507372f0b1b963James DongYTR3            EQU Src2
10930c1bc742181ded4930842b46e9507372f0b1b963James DongYTR4            EQU Src7
10940c1bc742181ded4930842b46e9507372f0b1b963James DongYTR5            EQU Src5
10950c1bc742181ded4930842b46e9507372f0b1b963James DongYTR6            EQU Tmp
10960c1bc742181ded4930842b46e9507372f0b1b963James DongYTR7            EQU Src6
10970c1bc742181ded4930842b46e9507372f0b1b963James DongYTRt            EQU Src3
10980c1bc742181ded4930842b46e9507372f0b1b963James Dong
10990c1bc742181ded4930842b46e9507372f0b1b963James DongqC0             QN  YTR0.S32                ;// for YTRpose
11000c1bc742181ded4930842b46e9507372f0b1b963James DongqC1             QN  YTR1.S32
11010c1bc742181ded4930842b46e9507372f0b1b963James DongqC2             QN  YTR2.S32
11020c1bc742181ded4930842b46e9507372f0b1b963James DongqC3             QN  YTR3.S32
11030c1bc742181ded4930842b46e9507372f0b1b963James DongqC4             QN  YTR4.S32
11040c1bc742181ded4930842b46e9507372f0b1b963James DongqC5             QN  YTR5.S32
11050c1bc742181ded4930842b46e9507372f0b1b963James DongqC6             QN  YTR6.S32
11060c1bc742181ded4930842b46e9507372f0b1b963James DongqC7             QN  YTR7.S32
11070c1bc742181ded4930842b46e9507372f0b1b963James Dong
11080c1bc742181ded4930842b46e9507372f0b1b963James DongdD0             DN  YTR0*2+1                ;// for using VSWP
11090c1bc742181ded4930842b46e9507372f0b1b963James DongdD1             DN  YTR1*2+1
11100c1bc742181ded4930842b46e9507372f0b1b963James DongdD2             DN  YTR2*2+1
11110c1bc742181ded4930842b46e9507372f0b1b963James DongdD3             DN  YTR3*2+1
11120c1bc742181ded4930842b46e9507372f0b1b963James DongdD4             DN  YTR4*2
11130c1bc742181ded4930842b46e9507372f0b1b963James DongdD5             DN  YTR5*2
11140c1bc742181ded4930842b46e9507372f0b1b963James DongdD6             DN  YTR6*2
11150c1bc742181ded4930842b46e9507372f0b1b963James DongdD7             DN  YTR7*2
11160c1bc742181ded4930842b46e9507372f0b1b963James Dong
11170c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf0, qYf1
11180c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf2, qYf3
11190c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf4, qYf5
11200c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf6, qYf7
11210c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC0, qC2
11220c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC1, qC3
11230c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC4, qC6
11240c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC5, qC7
11250c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD0, dD4
11260c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD1, dD5
11270c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD2, dD6
11280c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD3, dD7
11290c1bc742181ded4930842b46e9507372f0b1b963James Dong
11300c1bc742181ded4930842b46e9507372f0b1b963James Dong
11310c1bc742181ded4930842b46e9507372f0b1b963James DongdYf0U8          DN YTR0*2.U8
11320c1bc742181ded4930842b46e9507372f0b1b963James DongdYf1U8          DN YTR1*2.U8
11330c1bc742181ded4930842b46e9507372f0b1b963James DongdYf2U8          DN YTR2*2.U8
11340c1bc742181ded4930842b46e9507372f0b1b963James DongdYf3U8          DN YTR3*2.U8
11350c1bc742181ded4930842b46e9507372f0b1b963James DongdYf4U8          DN YTR4*2.U8
11360c1bc742181ded4930842b46e9507372f0b1b963James DongdYf5U8          DN YTR5*2.U8
11370c1bc742181ded4930842b46e9507372f0b1b963James DongdYf6U8          DN YTR6*2.U8
11380c1bc742181ded4930842b46e9507372f0b1b963James DongdYf7U8          DN YTR7*2.U8
11390c1bc742181ded4930842b46e9507372f0b1b963James Dong
11400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
11410c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Do saturation if outsize is other than S16
11420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
11430c1bc742181ded4930842b46e9507372f0b1b963James Dong
11440c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="u8")
11450c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Output range [0-255]
11460c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf0U8, qYf0
11470c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf1U8, qYf1
11480c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf2U8, qYf2
11490c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf3U8, qYf3
11500c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf4U8, qYf4
11510c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf5U8, qYf5
11520c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf6U8, qYf6
11530c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf7U8, qYf7
11540c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
11550c1bc742181ded4930842b46e9507372f0b1b963James Dong
11560c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="s9")
11570c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Output range [-256 to +255]
11580c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf0, qYf0, #16-9
11590c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf1, qYf1, #16-9
11600c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf2, qYf2, #16-9
11610c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf3, qYf3, #16-9
11620c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf4, qYf4, #16-9
11630c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf5, qYf5, #16-9
11640c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf6, qYf6, #16-9
11650c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf7, qYf7, #16-9
11660c1bc742181ded4930842b46e9507372f0b1b963James Dong
11670c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf0, qYf0, #16-9
11680c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf1, qYf1, #16-9
11690c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf2, qYf2, #16-9
11700c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf3, qYf3, #16-9
11710c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf4, qYf4, #16-9
11720c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf5, qYf5, #16-9
11730c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf6, qYf6, #16-9
11740c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf7, qYf7, #16-9
11750c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
11760c1bc742181ded4930842b46e9507372f0b1b963James Dong
11770c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store output depending on the Stride size
11780c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
11790c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf0, [pDest @64], Stride
11800c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf1, [pDest @64], Stride
11810c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf2, [pDest @64], Stride
11820c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf3, [pDest @64], Stride
11830c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf4, [pDest @64], Stride
11840c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf5, [pDest @64], Stride
11850c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf6, [pDest @64], Stride
11860c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf7, [pDest @64]
11870c1bc742181ded4930842b46e9507372f0b1b963James Dong        ELSE
11880c1bc742181ded4930842b46e9507372f0b1b963James Dong            IF ("$outsize"="u8")
11890c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf0U8, [pDest @64], #8
11900c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf1U8, [pDest @64], #8
11910c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf2U8, [pDest @64], #8
11920c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf3U8, [pDest @64], #8
11930c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf4U8, [pDest @64], #8
11940c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf5U8, [pDest @64], #8
11950c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf6U8, [pDest @64], #8
11960c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf7U8, [pDest @64]
11970c1bc742181ded4930842b46e9507372f0b1b963James Dong            ELSE
11980c1bc742181ded4930842b46e9507372f0b1b963James Dong                ;// ("$outsize"="s9") or ("$outsize"="s16")
11990c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf0, [pDest @64], #16
12000c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf1, [pDest @64], #16
12010c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf2, [pDest @64], #16
12020c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf3, [pDest @64], #16
12030c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf4, [pDest @64], #16
12040c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf5, [pDest @64], #16
12050c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf6, [pDest @64], #16
12060c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf7, [pDest @64]
12070c1bc742181ded4930842b46e9507372f0b1b963James Dong            ENDIF
12080c1bc742181ded4930842b46e9507372f0b1b963James Dong
12090c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
12100c1bc742181ded4930842b46e9507372f0b1b963James Dong
12110c1bc742181ded4930842b46e9507372f0b1b963James Dong
12120c1bc742181ded4930842b46e9507372f0b1b963James Dong
12130c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF ;// CortexA8
12140c1bc742181ded4930842b46e9507372f0b1b963James Dong
12150c1bc742181ded4930842b46e9507372f0b1b963James Dong
12160c1bc742181ded4930842b46e9507372f0b1b963James Dong
12170c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
12180c1bc742181ded4930842b46e9507372f0b1b963James Dong
12190c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale TWO input rows with TWO rows of 16 bit scale values
12200c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12210c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
12220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// input (Eight input values) with one row of scale values. Also
12230c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Loads next scale values from pScale, if $LastRow flag is not set.
12240c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Registers:
12260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAlo           - Input D register with first four S16 values of row n
12280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAhi           - Input D register with next four S16 values of row n
12290c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBlo           - Input D register with first four S16 values of row n+1
12300c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBhi           - Input D register with next four S16 values of row n+1
12310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to next row of scale values
12320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT0lo           - Temporary scratch register
12330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT0hi           - Temporary scratch register
12340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT1lo           - Temporary scratch register
12350c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT1hi           - Temporary scratch register
12360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale1lo       - Scale value of row n
12370c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale1hi       - Scale value of row n
12380c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale2lo       - Scale value of row n+1
12390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale2hi       - Scale value of row n+1
12400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12410c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Flag
12420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12430c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $LastRow        - Flag to indicate whether current row is last row
12440c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12450c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output Registers:
12460c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12470c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAlo           - Scaled output values (first four S16 of row n)
12480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAhi           - Scaled output values (next four S16 of row n)
12490c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBlo           - Scaled output values (first four S16 of row n+1)
12500c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBhi           - Scaled output values (next four S16 of row n+1)
12510c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qScale1         - Scale values for next row
12520c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qScale2         - Scale values for next row+1
12530c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to next row of scale values
12540c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12550c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
12560c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
12570c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT0lo, $dAlo, dScale1lo
12580c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT0hi, $dAhi, dScale1hi
12590c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT1lo, $dBlo, dScale2lo
12600c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT1hi, $dBhi, dScale2hi
12610c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$LastRow"="0"
12620c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
12630c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
12640c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
12650c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dAlo, qT0lo, #12
12660c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dAhi, qT0hi, #12
12670c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dBlo, qT1lo, #12
12680c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dBhi, qT1hi, #12
12690c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
12700c1bc742181ded4930842b46e9507372f0b1b963James Dong
12710c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale 8x8 block input values with 16 bit scale values
12720c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12730c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This macro is used to pre-scale block of 8x8 input.
12740c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This also do the Ist stage transformations of IDCT.
12750c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12760c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Registers:
12770c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12780c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnlo          - n th input D register with first four S16 values
12790c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnhi          - n th input D register with next four S16 values
12800c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qXjn            - n th input Q register with eight S16 values
12810c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to scale values
12820c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12830c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output Registers:
12840c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12850c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qXin            - n th output Q register with eight S16 output values of 1st stage
12860c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12870c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
12880c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_PRESCALE16
12890c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
12900c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
12910c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
12920c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
12930c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
12940c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
12950c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
12960c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXi6, qXj1, qXj7            ;// j1-j7
12970c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
12980c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
12990c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXi2, qXj2, qXj6            ;// j2-j6
13000c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
13010c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
13020c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXi4, qXj5, qXj3            ;// j5-j3
13030c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
13040c1bc742181ded4930842b46e9507372f0b1b963James Dong
13050c1bc742181ded4930842b46e9507372f0b1b963James Dong
13060c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale 8x8 block input values with 32 bit scale values
13070c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13080c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This macro is used to pre-scale block of 8x8 input.
13090c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This also do the Ist stage transformations of IDCT.
13100c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13110c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Registers:
13120c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13130c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnlo          - n th input D register with first four S16 values
13140c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnhi          - n th input D register with next four S16 values
13150c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qXjn            - n th input Q register with eight S16 values
13160c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to 32bit scale values in Q23 format
13170c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13180c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output Registers:
13190c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13200c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
13210c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
13220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13230c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
13240c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_PRESCALE32
13250c1bc742181ded4930842b46e9507372f0b1b963James DongqScale0lo       QN 0.S32
13260c1bc742181ded4930842b46e9507372f0b1b963James DongqScale0hi       QN 1.S32
13270c1bc742181ded4930842b46e9507372f0b1b963James DongqScale1lo       QN 2.S32
13280c1bc742181ded4930842b46e9507372f0b1b963James DongqScale1hi       QN 3.S32
13290c1bc742181ded4930842b46e9507372f0b1b963James DongqScale2lo       QN qScale1lo
13300c1bc742181ded4930842b46e9507372f0b1b963James DongqScale2hi       QN qScale1hi
13310c1bc742181ded4930842b46e9507372f0b1b963James DongqScale3lo       QN qScale1lo
13320c1bc742181ded4930842b46e9507372f0b1b963James DongqScale3hi       QN qScale1hi
13330c1bc742181ded4930842b46e9507372f0b1b963James DongqScale4lo       QN qScale1lo
13340c1bc742181ded4930842b46e9507372f0b1b963James DongqScale4hi       QN qScale1hi
13350c1bc742181ded4930842b46e9507372f0b1b963James DongqScale5lo       QN qScale0lo
13360c1bc742181ded4930842b46e9507372f0b1b963James DongqScale5hi       QN qScale0hi
13370c1bc742181ded4930842b46e9507372f0b1b963James DongqScale6lo       QN qScale0lo
13380c1bc742181ded4930842b46e9507372f0b1b963James DongqScale6hi       QN qScale0hi
13390c1bc742181ded4930842b46e9507372f0b1b963James DongqScale7lo       QN qScale0lo
13400c1bc742181ded4930842b46e9507372f0b1b963James DongqScale7hi       QN qScale0hi
13410c1bc742181ded4930842b46e9507372f0b1b963James Dong
13420c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc0lo         QN 4.S32
13430c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc0hi         QN 5.S32
13440c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc1lo         QN 6.S32
13450c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc1hi         QN Src4.S32
13460c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc2lo         QN qSrc0lo
13470c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc2hi         QN qSrc0hi
13480c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc3lo         QN qSrc0lo
13490c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc3hi         QN qSrc0hi
13500c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc4lo         QN qSrc0lo
13510c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc4hi         QN qSrc0hi
13520c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc5lo         QN qSrc1lo
13530c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc5hi         QN qSrc1hi
13540c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc6lo         QN qSrc1lo
13550c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc6hi         QN qSrc1hi
13560c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc7lo         QN qSrc0lo
13570c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc7hi         QN qSrc0hi
13580c1bc742181ded4930842b46e9507372f0b1b963James Dong
13590c1bc742181ded4930842b46e9507372f0b1b963James DongqRes17lo        QN qScale0lo
13600c1bc742181ded4930842b46e9507372f0b1b963James DongqRes17hi        QN qScale0hi
13610c1bc742181ded4930842b46e9507372f0b1b963James DongqRes26lo        QN qScale0lo
13620c1bc742181ded4930842b46e9507372f0b1b963James DongqRes26hi        QN qScale0hi
13630c1bc742181ded4930842b46e9507372f0b1b963James DongqRes53lo        QN qScale0lo
13640c1bc742181ded4930842b46e9507372f0b1b963James DongqRes53hi        QN qScale0hi
13650c1bc742181ded4930842b46e9507372f0b1b963James Dong
13660c1bc742181ded4930842b46e9507372f0b1b963James Dong            ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
13670c1bc742181ded4930842b46e9507372f0b1b963James Dong
13680c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 0
13690c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale0lo, qScale0hi}, [pScale]!
13700c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc0lo, dXj0lo, #(12-1)
13710c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc0hi, dXj0hi, #(12-1)
13720c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale1lo, qScale1hi}, [pScale]!
13730c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
13740c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
13750c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale7lo, qScale7hi}, [pTemp]!
13760c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc1lo, dXj1lo, #(12-1)
13770c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc1hi, dXj1hi, #(12-1)
13780c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
13790c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi0hi, qSrc0hi
13800c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc7lo, dXj7lo, #(12-1)
13810c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc7hi, dXj7hi, #(12-1)
13820c1bc742181ded4930842b46e9507372f0b1b963James Dong            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
13830c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
13840c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
13850c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
13860c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
13870c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale2lo, qScale2hi}, [pScale]!
13880c1bc742181ded4930842b46e9507372f0b1b963James Dong
13890c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 1 & 7
13900c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
13910c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
13920c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi5lo, qRes17lo                ;// Output i5
13930c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi5hi, qRes17hi
13940c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
13950c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
13960c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi6lo, qRes17lo                ;// Output i6
13970c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi6hi, qRes17hi
13980c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc2lo, dXj2lo, #(12-1)
13990c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc2hi, dXj2hi, #(12-1)
14000c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale6lo, qScale6hi}, [pTemp]!
14010c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc6lo, dXj6lo, #(12-1)
14020c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc6hi, dXj6hi, #(12-1)
14030c1bc742181ded4930842b46e9507372f0b1b963James Dong            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
14040c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
14050c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
14060c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
14070c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
14080c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale3lo, qScale3hi}, [pScale]!
14090c1bc742181ded4930842b46e9507372f0b1b963James Dong
14100c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 2 & 6
14110c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
14120c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
14130c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi3lo, qRes26lo                ;// Output i3
14140c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi3hi, qRes26hi
14150c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
14160c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
14170c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi2lo, qRes26lo                ;// Output i2
14180c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi2hi, qRes26hi
14190c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc3lo, dXj3lo, #(12-1)
14200c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc3hi, dXj3hi, #(12-1)
14210c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale5lo, qScale5hi}, [pTemp]!
14220c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc5lo, dXj5lo, #(12-1)
14230c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc5hi, dXj5hi, #(12-1)
14240c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
14250c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
14260c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
14270c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
14280c1bc742181ded4930842b46e9507372f0b1b963James Dong
14290c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 3 & 5
14300c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
14310c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
14320c1bc742181ded4930842b46e9507372f0b1b963James Dong            SUB         pSrc, pSrc, #16*2*2
14330c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi7lo, qRes53lo                ;// Output i7
14340c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi7hi, qRes53hi
14350c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
14360c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
14370c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        qXj4, [pSrc @64]
14380c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi4lo, qRes53lo                ;// Output i4
14390c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi4hi, qRes53hi
14400c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc4lo, dXj4lo, #(12-1)
14410c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc4hi, dXj4hi, #(12-1)
14420c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale4lo, qScale4hi}, [pScale]
14430c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
14440c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
14450c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
14460c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
14470c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 4
14480c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
14490c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi1hi, qSrc4hi
14500c1bc742181ded4930842b46e9507372f0b1b963James Dong
14510c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
14520c1bc742181ded4930842b46e9507372f0b1b963James Dong
14530c1bc742181ded4930842b46e9507372f0b1b963James Dong        END
1454