10c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Copyright (C) 2004 ARM Limited
378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Licensed under the Apache License, Version 2.0 (the "License");
578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// you may not use this file except in compliance with the License.
678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// You may obtain a copy of the License at
778e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
878e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//      http://www.apache.org/licenses/LICENSE-2.0
978e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1078e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// Unless required by applicable law or agreed to in writing, software
1178e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// distributed under the License is distributed on an "AS IS" BASIS,
1278e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1378e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// See the License for the specific language governing permissions and
1478e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;// limitations under the License.
1578e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
1678e52bfac041d71ce53b5b13c2abf78af742b09dLajos Molnar;//
170c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IDCT_s.s
190c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
200c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Inverse DCT module
210c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
220c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
230c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ALGORITHM DESCRIPTION
240c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
250c1bc742181ded4930842b46e9507372f0b1b963James Dong;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// column and then a 1D IDCT for each row.
270c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
280c1bc742181ded4930842b46e9507372f0b1b963James Dong;// The 8-point 1D IDCT is defined by
290c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
300c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
310c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
320c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   c(u,x) = cos( (2x+1)*u*pi/16 )
330c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
340c1bc742181ded4930842b46e9507372f0b1b963James Dong;// We compute the 8-point 1D IDCT using the reverse of
350c1bc742181ded4930842b46e9507372f0b1b963James Dong;// the Arai-Agui-Nakajima flow graph which we split into
360c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 5 stages named in reverse order to identify with the
370c1bc742181ded4930842b46e9507372f0b1b963James Dong;// forward DCT. Direct inversion of the forward formulae
380c1bc742181ded4930842b46e9507372f0b1b963James Dong;// in file FDCT_s.s gives:
390c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
400c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
410c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             [ A(0) = 2*sqrt(2)
420c1bc742181ded4930842b46e9507372f0b1b963James Dong;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
430c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
440c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 4:   i0 = j0             i1 = j4
450c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
460c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
470c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
480c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
490c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
500c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h2 = (i2*sqrt2)-i3  h3 = i3
510c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
520c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
530c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             [ The above two lines rotate by -(pi/8) ]
540c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2
550c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
560c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
570c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
580c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g7 = h7             g6 = h6 - h7
590c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g5 = h5 - g6        g4 = h4 - g5
600c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
610c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
620c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
630c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
640c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
650c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
660c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Note that most coefficients are halved 3 times during the
670c1bc742181ded4930842b46e9507372f0b1b963James Dong;// above calculation. We can rescale the algorithm dividing
680c1bc742181ded4930842b46e9507372f0b1b963James Dong;// the input by 8 to remove the halvings.
690c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
700c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 5:   j(u) = T(u)*A(u)/8
710c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
720c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 4:   i0 = j0             i1 = j4
730c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i3 = j2 + j6        i2 = j2 - j6
740c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i7 = j5 + j3        i4 = j5 - j3
750c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i5 = j1 + j7        i6 = j1 - j7
760c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
770c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
780c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h2 = (i2*sqrt2)-i3  h3 = i3
790c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
800c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
810c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7
820c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
840c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g1 = h1 + h2        g2 = h1 - h2
850c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g7 = h7             g6 = h6 - h7
860c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g5 = h5 - g6        g4 = h4 - g5
870c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
880c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
890c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f1 = g1 + g6        f6 = g1 - g6
900c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f2 = g2 + g5        f5 = g2 - g5
910c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f3 = g3 + g4        f4 = g3 - g4
920c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
930c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Note:
940c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 1. The scaling by A(u)/8 can often be combined with inverse
950c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    quantization. The column and row scalings can be combined.
960c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
970c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    to the above code but is otherwise identical.
980c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 3. The rotation by -pi/8 can be peformed using three multiplies
990c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
1010c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 4. If |T(u)|<=1 then from the IDCT definition,
1020c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
1030c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
1040c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = (approx)2.64
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
1080c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    The table below shows input patterns generating the maximum
1090c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    InputPattern      Max |f(x)|
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPPPPPPP        |f0| =  2.64
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPPMMMMM        |f1| =  2.64
1130c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPMMMPPP        |f2| =  2.64
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPMMPPMM        |f3| =  2.64
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMMPPMMP        |f4| =  2.64
1160c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMMPMMPM        |f5| =  2.64
1170c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMPPMPMP        |f6| =  2.64
1180c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMPMPMPM        |f7| =  2.64
1190c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   Note that this input pattern is the transpose of the
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   corresponding max input patter for the FDCT.
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Arguments
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong
1240c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc    RN 0    ;// source data buffer
1250c1bc742181ded4930842b46e9507372f0b1b963James DongStride  RN 1    ;// destination stride in bytes
1260c1bc742181ded4930842b46e9507372f0b1b963James DongpDest   RN 2    ;// destination data buffer
1270c1bc742181ded4930842b46e9507372f0b1b963James DongpScale  RN 3    ;// pointer to scaling table
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong
1290c1bc742181ded4930842b46e9507372f0b1b963James Dong
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// DCT Inverse Macro
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// The DCT code should be parametrized according
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// to the following inputs:
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Inputs:
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pSrc   = r0 = Pointer to input data
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//               Range is -256 to +255 (9-bit)
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Stride = r1 = Stride between input lines
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pDest  = r2 = Pointer to output data
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT  $outsize, $inscale, $stride
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong        LCLA    SHIFT
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ARM1136JS
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong;// REGISTER ALLOCATION
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong;// This is hard since we have 8 values, 9 free registers and each
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong;// butterfly requires a temporary register. We also want to
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong;// maintain register order so we can use LDM/STM. The table below
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong;// summarises the register allocation that meets all these criteria.
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r1  a01     g0  h0
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r4  b01 f0  g1  h1  i0
1640c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r5  a23 f1  g2      i1
1650c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r6  b23 f2  g3  h2  i2
1660c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r7  a45 f3      h3  i3
1670c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r8  b45 f4  g4  h4  i4
1680c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r9  a67 f5  g5  h5  i5
1690c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r10 b67 f6  g6  h6  i6
1700c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r11     f7  g7  h7  i7
1710c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
1720c1bc742181ded4930842b46e9507372f0b1b963James Dongra01    RN 1
1730c1bc742181ded4930842b46e9507372f0b1b963James Dongrb01    RN 4
1740c1bc742181ded4930842b46e9507372f0b1b963James Dongra23    RN 5
1750c1bc742181ded4930842b46e9507372f0b1b963James Dongrb23    RN 6
1760c1bc742181ded4930842b46e9507372f0b1b963James Dongra45    RN 7
1770c1bc742181ded4930842b46e9507372f0b1b963James Dongrb45    RN 8
1780c1bc742181ded4930842b46e9507372f0b1b963James Dongra67    RN 9
1790c1bc742181ded4930842b46e9507372f0b1b963James Dongrb67    RN 10
1800c1bc742181ded4930842b46e9507372f0b1b963James Dongrtmp    RN 11
1810c1bc742181ded4930842b46e9507372f0b1b963James DongcsPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
1820c1bc742181ded4930842b46e9507372f0b1b963James DongLoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
1830c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transpose allocation
1840c1bc742181ded4930842b46e9507372f0b1b963James Dongxft     RN ra01
1850c1bc742181ded4930842b46e9507372f0b1b963James Dongxf0     RN rb01
1860c1bc742181ded4930842b46e9507372f0b1b963James Dongxf1     RN ra23
1870c1bc742181ded4930842b46e9507372f0b1b963James Dongxf2     RN rb23
1880c1bc742181ded4930842b46e9507372f0b1b963James Dongxf3     RN ra45
1890c1bc742181ded4930842b46e9507372f0b1b963James Dongxf4     RN rb45
1900c1bc742181ded4930842b46e9507372f0b1b963James Dongxf5     RN ra67
1910c1bc742181ded4930842b46e9507372f0b1b963James Dongxf6     RN rb67
1920c1bc742181ded4930842b46e9507372f0b1b963James Dongxf7     RN rtmp
1930c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 1 allocation
1940c1bc742181ded4930842b46e9507372f0b1b963James Dongxg0     RN xft
1950c1bc742181ded4930842b46e9507372f0b1b963James Dongxg1     RN xf0
1960c1bc742181ded4930842b46e9507372f0b1b963James Dongxg2     RN xf1
1970c1bc742181ded4930842b46e9507372f0b1b963James Dongxg3     RN xf2
1980c1bc742181ded4930842b46e9507372f0b1b963James Dongxgt     RN xf3
1990c1bc742181ded4930842b46e9507372f0b1b963James Dongxg4     RN xf4
2000c1bc742181ded4930842b46e9507372f0b1b963James Dongxg5     RN xf5
2010c1bc742181ded4930842b46e9507372f0b1b963James Dongxg6     RN xf6
2020c1bc742181ded4930842b46e9507372f0b1b963James Dongxg7     RN xf7
2030c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 2 allocation
2040c1bc742181ded4930842b46e9507372f0b1b963James Dongxh0     RN xg0
2050c1bc742181ded4930842b46e9507372f0b1b963James Dongxh1     RN xg1
2060c1bc742181ded4930842b46e9507372f0b1b963James Dongxht     RN xg2
2070c1bc742181ded4930842b46e9507372f0b1b963James Dongxh2     RN xg3
2080c1bc742181ded4930842b46e9507372f0b1b963James Dongxh3     RN xgt
2090c1bc742181ded4930842b46e9507372f0b1b963James Dongxh4     RN xg4
2100c1bc742181ded4930842b46e9507372f0b1b963James Dongxh5     RN xg5
2110c1bc742181ded4930842b46e9507372f0b1b963James Dongxh6     RN xg6
2120c1bc742181ded4930842b46e9507372f0b1b963James Dongxh7     RN xg7
2130c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 3,4 allocation
2140c1bc742181ded4930842b46e9507372f0b1b963James Dongxit     RN xh0
2150c1bc742181ded4930842b46e9507372f0b1b963James Dongxi0     RN xh1
2160c1bc742181ded4930842b46e9507372f0b1b963James Dongxi1     RN xht
2170c1bc742181ded4930842b46e9507372f0b1b963James Dongxi2     RN xh2
2180c1bc742181ded4930842b46e9507372f0b1b963James Dongxi3     RN xh3
2190c1bc742181ded4930842b46e9507372f0b1b963James Dongxi4     RN xh4
2200c1bc742181ded4930842b46e9507372f0b1b963James Dongxi5     RN xh5
2210c1bc742181ded4930842b46e9507372f0b1b963James Dongxi6     RN xh6
2220c1bc742181ded4930842b46e9507372f0b1b963James Dongxi7     RN xh7
2230c1bc742181ded4930842b46e9507372f0b1b963James Dong
2240c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STR   pDest,  ppDest
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_STR   Stride, pStride
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ADR   pDest,  pBlk
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     csPiBy8, =0x30fc7642
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     LoopRR2, =0x00005a82
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong
2320c1bc742181ded4930842b46e9507372f0b1b963James Dongv6_idct_col$_F
2330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load even values
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi4, [pSrc], #4  ;// j0
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi5, [pSrc, #4*16-4]  ;// j4
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi6, [pSrc, #2*16-4]  ;// j2
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi7, [pSrc, #6*16-4]  ;// j6
2380c1bc742181ded4930842b46e9507372f0b1b963James Dong
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale Even Values
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s16" ;// 16x16 mul
2410c1bc742181ded4930842b46e9507372f0b1b963James DongSHIFT       SETA    12
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi0, [pScale], #4
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi1, [pScale, #4*16-4]
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi2, [pScale, #2*16-4]
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xit, #1<<(SHIFT-1)
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi3, xi0, xi4, xit
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi4, xi0, xi4, xit
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi0, xi1, xi5, xit
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi5, xi1, xi5, xit
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi3, xi3, ASR #SHIFT
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
2520c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi3, [pScale, #6*16-4]
2530c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi1, xi2, xi6, xit
2540c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi6, xi2, xi6, xit
2550c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi0, xi0, ASR #SHIFT
2560c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
2570c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi2, xi3, xi7, xit
2580c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi7, xi3, xi7, xit
2590c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi1, xi1, ASR #SHIFT
2600c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
2610c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi2, xi2, ASR #SHIFT
2620c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
2630c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
2640c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s32" ;// 32x16 mul
2650c1bc742181ded4930842b46e9507372f0b1b963James DongSHIFT       SETA    (12+8-16)
2660c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xit, #1<<(SHIFT-1)
2670c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi0, [pScale], #8
2680c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi1, [pScale, #0*32+4-8]
2690c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi2, [pScale, #4*32-8]
2700c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi3, [pScale, #4*32+4-8]
2710c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi0, xi0, xi4, xit
2720c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi1, xi1, xi4, xit
2730c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi2, xi2, xi5, xit
2740c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi3, xi3, xi5, xit
2750c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi0, xi0, ASR #SHIFT
2760c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
2770c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi2, xi2, ASR #SHIFT
2780c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
2790c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi0, [pScale, #2*32-8]
2800c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi1, [pScale, #2*32+4-8]
2810c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi2, [pScale, #6*32-8]
2820c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi3, [pScale, #6*32+4-8]
2830c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi0, xi0, xi6, xit
2840c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi1, xi1, xi6, xit
2850c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi2, xi2, xi7, xit
2860c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi3, xi3, xi7, xit
2870c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi0, xi0, ASR #SHIFT
2880c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
2890c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi2, xi2, ASR #SHIFT
2900c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
2910c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
2920c1bc742181ded4930842b46e9507372f0b1b963James Dong
2930c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load odd values
2940c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc, #1*16-4]      ;// j1
2950c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #7*16-4]      ;// j7
2960c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi2, [pSrc, #5*16-4]      ;// j5
2970c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi3, [pSrc, #3*16-4]      ;// j3
2980c1bc742181ded4930842b46e9507372f0b1b963James Dong
2990c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF  {TRUE}
3000c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// shortcut if odd values 0
3010c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQ     xi0, #0
3020c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQEQ   xi1, #0
3030c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQEQ   xi2, #0
3040c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQEQ   xi3, #0
3050c1bc742181ded4930842b46e9507372f0b1b963James Dong            BEQ     v6OddZero$_F
3060c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
3070c1bc742181ded4930842b46e9507372f0b1b963James Dong
3080c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store scaled even values
3090c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest, {xi4, xi5, xi6, xi7}
3100c1bc742181ded4930842b46e9507372f0b1b963James Dong
3110c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale odd values
3120c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s16"
3130c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Perform AAN Scale
3140c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi4, [pScale, #1*16-4]
3150c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi5, [pScale, #7*16-4]
3160c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi6, [pScale, #5*16-4]
3170c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi7, xi0, xi4, xit
3180c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi0, xi0, xi4, xit
3190c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi4, xi1, xi5, xit
3200c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi1, xi1, xi5, xit
3210c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi7, xi7, ASR #SHIFT
3220c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
3230c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi7, [pScale, #3*16-4]
3240c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi5, xi2, xi6, xit
3250c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi2, xi2, xi6, xit
3260c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi4, xi4, ASR #SHIFT
3270c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
3280c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi6, xi3, xi7, xit
3290c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi3, xi3, xi7, xit
3300c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi5, xi5, ASR #SHIFT
3310c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
3320c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi6, xi6, ASR #SHIFT
3330c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
3340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
3350c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s32" ;// 32x16 mul
3360c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi4, [pScale, #1*32-8]
3370c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi5, [pScale, #1*32+4-8]
3380c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi6, [pScale, #7*32-8]
3390c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi7, [pScale, #7*32+4-8]
3400c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi4, xi4, xi0, xit
3410c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi5, xi5, xi0, xit
3420c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi6, xi6, xi1, xit
3430c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi7, xi7, xi1, xit
3440c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi4, xi4, ASR #SHIFT
3450c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
3460c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi6, xi6, ASR #SHIFT
3470c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
3480c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi4, [pScale, #5*32-8]
3490c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi5, [pScale, #5*32+4-8]
3500c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi6, [pScale, #3*32-8]
3510c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi7, [pScale, #3*32+4-8]
3520c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi4, xi4, xi2, xit
3530c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi5, xi5, xi2, xit
3540c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi6, xi6, xi3, xit
3550c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi7, xi7, xi3, xit
3560c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi4, xi4, ASR #SHIFT
3570c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
3580c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi6, xi6, ASR #SHIFT
3590c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
3600c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
3610c1bc742181ded4930842b46e9507372f0b1b963James Dong
3620c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xit, =0x00010001        ;// rounding constant
3630c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16 xi5, xi0, xi1           ;// (j1+j7)/2
3640c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi5, xi5, xit
3650c1bc742181ded4930842b46e9507372f0b1b963James Dong
3660c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi6, xi0, xi1           ;// j1-j7
3670c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16 xi7, xi2, xi3           ;// (j5+j3)/2
3680c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi7, xi7, xit
3690c1bc742181ded4930842b46e9507372f0b1b963James Dong
3700c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi4, xi2, xi3           ;// j5-j3
3710c1bc742181ded4930842b46e9507372f0b1b963James Dong
3720c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
3730c1bc742181ded4930842b46e9507372f0b1b963James Dong
3740c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
3750c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
3760c1bc742181ded4930842b46e9507372f0b1b963James Dong
3770c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
3780c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
3790c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
3800c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
3810c1bc742181ded4930842b46e9507372f0b1b963James Dong
3820c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi1, xi3, LoopRR2
3830c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi3, xi3, LoopRR2
3840c1bc742181ded4930842b46e9507372f0b1b963James Dong
3850c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
3860c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
3870c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
3880c1bc742181ded4930842b46e9507372f0b1b963James Dong
3890c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0,xi1,xi2,xi3 now free
3900c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3, rows 2to3 x1/2
3910c1bc742181ded4930842b46e9507372f0b1b963James Dong
3920c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi3, xi3, LSL #1
3930c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
3940c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
3950c1bc742181ded4930842b46e9507372f0b1b963James Dong
3960c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2, rows4to7
3970c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg6, xh6, xh7
3980c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg5, xh5, xg6
3990c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg4, xh4, xg5
4000c1bc742181ded4930842b46e9507372f0b1b963James Dong
4010c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
4020c1bc742181ded4930842b46e9507372f0b1b963James Dong
4030c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
4040c1bc742181ded4930842b46e9507372f0b1b963James Dong
4050c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi0, xi2, LoopRR2
4060c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi2, xi2, LoopRR2
4070c1bc742181ded4930842b46e9507372f0b1b963James Dong
4080c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi2, xi2, LSL #1
4090c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
4100c1bc742181ded4930842b46e9507372f0b1b963James Dong
4110c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0, xi1 now free
4120c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
4130c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRD    xi0, [pDest]            ;// j0, j4 scaled
4140c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xh2, xh2, xi3
4150c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
4160c1bc742181ded4930842b46e9507372f0b1b963James Dong
4170c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh0, xi0, xi1
4180c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xh1, xi0, xi1
4190c1bc742181ded4930842b46e9507372f0b1b963James Dong
4200c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
4210c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg2, xh1, xh2
4220c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg1, xh1, xh2
4230c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg3, xh0, xh3
4240c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg0, xh0, xh3
4250c1bc742181ded4930842b46e9507372f0b1b963James Dong
4260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
4270c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf3, xg3, xg4
4280c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf4, xg3, xg4
4290c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf2, xg2, xg5
4300c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf5, xg2, xg5
4310c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf1, xg1, xg6
4320c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf6, xg1, xg6
4330c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf0, xg0, xg7
4340c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf7, xg0, xg7
4350c1bc742181ded4930842b46e9507372f0b1b963James Dong
4360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose, store and loop
4370c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra01, xf0, xf1, LSL #16
4380c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb01, xf1, xf0, ASR #16
4390c1bc742181ded4930842b46e9507372f0b1b963James Dong
4400c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra23, xf2, xf3, LSL #16
4410c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb23, xf3, xf2, ASR #16
4420c1bc742181ded4930842b46e9507372f0b1b963James Dong
4430c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra45, xf4, xf5, LSL #16
4440c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb45, xf5, xf4, ASR #16
4450c1bc742181ded4930842b46e9507372f0b1b963James Dong
4460c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra67, xf6, xf7, LSL #16
4470c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {ra01, ra23, ra45, ra67}
4480c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb67, xf7, xf6, ASR #16
4490c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {rb01, rb23, rb45, rb67}
4500c1bc742181ded4930842b46e9507372f0b1b963James Dong        BCC     v6_idct_col$_F
4510c1bc742181ded4930842b46e9507372f0b1b963James Dong
4520c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB     pSrc, pDest, #(64*2)
4530c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   pDest, ppDest
4540c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
4550c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_LDR   pScale, pStride
4560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
4570c1bc742181ded4930842b46e9507372f0b1b963James Dong        B       v6_idct_row$_F
4580c1bc742181ded4930842b46e9507372f0b1b963James Dong
4590c1bc742181ded4930842b46e9507372f0b1b963James Dongv6OddZero$_F
4600c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi2, xi6, xi7           ;// (j2-j6)
4610c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
4620c1bc742181ded4930842b46e9507372f0b1b963James Dong
4630c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi0, xi2, LoopRR2
4640c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi2, xi2, LoopRR2
4650c1bc742181ded4930842b46e9507372f0b1b963James Dong
4660c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi2, xi2, LSL #1
4670c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
4680c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xh2, xh2, xi3
4690c1bc742181ded4930842b46e9507372f0b1b963James Dong
4700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0, xi1 now free
4710c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
4720c1bc742181ded4930842b46e9507372f0b1b963James Dong
4730c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh0, xi4, xi5
4740c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xh1, xi4, xi5
4750c1bc742181ded4930842b46e9507372f0b1b963James Dong
4760c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
4770c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg2, xh1, xh2
4780c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg1, xh1, xh2
4790c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg3, xh0, xh3
4800c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg0, xh0, xh3
4810c1bc742181ded4930842b46e9507372f0b1b963James Dong
4820c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
4830c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf3, xg3
4840c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf4, xg3
4850c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf2, xg2
4860c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf5, xg2
4870c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf1, xg1
4880c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf6, xg1
4890c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf0, xg0
4900c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf7, xg0
4910c1bc742181ded4930842b46e9507372f0b1b963James Dong
4920c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose
4930c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra01, xf0, xf1, LSL #16
4940c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb01, xf1, xf0, ASR #16
4950c1bc742181ded4930842b46e9507372f0b1b963James Dong
4960c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra23, xf2, xf3, LSL #16
4970c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb23, xf3, xf2, ASR #16
4980c1bc742181ded4930842b46e9507372f0b1b963James Dong
4990c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra45, xf4, xf5, LSL #16
5000c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb45, xf5, xf4, ASR #16
5010c1bc742181ded4930842b46e9507372f0b1b963James Dong
5020c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra67, xf6, xf7, LSL #16
5030c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb67, xf7, xf6, ASR #16
5040c1bc742181ded4930842b46e9507372f0b1b963James Dong
5050c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {ra01, ra23, ra45, ra67}
5060c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
5070c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {rb01, rb23, rb45, rb67}
5080c1bc742181ded4930842b46e9507372f0b1b963James Dong
5090c1bc742181ded4930842b46e9507372f0b1b963James Dong        BCC     v6_idct_col$_F
5100c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB     pSrc, pDest, #(64*2)
5110c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   pDest, ppDest
5120c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
5130c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_LDR   pScale, pStride
5140c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
5150c1bc742181ded4930842b46e9507372f0b1b963James Dong
5160c1bc742181ded4930842b46e9507372f0b1b963James Dong
5170c1bc742181ded4930842b46e9507372f0b1b963James Dongv6_idct_row$_F
5180c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3, rows4to7 x1/4
5190c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xit, =0x00010001        ;// rounding constant
5200c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc, #1*16]      ;// j1
5210c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #7*16]      ;// 4*j7
5220c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi2, [pSrc, #5*16]      ;// j5
5230c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi3, [pSrc, #3*16]      ;// j3
5240c1bc742181ded4930842b46e9507372f0b1b963James Dong
5250c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi1, xi1, xit           ;// 2*j7
5260c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi1, xi1, xit           ;// j7
5270c1bc742181ded4930842b46e9507372f0b1b963James Dong
5280c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
5290c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi6, xi0, xi1           ;// j1-j7
5300c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
5310c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi4, xi2, xi3           ;// j5-j3
5320c1bc742181ded4930842b46e9507372f0b1b963James Dong
5330c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
5340c1bc742181ded4930842b46e9507372f0b1b963James Dong
5350c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
5360c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
5370c1bc742181ded4930842b46e9507372f0b1b963James Dong
5380c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
5390c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
5400c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
5410c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
5420c1bc742181ded4930842b46e9507372f0b1b963James Dong
5430c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi1, xi3, LoopRR2
5440c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi3, xi3, LoopRR2
5450c1bc742181ded4930842b46e9507372f0b1b963James Dong
5460c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
5470c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
5480c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
5490c1bc742181ded4930842b46e9507372f0b1b963James Dong
5500c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi3, xi3, LSL #1
5510c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
5520c1bc742181ded4930842b46e9507372f0b1b963James Dong
5530c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0,xi1,xi2,xi3 now free
5540c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3, rows 2to3 x1/2
5550c1bc742181ded4930842b46e9507372f0b1b963James Dong
5560c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc, #2*16]      ;// j2
5570c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #6*16]      ;// 2*j6
5580c1bc742181ded4930842b46e9507372f0b1b963James Dong
5590c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2, rows4to7
5600c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg6, xh6, xh7
5610c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg5, xh5, xg6
5620c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg4, xh4, xg5
5630c1bc742181ded4930842b46e9507372f0b1b963James Dong
5640c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi1, xi1, xit           ;// j6
5650c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
5660c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
5670c1bc742181ded4930842b46e9507372f0b1b963James Dong
5680c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi0, xi2, LoopRR2
5690c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi2, xi2, LoopRR2
5700c1bc742181ded4930842b46e9507372f0b1b963James Dong
5710c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi2, xi2, LSL #1
5720c1bc742181ded4930842b46e9507372f0b1b963James Dong
5730c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
5740c1bc742181ded4930842b46e9507372f0b1b963James Dong
5750c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0, xi1 now free
5760c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
5770c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #4*16]      ;// j4
5780c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc], #4         ;// j0
5790c1bc742181ded4930842b46e9507372f0b1b963James Dong
5800c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xh2, xh2, xi3
5810c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
5820c1bc742181ded4930842b46e9507372f0b1b963James Dong
5830c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
5840c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh0, xi0, xi1           ;// of DC result
5850c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xh1, xi0, xi1
5860c1bc742181ded4930842b46e9507372f0b1b963James Dong
5870c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
5880c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg2, xh1, xh2
5890c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg1, xh1, xh2
5900c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg3, xh0, xh3
5910c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg0, xh0, xh3
5920c1bc742181ded4930842b46e9507372f0b1b963James Dong
5930c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
5940c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf3, xg3, xg4
5950c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf4, xg3, xg4
5960c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf2, xg2, xg5
5970c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf5, xg2, xg5
5980c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf1, xg1, xg6
5990c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf6, xg1, xg6
6000c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf0, xg0, xg7
6010c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf7, xg0, xg7
6020c1bc742181ded4930842b46e9507372f0b1b963James Dong
6030c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Saturate
6040c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="u8")
6050c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf0, #8, xf0
6060c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf1, #8, xf1
6070c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf2, #8, xf2
6080c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf3, #8, xf3
6090c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf4, #8, xf4
6100c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf5, #8, xf5
6110c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf6, #8, xf6
6120c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf7, #8, xf7
6130c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6140c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="s9")
6150c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf0, #9, xf0
6160c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf1, #9, xf1
6170c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf2, #9, xf2
6180c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf3, #9, xf3
6190c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf4, #9, xf4
6200c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf5, #9, xf5
6210c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf6, #9, xf6
6220c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf7, #9, xf7
6230c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6240c1bc742181ded4930842b46e9507372f0b1b963James Dong
6250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose to Row, Pack and store
6260c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="u8")
6270c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
6280c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
6290c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
6300c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
6310c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra01, xf0, xf2, LSL #16
6320c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb01, xf2, xf0, ASR #16
6330c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra23, xf4, xf6, LSL #16
6340c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb23, xf6, xf4, ASR #16
6350c1bc742181ded4930842b46e9507372f0b1b963James Dong            STMIA   pDest, {ra01, ra23}
6360c1bc742181ded4930842b46e9507372f0b1b963James Dong            IF "$stride"="s"
6370c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6380c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23}
6390c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6400c1bc742181ded4930842b46e9507372f0b1b963James Dong            ELSE
6410c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6420c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23}
6430c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6440c1bc742181ded4930842b46e9507372f0b1b963James Dong            ENDIF
6450c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6460c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="s9"):LOR:("$outsize"="s16")
6470c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra01, xf0, xf1, LSL #16
6480c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb01, xf1, xf0, ASR #16
6490c1bc742181ded4930842b46e9507372f0b1b963James Dong
6500c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra23, xf2, xf3, LSL #16
6510c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb23, xf3, xf2, ASR #16
6520c1bc742181ded4930842b46e9507372f0b1b963James Dong
6530c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra45, xf4, xf5, LSL #16
6540c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb45, xf5, xf4, ASR #16
6550c1bc742181ded4930842b46e9507372f0b1b963James Dong
6560c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra67, xf6, xf7, LSL #16
6570c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb67, xf7, xf6, ASR #16
6580c1bc742181ded4930842b46e9507372f0b1b963James Dong
6590c1bc742181ded4930842b46e9507372f0b1b963James Dong            STMIA   pDest, {ra01, ra23, ra45, ra67}
6600c1bc742181ded4930842b46e9507372f0b1b963James Dong            IF "$stride"="s"
6610c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6620c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23, rb45, rb67}
6630c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6640c1bc742181ded4930842b46e9507372f0b1b963James Dong            ELSE
6650c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6660c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23, rb45, rb67}
6670c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6680c1bc742181ded4930842b46e9507372f0b1b963James Dong            ENDIF
6690c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6700c1bc742181ded4930842b46e9507372f0b1b963James Dong
6710c1bc742181ded4930842b46e9507372f0b1b963James Dong        BCC     v6_idct_row$_F
6720c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF ;// ARM1136JS
6730c1bc742181ded4930842b46e9507372f0b1b963James Dong
6740c1bc742181ded4930842b46e9507372f0b1b963James Dong
6750c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF CortexA8
6760c1bc742181ded4930842b46e9507372f0b1b963James Dong
6770c1bc742181ded4930842b46e9507372f0b1b963James DongSrc0            EQU  7
6780c1bc742181ded4930842b46e9507372f0b1b963James DongSrc1            EQU  8
6790c1bc742181ded4930842b46e9507372f0b1b963James DongSrc2            EQU  9
6800c1bc742181ded4930842b46e9507372f0b1b963James DongSrc3            EQU  10
6810c1bc742181ded4930842b46e9507372f0b1b963James DongSrc4            EQU  11
6820c1bc742181ded4930842b46e9507372f0b1b963James DongSrc5            EQU  12
6830c1bc742181ded4930842b46e9507372f0b1b963James DongSrc6            EQU  13
6840c1bc742181ded4930842b46e9507372f0b1b963James DongSrc7            EQU  14
6850c1bc742181ded4930842b46e9507372f0b1b963James DongTmp             EQU  15
6860c1bc742181ded4930842b46e9507372f0b1b963James Dong
6870c1bc742181ded4930842b46e9507372f0b1b963James DongqXj0            QN Src0.S16
6880c1bc742181ded4930842b46e9507372f0b1b963James DongqXj1            QN Src1.S16
6890c1bc742181ded4930842b46e9507372f0b1b963James DongqXj2            QN Src2.S16
6900c1bc742181ded4930842b46e9507372f0b1b963James DongqXj3            QN Src3.S16
6910c1bc742181ded4930842b46e9507372f0b1b963James DongqXj4            QN Src4.S16
6920c1bc742181ded4930842b46e9507372f0b1b963James DongqXj5            QN Src5.S16
6930c1bc742181ded4930842b46e9507372f0b1b963James DongqXj6            QN Src6.S16
6940c1bc742181ded4930842b46e9507372f0b1b963James DongqXj7            QN Src7.S16
6950c1bc742181ded4930842b46e9507372f0b1b963James DongqXjt            QN Tmp.S16
6960c1bc742181ded4930842b46e9507372f0b1b963James Dong
6970c1bc742181ded4930842b46e9507372f0b1b963James DongdXj0lo          DN (Src0*2).S16
6980c1bc742181ded4930842b46e9507372f0b1b963James DongdXj0hi          DN (Src0*2+1).S16
6990c1bc742181ded4930842b46e9507372f0b1b963James DongdXj1lo          DN (Src1*2).S16
7000c1bc742181ded4930842b46e9507372f0b1b963James DongdXj1hi          DN (Src1*2+1).S16
7010c1bc742181ded4930842b46e9507372f0b1b963James DongdXj2lo          DN (Src2*2).S16
7020c1bc742181ded4930842b46e9507372f0b1b963James DongdXj2hi          DN (Src2*2+1).S16
7030c1bc742181ded4930842b46e9507372f0b1b963James DongdXj3lo          DN (Src3*2).S16
7040c1bc742181ded4930842b46e9507372f0b1b963James DongdXj3hi          DN (Src3*2+1).S16
7050c1bc742181ded4930842b46e9507372f0b1b963James DongdXj4lo          DN (Src4*2).S16
7060c1bc742181ded4930842b46e9507372f0b1b963James DongdXj4hi          DN (Src4*2+1).S16
7070c1bc742181ded4930842b46e9507372f0b1b963James DongdXj5lo          DN (Src5*2).S16
7080c1bc742181ded4930842b46e9507372f0b1b963James DongdXj5hi          DN (Src5*2+1).S16
7090c1bc742181ded4930842b46e9507372f0b1b963James DongdXj6lo          DN (Src6*2).S16
7100c1bc742181ded4930842b46e9507372f0b1b963James DongdXj6hi          DN (Src6*2+1).S16
7110c1bc742181ded4930842b46e9507372f0b1b963James DongdXj7lo          DN (Src7*2).S16
7120c1bc742181ded4930842b46e9507372f0b1b963James DongdXj7hi          DN (Src7*2+1).S16
7130c1bc742181ded4930842b46e9507372f0b1b963James DongdXjtlo          DN (Tmp*2).S16
7140c1bc742181ded4930842b46e9507372f0b1b963James DongdXjthi          DN (Tmp*2+1).S16
7150c1bc742181ded4930842b46e9507372f0b1b963James Dong
7160c1bc742181ded4930842b46e9507372f0b1b963James DongqXi0            QN qXj0
7170c1bc742181ded4930842b46e9507372f0b1b963James DongqXi1            QN qXj4
7180c1bc742181ded4930842b46e9507372f0b1b963James DongqXi2            QN qXj2
7190c1bc742181ded4930842b46e9507372f0b1b963James DongqXi3            QN qXj7
7200c1bc742181ded4930842b46e9507372f0b1b963James DongqXi4            QN qXj5
7210c1bc742181ded4930842b46e9507372f0b1b963James DongqXi5            QN qXjt
7220c1bc742181ded4930842b46e9507372f0b1b963James DongqXi6            QN qXj1
7230c1bc742181ded4930842b46e9507372f0b1b963James DongqXi7            QN qXj6
7240c1bc742181ded4930842b46e9507372f0b1b963James DongqXit            QN qXj3
7250c1bc742181ded4930842b46e9507372f0b1b963James Dong
7260c1bc742181ded4930842b46e9507372f0b1b963James DongdXi0lo          DN dXj0lo
7270c1bc742181ded4930842b46e9507372f0b1b963James DongdXi0hi          DN dXj0hi
7280c1bc742181ded4930842b46e9507372f0b1b963James DongdXi1lo          DN dXj4lo
7290c1bc742181ded4930842b46e9507372f0b1b963James DongdXi1hi          DN dXj4hi
7300c1bc742181ded4930842b46e9507372f0b1b963James DongdXi2lo          DN dXj2lo
7310c1bc742181ded4930842b46e9507372f0b1b963James DongdXi2hi          DN dXj2hi
7320c1bc742181ded4930842b46e9507372f0b1b963James DongdXi3lo          DN dXj7lo
7330c1bc742181ded4930842b46e9507372f0b1b963James DongdXi3hi          DN dXj7hi
7340c1bc742181ded4930842b46e9507372f0b1b963James DongdXi4lo          DN dXj5lo
7350c1bc742181ded4930842b46e9507372f0b1b963James DongdXi4hi          DN dXj5hi
7360c1bc742181ded4930842b46e9507372f0b1b963James DongdXi5lo          DN dXjtlo
7370c1bc742181ded4930842b46e9507372f0b1b963James DongdXi5hi          DN dXjthi
7380c1bc742181ded4930842b46e9507372f0b1b963James DongdXi6lo          DN dXj1lo
7390c1bc742181ded4930842b46e9507372f0b1b963James DongdXi6hi          DN dXj1hi
7400c1bc742181ded4930842b46e9507372f0b1b963James DongdXi7lo          DN dXj6lo
7410c1bc742181ded4930842b46e9507372f0b1b963James DongdXi7hi          DN dXj6hi
7420c1bc742181ded4930842b46e9507372f0b1b963James DongdXitlo          DN dXj3lo
7430c1bc742181ded4930842b46e9507372f0b1b963James DongdXithi          DN dXj3hi
7440c1bc742181ded4930842b46e9507372f0b1b963James Dong
7450c1bc742181ded4930842b46e9507372f0b1b963James DongqXh0            QN qXit
7460c1bc742181ded4930842b46e9507372f0b1b963James DongqXh1            QN qXi0
7470c1bc742181ded4930842b46e9507372f0b1b963James DongqXh2            QN qXi2
7480c1bc742181ded4930842b46e9507372f0b1b963James DongqXh3            QN qXi3
7490c1bc742181ded4930842b46e9507372f0b1b963James DongqXh4            QN qXi7
7500c1bc742181ded4930842b46e9507372f0b1b963James DongqXh5            QN qXi5
7510c1bc742181ded4930842b46e9507372f0b1b963James DongqXh6            QN qXi4
7520c1bc742181ded4930842b46e9507372f0b1b963James DongqXh7            QN qXi1
7530c1bc742181ded4930842b46e9507372f0b1b963James DongqXht            QN qXi6
7540c1bc742181ded4930842b46e9507372f0b1b963James Dong
7550c1bc742181ded4930842b46e9507372f0b1b963James DongdXh0lo          DN dXitlo
7560c1bc742181ded4930842b46e9507372f0b1b963James DongdXh0hi          DN dXithi
7570c1bc742181ded4930842b46e9507372f0b1b963James DongdXh1lo          DN dXi0lo
7580c1bc742181ded4930842b46e9507372f0b1b963James DongdXh1hi          DN dXi0hi
7590c1bc742181ded4930842b46e9507372f0b1b963James DongdXh2lo          DN dXi2lo
7600c1bc742181ded4930842b46e9507372f0b1b963James DongdXh2hi          DN dXi2hi
7610c1bc742181ded4930842b46e9507372f0b1b963James DongdXh3lo          DN dXi3lo
7620c1bc742181ded4930842b46e9507372f0b1b963James DongdXh3hi          DN dXi3hi
7630c1bc742181ded4930842b46e9507372f0b1b963James DongdXh4lo          DN dXi7lo
7640c1bc742181ded4930842b46e9507372f0b1b963James DongdXh4hi          DN dXi7hi
7650c1bc742181ded4930842b46e9507372f0b1b963James DongdXh5lo          DN dXi5lo
7660c1bc742181ded4930842b46e9507372f0b1b963James DongdXh5hi          DN dXi5hi
7670c1bc742181ded4930842b46e9507372f0b1b963James DongdXh6lo          DN dXi4lo
7680c1bc742181ded4930842b46e9507372f0b1b963James DongdXh6hi          DN dXi4hi
7690c1bc742181ded4930842b46e9507372f0b1b963James DongdXh7lo          DN dXi1lo
7700c1bc742181ded4930842b46e9507372f0b1b963James DongdXh7hi          DN dXi1hi
7710c1bc742181ded4930842b46e9507372f0b1b963James DongdXhtlo          DN dXi6lo
7720c1bc742181ded4930842b46e9507372f0b1b963James DongdXhthi          DN dXi6hi
7730c1bc742181ded4930842b46e9507372f0b1b963James Dong
7740c1bc742181ded4930842b46e9507372f0b1b963James DongqXg0            QN qXh2
7750c1bc742181ded4930842b46e9507372f0b1b963James DongqXg1            QN qXht
7760c1bc742181ded4930842b46e9507372f0b1b963James DongqXg2            QN qXh1
7770c1bc742181ded4930842b46e9507372f0b1b963James DongqXg3            QN qXh0
7780c1bc742181ded4930842b46e9507372f0b1b963James DongqXg4            QN qXh4
7790c1bc742181ded4930842b46e9507372f0b1b963James DongqXg5            QN qXh5
7800c1bc742181ded4930842b46e9507372f0b1b963James DongqXg6            QN qXh6
7810c1bc742181ded4930842b46e9507372f0b1b963James DongqXg7            QN qXh7
7820c1bc742181ded4930842b46e9507372f0b1b963James DongqXgt            QN qXh3
7830c1bc742181ded4930842b46e9507372f0b1b963James Dong
7840c1bc742181ded4930842b46e9507372f0b1b963James DongqXf0            QN qXg6
7850c1bc742181ded4930842b46e9507372f0b1b963James DongqXf1            QN qXg5
7860c1bc742181ded4930842b46e9507372f0b1b963James DongqXf2            QN qXg4
7870c1bc742181ded4930842b46e9507372f0b1b963James DongqXf3            QN qXgt
7880c1bc742181ded4930842b46e9507372f0b1b963James DongqXf4            QN qXg3
7890c1bc742181ded4930842b46e9507372f0b1b963James DongqXf5            QN qXg2
7900c1bc742181ded4930842b46e9507372f0b1b963James DongqXf6            QN qXg1
7910c1bc742181ded4930842b46e9507372f0b1b963James DongqXf7            QN qXg0
7920c1bc742181ded4930842b46e9507372f0b1b963James DongqXft            QN qXg7
7930c1bc742181ded4930842b46e9507372f0b1b963James Dong
7940c1bc742181ded4930842b46e9507372f0b1b963James Dong
7950c1bc742181ded4930842b46e9507372f0b1b963James DongqXt0            QN 1.S32
7960c1bc742181ded4930842b46e9507372f0b1b963James DongqXt1            QN 2.S32
7970c1bc742181ded4930842b46e9507372f0b1b963James DongqT0lo           QN 1.S32
7980c1bc742181ded4930842b46e9507372f0b1b963James DongqT0hi           QN 2.S32
7990c1bc742181ded4930842b46e9507372f0b1b963James DongqT1lo           QN 3.S32
8000c1bc742181ded4930842b46e9507372f0b1b963James DongqT1hi           QN 4.S32
8010c1bc742181ded4930842b46e9507372f0b1b963James DongqScalelo        QN 5.S32        ;// used to read post scale values
8020c1bc742181ded4930842b46e9507372f0b1b963James DongqScalehi        QN 6.S32
8030c1bc742181ded4930842b46e9507372f0b1b963James DongqTemp0          QN 5.S32
8040c1bc742181ded4930842b46e9507372f0b1b963James DongqTemp1          QN 6.S32
8050c1bc742181ded4930842b46e9507372f0b1b963James Dong
8060c1bc742181ded4930842b46e9507372f0b1b963James Dong
8070c1bc742181ded4930842b46e9507372f0b1b963James DongScale1          EQU 6
8080c1bc742181ded4930842b46e9507372f0b1b963James DongScale2          EQU 15
8090c1bc742181ded4930842b46e9507372f0b1b963James DongqScale1         QN Scale1.S16
8100c1bc742181ded4930842b46e9507372f0b1b963James DongqScale2         QN Scale2.S16
8110c1bc742181ded4930842b46e9507372f0b1b963James DongdScale1lo       DN (Scale1*2).S16
8120c1bc742181ded4930842b46e9507372f0b1b963James DongdScale1hi       DN (Scale1*2+1).S16
8130c1bc742181ded4930842b46e9507372f0b1b963James DongdScale2lo       DN (Scale2*2).S16
8140c1bc742181ded4930842b46e9507372f0b1b963James DongdScale2hi       DN (Scale2*2+1).S16
8150c1bc742181ded4930842b46e9507372f0b1b963James Dong
8160c1bc742181ded4930842b46e9507372f0b1b963James DongdCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
8170c1bc742181ded4930842b46e9507372f0b1b963James DongInvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
8180c1bc742181ded4930842b46e9507372f0b1b963James DongS               DN dCoefs[1]    ;// Sin(PI/8) in Q15
8190c1bc742181ded4930842b46e9507372f0b1b963James DongC               DN dCoefs[2]    ;// Cos(PI/8) in Q15
8200c1bc742181ded4930842b46e9507372f0b1b963James Dong
8210c1bc742181ded4930842b46e9507372f0b1b963James DongpTemp           RN 12
8220c1bc742181ded4930842b46e9507372f0b1b963James Dong
8230c1bc742181ded4930842b46e9507372f0b1b963James Dong
8240c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT  armCOMM_IDCTCoef
8250c1bc742181ded4930842b46e9507372f0b1b963James Dong
8260c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj0,qXj1}, [pSrc @64]!
8270c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj2,qXj3}, [pSrc @64]!
8280c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj4,qXj5}, [pSrc @64]!
8290c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj6,qXj7}, [pSrc @64]!
8300c1bc742181ded4930842b46e9507372f0b1b963James Dong
8310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load PreScale and multiply with Src
8320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4
8330c1bc742181ded4930842b46e9507372f0b1b963James Dong
8340c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s16"                         ;// 16X16 Mul
8350c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_IDCT_PRESCALE16
8360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
8370c1bc742181ded4930842b46e9507372f0b1b963James Dong
8380c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s32"                         ;// 32X32 ,ul
8390c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_IDCT_PRESCALE32
8400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
8410c1bc742181ded4930842b46e9507372f0b1b963James Dong
8420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 3
8430c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
8440c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
8450c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
8460c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
8470c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
8480c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
8490c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXh2, qXi2, qXi3                ;// h2, h3
8500c1bc742181ded4930842b46e9507372f0b1b963James Dong
8510c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dXi4lo, C                 ;// c*i4
8520c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
8530c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dXi4hi, C
8540c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt1, dXi6hi, S
8550c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh4lo, qXt0, #16               ;// h4
8560c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh4hi, qXt1, #16
8570c1bc742181ded4930842b46e9507372f0b1b963James Dong
8580c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dXi6lo, C                 ;// c*i6
8590c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
8600c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dXi6hi, C
8610c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt1, dXi4hi, S
8620c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh6lo, qXt0, #16               ;// h6
8630c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh6hi, qXt1, #16
8640c1bc742181ded4930842b46e9507372f0b1b963James Dong
8650c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2
8660c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXg6, qXh6, qXh7
8670c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXg5, qXh5, qXg6
8680c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXg4, qXh4, qXg5
8690c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
8700c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
8710c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
8720c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
8730c1bc742181ded4930842b46e9507372f0b1b963James Dong
8740c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
8750c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf3, qXg3, qXg4
8760c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf4, qXg3, qXg4
8770c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf2, qXg2, qXg5
8780c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf5, qXg2, qXg5
8790c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf1, qXg1, qXg6
8800c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf6, qXg1, qXg6
8810c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf0, qXg0, qXg7
8820c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf7, qXg0, qXg7
8830c1bc742181ded4930842b46e9507372f0b1b963James Dong
8840c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose, store and loop
8850c1bc742181ded4930842b46e9507372f0b1b963James DongXTR0            EQU Src5
8860c1bc742181ded4930842b46e9507372f0b1b963James DongXTR1            EQU Tmp
8870c1bc742181ded4930842b46e9507372f0b1b963James DongXTR2            EQU Src6
8880c1bc742181ded4930842b46e9507372f0b1b963James DongXTR3            EQU Src7
8890c1bc742181ded4930842b46e9507372f0b1b963James DongXTR4            EQU Src3
8900c1bc742181ded4930842b46e9507372f0b1b963James DongXTR5            EQU Src0
8910c1bc742181ded4930842b46e9507372f0b1b963James DongXTR6            EQU Src1
8920c1bc742181ded4930842b46e9507372f0b1b963James DongXTR7            EQU Src2
8930c1bc742181ded4930842b46e9507372f0b1b963James DongXTRt            EQU Src4
8940c1bc742181ded4930842b46e9507372f0b1b963James Dong
8950c1bc742181ded4930842b46e9507372f0b1b963James DongqA0             QN  XTR0.S32  ;// for XTRpose
8960c1bc742181ded4930842b46e9507372f0b1b963James DongqA1             QN  XTR1.S32
8970c1bc742181ded4930842b46e9507372f0b1b963James DongqA2             QN  XTR2.S32
8980c1bc742181ded4930842b46e9507372f0b1b963James DongqA3             QN  XTR3.S32
8990c1bc742181ded4930842b46e9507372f0b1b963James DongqA4             QN  XTR4.S32
9000c1bc742181ded4930842b46e9507372f0b1b963James DongqA5             QN  XTR5.S32
9010c1bc742181ded4930842b46e9507372f0b1b963James DongqA6             QN  XTR6.S32
9020c1bc742181ded4930842b46e9507372f0b1b963James DongqA7             QN  XTR7.S32
9030c1bc742181ded4930842b46e9507372f0b1b963James Dong
9040c1bc742181ded4930842b46e9507372f0b1b963James DongdB0             DN  XTR0*2+1      ;// for using VSWP
9050c1bc742181ded4930842b46e9507372f0b1b963James DongdB1             DN  XTR1*2+1
9060c1bc742181ded4930842b46e9507372f0b1b963James DongdB2             DN  XTR2*2+1
9070c1bc742181ded4930842b46e9507372f0b1b963James DongdB3             DN  XTR3*2+1
9080c1bc742181ded4930842b46e9507372f0b1b963James DongdB4             DN  XTR4*2
9090c1bc742181ded4930842b46e9507372f0b1b963James DongdB5             DN  XTR5*2
9100c1bc742181ded4930842b46e9507372f0b1b963James DongdB6             DN  XTR6*2
9110c1bc742181ded4930842b46e9507372f0b1b963James DongdB7             DN  XTR7*2
9120c1bc742181ded4930842b46e9507372f0b1b963James Dong
9130c1bc742181ded4930842b46e9507372f0b1b963James Dong
9140c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf0, qXf1
9150c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf2, qXf3
9160c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf4, qXf5
9170c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf6, qXf7
9180c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA0, qA2
9190c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA1, qA3
9200c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA4, qA6
9210c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA5, qA7
9220c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB0, dB4
9230c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB1, dB5
9240c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB2, dB6
9250c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB3, dB7
9260c1bc742181ded4930842b46e9507372f0b1b963James Dong
9270c1bc742181ded4930842b46e9507372f0b1b963James Dong
9280c1bc742181ded4930842b46e9507372f0b1b963James DongqYj0            QN qXf0
9290c1bc742181ded4930842b46e9507372f0b1b963James DongqYj1            QN qXf1
9300c1bc742181ded4930842b46e9507372f0b1b963James DongqYj2            QN qXf2
9310c1bc742181ded4930842b46e9507372f0b1b963James DongqYj3            QN qXf3
9320c1bc742181ded4930842b46e9507372f0b1b963James DongqYj4            QN qXf4
9330c1bc742181ded4930842b46e9507372f0b1b963James DongqYj5            QN qXf5
9340c1bc742181ded4930842b46e9507372f0b1b963James DongqYj6            QN qXf6
9350c1bc742181ded4930842b46e9507372f0b1b963James DongqYj7            QN qXf7
9360c1bc742181ded4930842b46e9507372f0b1b963James DongqYjt            QN qXft
9370c1bc742181ded4930842b46e9507372f0b1b963James Dong
9380c1bc742181ded4930842b46e9507372f0b1b963James DongdYj0lo          DN (XTR0*2).S16
9390c1bc742181ded4930842b46e9507372f0b1b963James DongdYj0hi          DN (XTR0*2+1).S16
9400c1bc742181ded4930842b46e9507372f0b1b963James DongdYj1lo          DN (XTR1*2).S16
9410c1bc742181ded4930842b46e9507372f0b1b963James DongdYj1hi          DN (XTR1*2+1).S16
9420c1bc742181ded4930842b46e9507372f0b1b963James DongdYj2lo          DN (XTR2*2).S16
9430c1bc742181ded4930842b46e9507372f0b1b963James DongdYj2hi          DN (XTR2*2+1).S16
9440c1bc742181ded4930842b46e9507372f0b1b963James DongdYj3lo          DN (XTR3*2).S16
9450c1bc742181ded4930842b46e9507372f0b1b963James DongdYj3hi          DN (XTR3*2+1).S16
9460c1bc742181ded4930842b46e9507372f0b1b963James DongdYj4lo          DN (XTR4*2).S16
9470c1bc742181ded4930842b46e9507372f0b1b963James DongdYj4hi          DN (XTR4*2+1).S16
9480c1bc742181ded4930842b46e9507372f0b1b963James DongdYj5lo          DN (XTR5*2).S16
9490c1bc742181ded4930842b46e9507372f0b1b963James DongdYj5hi          DN (XTR5*2+1).S16
9500c1bc742181ded4930842b46e9507372f0b1b963James DongdYj6lo          DN (XTR6*2).S16
9510c1bc742181ded4930842b46e9507372f0b1b963James DongdYj6hi          DN (XTR6*2+1).S16
9520c1bc742181ded4930842b46e9507372f0b1b963James DongdYj7lo          DN (XTR7*2).S16
9530c1bc742181ded4930842b46e9507372f0b1b963James DongdYj7hi          DN (XTR7*2+1).S16
9540c1bc742181ded4930842b46e9507372f0b1b963James DongdYjtlo          DN (XTRt*2).S16
9550c1bc742181ded4930842b46e9507372f0b1b963James DongdYjthi          DN (XTRt*2+1).S16
9560c1bc742181ded4930842b46e9507372f0b1b963James Dong
9570c1bc742181ded4930842b46e9507372f0b1b963James DongqYi0            QN qYj0
9580c1bc742181ded4930842b46e9507372f0b1b963James DongqYi1            QN qYj4
9590c1bc742181ded4930842b46e9507372f0b1b963James DongqYi2            QN qYj2
9600c1bc742181ded4930842b46e9507372f0b1b963James DongqYi3            QN qYj7
9610c1bc742181ded4930842b46e9507372f0b1b963James DongqYi4            QN qYj5
9620c1bc742181ded4930842b46e9507372f0b1b963James DongqYi5            QN qYjt
9630c1bc742181ded4930842b46e9507372f0b1b963James DongqYi6            QN qYj1
9640c1bc742181ded4930842b46e9507372f0b1b963James DongqYi7            QN qYj6
9650c1bc742181ded4930842b46e9507372f0b1b963James DongqYit            QN qYj3
9660c1bc742181ded4930842b46e9507372f0b1b963James Dong
9670c1bc742181ded4930842b46e9507372f0b1b963James DongdYi0lo          DN dYj0lo
9680c1bc742181ded4930842b46e9507372f0b1b963James DongdYi0hi          DN dYj0hi
9690c1bc742181ded4930842b46e9507372f0b1b963James DongdYi1lo          DN dYj4lo
9700c1bc742181ded4930842b46e9507372f0b1b963James DongdYi1hi          DN dYj4hi
9710c1bc742181ded4930842b46e9507372f0b1b963James DongdYi2lo          DN dYj2lo
9720c1bc742181ded4930842b46e9507372f0b1b963James DongdYi2hi          DN dYj2hi
9730c1bc742181ded4930842b46e9507372f0b1b963James DongdYi3lo          DN dYj7lo
9740c1bc742181ded4930842b46e9507372f0b1b963James DongdYi3hi          DN dYj7hi
9750c1bc742181ded4930842b46e9507372f0b1b963James DongdYi4lo          DN dYj5lo
9760c1bc742181ded4930842b46e9507372f0b1b963James DongdYi4hi          DN dYj5hi
9770c1bc742181ded4930842b46e9507372f0b1b963James DongdYi5lo          DN dYjtlo
9780c1bc742181ded4930842b46e9507372f0b1b963James DongdYi5hi          DN dYjthi
9790c1bc742181ded4930842b46e9507372f0b1b963James DongdYi6lo          DN dYj1lo
9800c1bc742181ded4930842b46e9507372f0b1b963James DongdYi6hi          DN dYj1hi
9810c1bc742181ded4930842b46e9507372f0b1b963James DongdYi7lo          DN dYj6lo
9820c1bc742181ded4930842b46e9507372f0b1b963James DongdYi7hi          DN dYj6hi
9830c1bc742181ded4930842b46e9507372f0b1b963James DongdYitlo          DN dYj3lo
9840c1bc742181ded4930842b46e9507372f0b1b963James DongdYithi          DN dYj3hi
9850c1bc742181ded4930842b46e9507372f0b1b963James Dong
9860c1bc742181ded4930842b46e9507372f0b1b963James DongqYh0            QN qYit
9870c1bc742181ded4930842b46e9507372f0b1b963James DongqYh1            QN qYi0
9880c1bc742181ded4930842b46e9507372f0b1b963James DongqYh2            QN qYi2
9890c1bc742181ded4930842b46e9507372f0b1b963James DongqYh3            QN qYi3
9900c1bc742181ded4930842b46e9507372f0b1b963James DongqYh4            QN qYi7
9910c1bc742181ded4930842b46e9507372f0b1b963James DongqYh5            QN qYi5
9920c1bc742181ded4930842b46e9507372f0b1b963James DongqYh6            QN qYi4
9930c1bc742181ded4930842b46e9507372f0b1b963James DongqYh7            QN qYi1
9940c1bc742181ded4930842b46e9507372f0b1b963James DongqYht            QN qYi6
9950c1bc742181ded4930842b46e9507372f0b1b963James Dong
9960c1bc742181ded4930842b46e9507372f0b1b963James DongdYh0lo          DN dYitlo
9970c1bc742181ded4930842b46e9507372f0b1b963James DongdYh0hi          DN dYithi
9980c1bc742181ded4930842b46e9507372f0b1b963James DongdYh1lo          DN dYi0lo
9990c1bc742181ded4930842b46e9507372f0b1b963James DongdYh1hi          DN dYi0hi
10000c1bc742181ded4930842b46e9507372f0b1b963James DongdYh2lo          DN dYi2lo
10010c1bc742181ded4930842b46e9507372f0b1b963James DongdYh2hi          DN dYi2hi
10020c1bc742181ded4930842b46e9507372f0b1b963James DongdYh3lo          DN dYi3lo
10030c1bc742181ded4930842b46e9507372f0b1b963James DongdYh3hi          DN dYi3hi
10040c1bc742181ded4930842b46e9507372f0b1b963James DongdYh4lo          DN dYi7lo
10050c1bc742181ded4930842b46e9507372f0b1b963James DongdYh4hi          DN dYi7hi
10060c1bc742181ded4930842b46e9507372f0b1b963James DongdYh5lo          DN dYi5lo
10070c1bc742181ded4930842b46e9507372f0b1b963James DongdYh5hi          DN dYi5hi
10080c1bc742181ded4930842b46e9507372f0b1b963James DongdYh6lo          DN dYi4lo
10090c1bc742181ded4930842b46e9507372f0b1b963James DongdYh6hi          DN dYi4hi
10100c1bc742181ded4930842b46e9507372f0b1b963James DongdYh7lo          DN dYi1lo
10110c1bc742181ded4930842b46e9507372f0b1b963James DongdYh7hi          DN dYi1hi
10120c1bc742181ded4930842b46e9507372f0b1b963James DongdYhtlo          DN dYi6lo
10130c1bc742181ded4930842b46e9507372f0b1b963James DongdYhthi          DN dYi6hi
10140c1bc742181ded4930842b46e9507372f0b1b963James Dong
10150c1bc742181ded4930842b46e9507372f0b1b963James DongqYg0            QN qYh2
10160c1bc742181ded4930842b46e9507372f0b1b963James DongqYg1            QN qYht
10170c1bc742181ded4930842b46e9507372f0b1b963James DongqYg2            QN qYh1
10180c1bc742181ded4930842b46e9507372f0b1b963James DongqYg3            QN qYh0
10190c1bc742181ded4930842b46e9507372f0b1b963James DongqYg4            QN qYh4
10200c1bc742181ded4930842b46e9507372f0b1b963James DongqYg5            QN qYh5
10210c1bc742181ded4930842b46e9507372f0b1b963James DongqYg6            QN qYh6
10220c1bc742181ded4930842b46e9507372f0b1b963James DongqYg7            QN qYh7
10230c1bc742181ded4930842b46e9507372f0b1b963James DongqYgt            QN qYh3
10240c1bc742181ded4930842b46e9507372f0b1b963James Dong
10250c1bc742181ded4930842b46e9507372f0b1b963James DongqYf0            QN qYg6
10260c1bc742181ded4930842b46e9507372f0b1b963James DongqYf1            QN qYg5
10270c1bc742181ded4930842b46e9507372f0b1b963James DongqYf2            QN qYg4
10280c1bc742181ded4930842b46e9507372f0b1b963James DongqYf3            QN qYgt
10290c1bc742181ded4930842b46e9507372f0b1b963James DongqYf4            QN qYg3
10300c1bc742181ded4930842b46e9507372f0b1b963James DongqYf5            QN qYg2
10310c1bc742181ded4930842b46e9507372f0b1b963James DongqYf6            QN qYg1
10320c1bc742181ded4930842b46e9507372f0b1b963James DongqYf7            QN qYg0
10330c1bc742181ded4930842b46e9507372f0b1b963James DongqYft            QN qYg7
10340c1bc742181ded4930842b46e9507372f0b1b963James Dong
10350c1bc742181ded4930842b46e9507372f0b1b963James Dong        VRSHR       qYj7, qYj7, #2
10360c1bc742181ded4930842b46e9507372f0b1b963James Dong        VRSHR       qYj6, qYj6, #1
10370c1bc742181ded4930842b46e9507372f0b1b963James Dong
10380c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
10390c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
10400c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
10410c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
10420c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
10430c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
10440c1bc742181ded4930842b46e9507372f0b1b963James Dong
10450c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
10460c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
10470c1bc742181ded4930842b46e9507372f0b1b963James Dong
10480c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         pTemp, #0x4             ;// ensure correct round
10490c1bc742181ded4930842b46e9507372f0b1b963James Dong        VDUP        qScale1, pTemp           ;// of DC result
10500c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qYi0, qYi0, qScale1
10510c1bc742181ded4930842b46e9507372f0b1b963James Dong
10520c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
10530c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
10540c1bc742181ded4930842b46e9507372f0b1b963James Dong
10550c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
10560c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
10570c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYh2, qYi2, qYi3        ;// h2, h3
10580c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
10590c1bc742181ded4930842b46e9507372f0b1b963James Dong
10600c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dYi4lo, C         ;// c*i4
10610c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
10620c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dYi4hi, C
10630c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt1, dYi6hi, S
10640c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh4lo, qXt0, #16       ;// h4
10650c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh4hi, qXt1, #16
10660c1bc742181ded4930842b46e9507372f0b1b963James Dong
10670c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dYi6lo, C         ;// c*i6
10680c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
10690c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dYi6hi, C
10700c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt1, dYi4hi, S
10710c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh6lo, qXt0, #16       ;// h6
10720c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh6hi, qXt1, #16
10730c1bc742181ded4930842b46e9507372f0b1b963James Dong
10740c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYg6, qYh6, qYh7
10750c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYg5, qYh5, qYg6
10760c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYg4, qYh4, qYg5
10770c1bc742181ded4930842b46e9507372f0b1b963James Dong
10780c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
10790c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
10800c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
10810c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
10820c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
10830c1bc742181ded4930842b46e9507372f0b1b963James Dong
10840c1bc742181ded4930842b46e9507372f0b1b963James Dong
10850c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
10860c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf3, qYg3, qYg4
10870c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf4, qYg3, qYg4
10880c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf2, qYg2, qYg5
10890c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf5, qYg2, qYg5
10900c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf1, qYg1, qYg6
10910c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf6, qYg1, qYg6
10920c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf0, qYg0, qYg7
10930c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf7, qYg0, qYg7
10940c1bc742181ded4930842b46e9507372f0b1b963James Dong
10950c1bc742181ded4930842b46e9507372f0b1b963James DongYTR0            EQU Src0
10960c1bc742181ded4930842b46e9507372f0b1b963James DongYTR1            EQU Src4
10970c1bc742181ded4930842b46e9507372f0b1b963James DongYTR2            EQU Src1
10980c1bc742181ded4930842b46e9507372f0b1b963James DongYTR3            EQU Src2
10990c1bc742181ded4930842b46e9507372f0b1b963James DongYTR4            EQU Src7
11000c1bc742181ded4930842b46e9507372f0b1b963James DongYTR5            EQU Src5
11010c1bc742181ded4930842b46e9507372f0b1b963James DongYTR6            EQU Tmp
11020c1bc742181ded4930842b46e9507372f0b1b963James DongYTR7            EQU Src6
11030c1bc742181ded4930842b46e9507372f0b1b963James DongYTRt            EQU Src3
11040c1bc742181ded4930842b46e9507372f0b1b963James Dong
11050c1bc742181ded4930842b46e9507372f0b1b963James DongqC0             QN  YTR0.S32                ;// for YTRpose
11060c1bc742181ded4930842b46e9507372f0b1b963James DongqC1             QN  YTR1.S32
11070c1bc742181ded4930842b46e9507372f0b1b963James DongqC2             QN  YTR2.S32
11080c1bc742181ded4930842b46e9507372f0b1b963James DongqC3             QN  YTR3.S32
11090c1bc742181ded4930842b46e9507372f0b1b963James DongqC4             QN  YTR4.S32
11100c1bc742181ded4930842b46e9507372f0b1b963James DongqC5             QN  YTR5.S32
11110c1bc742181ded4930842b46e9507372f0b1b963James DongqC6             QN  YTR6.S32
11120c1bc742181ded4930842b46e9507372f0b1b963James DongqC7             QN  YTR7.S32
11130c1bc742181ded4930842b46e9507372f0b1b963James Dong
11140c1bc742181ded4930842b46e9507372f0b1b963James DongdD0             DN  YTR0*2+1                ;// for using VSWP
11150c1bc742181ded4930842b46e9507372f0b1b963James DongdD1             DN  YTR1*2+1
11160c1bc742181ded4930842b46e9507372f0b1b963James DongdD2             DN  YTR2*2+1
11170c1bc742181ded4930842b46e9507372f0b1b963James DongdD3             DN  YTR3*2+1
11180c1bc742181ded4930842b46e9507372f0b1b963James DongdD4             DN  YTR4*2
11190c1bc742181ded4930842b46e9507372f0b1b963James DongdD5             DN  YTR5*2
11200c1bc742181ded4930842b46e9507372f0b1b963James DongdD6             DN  YTR6*2
11210c1bc742181ded4930842b46e9507372f0b1b963James DongdD7             DN  YTR7*2
11220c1bc742181ded4930842b46e9507372f0b1b963James Dong
11230c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf0, qYf1
11240c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf2, qYf3
11250c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf4, qYf5
11260c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf6, qYf7
11270c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC0, qC2
11280c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC1, qC3
11290c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC4, qC6
11300c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC5, qC7
11310c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD0, dD4
11320c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD1, dD5
11330c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD2, dD6
11340c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD3, dD7
11350c1bc742181ded4930842b46e9507372f0b1b963James Dong
11360c1bc742181ded4930842b46e9507372f0b1b963James Dong
11370c1bc742181ded4930842b46e9507372f0b1b963James DongdYf0U8          DN YTR0*2.U8
11380c1bc742181ded4930842b46e9507372f0b1b963James DongdYf1U8          DN YTR1*2.U8
11390c1bc742181ded4930842b46e9507372f0b1b963James DongdYf2U8          DN YTR2*2.U8
11400c1bc742181ded4930842b46e9507372f0b1b963James DongdYf3U8          DN YTR3*2.U8
11410c1bc742181ded4930842b46e9507372f0b1b963James DongdYf4U8          DN YTR4*2.U8
11420c1bc742181ded4930842b46e9507372f0b1b963James DongdYf5U8          DN YTR5*2.U8
11430c1bc742181ded4930842b46e9507372f0b1b963James DongdYf6U8          DN YTR6*2.U8
11440c1bc742181ded4930842b46e9507372f0b1b963James DongdYf7U8          DN YTR7*2.U8
11450c1bc742181ded4930842b46e9507372f0b1b963James Dong
11460c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
11470c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Do saturation if outsize is other than S16
11480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
11490c1bc742181ded4930842b46e9507372f0b1b963James Dong
11500c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="u8")
11510c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Output range [0-255]
11520c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf0U8, qYf0
11530c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf1U8, qYf1
11540c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf2U8, qYf2
11550c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf3U8, qYf3
11560c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf4U8, qYf4
11570c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf5U8, qYf5
11580c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf6U8, qYf6
11590c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf7U8, qYf7
11600c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
11610c1bc742181ded4930842b46e9507372f0b1b963James Dong
11620c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="s9")
11630c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Output range [-256 to +255]
11640c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf0, qYf0, #16-9
11650c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf1, qYf1, #16-9
11660c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf2, qYf2, #16-9
11670c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf3, qYf3, #16-9
11680c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf4, qYf4, #16-9
11690c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf5, qYf5, #16-9
11700c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf6, qYf6, #16-9
11710c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf7, qYf7, #16-9
11720c1bc742181ded4930842b46e9507372f0b1b963James Dong
11730c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf0, qYf0, #16-9
11740c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf1, qYf1, #16-9
11750c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf2, qYf2, #16-9
11760c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf3, qYf3, #16-9
11770c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf4, qYf4, #16-9
11780c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf5, qYf5, #16-9
11790c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf6, qYf6, #16-9
11800c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf7, qYf7, #16-9
11810c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
11820c1bc742181ded4930842b46e9507372f0b1b963James Dong
11830c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store output depending on the Stride size
11840c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
11850c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf0, [pDest @64], Stride
11860c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf1, [pDest @64], Stride
11870c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf2, [pDest @64], Stride
11880c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf3, [pDest @64], Stride
11890c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf4, [pDest @64], Stride
11900c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf5, [pDest @64], Stride
11910c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf6, [pDest @64], Stride
11920c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf7, [pDest @64]
11930c1bc742181ded4930842b46e9507372f0b1b963James Dong        ELSE
11940c1bc742181ded4930842b46e9507372f0b1b963James Dong            IF ("$outsize"="u8")
11950c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf0U8, [pDest @64], #8
11960c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf1U8, [pDest @64], #8
11970c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf2U8, [pDest @64], #8
11980c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf3U8, [pDest @64], #8
11990c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf4U8, [pDest @64], #8
12000c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf5U8, [pDest @64], #8
12010c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf6U8, [pDest @64], #8
12020c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf7U8, [pDest @64]
12030c1bc742181ded4930842b46e9507372f0b1b963James Dong            ELSE
12040c1bc742181ded4930842b46e9507372f0b1b963James Dong                ;// ("$outsize"="s9") or ("$outsize"="s16")
12050c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf0, [pDest @64], #16
12060c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf1, [pDest @64], #16
12070c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf2, [pDest @64], #16
12080c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf3, [pDest @64], #16
12090c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf4, [pDest @64], #16
12100c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf5, [pDest @64], #16
12110c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf6, [pDest @64], #16
12120c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf7, [pDest @64]
12130c1bc742181ded4930842b46e9507372f0b1b963James Dong            ENDIF
12140c1bc742181ded4930842b46e9507372f0b1b963James Dong
12150c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
12160c1bc742181ded4930842b46e9507372f0b1b963James Dong
12170c1bc742181ded4930842b46e9507372f0b1b963James Dong
12180c1bc742181ded4930842b46e9507372f0b1b963James Dong
12190c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF ;// CortexA8
12200c1bc742181ded4930842b46e9507372f0b1b963James Dong
12210c1bc742181ded4930842b46e9507372f0b1b963James Dong
12220c1bc742181ded4930842b46e9507372f0b1b963James Dong
12230c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
12240c1bc742181ded4930842b46e9507372f0b1b963James Dong
12250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale TWO input rows with TWO rows of 16 bit scale values
12260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
12280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// input (Eight input values) with one row of scale values. Also
12290c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Loads next scale values from pScale, if $LastRow flag is not set.
12300c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Registers:
12320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAlo           - Input D register with first four S16 values of row n
12340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAhi           - Input D register with next four S16 values of row n
12350c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBlo           - Input D register with first four S16 values of row n+1
12360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBhi           - Input D register with next four S16 values of row n+1
12370c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to next row of scale values
12380c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT0lo           - Temporary scratch register
12390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT0hi           - Temporary scratch register
12400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT1lo           - Temporary scratch register
12410c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT1hi           - Temporary scratch register
12420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale1lo       - Scale value of row n
12430c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale1hi       - Scale value of row n
12440c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale2lo       - Scale value of row n+1
12450c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale2hi       - Scale value of row n+1
12460c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12470c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Flag
12480c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12490c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $LastRow        - Flag to indicate whether current row is last row
12500c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12510c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output Registers:
12520c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12530c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAlo           - Scaled output values (first four S16 of row n)
12540c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAhi           - Scaled output values (next four S16 of row n)
12550c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBlo           - Scaled output values (first four S16 of row n+1)
12560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBhi           - Scaled output values (next four S16 of row n+1)
12570c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qScale1         - Scale values for next row
12580c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qScale2         - Scale values for next row+1
12590c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to next row of scale values
12600c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12610c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
12620c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
12630c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT0lo, $dAlo, dScale1lo
12640c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT0hi, $dAhi, dScale1hi
12650c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT1lo, $dBlo, dScale2lo
12660c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT1hi, $dBhi, dScale2hi
12670c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$LastRow"="0"
12680c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
12690c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
12700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
12710c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dAlo, qT0lo, #12
12720c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dAhi, qT0hi, #12
12730c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dBlo, qT1lo, #12
12740c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dBhi, qT1hi, #12
12750c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
12760c1bc742181ded4930842b46e9507372f0b1b963James Dong
12770c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale 8x8 block input values with 16 bit scale values
12780c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12790c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This macro is used to pre-scale block of 8x8 input.
12800c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This also do the Ist stage transformations of IDCT.
12810c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12820c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Registers:
12830c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12840c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnlo          - n th input D register with first four S16 values
12850c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnhi          - n th input D register with next four S16 values
12860c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qXjn            - n th input Q register with eight S16 values
12870c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to scale values
12880c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12890c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output Registers:
12900c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12910c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qXin            - n th output Q register with eight S16 output values of 1st stage
12920c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12930c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
12940c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_PRESCALE16
12950c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
12960c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
12970c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
12980c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
12990c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
13000c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
13010c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
13020c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXi6, qXj1, qXj7            ;// j1-j7
13030c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
13040c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
13050c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXi2, qXj2, qXj6            ;// j2-j6
13060c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
13070c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
13080c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXi4, qXj5, qXj3            ;// j5-j3
13090c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
13100c1bc742181ded4930842b46e9507372f0b1b963James Dong
13110c1bc742181ded4930842b46e9507372f0b1b963James Dong
13120c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale 8x8 block input values with 32 bit scale values
13130c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13140c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This macro is used to pre-scale block of 8x8 input.
13150c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This also do the Ist stage transformations of IDCT.
13160c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13170c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Registers:
13180c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13190c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnlo          - n th input D register with first four S16 values
13200c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnhi          - n th input D register with next four S16 values
13210c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qXjn            - n th input Q register with eight S16 values
13220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to 32bit scale values in Q23 format
13230c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13240c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output Registers:
13250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
13270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
13280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13290c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
13300c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_PRESCALE32
13310c1bc742181ded4930842b46e9507372f0b1b963James DongqScale0lo       QN 0.S32
13320c1bc742181ded4930842b46e9507372f0b1b963James DongqScale0hi       QN 1.S32
13330c1bc742181ded4930842b46e9507372f0b1b963James DongqScale1lo       QN 2.S32
13340c1bc742181ded4930842b46e9507372f0b1b963James DongqScale1hi       QN 3.S32
13350c1bc742181ded4930842b46e9507372f0b1b963James DongqScale2lo       QN qScale1lo
13360c1bc742181ded4930842b46e9507372f0b1b963James DongqScale2hi       QN qScale1hi
13370c1bc742181ded4930842b46e9507372f0b1b963James DongqScale3lo       QN qScale1lo
13380c1bc742181ded4930842b46e9507372f0b1b963James DongqScale3hi       QN qScale1hi
13390c1bc742181ded4930842b46e9507372f0b1b963James DongqScale4lo       QN qScale1lo
13400c1bc742181ded4930842b46e9507372f0b1b963James DongqScale4hi       QN qScale1hi
13410c1bc742181ded4930842b46e9507372f0b1b963James DongqScale5lo       QN qScale0lo
13420c1bc742181ded4930842b46e9507372f0b1b963James DongqScale5hi       QN qScale0hi
13430c1bc742181ded4930842b46e9507372f0b1b963James DongqScale6lo       QN qScale0lo
13440c1bc742181ded4930842b46e9507372f0b1b963James DongqScale6hi       QN qScale0hi
13450c1bc742181ded4930842b46e9507372f0b1b963James DongqScale7lo       QN qScale0lo
13460c1bc742181ded4930842b46e9507372f0b1b963James DongqScale7hi       QN qScale0hi
13470c1bc742181ded4930842b46e9507372f0b1b963James Dong
13480c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc0lo         QN 4.S32
13490c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc0hi         QN 5.S32
13500c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc1lo         QN 6.S32
13510c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc1hi         QN Src4.S32
13520c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc2lo         QN qSrc0lo
13530c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc2hi         QN qSrc0hi
13540c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc3lo         QN qSrc0lo
13550c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc3hi         QN qSrc0hi
13560c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc4lo         QN qSrc0lo
13570c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc4hi         QN qSrc0hi
13580c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc5lo         QN qSrc1lo
13590c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc5hi         QN qSrc1hi
13600c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc6lo         QN qSrc1lo
13610c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc6hi         QN qSrc1hi
13620c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc7lo         QN qSrc0lo
13630c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc7hi         QN qSrc0hi
13640c1bc742181ded4930842b46e9507372f0b1b963James Dong
13650c1bc742181ded4930842b46e9507372f0b1b963James DongqRes17lo        QN qScale0lo
13660c1bc742181ded4930842b46e9507372f0b1b963James DongqRes17hi        QN qScale0hi
13670c1bc742181ded4930842b46e9507372f0b1b963James DongqRes26lo        QN qScale0lo
13680c1bc742181ded4930842b46e9507372f0b1b963James DongqRes26hi        QN qScale0hi
13690c1bc742181ded4930842b46e9507372f0b1b963James DongqRes53lo        QN qScale0lo
13700c1bc742181ded4930842b46e9507372f0b1b963James DongqRes53hi        QN qScale0hi
13710c1bc742181ded4930842b46e9507372f0b1b963James Dong
13720c1bc742181ded4930842b46e9507372f0b1b963James Dong            ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
13730c1bc742181ded4930842b46e9507372f0b1b963James Dong
13740c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 0
13750c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale0lo, qScale0hi}, [pScale]!
13760c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc0lo, dXj0lo, #(12-1)
13770c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc0hi, dXj0hi, #(12-1)
13780c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale1lo, qScale1hi}, [pScale]!
13790c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
13800c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
13810c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale7lo, qScale7hi}, [pTemp]!
13820c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc1lo, dXj1lo, #(12-1)
13830c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc1hi, dXj1hi, #(12-1)
13840c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
13850c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi0hi, qSrc0hi
13860c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc7lo, dXj7lo, #(12-1)
13870c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc7hi, dXj7hi, #(12-1)
13880c1bc742181ded4930842b46e9507372f0b1b963James Dong            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
13890c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
13900c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
13910c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
13920c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
13930c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale2lo, qScale2hi}, [pScale]!
13940c1bc742181ded4930842b46e9507372f0b1b963James Dong
13950c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 1 & 7
13960c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
13970c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
13980c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi5lo, qRes17lo                ;// Output i5
13990c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi5hi, qRes17hi
14000c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
14010c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
14020c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi6lo, qRes17lo                ;// Output i6
14030c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi6hi, qRes17hi
14040c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc2lo, dXj2lo, #(12-1)
14050c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc2hi, dXj2hi, #(12-1)
14060c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale6lo, qScale6hi}, [pTemp]!
14070c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc6lo, dXj6lo, #(12-1)
14080c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc6hi, dXj6hi, #(12-1)
14090c1bc742181ded4930842b46e9507372f0b1b963James Dong            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
14100c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
14110c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
14120c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
14130c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
14140c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale3lo, qScale3hi}, [pScale]!
14150c1bc742181ded4930842b46e9507372f0b1b963James Dong
14160c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 2 & 6
14170c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
14180c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
14190c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi3lo, qRes26lo                ;// Output i3
14200c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi3hi, qRes26hi
14210c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
14220c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
14230c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi2lo, qRes26lo                ;// Output i2
14240c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi2hi, qRes26hi
14250c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc3lo, dXj3lo, #(12-1)
14260c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc3hi, dXj3hi, #(12-1)
14270c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale5lo, qScale5hi}, [pTemp]!
14280c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc5lo, dXj5lo, #(12-1)
14290c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc5hi, dXj5hi, #(12-1)
14300c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
14310c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
14320c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
14330c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
14340c1bc742181ded4930842b46e9507372f0b1b963James Dong
14350c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 3 & 5
14360c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
14370c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
14380c1bc742181ded4930842b46e9507372f0b1b963James Dong            SUB         pSrc, pSrc, #16*2*2
14390c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi7lo, qRes53lo                ;// Output i7
14400c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi7hi, qRes53hi
14410c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
14420c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
14430c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        qXj4, [pSrc @64]
14440c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi4lo, qRes53lo                ;// Output i4
14450c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi4hi, qRes53hi
14460c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc4lo, dXj4lo, #(12-1)
14470c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc4hi, dXj4hi, #(12-1)
14480c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale4lo, qScale4hi}, [pScale]
14490c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
14500c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
14510c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
14520c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
14530c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 4
14540c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
14550c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi1hi, qSrc4hi
14560c1bc742181ded4930842b46e9507372f0b1b963James Dong
14570c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
14580c1bc742181ded4930842b46e9507372f0b1b963James Dong
14590c1bc742181ded4930842b46e9507372f0b1b963James Dong        END
1460