armCOMM_IDCT_s.h revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2004 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// IDCT_s.s
19;//
20;// Inverse DCT module
21;//
22;//
23;// ALGORITHM DESCRIPTION
24;//
25;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
26;// column and then a 1D IDCT for each row.
27;//
28;// The 8-point 1D IDCT is defined by
29;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
30;//
31;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
32;//   c(u,x) = cos( (2x+1)*u*pi/16 )
33;//
34;// We compute the 8-point 1D IDCT using the reverse of
35;// the Arai-Agui-Nakajima flow graph which we split into
36;// 5 stages named in reverse order to identify with the
37;// forward DCT. Direct inversion of the forward formulae
38;// in file FDCT_s.s gives:
39;//
40;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
41;//             [ A(0) = 2*sqrt(2)
42;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
43;//
44;// IStage 4:   i0 = j0             i1 = j4
45;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
46;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
47;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
48;//
49;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
50;//             h2 = (i2*sqrt2)-i3  h3 = i3
51;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
52;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
53;//             [ The above two lines rotate by -(pi/8) ]
54;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2
55;//
56;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
57;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
58;//             g7 = h7             g6 = h6 - h7
59;//             g5 = h5 - g6        g4 = h4 - g5
60;//
61;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
62;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
63;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
64;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
65;//
66;// Note that most coefficients are halved 3 times during the
67;// above calculation. We can rescale the algorithm dividing
68;// the input by 8 to remove the halvings.
69;//
70;// IStage 5:   j(u) = T(u)*A(u)/8
71;//
72;// IStage 4:   i0 = j0             i1 = j4
73;//             i3 = j2 + j6        i2 = j2 - j6
74;//             i7 = j5 + j3        i4 = j5 - j3
75;//             i5 = j1 + j7        i6 = j1 - j7
76;//
77;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
78;//             h2 = (i2*sqrt2)-i3  h3 = i3
79;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
80;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
81;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7
82;//
83;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
84;//             g1 = h1 + h2        g2 = h1 - h2
85;//             g7 = h7             g6 = h6 - h7
86;//             g5 = h5 - g6        g4 = h4 - g5
87;//
88;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
89;//             f1 = g1 + g6        f6 = g1 - g6
90;//             f2 = g2 + g5        f5 = g2 - g5
91;//             f3 = g3 + g4        f4 = g3 - g4
92;//
93;// Note:
94;// 1. The scaling by A(u)/8 can often be combined with inverse
95;//    quantization. The column and row scalings can be combined.
96;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
97;//    to the above code but is otherwise identical.
98;// 3. The rotation by -pi/8 can be peformed using three multiplies
99;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
100;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
101;// 4. If |T(u)|<=1 then from the IDCT definition,
102;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
103;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
104;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
105;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
106;//            = (approx)2.64
107;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
108;//    The table below shows input patterns generating the maximum
109;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
110;//    InputPattern      Max |f(x)|
111;//      PPPPPPPP        |f0| =  2.64
112;//      PPPMMMMM        |f1| =  2.64
113;//      PPMMMPPP        |f2| =  2.64
114;//      PPMMPPMM        |f3| =  2.64
115;//      PMMPPMMP        |f4| =  2.64
116;//      PMMPMMPM        |f5| =  2.64
117;//      PMPPMPMP        |f6| =  2.64
118;//      PMPMPMPM        |f7| =  2.64
119;//   Note that this input pattern is the transpose of the
120;//   corresponding max input patter for the FDCT.
121
122;// Arguments
123
124pSrc    RN 0    ;// source data buffer
125Stride  RN 1    ;// destination stride in bytes
126pDest   RN 2    ;// destination data buffer
127pScale  RN 3    ;// pointer to scaling table
128
129
130        ;// DCT Inverse Macro
131        ;// The DCT code should be parametrized according
132        ;// to the following inputs:
133        ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
134        ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
135        ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
136        ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
137        ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
138        ;//
139        ;// Inputs:
140        ;// pSrc   = r0 = Pointer to input data
141        ;//               Range is -256 to +255 (9-bit)
142        ;// Stride = r1 = Stride between input lines
143        ;// pDest  = r2 = Pointer to output data
144        ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
145
146
147
148        MACRO
149        M_IDCT  $outsize, $inscale, $stride
150        LCLA    SHIFT
151
152
153        IF ARM1136JS
154
155;// REGISTER ALLOCATION
156;// This is hard since we have 8 values, 9 free registers and each
157;// butterfly requires a temporary register. We also want to
158;// maintain register order so we can use LDM/STM. The table below
159;// summarises the register allocation that meets all these criteria.
160;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
161;//
162;// r1  a01     g0  h0
163;// r4  b01 f0  g1  h1  i0
164;// r5  a23 f1  g2      i1
165;// r6  b23 f2  g3  h2  i2
166;// r7  a45 f3      h3  i3
167;// r8  b45 f4  g4  h4  i4
168;// r9  a67 f5  g5  h5  i5
169;// r10 b67 f6  g6  h6  i6
170;// r11     f7  g7  h7  i7
171;//
172ra01    RN 1
173rb01    RN 4
174ra23    RN 5
175rb23    RN 6
176ra45    RN 7
177rb45    RN 8
178ra67    RN 9
179rb67    RN 10
180rtmp    RN 11
181csPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
182LoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
183;// Transpose allocation
184xft     RN ra01
185xf0     RN rb01
186xf1     RN ra23
187xf2     RN rb23
188xf3     RN ra45
189xf4     RN rb45
190xf5     RN ra67
191xf6     RN rb67
192xf7     RN rtmp
193;// IStage 1 allocation
194xg0     RN xft
195xg1     RN xf0
196xg2     RN xf1
197xg3     RN xf2
198xgt     RN xf3
199xg4     RN xf4
200xg5     RN xf5
201xg6     RN xf6
202xg7     RN xf7
203;// IStage 2 allocation
204xh0     RN xg0
205xh1     RN xg1
206xht     RN xg2
207xh2     RN xg3
208xh3     RN xgt
209xh4     RN xg4
210xh5     RN xg5
211xh6     RN xg6
212xh7     RN xg7
213;// IStage 3,4 allocation
214xit     RN xh0
215xi0     RN xh1
216xi1     RN xht
217xi2     RN xh2
218xi3     RN xh3
219xi4     RN xh4
220xi5     RN xh5
221xi6     RN xh6
222xi7     RN xh7
223
224        M_STR   pDest,  ppDest
225        IF "$stride"="s"
226            M_STR   Stride, pStride
227        ENDIF
228        M_ADR   pDest,  pBlk
229        LDR     csPiBy8, =0x30fc7642
230        LDR     LoopRR2, =0x00005a82
231
232v6_idct_col$_F
233        ;// Load even values
234        LDR     xi4, [pSrc], #4  ;// j0
235        LDR     xi5, [pSrc, #4*16-4]  ;// j4
236        LDR     xi6, [pSrc, #2*16-4]  ;// j2
237        LDR     xi7, [pSrc, #6*16-4]  ;// j6
238
239        ;// Scale Even Values
240        IF "$inscale"="s16" ;// 16x16 mul
241SHIFT       SETA    12
242            LDR     xi0, [pScale], #4
243            LDR     xi1, [pScale, #4*16-4]
244            LDR     xi2, [pScale, #2*16-4]
245            MOV     xit, #1<<(SHIFT-1)
246            SMLABB  xi3, xi0, xi4, xit
247            SMLATT  xi4, xi0, xi4, xit
248            SMLABB  xi0, xi1, xi5, xit
249            SMLATT  xi5, xi1, xi5, xit
250            MOV     xi3, xi3, ASR #SHIFT
251            PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
252            LDR     xi3, [pScale, #6*16-4]
253            SMLABB  xi1, xi2, xi6, xit
254            SMLATT  xi6, xi2, xi6, xit
255            MOV     xi0, xi0, ASR #SHIFT
256            PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
257            SMLABB  xi2, xi3, xi7, xit
258            SMLATT  xi7, xi3, xi7, xit
259            MOV     xi1, xi1, ASR #SHIFT
260            PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
261            MOV     xi2, xi2, ASR #SHIFT
262            PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
263        ENDIF
264        IF "$inscale"="s32" ;// 32x16 mul
265SHIFT       SETA    (12+8-16)
266            MOV     xit, #1<<(SHIFT-1)
267            LDR     xi0, [pScale], #8
268            LDR     xi1, [pScale, #0*32+4-8]
269            LDR     xi2, [pScale, #4*32-8]
270            LDR     xi3, [pScale, #4*32+4-8]
271            SMLAWB  xi0, xi0, xi4, xit
272            SMLAWT  xi1, xi1, xi4, xit
273            SMLAWB  xi2, xi2, xi5, xit
274            SMLAWT  xi3, xi3, xi5, xit
275            MOV     xi0, xi0, ASR #SHIFT
276            PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
277            MOV     xi2, xi2, ASR #SHIFT
278            PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
279            LDR     xi0, [pScale, #2*32-8]
280            LDR     xi1, [pScale, #2*32+4-8]
281            LDR     xi2, [pScale, #6*32-8]
282            LDR     xi3, [pScale, #6*32+4-8]
283            SMLAWB  xi0, xi0, xi6, xit
284            SMLAWT  xi1, xi1, xi6, xit
285            SMLAWB  xi2, xi2, xi7, xit
286            SMLAWT  xi3, xi3, xi7, xit
287            MOV     xi0, xi0, ASR #SHIFT
288            PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
289            MOV     xi2, xi2, ASR #SHIFT
290            PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
291        ENDIF
292
293        ;// Load odd values
294        LDR     xi0, [pSrc, #1*16-4]      ;// j1
295        LDR     xi1, [pSrc, #7*16-4]      ;// j7
296        LDR     xi2, [pSrc, #5*16-4]      ;// j5
297        LDR     xi3, [pSrc, #3*16-4]      ;// j3
298
299        IF  {TRUE}
300            ;// shortcut if odd values 0
301            TEQ     xi0, #0
302            TEQEQ   xi1, #0
303            TEQEQ   xi2, #0
304            TEQEQ   xi3, #0
305            BEQ     v6OddZero$_F
306        ENDIF
307
308        ;// Store scaled even values
309        STMIA   pDest, {xi4, xi5, xi6, xi7}
310
311        ;// Scale odd values
312        IF "$inscale"="s16"
313            ;// Perform AAN Scale
314            LDR     xi4, [pScale, #1*16-4]
315            LDR     xi5, [pScale, #7*16-4]
316            LDR     xi6, [pScale, #5*16-4]
317            SMLABB  xi7, xi0, xi4, xit
318            SMLATT  xi0, xi0, xi4, xit
319            SMLABB  xi4, xi1, xi5, xit
320            SMLATT  xi1, xi1, xi5, xit
321            MOV     xi7, xi7, ASR #SHIFT
322            PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
323            LDR     xi7, [pScale, #3*16-4]
324            SMLABB  xi5, xi2, xi6, xit
325            SMLATT  xi2, xi2, xi6, xit
326            MOV     xi4, xi4, ASR #SHIFT
327            PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
328            SMLABB  xi6, xi3, xi7, xit
329            SMLATT  xi3, xi3, xi7, xit
330            MOV     xi5, xi5, ASR #SHIFT
331            PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
332            MOV     xi6, xi6, ASR #SHIFT
333            PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
334        ENDIF
335        IF "$inscale"="s32" ;// 32x16 mul
336            LDR     xi4, [pScale, #1*32-8]
337            LDR     xi5, [pScale, #1*32+4-8]
338            LDR     xi6, [pScale, #7*32-8]
339            LDR     xi7, [pScale, #7*32+4-8]
340            SMLAWB  xi4, xi4, xi0, xit
341            SMLAWT  xi5, xi5, xi0, xit
342            SMLAWB  xi6, xi6, xi1, xit
343            SMLAWT  xi7, xi7, xi1, xit
344            MOV     xi4, xi4, ASR #SHIFT
345            PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
346            MOV     xi6, xi6, ASR #SHIFT
347            PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
348            LDR     xi4, [pScale, #5*32-8]
349            LDR     xi5, [pScale, #5*32+4-8]
350            LDR     xi6, [pScale, #3*32-8]
351            LDR     xi7, [pScale, #3*32+4-8]
352            SMLAWB  xi4, xi4, xi2, xit
353            SMLAWT  xi5, xi5, xi2, xit
354            SMLAWB  xi6, xi6, xi3, xit
355            SMLAWT  xi7, xi7, xi3, xit
356            MOV     xi4, xi4, ASR #SHIFT
357            PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
358            MOV     xi6, xi6, ASR #SHIFT
359            PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
360        ENDIF
361
362        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
363        SSUB16  xi6, xi0, xi1           ;// j1-j7
364        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
365        SSUB16  xi4, xi2, xi3           ;// j5-j3
366
367        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
368
369        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
370        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
371
372        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
373        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
374        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
375        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
376
377        SMULBB  xi1, xi3, LoopRR2
378        SMULTB  xi3, xi3, LoopRR2
379
380        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
381        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
382        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
383
384        ;// xi0,xi1,xi2,xi3 now free
385        ;// IStage 4,3, rows 2to3 x1/2
386
387        MOV     xi3, xi3, LSL #1
388        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
389        LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
390
391        ;// IStage 2, rows4to7
392        SSUB16  xg6, xh6, xh7
393        SSUB16  xg5, xh5, xg6
394        SSUB16  xg4, xh4, xg5
395
396        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
397        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
398
399        SMULBB  xi0, xi2, LoopRR2
400        SMULTB  xi2, xi2, LoopRR2
401
402        MOV     xi2, xi2, LSL #1
403        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
404
405        ;// xi0, xi1 now free
406        ;// IStage 4,3 rows 0to1 x 1/2
407        LDRD    xi0, [pDest]            ;// j0, j4 scaled
408        SSUB16  xh2, xh2, xi3
409        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
410
411        SHADD16 xh0, xi0, xi1
412        SHSUB16 xh1, xi0, xi1
413
414        ;// IStage 2 rows 0to3 x 1/2
415        SHSUB16 xg2, xh1, xh2
416        SHADD16 xg1, xh1, xh2
417        SHSUB16 xg3, xh0, xh3
418        SHADD16 xg0, xh0, xh3
419
420        ;// IStage 1 all rows
421        SADD16  xf3, xg3, xg4
422        SSUB16  xf4, xg3, xg4
423        SADD16  xf2, xg2, xg5
424        SSUB16  xf5, xg2, xg5
425        SADD16  xf1, xg1, xg6
426        SSUB16  xf6, xg1, xg6
427        SADD16  xf0, xg0, xg7
428        SSUB16  xf7, xg0, xg7
429
430        ;// Transpose, store and loop
431        PKHBT   ra01, xf0, xf1, LSL #16
432        PKHTB   rb01, xf1, xf0, ASR #16
433
434        PKHBT   ra23, xf2, xf3, LSL #16
435        PKHTB   rb23, xf3, xf2, ASR #16
436
437        PKHBT   ra45, xf4, xf5, LSL #16
438        PKHTB   rb45, xf5, xf4, ASR #16
439
440        PKHBT   ra67, xf6, xf7, LSL #16
441        STMIA   pDest!, {ra01, ra23, ra45, ra67}
442        PKHTB   rb67, xf7, xf6, ASR #16
443        STMIA   pDest!, {rb01, rb23, rb45, rb67}
444        BCC     v6_idct_col$_F
445
446        SUB     pSrc, pDest, #(64*2)
447        M_LDR   pDest, ppDest
448        IF "$stride"="s"
449            M_LDR   pScale, pStride
450        ENDIF
451        B       v6_idct_row$_F
452
453v6OddZero$_F
454        SSUB16  xi2, xi6, xi7           ;// (j2-j6)
455        SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
456
457        SMULBB  xi0, xi2, LoopRR2
458        SMULTB  xi2, xi2, LoopRR2
459
460        MOV     xi2, xi2, LSL #1
461        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
462        SSUB16  xh2, xh2, xi3
463
464        ;// xi0, xi1 now free
465        ;// IStage 4,3 rows 0to1 x 1/2
466
467        SHADD16 xh0, xi4, xi5
468        SHSUB16 xh1, xi4, xi5
469
470        ;// IStage 2 rows 0to3 x 1/2
471        SHSUB16 xg2, xh1, xh2
472        SHADD16 xg1, xh1, xh2
473        SHSUB16 xg3, xh0, xh3
474        SHADD16 xg0, xh0, xh3
475
476        ;// IStage 1 all rows
477        MOV  xf3, xg3
478        MOV  xf4, xg3
479        MOV  xf2, xg2
480        MOV  xf5, xg2
481        MOV  xf1, xg1
482        MOV  xf6, xg1
483        MOV  xf0, xg0
484        MOV  xf7, xg0
485
486        ;// Transpose
487        PKHBT   ra01, xf0, xf1, LSL #16
488        PKHTB   rb01, xf1, xf0, ASR #16
489
490        PKHBT   ra23, xf2, xf3, LSL #16
491        PKHTB   rb23, xf3, xf2, ASR #16
492
493        PKHBT   ra45, xf4, xf5, LSL #16
494        PKHTB   rb45, xf5, xf4, ASR #16
495
496        PKHBT   ra67, xf6, xf7, LSL #16
497        PKHTB   rb67, xf7, xf6, ASR #16
498
499        STMIA   pDest!, {ra01, ra23, ra45, ra67}
500        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
501        STMIA   pDest!, {rb01, rb23, rb45, rb67}
502
503        BCC     v6_idct_col$_F
504        SUB     pSrc, pDest, #(64*2)
505        M_LDR   pDest, ppDest
506        IF "$stride"="s"
507            M_LDR   pScale, pStride
508        ENDIF
509
510
511v6_idct_row$_F
512        ;// IStage 4,3, rows4to7 x1/4
513        LDR     xit, =0x00010001        ;// rounding constant
514        LDR     xi0, [pSrc, #1*16]      ;// j1
515        LDR     xi1, [pSrc, #7*16]      ;// 4*j7
516        LDR     xi2, [pSrc, #5*16]      ;// j5
517        LDR     xi3, [pSrc, #3*16]      ;// j3
518
519        SHADD16 xi1, xi1, xit           ;// 2*j7
520        SHADD16 xi1, xi1, xit           ;// j7
521
522        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
523        SSUB16  xi6, xi0, xi1           ;// j1-j7
524        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
525        SSUB16  xi4, xi2, xi3           ;// j5-j3
526
527        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
528
529        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
530        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
531
532        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
533        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
534        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
535        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
536
537        SMULBB  xi1, xi3, LoopRR2
538        SMULTB  xi3, xi3, LoopRR2
539
540        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
541        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
542        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
543
544        MOV     xi3, xi3, LSL #1
545        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
546
547        ;// xi0,xi1,xi2,xi3 now free
548        ;// IStage 4,3, rows 2to3 x1/2
549
550        LDR     xi0, [pSrc, #2*16]      ;// j2
551        LDR     xi1, [pSrc, #6*16]      ;// 2*j6
552
553        ;// IStage 2, rows4to7
554        SSUB16  xg6, xh6, xh7
555        SSUB16  xg5, xh5, xg6
556        SSUB16  xg4, xh4, xg5
557
558        SHADD16 xi1, xi1, xit           ;// j6
559        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
560        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
561
562        SMULBB  xi0, xi2, LoopRR2
563        SMULTB  xi2, xi2, LoopRR2
564
565        MOV     xi2, xi2, LSL #1
566
567        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
568
569        ;// xi0, xi1 now free
570        ;// IStage 4,3 rows 0to1 x 1/2
571        LDR     xi1, [pSrc, #4*16]      ;// j4
572        LDR     xi0, [pSrc], #4         ;// j0
573
574        SSUB16  xh2, xh2, xi3
575        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
576
577        ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
578        SHADD16 xh0, xi0, xi1           ;// of DC result
579        SHSUB16 xh1, xi0, xi1
580
581        ;// IStage 2 rows 0to3 x 1/2
582        SHSUB16 xg2, xh1, xh2
583        SHADD16 xg1, xh1, xh2
584        SHSUB16 xg3, xh0, xh3
585        SHADD16 xg0, xh0, xh3
586
587        ;// IStage 1 all rows
588        SHADD16 xf3, xg3, xg4
589        SHSUB16 xf4, xg3, xg4
590        SHADD16 xf2, xg2, xg5
591        SHSUB16 xf5, xg2, xg5
592        SHADD16 xf1, xg1, xg6
593        SHSUB16 xf6, xg1, xg6
594        SHADD16 xf0, xg0, xg7
595        SHSUB16 xf7, xg0, xg7
596
597        ;// Saturate
598        IF ("$outsize"="u8")
599            USAT16  xf0, #8, xf0
600            USAT16  xf1, #8, xf1
601            USAT16  xf2, #8, xf2
602            USAT16  xf3, #8, xf3
603            USAT16  xf4, #8, xf4
604            USAT16  xf5, #8, xf5
605            USAT16  xf6, #8, xf6
606            USAT16  xf7, #8, xf7
607        ENDIF
608        IF ("$outsize"="s9")
609            SSAT16  xf0, #9, xf0
610            SSAT16  xf1, #9, xf1
611            SSAT16  xf2, #9, xf2
612            SSAT16  xf3, #9, xf3
613            SSAT16  xf4, #9, xf4
614            SSAT16  xf5, #9, xf5
615            SSAT16  xf6, #9, xf6
616            SSAT16  xf7, #9, xf7
617        ENDIF
618
619        ;// Transpose to Row, Pack and store
620        IF ("$outsize"="u8")
621            ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
622            ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
623            ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
624            ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
625            PKHBT   ra01, xf0, xf2, LSL #16
626            PKHTB   rb01, xf2, xf0, ASR #16
627            PKHBT   ra23, xf4, xf6, LSL #16
628            PKHTB   rb23, xf6, xf4, ASR #16
629            STMIA   pDest, {ra01, ra23}
630            IF "$stride"="s"
631                ADD     pDest, pDest, pScale
632                STMIA   pDest, {rb01, rb23}
633                ADD     pDest, pDest, pScale
634            ELSE
635                ADD     pDest, pDest, #($stride)
636                STMIA   pDest, {rb01, rb23}
637                ADD     pDest, pDest, #($stride)
638            ENDIF
639        ENDIF
640        IF ("$outsize"="s9"):LOR:("$outsize"="s16")
641            PKHBT   ra01, xf0, xf1, LSL #16
642            PKHTB   rb01, xf1, xf0, ASR #16
643
644            PKHBT   ra23, xf2, xf3, LSL #16
645            PKHTB   rb23, xf3, xf2, ASR #16
646
647            PKHBT   ra45, xf4, xf5, LSL #16
648            PKHTB   rb45, xf5, xf4, ASR #16
649
650            PKHBT   ra67, xf6, xf7, LSL #16
651            PKHTB   rb67, xf7, xf6, ASR #16
652
653            STMIA   pDest, {ra01, ra23, ra45, ra67}
654            IF "$stride"="s"
655                ADD     pDest, pDest, pScale
656                STMIA   pDest, {rb01, rb23, rb45, rb67}
657                ADD     pDest, pDest, pScale
658            ELSE
659                ADD     pDest, pDest, #($stride)
660                STMIA   pDest, {rb01, rb23, rb45, rb67}
661                ADD     pDest, pDest, #($stride)
662            ENDIF
663        ENDIF
664
665        BCC     v6_idct_row$_F
666        ENDIF ;// ARM1136JS
667
668
669        IF CortexA8
670
671Src0            EQU  7
672Src1            EQU  8
673Src2            EQU  9
674Src3            EQU  10
675Src4            EQU  11
676Src5            EQU  12
677Src6            EQU  13
678Src7            EQU  14
679Tmp             EQU  15
680
681qXj0            QN Src0.S16
682qXj1            QN Src1.S16
683qXj2            QN Src2.S16
684qXj3            QN Src3.S16
685qXj4            QN Src4.S16
686qXj5            QN Src5.S16
687qXj6            QN Src6.S16
688qXj7            QN Src7.S16
689qXjt            QN Tmp.S16
690
691dXj0lo          DN (Src0*2).S16
692dXj0hi          DN (Src0*2+1).S16
693dXj1lo          DN (Src1*2).S16
694dXj1hi          DN (Src1*2+1).S16
695dXj2lo          DN (Src2*2).S16
696dXj2hi          DN (Src2*2+1).S16
697dXj3lo          DN (Src3*2).S16
698dXj3hi          DN (Src3*2+1).S16
699dXj4lo          DN (Src4*2).S16
700dXj4hi          DN (Src4*2+1).S16
701dXj5lo          DN (Src5*2).S16
702dXj5hi          DN (Src5*2+1).S16
703dXj6lo          DN (Src6*2).S16
704dXj6hi          DN (Src6*2+1).S16
705dXj7lo          DN (Src7*2).S16
706dXj7hi          DN (Src7*2+1).S16
707dXjtlo          DN (Tmp*2).S16
708dXjthi          DN (Tmp*2+1).S16
709
710qXi0            QN qXj0
711qXi1            QN qXj4
712qXi2            QN qXj2
713qXi3            QN qXj7
714qXi4            QN qXj5
715qXi5            QN qXjt
716qXi6            QN qXj1
717qXi7            QN qXj6
718qXit            QN qXj3
719
720dXi0lo          DN dXj0lo
721dXi0hi          DN dXj0hi
722dXi1lo          DN dXj4lo
723dXi1hi          DN dXj4hi
724dXi2lo          DN dXj2lo
725dXi2hi          DN dXj2hi
726dXi3lo          DN dXj7lo
727dXi3hi          DN dXj7hi
728dXi4lo          DN dXj5lo
729dXi4hi          DN dXj5hi
730dXi5lo          DN dXjtlo
731dXi5hi          DN dXjthi
732dXi6lo          DN dXj1lo
733dXi6hi          DN dXj1hi
734dXi7lo          DN dXj6lo
735dXi7hi          DN dXj6hi
736dXitlo          DN dXj3lo
737dXithi          DN dXj3hi
738
739qXh0            QN qXit
740qXh1            QN qXi0
741qXh2            QN qXi2
742qXh3            QN qXi3
743qXh4            QN qXi7
744qXh5            QN qXi5
745qXh6            QN qXi4
746qXh7            QN qXi1
747qXht            QN qXi6
748
749dXh0lo          DN dXitlo
750dXh0hi          DN dXithi
751dXh1lo          DN dXi0lo
752dXh1hi          DN dXi0hi
753dXh2lo          DN dXi2lo
754dXh2hi          DN dXi2hi
755dXh3lo          DN dXi3lo
756dXh3hi          DN dXi3hi
757dXh4lo          DN dXi7lo
758dXh4hi          DN dXi7hi
759dXh5lo          DN dXi5lo
760dXh5hi          DN dXi5hi
761dXh6lo          DN dXi4lo
762dXh6hi          DN dXi4hi
763dXh7lo          DN dXi1lo
764dXh7hi          DN dXi1hi
765dXhtlo          DN dXi6lo
766dXhthi          DN dXi6hi
767
768qXg0            QN qXh2
769qXg1            QN qXht
770qXg2            QN qXh1
771qXg3            QN qXh0
772qXg4            QN qXh4
773qXg5            QN qXh5
774qXg6            QN qXh6
775qXg7            QN qXh7
776qXgt            QN qXh3
777
778qXf0            QN qXg6
779qXf1            QN qXg5
780qXf2            QN qXg4
781qXf3            QN qXgt
782qXf4            QN qXg3
783qXf5            QN qXg2
784qXf6            QN qXg1
785qXf7            QN qXg0
786qXft            QN qXg7
787
788
789qXt0            QN 1.S32
790qXt1            QN 2.S32
791qT0lo           QN 1.S32
792qT0hi           QN 2.S32
793qT1lo           QN 3.S32
794qT1hi           QN 4.S32
795qScalelo        QN 5.S32        ;// used to read post scale values
796qScalehi        QN 6.S32
797qTemp0          QN 5.S32
798qTemp1          QN 6.S32
799
800
801Scale1          EQU 6
802Scale2          EQU 15
803qScale1         QN Scale1.S16
804qScale2         QN Scale2.S16
805dScale1lo       DN (Scale1*2).S16
806dScale1hi       DN (Scale1*2+1).S16
807dScale2lo       DN (Scale2*2).S16
808dScale2hi       DN (Scale2*2+1).S16
809
810dCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
811InvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
812S               DN dCoefs[1]    ;// Sin(PI/8) in Q15
813C               DN dCoefs[2]    ;// Cos(PI/8) in Q15
814
815pTemp           RN 12
816
817
818        IMPORT  armCOMM_IDCTCoef
819
820        VLD1        {qXj0,qXj1}, [pSrc @64]!
821        VLD1        {qXj2,qXj3}, [pSrc @64]!
822        VLD1        {qXj4,qXj5}, [pSrc @64]!
823        VLD1        {qXj6,qXj7}, [pSrc @64]!
824
825        ;// Load PreScale and multiply with Src
826        ;// IStage 4
827
828        IF "$inscale"="s16"                         ;// 16X16 Mul
829            M_IDCT_PRESCALE16
830        ENDIF
831
832        IF "$inscale"="s32"                         ;// 32X32 ,ul
833            M_IDCT_PRESCALE32
834        ENDIF
835
836        ;// IStage 3
837        VQRDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
838        VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
839        VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
840        VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
841        VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
842        VQRDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
843        VSUB        qXh2, qXi2, qXi3                ;// h2, h3
844
845        VMULL       qXt0, dXi4lo, C                 ;// c*i4
846        VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
847        VMULL       qXt1, dXi4hi, C
848        VMLAL       qXt1, dXi6hi, S
849        VSHRN       dXh4lo, qXt0, #16               ;// h4
850        VSHRN       dXh4hi, qXt1, #16
851
852        VMULL       qXt0, dXi6lo, C                 ;// c*i6
853        VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
854        VMULL       qXt1, dXi6hi, C
855        VMLSL       qXt1, dXi4hi, S
856        VSHRN       dXh6lo, qXt0, #16               ;// h6
857        VSHRN       dXh6hi, qXt1, #16
858
859        ;// IStage 2
860        VSUB        qXg6, qXh6, qXh7
861        VSUB        qXg5, qXh5, qXg6
862        VSUB        qXg4, qXh4, qXg5
863        VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
864        VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
865        VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
866        VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
867
868        ;// IStage 1 all rows
869        VADD        qXf3, qXg3, qXg4
870        VSUB        qXf4, qXg3, qXg4
871        VADD        qXf2, qXg2, qXg5
872        VSUB        qXf5, qXg2, qXg5
873        VADD        qXf1, qXg1, qXg6
874        VSUB        qXf6, qXg1, qXg6
875        VADD        qXf0, qXg0, qXg7
876        VSUB        qXf7, qXg0, qXg7
877
878        ;// Transpose, store and loop
879XTR0            EQU Src5
880XTR1            EQU Tmp
881XTR2            EQU Src6
882XTR3            EQU Src7
883XTR4            EQU Src3
884XTR5            EQU Src0
885XTR6            EQU Src1
886XTR7            EQU Src2
887XTRt            EQU Src4
888
889qA0             QN  XTR0.S32  ;// for XTRpose
890qA1             QN  XTR1.S32
891qA2             QN  XTR2.S32
892qA3             QN  XTR3.S32
893qA4             QN  XTR4.S32
894qA5             QN  XTR5.S32
895qA6             QN  XTR6.S32
896qA7             QN  XTR7.S32
897
898dB0             DN  XTR0*2+1      ;// for using VSWP
899dB1             DN  XTR1*2+1
900dB2             DN  XTR2*2+1
901dB3             DN  XTR3*2+1
902dB4             DN  XTR4*2
903dB5             DN  XTR5*2
904dB6             DN  XTR6*2
905dB7             DN  XTR7*2
906
907
908        VTRN        qXf0, qXf1
909        VTRN        qXf2, qXf3
910        VTRN        qXf4, qXf5
911        VTRN        qXf6, qXf7
912        VTRN        qA0, qA2
913        VTRN        qA1, qA3
914        VTRN        qA4, qA6
915        VTRN        qA5, qA7
916        VSWP        dB0, dB4
917        VSWP        dB1, dB5
918        VSWP        dB2, dB6
919        VSWP        dB3, dB7
920
921
922qYj0            QN qXf0
923qYj1            QN qXf1
924qYj2            QN qXf2
925qYj3            QN qXf3
926qYj4            QN qXf4
927qYj5            QN qXf5
928qYj6            QN qXf6
929qYj7            QN qXf7
930qYjt            QN qXft
931
932dYj0lo          DN (XTR0*2).S16
933dYj0hi          DN (XTR0*2+1).S16
934dYj1lo          DN (XTR1*2).S16
935dYj1hi          DN (XTR1*2+1).S16
936dYj2lo          DN (XTR2*2).S16
937dYj2hi          DN (XTR2*2+1).S16
938dYj3lo          DN (XTR3*2).S16
939dYj3hi          DN (XTR3*2+1).S16
940dYj4lo          DN (XTR4*2).S16
941dYj4hi          DN (XTR4*2+1).S16
942dYj5lo          DN (XTR5*2).S16
943dYj5hi          DN (XTR5*2+1).S16
944dYj6lo          DN (XTR6*2).S16
945dYj6hi          DN (XTR6*2+1).S16
946dYj7lo          DN (XTR7*2).S16
947dYj7hi          DN (XTR7*2+1).S16
948dYjtlo          DN (XTRt*2).S16
949dYjthi          DN (XTRt*2+1).S16
950
951qYi0            QN qYj0
952qYi1            QN qYj4
953qYi2            QN qYj2
954qYi3            QN qYj7
955qYi4            QN qYj5
956qYi5            QN qYjt
957qYi6            QN qYj1
958qYi7            QN qYj6
959qYit            QN qYj3
960
961dYi0lo          DN dYj0lo
962dYi0hi          DN dYj0hi
963dYi1lo          DN dYj4lo
964dYi1hi          DN dYj4hi
965dYi2lo          DN dYj2lo
966dYi2hi          DN dYj2hi
967dYi3lo          DN dYj7lo
968dYi3hi          DN dYj7hi
969dYi4lo          DN dYj5lo
970dYi4hi          DN dYj5hi
971dYi5lo          DN dYjtlo
972dYi5hi          DN dYjthi
973dYi6lo          DN dYj1lo
974dYi6hi          DN dYj1hi
975dYi7lo          DN dYj6lo
976dYi7hi          DN dYj6hi
977dYitlo          DN dYj3lo
978dYithi          DN dYj3hi
979
980qYh0            QN qYit
981qYh1            QN qYi0
982qYh2            QN qYi2
983qYh3            QN qYi3
984qYh4            QN qYi7
985qYh5            QN qYi5
986qYh6            QN qYi4
987qYh7            QN qYi1
988qYht            QN qYi6
989
990dYh0lo          DN dYitlo
991dYh0hi          DN dYithi
992dYh1lo          DN dYi0lo
993dYh1hi          DN dYi0hi
994dYh2lo          DN dYi2lo
995dYh2hi          DN dYi2hi
996dYh3lo          DN dYi3lo
997dYh3hi          DN dYi3hi
998dYh4lo          DN dYi7lo
999dYh4hi          DN dYi7hi
1000dYh5lo          DN dYi5lo
1001dYh5hi          DN dYi5hi
1002dYh6lo          DN dYi4lo
1003dYh6hi          DN dYi4hi
1004dYh7lo          DN dYi1lo
1005dYh7hi          DN dYi1hi
1006dYhtlo          DN dYi6lo
1007dYhthi          DN dYi6hi
1008
1009qYg0            QN qYh2
1010qYg1            QN qYht
1011qYg2            QN qYh1
1012qYg3            QN qYh0
1013qYg4            QN qYh4
1014qYg5            QN qYh5
1015qYg6            QN qYh6
1016qYg7            QN qYh7
1017qYgt            QN qYh3
1018
1019qYf0            QN qYg6
1020qYf1            QN qYg5
1021qYf2            QN qYg4
1022qYf3            QN qYgt
1023qYf4            QN qYg3
1024qYf5            QN qYg2
1025qYf6            QN qYg1
1026qYf7            QN qYg0
1027qYft            QN qYg7
1028
1029        VRSHR       qYj7, qYj7, #2
1030        VRSHR       qYj6, qYj6, #1
1031
1032        VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
1033        VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
1034        VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
1035        VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
1036        VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
1037        VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
1038
1039        VQRDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
1040        ;// IStage 4,3 rows 0to1 x 1/2
1041
1042        MOV         pTemp, #0x4             ;// ensure correct round
1043        VDUP        qScale1, pTemp           ;// of DC result
1044        VADD        qYi0, qYi0, qScale1
1045
1046        VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
1047        VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
1048
1049        VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
1050        VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
1051        VSUB        qYh2, qYi2, qYi3        ;// h2, h3
1052        VQRDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
1053
1054        VMULL       qXt0, dYi4lo, C         ;// c*i4
1055        VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
1056        VMULL       qXt1, dYi4hi, C
1057        VMLAL       qXt1, dYi6hi, S
1058        VSHRN       dYh4lo, qXt0, #16       ;// h4
1059        VSHRN       dYh4hi, qXt1, #16
1060
1061        VMULL       qXt0, dYi6lo, C         ;// c*i6
1062        VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
1063        VMULL       qXt1, dYi6hi, C
1064        VMLSL       qXt1, dYi4hi, S
1065        VSHRN       dYh6lo, qXt0, #16       ;// h6
1066        VSHRN       dYh6hi, qXt1, #16
1067
1068        VSUB        qYg6, qYh6, qYh7
1069        VSUB        qYg5, qYh5, qYg6
1070        VSUB        qYg4, qYh4, qYg5
1071
1072        ;// IStage 2 rows 0to3 x 1/2
1073        VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
1074        VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
1075        VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
1076        VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
1077
1078
1079        ;// IStage 1 all rows
1080        VHADD        qYf3, qYg3, qYg4
1081        VHSUB        qYf4, qYg3, qYg4
1082        VHADD        qYf2, qYg2, qYg5
1083        VHSUB        qYf5, qYg2, qYg5
1084        VHADD        qYf1, qYg1, qYg6
1085        VHSUB        qYf6, qYg1, qYg6
1086        VHADD        qYf0, qYg0, qYg7
1087        VHSUB        qYf7, qYg0, qYg7
1088
1089YTR0            EQU Src0
1090YTR1            EQU Src4
1091YTR2            EQU Src1
1092YTR3            EQU Src2
1093YTR4            EQU Src7
1094YTR5            EQU Src5
1095YTR6            EQU Tmp
1096YTR7            EQU Src6
1097YTRt            EQU Src3
1098
1099qC0             QN  YTR0.S32                ;// for YTRpose
1100qC1             QN  YTR1.S32
1101qC2             QN  YTR2.S32
1102qC3             QN  YTR3.S32
1103qC4             QN  YTR4.S32
1104qC5             QN  YTR5.S32
1105qC6             QN  YTR6.S32
1106qC7             QN  YTR7.S32
1107
1108dD0             DN  YTR0*2+1                ;// for using VSWP
1109dD1             DN  YTR1*2+1
1110dD2             DN  YTR2*2+1
1111dD3             DN  YTR3*2+1
1112dD4             DN  YTR4*2
1113dD5             DN  YTR5*2
1114dD6             DN  YTR6*2
1115dD7             DN  YTR7*2
1116
1117        VTRN        qYf0, qYf1
1118        VTRN        qYf2, qYf3
1119        VTRN        qYf4, qYf5
1120        VTRN        qYf6, qYf7
1121        VTRN        qC0, qC2
1122        VTRN        qC1, qC3
1123        VTRN        qC4, qC6
1124        VTRN        qC5, qC7
1125        VSWP        dD0, dD4
1126        VSWP        dD1, dD5
1127        VSWP        dD2, dD6
1128        VSWP        dD3, dD7
1129
1130
1131dYf0U8          DN YTR0*2.U8
1132dYf1U8          DN YTR1*2.U8
1133dYf2U8          DN YTR2*2.U8
1134dYf3U8          DN YTR3*2.U8
1135dYf4U8          DN YTR4*2.U8
1136dYf5U8          DN YTR5*2.U8
1137dYf6U8          DN YTR6*2.U8
1138dYf7U8          DN YTR7*2.U8
1139
1140        ;//
1141        ;// Do saturation if outsize is other than S16
1142        ;//
1143
1144        IF ("$outsize"="u8")
1145            ;// Output range [0-255]
1146            VQMOVN            dYf0U8, qYf0
1147            VQMOVN            dYf1U8, qYf1
1148            VQMOVN            dYf2U8, qYf2
1149            VQMOVN            dYf3U8, qYf3
1150            VQMOVN            dYf4U8, qYf4
1151            VQMOVN            dYf5U8, qYf5
1152            VQMOVN            dYf6U8, qYf6
1153            VQMOVN            dYf7U8, qYf7
1154        ENDIF
1155
1156        IF ("$outsize"="s9")
1157            ;// Output range [-256 to +255]
1158            VQSHL            qYf0, qYf0, #16-9
1159            VQSHL            qYf1, qYf1, #16-9
1160            VQSHL            qYf2, qYf2, #16-9
1161            VQSHL            qYf3, qYf3, #16-9
1162            VQSHL            qYf4, qYf4, #16-9
1163            VQSHL            qYf5, qYf5, #16-9
1164            VQSHL            qYf6, qYf6, #16-9
1165            VQSHL            qYf7, qYf7, #16-9
1166
1167            VSHR             qYf0, qYf0, #16-9
1168            VSHR             qYf1, qYf1, #16-9
1169            VSHR             qYf2, qYf2, #16-9
1170            VSHR             qYf3, qYf3, #16-9
1171            VSHR             qYf4, qYf4, #16-9
1172            VSHR             qYf5, qYf5, #16-9
1173            VSHR             qYf6, qYf6, #16-9
1174            VSHR             qYf7, qYf7, #16-9
1175        ENDIF
1176
1177        ;// Store output depending on the Stride size
1178        IF "$stride"="s"
1179            VST1        qYf0, [pDest @64], Stride
1180            VST1        qYf1, [pDest @64], Stride
1181            VST1        qYf2, [pDest @64], Stride
1182            VST1        qYf3, [pDest @64], Stride
1183            VST1        qYf4, [pDest @64], Stride
1184            VST1        qYf5, [pDest @64], Stride
1185            VST1        qYf6, [pDest @64], Stride
1186            VST1        qYf7, [pDest @64]
1187        ELSE
1188            IF ("$outsize"="u8")
1189                VST1        dYf0U8, [pDest @64], #8
1190                VST1        dYf1U8, [pDest @64], #8
1191                VST1        dYf2U8, [pDest @64], #8
1192                VST1        dYf3U8, [pDest @64], #8
1193                VST1        dYf4U8, [pDest @64], #8
1194                VST1        dYf5U8, [pDest @64], #8
1195                VST1        dYf6U8, [pDest @64], #8
1196                VST1        dYf7U8, [pDest @64]
1197            ELSE
1198                ;// ("$outsize"="s9") or ("$outsize"="s16")
1199                VST1        qYf0, [pDest @64], #16
1200                VST1        qYf1, [pDest @64], #16
1201                VST1        qYf2, [pDest @64], #16
1202                VST1        qYf3, [pDest @64], #16
1203                VST1        qYf4, [pDest @64], #16
1204                VST1        qYf5, [pDest @64], #16
1205                VST1        qYf6, [pDest @64], #16
1206                VST1        qYf7, [pDest @64]
1207            ENDIF
1208
1209        ENDIF
1210
1211
1212
1213        ENDIF ;// CortexA8
1214
1215
1216
1217        MEND
1218
1219        ;// Scale TWO input rows with TWO rows of 16 bit scale values
1220        ;//
1221        ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
1222        ;// input (Eight input values) with one row of scale values. Also
1223        ;// Loads next scale values from pScale, if $LastRow flag is not set.
1224        ;//
1225        ;// Input Registers:
1226        ;//
1227        ;// $dAlo           - Input D register with first four S16 values of row n
1228        ;// $dAhi           - Input D register with next four S16 values of row n
1229        ;// $dBlo           - Input D register with first four S16 values of row n+1
1230        ;// $dBhi           - Input D register with next four S16 values of row n+1
1231        ;// pScale          - Pointer to next row of scale values
1232        ;// qT0lo           - Temporary scratch register
1233        ;// qT0hi           - Temporary scratch register
1234        ;// qT1lo           - Temporary scratch register
1235        ;// qT1hi           - Temporary scratch register
1236        ;// dScale1lo       - Scale value of row n
1237        ;// dScale1hi       - Scale value of row n
1238        ;// dScale2lo       - Scale value of row n+1
1239        ;// dScale2hi       - Scale value of row n+1
1240        ;//
1241        ;// Input Flag
1242        ;//
1243        ;// $LastRow        - Flag to indicate whether current row is last row
1244        ;//
1245        ;// Output Registers:
1246        ;//
1247        ;// $dAlo           - Scaled output values (first four S16 of row n)
1248        ;// $dAhi           - Scaled output values (next four S16 of row n)
1249        ;// $dBlo           - Scaled output values (first four S16 of row n+1)
1250        ;// $dBhi           - Scaled output values (next four S16 of row n+1)
1251        ;// qScale1         - Scale values for next row
1252        ;// qScale2         - Scale values for next row+1
1253        ;// pScale          - Pointer to next row of scale values
1254        ;//
1255        MACRO
1256        M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
1257        VMULL       qT0lo, $dAlo, dScale1lo
1258        VMULL       qT0hi, $dAhi, dScale1hi
1259        VMULL       qT1lo, $dBlo, dScale2lo
1260        VMULL       qT1hi, $dBhi, dScale2hi
1261        IF "$LastRow"="0"
1262            VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
1263            VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
1264        ENDIF
1265        VQRSHRN       $dAlo, qT0lo, #12
1266        VQRSHRN       $dAhi, qT0hi, #12
1267        VQRSHRN       $dBlo, qT1lo, #12
1268        VQRSHRN       $dBhi, qT1hi, #12
1269        MEND
1270
1271        ;// Scale 8x8 block input values with 16 bit scale values
1272        ;//
1273        ;// This macro is used to pre-scale block of 8x8 input.
1274        ;// This also do the Ist stage transformations of IDCT.
1275        ;//
1276        ;// Input Registers:
1277        ;//
1278        ;// dXjnlo          - n th input D register with first four S16 values
1279        ;// dXjnhi          - n th input D register with next four S16 values
1280        ;// qXjn            - n th input Q register with eight S16 values
1281        ;// pScale          - Pointer to scale values
1282        ;//
1283        ;// Output Registers:
1284        ;//
1285        ;// qXin            - n th output Q register with eight S16 output values of 1st stage
1286        ;//
1287        MACRO
1288        M_IDCT_PRESCALE16
1289        VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
1290        VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
1291        M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
1292        M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
1293        M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
1294        M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
1295        VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
1296        VSUB        qXi6, qXj1, qXj7            ;// j1-j7
1297        LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
1298        VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
1299        VSUB        qXi2, qXj2, qXj6            ;// j2-j6
1300        VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
1301        VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
1302        VSUB        qXi4, qXj5, qXj3            ;// j5-j3
1303        MEND
1304
1305
1306        ;// Scale 8x8 block input values with 32 bit scale values
1307        ;//
1308        ;// This macro is used to pre-scale block of 8x8 input.
1309        ;// This also do the Ist stage transformations of IDCT.
1310        ;//
1311        ;// Input Registers:
1312        ;//
1313        ;// dXjnlo          - n th input D register with first four S16 values
1314        ;// dXjnhi          - n th input D register with next four S16 values
1315        ;// qXjn            - n th input Q register with eight S16 values
1316        ;// pScale          - Pointer to 32bit scale values in Q23 format
1317        ;//
1318        ;// Output Registers:
1319        ;//
1320        ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
1321        ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
1322        ;//
1323        MACRO
1324        M_IDCT_PRESCALE32
1325qScale0lo       QN 0.S32
1326qScale0hi       QN 1.S32
1327qScale1lo       QN 2.S32
1328qScale1hi       QN 3.S32
1329qScale2lo       QN qScale1lo
1330qScale2hi       QN qScale1hi
1331qScale3lo       QN qScale1lo
1332qScale3hi       QN qScale1hi
1333qScale4lo       QN qScale1lo
1334qScale4hi       QN qScale1hi
1335qScale5lo       QN qScale0lo
1336qScale5hi       QN qScale0hi
1337qScale6lo       QN qScale0lo
1338qScale6hi       QN qScale0hi
1339qScale7lo       QN qScale0lo
1340qScale7hi       QN qScale0hi
1341
1342qSrc0lo         QN 4.S32
1343qSrc0hi         QN 5.S32
1344qSrc1lo         QN 6.S32
1345qSrc1hi         QN Src4.S32
1346qSrc2lo         QN qSrc0lo
1347qSrc2hi         QN qSrc0hi
1348qSrc3lo         QN qSrc0lo
1349qSrc3hi         QN qSrc0hi
1350qSrc4lo         QN qSrc0lo
1351qSrc4hi         QN qSrc0hi
1352qSrc5lo         QN qSrc1lo
1353qSrc5hi         QN qSrc1hi
1354qSrc6lo         QN qSrc1lo
1355qSrc6hi         QN qSrc1hi
1356qSrc7lo         QN qSrc0lo
1357qSrc7hi         QN qSrc0hi
1358
1359qRes17lo        QN qScale0lo
1360qRes17hi        QN qScale0hi
1361qRes26lo        QN qScale0lo
1362qRes26hi        QN qScale0hi
1363qRes53lo        QN qScale0lo
1364qRes53hi        QN qScale0hi
1365
1366            ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
1367
1368            ;// Row 0
1369            VLD1        {qScale0lo, qScale0hi}, [pScale]!
1370            VSHLL       qSrc0lo, dXj0lo, #(12-1)
1371            VSHLL       qSrc0hi, dXj0hi, #(12-1)
1372            VLD1        {qScale1lo, qScale1hi}, [pScale]!
1373            VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
1374            VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
1375            VLD1        {qScale7lo, qScale7hi}, [pTemp]!
1376            VSHLL       qSrc1lo, dXj1lo, #(12-1)
1377            VSHLL       qSrc1hi, dXj1hi, #(12-1)
1378            VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
1379            VMOVN       dXi0hi, qSrc0hi
1380            VSHLL       qSrc7lo, dXj7lo, #(12-1)
1381            VSHLL       qSrc7hi, dXj7hi, #(12-1)
1382            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
1383            VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
1384            VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
1385            VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
1386            VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
1387            VLD1        {qScale2lo, qScale2hi}, [pScale]!
1388
1389            ;// Row 1 & 7
1390            VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
1391            VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
1392            VMOVN       dXi5lo, qRes17lo                ;// Output i5
1393            VMOVN       dXi5hi, qRes17hi
1394            VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
1395            VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
1396            VMOVN       dXi6lo, qRes17lo                ;// Output i6
1397            VMOVN       dXi6hi, qRes17hi
1398            VSHLL       qSrc2lo, dXj2lo, #(12-1)
1399            VSHLL       qSrc2hi, dXj2hi, #(12-1)
1400            VLD1        {qScale6lo, qScale6hi}, [pTemp]!
1401            VSHLL       qSrc6lo, dXj6lo, #(12-1)
1402            VSHLL       qSrc6hi, dXj6hi, #(12-1)
1403            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
1404            VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
1405            VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
1406            VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
1407            VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
1408            VLD1        {qScale3lo, qScale3hi}, [pScale]!
1409
1410            ;// Row 2 & 6
1411            VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
1412            VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
1413            VMOVN       dXi3lo, qRes26lo                ;// Output i3
1414            VMOVN       dXi3hi, qRes26hi
1415            VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
1416            VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
1417            VMOVN       dXi2lo, qRes26lo                ;// Output i2
1418            VMOVN       dXi2hi, qRes26hi
1419            VSHLL       qSrc3lo, dXj3lo, #(12-1)
1420            VSHLL       qSrc3hi, dXj3hi, #(12-1)
1421            VLD1        {qScale5lo, qScale5hi}, [pTemp]!
1422            VSHLL       qSrc5lo, dXj5lo, #(12-1)
1423            VSHLL       qSrc5hi, dXj5hi, #(12-1)
1424            VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
1425            VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
1426            VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
1427            VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
1428
1429            ;// Row 3 & 5
1430            VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
1431            VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
1432            SUB         pSrc, pSrc, #16*2*2
1433            VMOVN       dXi7lo, qRes53lo                ;// Output i7
1434            VMOVN       dXi7hi, qRes53hi
1435            VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
1436            VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
1437            VLD1        qXj4, [pSrc @64]
1438            VMOVN       dXi4lo, qRes53lo                ;// Output i4
1439            VMOVN       dXi4hi, qRes53hi
1440            VSHLL       qSrc4lo, dXj4lo, #(12-1)
1441            VSHLL       qSrc4hi, dXj4hi, #(12-1)
1442            VLD1        {qScale4lo, qScale4hi}, [pScale]
1443            LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
1444            VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
1445            VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
1446            VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
1447            ;// Row 4
1448            VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
1449            VMOVN       dXi1hi, qSrc4hi
1450
1451        MEND
1452
1453        END
1454