armCOMM_IDCT_s.h revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2004 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// IDCT_s.s
19;//
20;// Inverse DCT module
21;//
22;//
23;// ALGORITHM DESCRIPTION
24;//
25;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
26;// column and then a 1D IDCT for each row.
27;//
28;// The 8-point 1D IDCT is defined by
29;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
30;//
31;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
32;//   c(u,x) = cos( (2x+1)*u*pi/16 )
33;//
34;// We compute the 8-point 1D IDCT using the reverse of
35;// the Arai-Agui-Nakajima flow graph which we split into
36;// 5 stages named in reverse order to identify with the
37;// forward DCT. Direct inversion of the forward formulae
38;// in file FDCT_s.s gives:
39;//
40;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
41;//             [ A(0) = 2*sqrt(2)
42;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
43;//
44;// IStage 4:   i0 = j0             i1 = j4
45;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
46;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
47;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
48;//
49;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
50;//             h2 = (i2*sqrt2)-i3  h3 = i3
51;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
52;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
53;//             [ The above two lines rotate by -(pi/8) ]
54;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2
55;//
56;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
57;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
58;//             g7 = h7             g6 = h6 - h7
59;//             g5 = h5 - g6        g4 = h4 - g5
60;//
61;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
62;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
63;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
64;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
65;//
66;// Note that most coefficients are halved 3 times during the
67;// above calculation. We can rescale the algorithm dividing
68;// the input by 8 to remove the halvings.
69;//
70;// IStage 5:   j(u) = T(u)*A(u)/8
71;//
72;// IStage 4:   i0 = j0             i1 = j4
73;//             i3 = j2 + j6        i2 = j2 - j6
74;//             i7 = j5 + j3        i4 = j5 - j3
75;//             i5 = j1 + j7        i6 = j1 - j7
76;//
77;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
78;//             h2 = (i2*sqrt2)-i3  h3 = i3
79;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
80;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
81;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7
82;//
83;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
84;//             g1 = h1 + h2        g2 = h1 - h2
85;//             g7 = h7             g6 = h6 - h7
86;//             g5 = h5 - g6        g4 = h4 - g5
87;//
88;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
89;//             f1 = g1 + g6        f6 = g1 - g6
90;//             f2 = g2 + g5        f5 = g2 - g5
91;//             f3 = g3 + g4        f4 = g3 - g4
92;//
93;// Note:
94;// 1. The scaling by A(u)/8 can often be combined with inverse
95;//    quantization. The column and row scalings can be combined.
96;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
97;//    to the above code but is otherwise identical.
98;// 3. The rotation by -pi/8 can be peformed using three multiplies
99;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
100;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
101;// 4. If |T(u)|<=1 then from the IDCT definition,
102;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
103;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
104;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
105;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
106;//            = (approx)2.64
107;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
108;//    The table below shows input patterns generating the maximum
109;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
110;//    InputPattern      Max |f(x)|
111;//      PPPPPPPP        |f0| =  2.64
112;//      PPPMMMMM        |f1| =  2.64
113;//      PPMMMPPP        |f2| =  2.64
114;//      PPMMPPMM        |f3| =  2.64
115;//      PMMPPMMP        |f4| =  2.64
116;//      PMMPMMPM        |f5| =  2.64
117;//      PMPPMPMP        |f6| =  2.64
118;//      PMPMPMPM        |f7| =  2.64
119;//   Note that this input pattern is the transpose of the
120;//   corresponding max input patter for the FDCT.
121
122;// Arguments
123
124pSrc    RN 0    ;// source data buffer
125Stride  RN 1    ;// destination stride in bytes
126pDest   RN 2    ;// destination data buffer
127pScale  RN 3    ;// pointer to scaling table
128
129
130        ;// DCT Inverse Macro
131        ;// The DCT code should be parametrized according
132        ;// to the following inputs:
133        ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
134        ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
135        ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
136        ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
137        ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
138        ;//
139        ;// Inputs:
140        ;// pSrc   = r0 = Pointer to input data
141        ;//               Range is -256 to +255 (9-bit)
142        ;// Stride = r1 = Stride between input lines
143        ;// pDest  = r2 = Pointer to output data
144        ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
145
146
147
148        MACRO
149        M_IDCT  $outsize, $inscale, $stride
150        LCLA    SHIFT
151
152
153        IF ARM1136JS
154
155;// REGISTER ALLOCATION
156;// This is hard since we have 8 values, 9 free registers and each
157;// butterfly requires a temporary register. We also want to
158;// maintain register order so we can use LDM/STM. The table below
159;// summarises the register allocation that meets all these criteria.
160;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
161;//
162;// r1  a01     g0  h0
163;// r4  b01 f0  g1  h1  i0
164;// r5  a23 f1  g2      i1
165;// r6  b23 f2  g3  h2  i2
166;// r7  a45 f3      h3  i3
167;// r8  b45 f4  g4  h4  i4
168;// r9  a67 f5  g5  h5  i5
169;// r10 b67 f6  g6  h6  i6
170;// r11     f7  g7  h7  i7
171;//
172ra01    RN 1
173rb01    RN 4
174ra23    RN 5
175rb23    RN 6
176ra45    RN 7
177rb45    RN 8
178ra67    RN 9
179rb67    RN 10
180rtmp    RN 11
181csPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
182LoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
183;// Transpose allocation
184xft     RN ra01
185xf0     RN rb01
186xf1     RN ra23
187xf2     RN rb23
188xf3     RN ra45
189xf4     RN rb45
190xf5     RN ra67
191xf6     RN rb67
192xf7     RN rtmp
193;// IStage 1 allocation
194xg0     RN xft
195xg1     RN xf0
196xg2     RN xf1
197xg3     RN xf2
198xgt     RN xf3
199xg4     RN xf4
200xg5     RN xf5
201xg6     RN xf6
202xg7     RN xf7
203;// IStage 2 allocation
204xh0     RN xg0
205xh1     RN xg1
206xht     RN xg2
207xh2     RN xg3
208xh3     RN xgt
209xh4     RN xg4
210xh5     RN xg5
211xh6     RN xg6
212xh7     RN xg7
213;// IStage 3,4 allocation
214xit     RN xh0
215xi0     RN xh1
216xi1     RN xht
217xi2     RN xh2
218xi3     RN xh3
219xi4     RN xh4
220xi5     RN xh5
221xi6     RN xh6
222xi7     RN xh7
223
224        M_STR   pDest,  ppDest
225        IF "$stride"="s"
226            M_STR   Stride, pStride
227        ENDIF
228        M_ADR   pDest,  pBlk
229        LDR     csPiBy8, =0x30fc7642
230        LDR     LoopRR2, =0x00005a82
231
232v6_idct_col$_F
233        ;// Load even values
234        LDR     xi4, [pSrc], #4  ;// j0
235        LDR     xi5, [pSrc, #4*16-4]  ;// j4
236        LDR     xi6, [pSrc, #2*16-4]  ;// j2
237        LDR     xi7, [pSrc, #6*16-4]  ;// j6
238
239        ;// Scale Even Values
240        IF "$inscale"="s16" ;// 16x16 mul
241SHIFT       SETA    12
242            LDR     xi0, [pScale], #4
243            LDR     xi1, [pScale, #4*16-4]
244            LDR     xi2, [pScale, #2*16-4]
245            MOV     xit, #1<<(SHIFT-1)
246            SMLABB  xi3, xi0, xi4, xit
247            SMLATT  xi4, xi0, xi4, xit
248            SMLABB  xi0, xi1, xi5, xit
249            SMLATT  xi5, xi1, xi5, xit
250            MOV     xi3, xi3, ASR #SHIFT
251            PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
252            LDR     xi3, [pScale, #6*16-4]
253            SMLABB  xi1, xi2, xi6, xit
254            SMLATT  xi6, xi2, xi6, xit
255            MOV     xi0, xi0, ASR #SHIFT
256            PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
257            SMLABB  xi2, xi3, xi7, xit
258            SMLATT  xi7, xi3, xi7, xit
259            MOV     xi1, xi1, ASR #SHIFT
260            PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
261            MOV     xi2, xi2, ASR #SHIFT
262            PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
263        ENDIF
264        IF "$inscale"="s32" ;// 32x16 mul
265SHIFT       SETA    (12+8-16)
266            MOV     xit, #1<<(SHIFT-1)
267            LDR     xi0, [pScale], #8
268            LDR     xi1, [pScale, #0*32+4-8]
269            LDR     xi2, [pScale, #4*32-8]
270            LDR     xi3, [pScale, #4*32+4-8]
271            SMLAWB  xi0, xi0, xi4, xit
272            SMLAWT  xi1, xi1, xi4, xit
273            SMLAWB  xi2, xi2, xi5, xit
274            SMLAWT  xi3, xi3, xi5, xit
275            MOV     xi0, xi0, ASR #SHIFT
276            PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
277            MOV     xi2, xi2, ASR #SHIFT
278            PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
279            LDR     xi0, [pScale, #2*32-8]
280            LDR     xi1, [pScale, #2*32+4-8]
281            LDR     xi2, [pScale, #6*32-8]
282            LDR     xi3, [pScale, #6*32+4-8]
283            SMLAWB  xi0, xi0, xi6, xit
284            SMLAWT  xi1, xi1, xi6, xit
285            SMLAWB  xi2, xi2, xi7, xit
286            SMLAWT  xi3, xi3, xi7, xit
287            MOV     xi0, xi0, ASR #SHIFT
288            PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
289            MOV     xi2, xi2, ASR #SHIFT
290            PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
291        ENDIF
292
293        ;// Load odd values
294        LDR     xi0, [pSrc, #1*16-4]      ;// j1
295        LDR     xi1, [pSrc, #7*16-4]      ;// j7
296        LDR     xi2, [pSrc, #5*16-4]      ;// j5
297        LDR     xi3, [pSrc, #3*16-4]      ;// j3
298
299        IF  {TRUE}
300            ;// shortcut if odd values 0
301            TEQ     xi0, #0
302            TEQEQ   xi1, #0
303            TEQEQ   xi2, #0
304            TEQEQ   xi3, #0
305            BEQ     v6OddZero$_F
306        ENDIF
307
308        ;// Store scaled even values
309        STMIA   pDest, {xi4, xi5, xi6, xi7}
310
311        ;// Scale odd values
312        IF "$inscale"="s16"
313            ;// Perform AAN Scale
314            LDR     xi4, [pScale, #1*16-4]
315            LDR     xi5, [pScale, #7*16-4]
316            LDR     xi6, [pScale, #5*16-4]
317            SMLABB  xi7, xi0, xi4, xit
318            SMLATT  xi0, xi0, xi4, xit
319            SMLABB  xi4, xi1, xi5, xit
320            SMLATT  xi1, xi1, xi5, xit
321            MOV     xi7, xi7, ASR #SHIFT
322            PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
323            LDR     xi7, [pScale, #3*16-4]
324            SMLABB  xi5, xi2, xi6, xit
325            SMLATT  xi2, xi2, xi6, xit
326            MOV     xi4, xi4, ASR #SHIFT
327            PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
328            SMLABB  xi6, xi3, xi7, xit
329            SMLATT  xi3, xi3, xi7, xit
330            MOV     xi5, xi5, ASR #SHIFT
331            PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
332            MOV     xi6, xi6, ASR #SHIFT
333            PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
334        ENDIF
335        IF "$inscale"="s32" ;// 32x16 mul
336            LDR     xi4, [pScale, #1*32-8]
337            LDR     xi5, [pScale, #1*32+4-8]
338            LDR     xi6, [pScale, #7*32-8]
339            LDR     xi7, [pScale, #7*32+4-8]
340            SMLAWB  xi4, xi4, xi0, xit
341            SMLAWT  xi5, xi5, xi0, xit
342            SMLAWB  xi6, xi6, xi1, xit
343            SMLAWT  xi7, xi7, xi1, xit
344            MOV     xi4, xi4, ASR #SHIFT
345            PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
346            MOV     xi6, xi6, ASR #SHIFT
347            PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
348            LDR     xi4, [pScale, #5*32-8]
349            LDR     xi5, [pScale, #5*32+4-8]
350            LDR     xi6, [pScale, #3*32-8]
351            LDR     xi7, [pScale, #3*32+4-8]
352            SMLAWB  xi4, xi4, xi2, xit
353            SMLAWT  xi5, xi5, xi2, xit
354            SMLAWB  xi6, xi6, xi3, xit
355            SMLAWT  xi7, xi7, xi3, xit
356            MOV     xi4, xi4, ASR #SHIFT
357            PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
358            MOV     xi6, xi6, ASR #SHIFT
359            PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
360        ENDIF
361
362        LDR     xit, =0x00010001        ;// rounding constant
363        SADD16 xi5, xi0, xi1           ;// (j1+j7)/2
364        SHADD16 xi5, xi5, xit
365
366        SSUB16  xi6, xi0, xi1           ;// j1-j7
367        SADD16 xi7, xi2, xi3           ;// (j5+j3)/2
368        SHADD16 xi7, xi7, xit
369
370        SSUB16  xi4, xi2, xi3           ;// j5-j3
371
372        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
373
374        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
375        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
376
377        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
378        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
379        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
380        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
381
382        SMULBB  xi1, xi3, LoopRR2
383        SMULTB  xi3, xi3, LoopRR2
384
385        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
386        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
387        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
388
389        ;// xi0,xi1,xi2,xi3 now free
390        ;// IStage 4,3, rows 2to3 x1/2
391
392        MOV     xi3, xi3, LSL #1
393        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
394        LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
395
396        ;// IStage 2, rows4to7
397        SSUB16  xg6, xh6, xh7
398        SSUB16  xg5, xh5, xg6
399        SSUB16  xg4, xh4, xg5
400
401        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
402
403        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
404
405        SMULBB  xi0, xi2, LoopRR2
406        SMULTB  xi2, xi2, LoopRR2
407
408        MOV     xi2, xi2, LSL #1
409        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
410
411        ;// xi0, xi1 now free
412        ;// IStage 4,3 rows 0to1 x 1/2
413        LDRD    xi0, [pDest]            ;// j0, j4 scaled
414        SSUB16  xh2, xh2, xi3
415        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
416
417        SHADD16 xh0, xi0, xi1
418        SHSUB16 xh1, xi0, xi1
419
420        ;// IStage 2 rows 0to3 x 1/2
421        SHSUB16 xg2, xh1, xh2
422        SHADD16 xg1, xh1, xh2
423        SHSUB16 xg3, xh0, xh3
424        SHADD16 xg0, xh0, xh3
425
426        ;// IStage 1 all rows
427        SADD16  xf3, xg3, xg4
428        SSUB16  xf4, xg3, xg4
429        SADD16  xf2, xg2, xg5
430        SSUB16  xf5, xg2, xg5
431        SADD16  xf1, xg1, xg6
432        SSUB16  xf6, xg1, xg6
433        SADD16  xf0, xg0, xg7
434        SSUB16  xf7, xg0, xg7
435
436        ;// Transpose, store and loop
437        PKHBT   ra01, xf0, xf1, LSL #16
438        PKHTB   rb01, xf1, xf0, ASR #16
439
440        PKHBT   ra23, xf2, xf3, LSL #16
441        PKHTB   rb23, xf3, xf2, ASR #16
442
443        PKHBT   ra45, xf4, xf5, LSL #16
444        PKHTB   rb45, xf5, xf4, ASR #16
445
446        PKHBT   ra67, xf6, xf7, LSL #16
447        STMIA   pDest!, {ra01, ra23, ra45, ra67}
448        PKHTB   rb67, xf7, xf6, ASR #16
449        STMIA   pDest!, {rb01, rb23, rb45, rb67}
450        BCC     v6_idct_col$_F
451
452        SUB     pSrc, pDest, #(64*2)
453        M_LDR   pDest, ppDest
454        IF "$stride"="s"
455            M_LDR   pScale, pStride
456        ENDIF
457        B       v6_idct_row$_F
458
459v6OddZero$_F
460        SSUB16  xi2, xi6, xi7           ;// (j2-j6)
461        SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
462
463        SMULBB  xi0, xi2, LoopRR2
464        SMULTB  xi2, xi2, LoopRR2
465
466        MOV     xi2, xi2, LSL #1
467        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
468        SSUB16  xh2, xh2, xi3
469
470        ;// xi0, xi1 now free
471        ;// IStage 4,3 rows 0to1 x 1/2
472
473        SHADD16 xh0, xi4, xi5
474        SHSUB16 xh1, xi4, xi5
475
476        ;// IStage 2 rows 0to3 x 1/2
477        SHSUB16 xg2, xh1, xh2
478        SHADD16 xg1, xh1, xh2
479        SHSUB16 xg3, xh0, xh3
480        SHADD16 xg0, xh0, xh3
481
482        ;// IStage 1 all rows
483        MOV  xf3, xg3
484        MOV  xf4, xg3
485        MOV  xf2, xg2
486        MOV  xf5, xg2
487        MOV  xf1, xg1
488        MOV  xf6, xg1
489        MOV  xf0, xg0
490        MOV  xf7, xg0
491
492        ;// Transpose
493        PKHBT   ra01, xf0, xf1, LSL #16
494        PKHTB   rb01, xf1, xf0, ASR #16
495
496        PKHBT   ra23, xf2, xf3, LSL #16
497        PKHTB   rb23, xf3, xf2, ASR #16
498
499        PKHBT   ra45, xf4, xf5, LSL #16
500        PKHTB   rb45, xf5, xf4, ASR #16
501
502        PKHBT   ra67, xf6, xf7, LSL #16
503        PKHTB   rb67, xf7, xf6, ASR #16
504
505        STMIA   pDest!, {ra01, ra23, ra45, ra67}
506        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
507        STMIA   pDest!, {rb01, rb23, rb45, rb67}
508
509        BCC     v6_idct_col$_F
510        SUB     pSrc, pDest, #(64*2)
511        M_LDR   pDest, ppDest
512        IF "$stride"="s"
513            M_LDR   pScale, pStride
514        ENDIF
515
516
517v6_idct_row$_F
518        ;// IStage 4,3, rows4to7 x1/4
519        LDR     xit, =0x00010001        ;// rounding constant
520        LDR     xi0, [pSrc, #1*16]      ;// j1
521        LDR     xi1, [pSrc, #7*16]      ;// 4*j7
522        LDR     xi2, [pSrc, #5*16]      ;// j5
523        LDR     xi3, [pSrc, #3*16]      ;// j3
524
525        SHADD16 xi1, xi1, xit           ;// 2*j7
526        SHADD16 xi1, xi1, xit           ;// j7
527
528        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
529        SSUB16  xi6, xi0, xi1           ;// j1-j7
530        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
531        SSUB16  xi4, xi2, xi3           ;// j5-j3
532
533        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
534
535        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
536        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
537
538        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
539        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
540        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
541        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
542
543        SMULBB  xi1, xi3, LoopRR2
544        SMULTB  xi3, xi3, LoopRR2
545
546        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
547        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
548        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
549
550        MOV     xi3, xi3, LSL #1
551        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
552
553        ;// xi0,xi1,xi2,xi3 now free
554        ;// IStage 4,3, rows 2to3 x1/2
555
556        LDR     xi0, [pSrc, #2*16]      ;// j2
557        LDR     xi1, [pSrc, #6*16]      ;// 2*j6
558
559        ;// IStage 2, rows4to7
560        SSUB16  xg6, xh6, xh7
561        SSUB16  xg5, xh5, xg6
562        SSUB16  xg4, xh4, xg5
563
564        SHADD16 xi1, xi1, xit           ;// j6
565        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
566        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
567
568        SMULBB  xi0, xi2, LoopRR2
569        SMULTB  xi2, xi2, LoopRR2
570
571        MOV     xi2, xi2, LSL #1
572
573        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
574
575        ;// xi0, xi1 now free
576        ;// IStage 4,3 rows 0to1 x 1/2
577        LDR     xi1, [pSrc, #4*16]      ;// j4
578        LDR     xi0, [pSrc], #4         ;// j0
579
580        SSUB16  xh2, xh2, xi3
581        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
582
583        ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
584        SHADD16 xh0, xi0, xi1           ;// of DC result
585        SHSUB16 xh1, xi0, xi1
586
587        ;// IStage 2 rows 0to3 x 1/2
588        SHSUB16 xg2, xh1, xh2
589        SHADD16 xg1, xh1, xh2
590        SHSUB16 xg3, xh0, xh3
591        SHADD16 xg0, xh0, xh3
592
593        ;// IStage 1 all rows
594        SHADD16 xf3, xg3, xg4
595        SHSUB16 xf4, xg3, xg4
596        SHADD16 xf2, xg2, xg5
597        SHSUB16 xf5, xg2, xg5
598        SHADD16 xf1, xg1, xg6
599        SHSUB16 xf6, xg1, xg6
600        SHADD16 xf0, xg0, xg7
601        SHSUB16 xf7, xg0, xg7
602
603        ;// Saturate
604        IF ("$outsize"="u8")
605            USAT16  xf0, #8, xf0
606            USAT16  xf1, #8, xf1
607            USAT16  xf2, #8, xf2
608            USAT16  xf3, #8, xf3
609            USAT16  xf4, #8, xf4
610            USAT16  xf5, #8, xf5
611            USAT16  xf6, #8, xf6
612            USAT16  xf7, #8, xf7
613        ENDIF
614        IF ("$outsize"="s9")
615            SSAT16  xf0, #9, xf0
616            SSAT16  xf1, #9, xf1
617            SSAT16  xf2, #9, xf2
618            SSAT16  xf3, #9, xf3
619            SSAT16  xf4, #9, xf4
620            SSAT16  xf5, #9, xf5
621            SSAT16  xf6, #9, xf6
622            SSAT16  xf7, #9, xf7
623        ENDIF
624
625        ;// Transpose to Row, Pack and store
626        IF ("$outsize"="u8")
627            ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
628            ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
629            ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
630            ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
631            PKHBT   ra01, xf0, xf2, LSL #16
632            PKHTB   rb01, xf2, xf0, ASR #16
633            PKHBT   ra23, xf4, xf6, LSL #16
634            PKHTB   rb23, xf6, xf4, ASR #16
635            STMIA   pDest, {ra01, ra23}
636            IF "$stride"="s"
637                ADD     pDest, pDest, pScale
638                STMIA   pDest, {rb01, rb23}
639                ADD     pDest, pDest, pScale
640            ELSE
641                ADD     pDest, pDest, #($stride)
642                STMIA   pDest, {rb01, rb23}
643                ADD     pDest, pDest, #($stride)
644            ENDIF
645        ENDIF
646        IF ("$outsize"="s9"):LOR:("$outsize"="s16")
647            PKHBT   ra01, xf0, xf1, LSL #16
648            PKHTB   rb01, xf1, xf0, ASR #16
649
650            PKHBT   ra23, xf2, xf3, LSL #16
651            PKHTB   rb23, xf3, xf2, ASR #16
652
653            PKHBT   ra45, xf4, xf5, LSL #16
654            PKHTB   rb45, xf5, xf4, ASR #16
655
656            PKHBT   ra67, xf6, xf7, LSL #16
657            PKHTB   rb67, xf7, xf6, ASR #16
658
659            STMIA   pDest, {ra01, ra23, ra45, ra67}
660            IF "$stride"="s"
661                ADD     pDest, pDest, pScale
662                STMIA   pDest, {rb01, rb23, rb45, rb67}
663                ADD     pDest, pDest, pScale
664            ELSE
665                ADD     pDest, pDest, #($stride)
666                STMIA   pDest, {rb01, rb23, rb45, rb67}
667                ADD     pDest, pDest, #($stride)
668            ENDIF
669        ENDIF
670
671        BCC     v6_idct_row$_F
672        ENDIF ;// ARM1136JS
673
674
675        IF CortexA8
676
677Src0            EQU  7
678Src1            EQU  8
679Src2            EQU  9
680Src3            EQU  10
681Src4            EQU  11
682Src5            EQU  12
683Src6            EQU  13
684Src7            EQU  14
685Tmp             EQU  15
686
687qXj0            QN Src0.S16
688qXj1            QN Src1.S16
689qXj2            QN Src2.S16
690qXj3            QN Src3.S16
691qXj4            QN Src4.S16
692qXj5            QN Src5.S16
693qXj6            QN Src6.S16
694qXj7            QN Src7.S16
695qXjt            QN Tmp.S16
696
697dXj0lo          DN (Src0*2).S16
698dXj0hi          DN (Src0*2+1).S16
699dXj1lo          DN (Src1*2).S16
700dXj1hi          DN (Src1*2+1).S16
701dXj2lo          DN (Src2*2).S16
702dXj2hi          DN (Src2*2+1).S16
703dXj3lo          DN (Src3*2).S16
704dXj3hi          DN (Src3*2+1).S16
705dXj4lo          DN (Src4*2).S16
706dXj4hi          DN (Src4*2+1).S16
707dXj5lo          DN (Src5*2).S16
708dXj5hi          DN (Src5*2+1).S16
709dXj6lo          DN (Src6*2).S16
710dXj6hi          DN (Src6*2+1).S16
711dXj7lo          DN (Src7*2).S16
712dXj7hi          DN (Src7*2+1).S16
713dXjtlo          DN (Tmp*2).S16
714dXjthi          DN (Tmp*2+1).S16
715
716qXi0            QN qXj0
717qXi1            QN qXj4
718qXi2            QN qXj2
719qXi3            QN qXj7
720qXi4            QN qXj5
721qXi5            QN qXjt
722qXi6            QN qXj1
723qXi7            QN qXj6
724qXit            QN qXj3
725
726dXi0lo          DN dXj0lo
727dXi0hi          DN dXj0hi
728dXi1lo          DN dXj4lo
729dXi1hi          DN dXj4hi
730dXi2lo          DN dXj2lo
731dXi2hi          DN dXj2hi
732dXi3lo          DN dXj7lo
733dXi3hi          DN dXj7hi
734dXi4lo          DN dXj5lo
735dXi4hi          DN dXj5hi
736dXi5lo          DN dXjtlo
737dXi5hi          DN dXjthi
738dXi6lo          DN dXj1lo
739dXi6hi          DN dXj1hi
740dXi7lo          DN dXj6lo
741dXi7hi          DN dXj6hi
742dXitlo          DN dXj3lo
743dXithi          DN dXj3hi
744
745qXh0            QN qXit
746qXh1            QN qXi0
747qXh2            QN qXi2
748qXh3            QN qXi3
749qXh4            QN qXi7
750qXh5            QN qXi5
751qXh6            QN qXi4
752qXh7            QN qXi1
753qXht            QN qXi6
754
755dXh0lo          DN dXitlo
756dXh0hi          DN dXithi
757dXh1lo          DN dXi0lo
758dXh1hi          DN dXi0hi
759dXh2lo          DN dXi2lo
760dXh2hi          DN dXi2hi
761dXh3lo          DN dXi3lo
762dXh3hi          DN dXi3hi
763dXh4lo          DN dXi7lo
764dXh4hi          DN dXi7hi
765dXh5lo          DN dXi5lo
766dXh5hi          DN dXi5hi
767dXh6lo          DN dXi4lo
768dXh6hi          DN dXi4hi
769dXh7lo          DN dXi1lo
770dXh7hi          DN dXi1hi
771dXhtlo          DN dXi6lo
772dXhthi          DN dXi6hi
773
774qXg0            QN qXh2
775qXg1            QN qXht
776qXg2            QN qXh1
777qXg3            QN qXh0
778qXg4            QN qXh4
779qXg5            QN qXh5
780qXg6            QN qXh6
781qXg7            QN qXh7
782qXgt            QN qXh3
783
784qXf0            QN qXg6
785qXf1            QN qXg5
786qXf2            QN qXg4
787qXf3            QN qXgt
788qXf4            QN qXg3
789qXf5            QN qXg2
790qXf6            QN qXg1
791qXf7            QN qXg0
792qXft            QN qXg7
793
794
795qXt0            QN 1.S32
796qXt1            QN 2.S32
797qT0lo           QN 1.S32
798qT0hi           QN 2.S32
799qT1lo           QN 3.S32
800qT1hi           QN 4.S32
801qScalelo        QN 5.S32        ;// used to read post scale values
802qScalehi        QN 6.S32
803qTemp0          QN 5.S32
804qTemp1          QN 6.S32
805
806
807Scale1          EQU 6
808Scale2          EQU 15
809qScale1         QN Scale1.S16
810qScale2         QN Scale2.S16
811dScale1lo       DN (Scale1*2).S16
812dScale1hi       DN (Scale1*2+1).S16
813dScale2lo       DN (Scale2*2).S16
814dScale2hi       DN (Scale2*2+1).S16
815
816dCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
817InvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
818S               DN dCoefs[1]    ;// Sin(PI/8) in Q15
819C               DN dCoefs[2]    ;// Cos(PI/8) in Q15
820
821pTemp           RN 12
822
823
824        IMPORT  armCOMM_IDCTCoef
825
826        VLD1        {qXj0,qXj1}, [pSrc @64]!
827        VLD1        {qXj2,qXj3}, [pSrc @64]!
828        VLD1        {qXj4,qXj5}, [pSrc @64]!
829        VLD1        {qXj6,qXj7}, [pSrc @64]!
830
831        ;// Load PreScale and multiply with Src
832        ;// IStage 4
833
834        IF "$inscale"="s16"                         ;// 16X16 Mul
835            M_IDCT_PRESCALE16
836        ENDIF
837
838        IF "$inscale"="s32"                         ;// 32X32 ,ul
839            M_IDCT_PRESCALE32
840        ENDIF
841
842        ;// IStage 3
843        VQDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
844        VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
845        VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
846        VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
847        VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
848        VQDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
849        VSUB        qXh2, qXi2, qXi3                ;// h2, h3
850
851        VMULL       qXt0, dXi4lo, C                 ;// c*i4
852        VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
853        VMULL       qXt1, dXi4hi, C
854        VMLAL       qXt1, dXi6hi, S
855        VSHRN       dXh4lo, qXt0, #16               ;// h4
856        VSHRN       dXh4hi, qXt1, #16
857
858        VMULL       qXt0, dXi6lo, C                 ;// c*i6
859        VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
860        VMULL       qXt1, dXi6hi, C
861        VMLSL       qXt1, dXi4hi, S
862        VSHRN       dXh6lo, qXt0, #16               ;// h6
863        VSHRN       dXh6hi, qXt1, #16
864
865        ;// IStage 2
866        VSUB        qXg6, qXh6, qXh7
867        VSUB        qXg5, qXh5, qXg6
868        VSUB        qXg4, qXh4, qXg5
869        VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
870        VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
871        VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
872        VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
873
874        ;// IStage 1 all rows
875        VADD        qXf3, qXg3, qXg4
876        VSUB        qXf4, qXg3, qXg4
877        VADD        qXf2, qXg2, qXg5
878        VSUB        qXf5, qXg2, qXg5
879        VADD        qXf1, qXg1, qXg6
880        VSUB        qXf6, qXg1, qXg6
881        VADD        qXf0, qXg0, qXg7
882        VSUB        qXf7, qXg0, qXg7
883
884        ;// Transpose, store and loop
885XTR0            EQU Src5
886XTR1            EQU Tmp
887XTR2            EQU Src6
888XTR3            EQU Src7
889XTR4            EQU Src3
890XTR5            EQU Src0
891XTR6            EQU Src1
892XTR7            EQU Src2
893XTRt            EQU Src4
894
895qA0             QN  XTR0.S32  ;// for XTRpose
896qA1             QN  XTR1.S32
897qA2             QN  XTR2.S32
898qA3             QN  XTR3.S32
899qA4             QN  XTR4.S32
900qA5             QN  XTR5.S32
901qA6             QN  XTR6.S32
902qA7             QN  XTR7.S32
903
904dB0             DN  XTR0*2+1      ;// for using VSWP
905dB1             DN  XTR1*2+1
906dB2             DN  XTR2*2+1
907dB3             DN  XTR3*2+1
908dB4             DN  XTR4*2
909dB5             DN  XTR5*2
910dB6             DN  XTR6*2
911dB7             DN  XTR7*2
912
913
914        VTRN        qXf0, qXf1
915        VTRN        qXf2, qXf3
916        VTRN        qXf4, qXf5
917        VTRN        qXf6, qXf7
918        VTRN        qA0, qA2
919        VTRN        qA1, qA3
920        VTRN        qA4, qA6
921        VTRN        qA5, qA7
922        VSWP        dB0, dB4
923        VSWP        dB1, dB5
924        VSWP        dB2, dB6
925        VSWP        dB3, dB7
926
927
928qYj0            QN qXf0
929qYj1            QN qXf1
930qYj2            QN qXf2
931qYj3            QN qXf3
932qYj4            QN qXf4
933qYj5            QN qXf5
934qYj6            QN qXf6
935qYj7            QN qXf7
936qYjt            QN qXft
937
938dYj0lo          DN (XTR0*2).S16
939dYj0hi          DN (XTR0*2+1).S16
940dYj1lo          DN (XTR1*2).S16
941dYj1hi          DN (XTR1*2+1).S16
942dYj2lo          DN (XTR2*2).S16
943dYj2hi          DN (XTR2*2+1).S16
944dYj3lo          DN (XTR3*2).S16
945dYj3hi          DN (XTR3*2+1).S16
946dYj4lo          DN (XTR4*2).S16
947dYj4hi          DN (XTR4*2+1).S16
948dYj5lo          DN (XTR5*2).S16
949dYj5hi          DN (XTR5*2+1).S16
950dYj6lo          DN (XTR6*2).S16
951dYj6hi          DN (XTR6*2+1).S16
952dYj7lo          DN (XTR7*2).S16
953dYj7hi          DN (XTR7*2+1).S16
954dYjtlo          DN (XTRt*2).S16
955dYjthi          DN (XTRt*2+1).S16
956
957qYi0            QN qYj0
958qYi1            QN qYj4
959qYi2            QN qYj2
960qYi3            QN qYj7
961qYi4            QN qYj5
962qYi5            QN qYjt
963qYi6            QN qYj1
964qYi7            QN qYj6
965qYit            QN qYj3
966
967dYi0lo          DN dYj0lo
968dYi0hi          DN dYj0hi
969dYi1lo          DN dYj4lo
970dYi1hi          DN dYj4hi
971dYi2lo          DN dYj2lo
972dYi2hi          DN dYj2hi
973dYi3lo          DN dYj7lo
974dYi3hi          DN dYj7hi
975dYi4lo          DN dYj5lo
976dYi4hi          DN dYj5hi
977dYi5lo          DN dYjtlo
978dYi5hi          DN dYjthi
979dYi6lo          DN dYj1lo
980dYi6hi          DN dYj1hi
981dYi7lo          DN dYj6lo
982dYi7hi          DN dYj6hi
983dYitlo          DN dYj3lo
984dYithi          DN dYj3hi
985
986qYh0            QN qYit
987qYh1            QN qYi0
988qYh2            QN qYi2
989qYh3            QN qYi3
990qYh4            QN qYi7
991qYh5            QN qYi5
992qYh6            QN qYi4
993qYh7            QN qYi1
994qYht            QN qYi6
995
996dYh0lo          DN dYitlo
997dYh0hi          DN dYithi
998dYh1lo          DN dYi0lo
999dYh1hi          DN dYi0hi
1000dYh2lo          DN dYi2lo
1001dYh2hi          DN dYi2hi
1002dYh3lo          DN dYi3lo
1003dYh3hi          DN dYi3hi
1004dYh4lo          DN dYi7lo
1005dYh4hi          DN dYi7hi
1006dYh5lo          DN dYi5lo
1007dYh5hi          DN dYi5hi
1008dYh6lo          DN dYi4lo
1009dYh6hi          DN dYi4hi
1010dYh7lo          DN dYi1lo
1011dYh7hi          DN dYi1hi
1012dYhtlo          DN dYi6lo
1013dYhthi          DN dYi6hi
1014
1015qYg0            QN qYh2
1016qYg1            QN qYht
1017qYg2            QN qYh1
1018qYg3            QN qYh0
1019qYg4            QN qYh4
1020qYg5            QN qYh5
1021qYg6            QN qYh6
1022qYg7            QN qYh7
1023qYgt            QN qYh3
1024
1025qYf0            QN qYg6
1026qYf1            QN qYg5
1027qYf2            QN qYg4
1028qYf3            QN qYgt
1029qYf4            QN qYg3
1030qYf5            QN qYg2
1031qYf6            QN qYg1
1032qYf7            QN qYg0
1033qYft            QN qYg7
1034
1035        VRSHR       qYj7, qYj7, #2
1036        VRSHR       qYj6, qYj6, #1
1037
1038        VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
1039        VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
1040        VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
1041        VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
1042        VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
1043        VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
1044
1045        VQDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
1046        ;// IStage 4,3 rows 0to1 x 1/2
1047
1048        MOV         pTemp, #0x4             ;// ensure correct round
1049        VDUP        qScale1, pTemp           ;// of DC result
1050        VADD        qYi0, qYi0, qScale1
1051
1052        VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
1053        VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
1054
1055        VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
1056        VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
1057        VSUB        qYh2, qYi2, qYi3        ;// h2, h3
1058        VQDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
1059
1060        VMULL       qXt0, dYi4lo, C         ;// c*i4
1061        VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
1062        VMULL       qXt1, dYi4hi, C
1063        VMLAL       qXt1, dYi6hi, S
1064        VSHRN       dYh4lo, qXt0, #16       ;// h4
1065        VSHRN       dYh4hi, qXt1, #16
1066
1067        VMULL       qXt0, dYi6lo, C         ;// c*i6
1068        VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
1069        VMULL       qXt1, dYi6hi, C
1070        VMLSL       qXt1, dYi4hi, S
1071        VSHRN       dYh6lo, qXt0, #16       ;// h6
1072        VSHRN       dYh6hi, qXt1, #16
1073
1074        VSUB        qYg6, qYh6, qYh7
1075        VSUB        qYg5, qYh5, qYg6
1076        VSUB        qYg4, qYh4, qYg5
1077
1078        ;// IStage 2 rows 0to3 x 1/2
1079        VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
1080        VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
1081        VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
1082        VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
1083
1084
1085        ;// IStage 1 all rows
1086        VHADD        qYf3, qYg3, qYg4
1087        VHSUB        qYf4, qYg3, qYg4
1088        VHADD        qYf2, qYg2, qYg5
1089        VHSUB        qYf5, qYg2, qYg5
1090        VHADD        qYf1, qYg1, qYg6
1091        VHSUB        qYf6, qYg1, qYg6
1092        VHADD        qYf0, qYg0, qYg7
1093        VHSUB        qYf7, qYg0, qYg7
1094
1095YTR0            EQU Src0
1096YTR1            EQU Src4
1097YTR2            EQU Src1
1098YTR3            EQU Src2
1099YTR4            EQU Src7
1100YTR5            EQU Src5
1101YTR6            EQU Tmp
1102YTR7            EQU Src6
1103YTRt            EQU Src3
1104
1105qC0             QN  YTR0.S32                ;// for YTRpose
1106qC1             QN  YTR1.S32
1107qC2             QN  YTR2.S32
1108qC3             QN  YTR3.S32
1109qC4             QN  YTR4.S32
1110qC5             QN  YTR5.S32
1111qC6             QN  YTR6.S32
1112qC7             QN  YTR7.S32
1113
1114dD0             DN  YTR0*2+1                ;// for using VSWP
1115dD1             DN  YTR1*2+1
1116dD2             DN  YTR2*2+1
1117dD3             DN  YTR3*2+1
1118dD4             DN  YTR4*2
1119dD5             DN  YTR5*2
1120dD6             DN  YTR6*2
1121dD7             DN  YTR7*2
1122
1123        VTRN        qYf0, qYf1
1124        VTRN        qYf2, qYf3
1125        VTRN        qYf4, qYf5
1126        VTRN        qYf6, qYf7
1127        VTRN        qC0, qC2
1128        VTRN        qC1, qC3
1129        VTRN        qC4, qC6
1130        VTRN        qC5, qC7
1131        VSWP        dD0, dD4
1132        VSWP        dD1, dD5
1133        VSWP        dD2, dD6
1134        VSWP        dD3, dD7
1135
1136
1137dYf0U8          DN YTR0*2.U8
1138dYf1U8          DN YTR1*2.U8
1139dYf2U8          DN YTR2*2.U8
1140dYf3U8          DN YTR3*2.U8
1141dYf4U8          DN YTR4*2.U8
1142dYf5U8          DN YTR5*2.U8
1143dYf6U8          DN YTR6*2.U8
1144dYf7U8          DN YTR7*2.U8
1145
1146        ;//
1147        ;// Do saturation if outsize is other than S16
1148        ;//
1149
1150        IF ("$outsize"="u8")
1151            ;// Output range [0-255]
1152            VQMOVN            dYf0U8, qYf0
1153            VQMOVN            dYf1U8, qYf1
1154            VQMOVN            dYf2U8, qYf2
1155            VQMOVN            dYf3U8, qYf3
1156            VQMOVN            dYf4U8, qYf4
1157            VQMOVN            dYf5U8, qYf5
1158            VQMOVN            dYf6U8, qYf6
1159            VQMOVN            dYf7U8, qYf7
1160        ENDIF
1161
1162        IF ("$outsize"="s9")
1163            ;// Output range [-256 to +255]
1164            VQSHL            qYf0, qYf0, #16-9
1165            VQSHL            qYf1, qYf1, #16-9
1166            VQSHL            qYf2, qYf2, #16-9
1167            VQSHL            qYf3, qYf3, #16-9
1168            VQSHL            qYf4, qYf4, #16-9
1169            VQSHL            qYf5, qYf5, #16-9
1170            VQSHL            qYf6, qYf6, #16-9
1171            VQSHL            qYf7, qYf7, #16-9
1172
1173            VSHR             qYf0, qYf0, #16-9
1174            VSHR             qYf1, qYf1, #16-9
1175            VSHR             qYf2, qYf2, #16-9
1176            VSHR             qYf3, qYf3, #16-9
1177            VSHR             qYf4, qYf4, #16-9
1178            VSHR             qYf5, qYf5, #16-9
1179            VSHR             qYf6, qYf6, #16-9
1180            VSHR             qYf7, qYf7, #16-9
1181        ENDIF
1182
1183        ;// Store output depending on the Stride size
1184        IF "$stride"="s"
1185            VST1        qYf0, [pDest @64], Stride
1186            VST1        qYf1, [pDest @64], Stride
1187            VST1        qYf2, [pDest @64], Stride
1188            VST1        qYf3, [pDest @64], Stride
1189            VST1        qYf4, [pDest @64], Stride
1190            VST1        qYf5, [pDest @64], Stride
1191            VST1        qYf6, [pDest @64], Stride
1192            VST1        qYf7, [pDest @64]
1193        ELSE
1194            IF ("$outsize"="u8")
1195                VST1        dYf0U8, [pDest @64], #8
1196                VST1        dYf1U8, [pDest @64], #8
1197                VST1        dYf2U8, [pDest @64], #8
1198                VST1        dYf3U8, [pDest @64], #8
1199                VST1        dYf4U8, [pDest @64], #8
1200                VST1        dYf5U8, [pDest @64], #8
1201                VST1        dYf6U8, [pDest @64], #8
1202                VST1        dYf7U8, [pDest @64]
1203            ELSE
1204                ;// ("$outsize"="s9") or ("$outsize"="s16")
1205                VST1        qYf0, [pDest @64], #16
1206                VST1        qYf1, [pDest @64], #16
1207                VST1        qYf2, [pDest @64], #16
1208                VST1        qYf3, [pDest @64], #16
1209                VST1        qYf4, [pDest @64], #16
1210                VST1        qYf5, [pDest @64], #16
1211                VST1        qYf6, [pDest @64], #16
1212                VST1        qYf7, [pDest @64]
1213            ENDIF
1214
1215        ENDIF
1216
1217
1218
1219        ENDIF ;// CortexA8
1220
1221
1222
1223        MEND
1224
1225        ;// Scale TWO input rows with TWO rows of 16 bit scale values
1226        ;//
1227        ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
1228        ;// input (Eight input values) with one row of scale values. Also
1229        ;// Loads next scale values from pScale, if $LastRow flag is not set.
1230        ;//
1231        ;// Input Registers:
1232        ;//
1233        ;// $dAlo           - Input D register with first four S16 values of row n
1234        ;// $dAhi           - Input D register with next four S16 values of row n
1235        ;// $dBlo           - Input D register with first four S16 values of row n+1
1236        ;// $dBhi           - Input D register with next four S16 values of row n+1
1237        ;// pScale          - Pointer to next row of scale values
1238        ;// qT0lo           - Temporary scratch register
1239        ;// qT0hi           - Temporary scratch register
1240        ;// qT1lo           - Temporary scratch register
1241        ;// qT1hi           - Temporary scratch register
1242        ;// dScale1lo       - Scale value of row n
1243        ;// dScale1hi       - Scale value of row n
1244        ;// dScale2lo       - Scale value of row n+1
1245        ;// dScale2hi       - Scale value of row n+1
1246        ;//
1247        ;// Input Flag
1248        ;//
1249        ;// $LastRow        - Flag to indicate whether current row is last row
1250        ;//
1251        ;// Output Registers:
1252        ;//
1253        ;// $dAlo           - Scaled output values (first four S16 of row n)
1254        ;// $dAhi           - Scaled output values (next four S16 of row n)
1255        ;// $dBlo           - Scaled output values (first four S16 of row n+1)
1256        ;// $dBhi           - Scaled output values (next four S16 of row n+1)
1257        ;// qScale1         - Scale values for next row
1258        ;// qScale2         - Scale values for next row+1
1259        ;// pScale          - Pointer to next row of scale values
1260        ;//
1261        MACRO
1262        M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
1263        VMULL       qT0lo, $dAlo, dScale1lo
1264        VMULL       qT0hi, $dAhi, dScale1hi
1265        VMULL       qT1lo, $dBlo, dScale2lo
1266        VMULL       qT1hi, $dBhi, dScale2hi
1267        IF "$LastRow"="0"
1268            VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
1269            VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
1270        ENDIF
1271        VQRSHRN       $dAlo, qT0lo, #12
1272        VQRSHRN       $dAhi, qT0hi, #12
1273        VQRSHRN       $dBlo, qT1lo, #12
1274        VQRSHRN       $dBhi, qT1hi, #12
1275        MEND
1276
1277        ;// Scale 8x8 block input values with 16 bit scale values
1278        ;//
1279        ;// This macro is used to pre-scale block of 8x8 input.
1280        ;// This also do the Ist stage transformations of IDCT.
1281        ;//
1282        ;// Input Registers:
1283        ;//
1284        ;// dXjnlo          - n th input D register with first four S16 values
1285        ;// dXjnhi          - n th input D register with next four S16 values
1286        ;// qXjn            - n th input Q register with eight S16 values
1287        ;// pScale          - Pointer to scale values
1288        ;//
1289        ;// Output Registers:
1290        ;//
1291        ;// qXin            - n th output Q register with eight S16 output values of 1st stage
1292        ;//
1293        MACRO
1294        M_IDCT_PRESCALE16
1295        VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
1296        VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
1297        M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
1298        M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
1299        M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
1300        M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
1301        VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
1302        VSUB        qXi6, qXj1, qXj7            ;// j1-j7
1303        LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
1304        VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
1305        VSUB        qXi2, qXj2, qXj6            ;// j2-j6
1306        VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
1307        VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
1308        VSUB        qXi4, qXj5, qXj3            ;// j5-j3
1309        MEND
1310
1311
1312        ;// Scale 8x8 block input values with 32 bit scale values
1313        ;//
1314        ;// This macro is used to pre-scale block of 8x8 input.
1315        ;// This also do the Ist stage transformations of IDCT.
1316        ;//
1317        ;// Input Registers:
1318        ;//
1319        ;// dXjnlo          - n th input D register with first four S16 values
1320        ;// dXjnhi          - n th input D register with next four S16 values
1321        ;// qXjn            - n th input Q register with eight S16 values
1322        ;// pScale          - Pointer to 32bit scale values in Q23 format
1323        ;//
1324        ;// Output Registers:
1325        ;//
1326        ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
1327        ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
1328        ;//
1329        MACRO
1330        M_IDCT_PRESCALE32
1331qScale0lo       QN 0.S32
1332qScale0hi       QN 1.S32
1333qScale1lo       QN 2.S32
1334qScale1hi       QN 3.S32
1335qScale2lo       QN qScale1lo
1336qScale2hi       QN qScale1hi
1337qScale3lo       QN qScale1lo
1338qScale3hi       QN qScale1hi
1339qScale4lo       QN qScale1lo
1340qScale4hi       QN qScale1hi
1341qScale5lo       QN qScale0lo
1342qScale5hi       QN qScale0hi
1343qScale6lo       QN qScale0lo
1344qScale6hi       QN qScale0hi
1345qScale7lo       QN qScale0lo
1346qScale7hi       QN qScale0hi
1347
1348qSrc0lo         QN 4.S32
1349qSrc0hi         QN 5.S32
1350qSrc1lo         QN 6.S32
1351qSrc1hi         QN Src4.S32
1352qSrc2lo         QN qSrc0lo
1353qSrc2hi         QN qSrc0hi
1354qSrc3lo         QN qSrc0lo
1355qSrc3hi         QN qSrc0hi
1356qSrc4lo         QN qSrc0lo
1357qSrc4hi         QN qSrc0hi
1358qSrc5lo         QN qSrc1lo
1359qSrc5hi         QN qSrc1hi
1360qSrc6lo         QN qSrc1lo
1361qSrc6hi         QN qSrc1hi
1362qSrc7lo         QN qSrc0lo
1363qSrc7hi         QN qSrc0hi
1364
1365qRes17lo        QN qScale0lo
1366qRes17hi        QN qScale0hi
1367qRes26lo        QN qScale0lo
1368qRes26hi        QN qScale0hi
1369qRes53lo        QN qScale0lo
1370qRes53hi        QN qScale0hi
1371
1372            ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
1373
1374            ;// Row 0
1375            VLD1        {qScale0lo, qScale0hi}, [pScale]!
1376            VSHLL       qSrc0lo, dXj0lo, #(12-1)
1377            VSHLL       qSrc0hi, dXj0hi, #(12-1)
1378            VLD1        {qScale1lo, qScale1hi}, [pScale]!
1379            VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
1380            VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
1381            VLD1        {qScale7lo, qScale7hi}, [pTemp]!
1382            VSHLL       qSrc1lo, dXj1lo, #(12-1)
1383            VSHLL       qSrc1hi, dXj1hi, #(12-1)
1384            VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
1385            VMOVN       dXi0hi, qSrc0hi
1386            VSHLL       qSrc7lo, dXj7lo, #(12-1)
1387            VSHLL       qSrc7hi, dXj7hi, #(12-1)
1388            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
1389            VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
1390            VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
1391            VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
1392            VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
1393            VLD1        {qScale2lo, qScale2hi}, [pScale]!
1394
1395            ;// Row 1 & 7
1396            VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
1397            VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
1398            VMOVN       dXi5lo, qRes17lo                ;// Output i5
1399            VMOVN       dXi5hi, qRes17hi
1400            VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
1401            VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
1402            VMOVN       dXi6lo, qRes17lo                ;// Output i6
1403            VMOVN       dXi6hi, qRes17hi
1404            VSHLL       qSrc2lo, dXj2lo, #(12-1)
1405            VSHLL       qSrc2hi, dXj2hi, #(12-1)
1406            VLD1        {qScale6lo, qScale6hi}, [pTemp]!
1407            VSHLL       qSrc6lo, dXj6lo, #(12-1)
1408            VSHLL       qSrc6hi, dXj6hi, #(12-1)
1409            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
1410            VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
1411            VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
1412            VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
1413            VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
1414            VLD1        {qScale3lo, qScale3hi}, [pScale]!
1415
1416            ;// Row 2 & 6
1417            VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
1418            VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
1419            VMOVN       dXi3lo, qRes26lo                ;// Output i3
1420            VMOVN       dXi3hi, qRes26hi
1421            VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
1422            VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
1423            VMOVN       dXi2lo, qRes26lo                ;// Output i2
1424            VMOVN       dXi2hi, qRes26hi
1425            VSHLL       qSrc3lo, dXj3lo, #(12-1)
1426            VSHLL       qSrc3hi, dXj3hi, #(12-1)
1427            VLD1        {qScale5lo, qScale5hi}, [pTemp]!
1428            VSHLL       qSrc5lo, dXj5lo, #(12-1)
1429            VSHLL       qSrc5hi, dXj5hi, #(12-1)
1430            VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
1431            VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
1432            VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
1433            VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
1434
1435            ;// Row 3 & 5
1436            VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
1437            VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
1438            SUB         pSrc, pSrc, #16*2*2
1439            VMOVN       dXi7lo, qRes53lo                ;// Output i7
1440            VMOVN       dXi7hi, qRes53hi
1441            VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
1442            VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
1443            VLD1        qXj4, [pSrc @64]
1444            VMOVN       dXi4lo, qRes53lo                ;// Output i4
1445            VMOVN       dXi4hi, qRes53hi
1446            VSHLL       qSrc4lo, dXj4lo, #(12-1)
1447            VSHLL       qSrc4hi, dXj4hi, #(12-1)
1448            VLD1        {qScale4lo, qScale4hi}, [pScale]
1449            LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
1450            VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
1451            VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
1452            VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
1453            ;// Row 4
1454            VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
1455            VMOVN       dXi1hi, qSrc4hi
1456
1457        MEND
1458
1459        END
1460