1;
2; jidctfst.asm - fast integer IDCT (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; This file contains a fast, not so accurate integer implementation of
17; the inverse DCT (Discrete Cosine Transform). The following code is
18; based directly on the IJG's original jidctfst.c; see the jidctfst.c
19; for more details.
20;
21; [TAB8]
22
23%include "jsimdext.inc"
24%include "jdct.inc"
25
26; --------------------------------------------------------------------------
27
28%define CONST_BITS      8       ; 14 is also OK.
29%define PASS1_BITS      2
30
31%if IFAST_SCALE_BITS != PASS1_BITS
32%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
33%endif
34
35%if CONST_BITS == 8
36F_1_082 equ     277             ; FIX(1.082392200)
37F_1_414 equ     362             ; FIX(1.414213562)
38F_1_847 equ     473             ; FIX(1.847759065)
39F_2_613 equ     669             ; FIX(2.613125930)
40F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
41%else
42; NASM cannot do compile-time arithmetic on floating-point constants.
43%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
44F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
45F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
46F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
47F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
48F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
49%endif
50
51; --------------------------------------------------------------------------
52        SECTION SEG_CONST
53
54; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
55; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
56
57%define PRE_MULTIPLY_SCALE_BITS   2
58%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
59
60        alignz  16
61        global  EXTN(jconst_idct_ifast_mmx)
62
63EXTN(jconst_idct_ifast_mmx):
64
65PW_F1414        times 4 dw  F_1_414 << CONST_SHIFT
66PW_F1847        times 4 dw  F_1_847 << CONST_SHIFT
67PW_MF1613       times 4 dw -F_1_613 << CONST_SHIFT
68PW_F1082        times 4 dw  F_1_082 << CONST_SHIFT
69PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
70
71        alignz  16
72
73; --------------------------------------------------------------------------
74        SECTION SEG_TEXT
75        BITS    32
76;
77; Perform dequantization and inverse DCT on one block of coefficients.
78;
79; GLOBAL(void)
80; jsimd_idct_ifast_mmx (void *dct_table, JCOEFPTR coef_block,
81;                       JSAMPARRAY output_buf, JDIMENSION output_col)
82;
83
84%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
85%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
86%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
87%define output_col(b)   (b)+20          ; JDIMENSION output_col
88
89%define original_ebp    ebp+0
90%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
91%define WK_NUM          2
92%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
93                                        ; JCOEF workspace[DCTSIZE2]
94
95        align   16
96        global  EXTN(jsimd_idct_ifast_mmx)
97
98EXTN(jsimd_idct_ifast_mmx):
99        push    ebp
100        mov     eax,esp                         ; eax = original ebp
101        sub     esp, byte 4
102        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
103        mov     [esp],eax
104        mov     ebp,esp                         ; ebp = aligned ebp
105        lea     esp, [workspace]
106        push    ebx
107;       push    ecx             ; need not be preserved
108;       push    edx             ; need not be preserved
109        push    esi
110        push    edi
111
112        get_GOT ebx             ; get GOT address
113
114        ; ---- Pass 1: process columns from input, store into work array.
115
116;       mov     eax, [original_ebp]
117        mov     edx, POINTER [dct_table(eax)]           ; quantptr
118        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
119        lea     edi, [workspace]                        ; JCOEF *wsptr
120        mov     ecx, DCTSIZE/4                          ; ctr
121        alignx  16,7
122.columnloop:
123%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
124        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
125        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
126        jnz     short .columnDCT
127
128        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
129        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
130        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
131        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
132        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
133        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
134        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
135        por     mm1,mm0
136        packsswb mm1,mm1
137        movd    eax,mm1
138        test    eax,eax
139        jnz     short .columnDCT
140
141        ; -- AC terms all zero
142
143        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
144        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
145
146        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
147        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
148        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
149
150        movq      mm1,mm0
151        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
152        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
153        movq      mm3,mm2
154        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
155        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
156
157        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
158        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
159        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
160        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
161        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
162        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
163        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
164        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
165        jmp     near .nextcolumn
166        alignx  16,7
167%endif
168.columnDCT:
169
170        ; -- Even part
171
172        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
173        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
174        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
175        pmullw  mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
176        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
177        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
178        pmullw  mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
179        pmullw  mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
180
181        movq    mm4,mm0
182        movq    mm5,mm1
183        psubw   mm0,mm2                 ; mm0=tmp11
184        psubw   mm1,mm3
185        paddw   mm4,mm2                 ; mm4=tmp10
186        paddw   mm5,mm3                 ; mm5=tmp13
187
188        psllw   mm1,PRE_MULTIPLY_SCALE_BITS
189        pmulhw  mm1,[GOTOFF(ebx,PW_F1414)]
190        psubw   mm1,mm5                 ; mm1=tmp12
191
192        movq    mm6,mm4
193        movq    mm7,mm0
194        psubw   mm4,mm5                 ; mm4=tmp3
195        psubw   mm0,mm1                 ; mm0=tmp2
196        paddw   mm6,mm5                 ; mm6=tmp0
197        paddw   mm7,mm1                 ; mm7=tmp1
198
199        movq    MMWORD [wk(1)], mm4     ; wk(1)=tmp3
200        movq    MMWORD [wk(0)], mm0     ; wk(0)=tmp2
201
202        ; -- Odd part
203
204        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
205        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
206        pmullw  mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
207        pmullw  mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
208        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
209        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
210        pmullw  mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
211        pmullw  mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
212
213        movq    mm4,mm2
214        movq    mm0,mm5
215        psubw   mm2,mm1                 ; mm2=z12
216        psubw   mm5,mm3                 ; mm5=z10
217        paddw   mm4,mm1                 ; mm4=z11
218        paddw   mm0,mm3                 ; mm0=z13
219
220        movq    mm1,mm5                 ; mm1=z10(unscaled)
221        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
222        psllw   mm5,PRE_MULTIPLY_SCALE_BITS
223
224        movq    mm3,mm4
225        psubw   mm4,mm0
226        paddw   mm3,mm0                 ; mm3=tmp7
227
228        psllw   mm4,PRE_MULTIPLY_SCALE_BITS
229        pmulhw  mm4,[GOTOFF(ebx,PW_F1414)]      ; mm4=tmp11
230
231        ; To avoid overflow...
232        ;
233        ; (Original)
234        ; tmp12 = -2.613125930 * z10 + z5;
235        ;
236        ; (This implementation)
237        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
238        ;       = -1.613125930 * z10 - z10 + z5;
239
240        movq    mm0,mm5
241        paddw   mm5,mm2
242        pmulhw  mm5,[GOTOFF(ebx,PW_F1847)]      ; mm5=z5
243        pmulhw  mm0,[GOTOFF(ebx,PW_MF1613)]
244        pmulhw  mm2,[GOTOFF(ebx,PW_F1082)]
245        psubw   mm0,mm1
246        psubw   mm2,mm5                 ; mm2=tmp10
247        paddw   mm0,mm5                 ; mm0=tmp12
248
249        ; -- Final output stage
250
251        psubw   mm0,mm3                 ; mm0=tmp6
252        movq    mm1,mm6
253        movq    mm5,mm7
254        paddw   mm6,mm3                 ; mm6=data0=(00 01 02 03)
255        paddw   mm7,mm0                 ; mm7=data1=(10 11 12 13)
256        psubw   mm1,mm3                 ; mm1=data7=(70 71 72 73)
257        psubw   mm5,mm0                 ; mm5=data6=(60 61 62 63)
258        psubw   mm4,mm0                 ; mm4=tmp5
259
260        movq      mm3,mm6               ; transpose coefficients(phase 1)
261        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
262        punpckhwd mm3,mm7               ; mm3=(02 12 03 13)
263        movq      mm0,mm5               ; transpose coefficients(phase 1)
264        punpcklwd mm5,mm1               ; mm5=(60 70 61 71)
265        punpckhwd mm0,mm1               ; mm0=(62 72 63 73)
266
267        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp2
268        movq    mm1, MMWORD [wk(1)]     ; mm1=tmp3
269
270        movq    MMWORD [wk(0)], mm5     ; wk(0)=(60 70 61 71)
271        movq    MMWORD [wk(1)], mm0     ; wk(1)=(62 72 63 73)
272
273        paddw   mm2,mm4                 ; mm2=tmp4
274        movq    mm5,mm7
275        movq    mm0,mm1
276        paddw   mm7,mm4                 ; mm7=data2=(20 21 22 23)
277        paddw   mm1,mm2                 ; mm1=data4=(40 41 42 43)
278        psubw   mm5,mm4                 ; mm5=data5=(50 51 52 53)
279        psubw   mm0,mm2                 ; mm0=data3=(30 31 32 33)
280
281        movq      mm4,mm7               ; transpose coefficients(phase 1)
282        punpcklwd mm7,mm0               ; mm7=(20 30 21 31)
283        punpckhwd mm4,mm0               ; mm4=(22 32 23 33)
284        movq      mm2,mm1               ; transpose coefficients(phase 1)
285        punpcklwd mm1,mm5               ; mm1=(40 50 41 51)
286        punpckhwd mm2,mm5               ; mm2=(42 52 43 53)
287
288        movq      mm0,mm6               ; transpose coefficients(phase 2)
289        punpckldq mm6,mm7               ; mm6=(00 10 20 30)
290        punpckhdq mm0,mm7               ; mm0=(01 11 21 31)
291        movq      mm5,mm3               ; transpose coefficients(phase 2)
292        punpckldq mm3,mm4               ; mm3=(02 12 22 32)
293        punpckhdq mm5,mm4               ; mm5=(03 13 23 33)
294
295        movq    mm7, MMWORD [wk(0)]     ; mm7=(60 70 61 71)
296        movq    mm4, MMWORD [wk(1)]     ; mm4=(62 72 63 73)
297
298        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
299        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
300        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
301        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
302
303        movq      mm6,mm1               ; transpose coefficients(phase 2)
304        punpckldq mm1,mm7               ; mm1=(40 50 60 70)
305        punpckhdq mm6,mm7               ; mm6=(41 51 61 71)
306        movq      mm0,mm2               ; transpose coefficients(phase 2)
307        punpckldq mm2,mm4               ; mm2=(42 52 62 72)
308        punpckhdq mm0,mm4               ; mm0=(43 53 63 73)
309
310        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
311        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
312        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
313        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
314
315.nextcolumn:
316        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
317        add     edx, byte 4*SIZEOF_IFAST_MULT_TYPE      ; quantptr
318        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
319        dec     ecx                                     ; ctr
320        jnz     near .columnloop
321
322        ; ---- Pass 2: process rows from work array, store into output array.
323
324        mov     eax, [original_ebp]
325        lea     esi, [workspace]                        ; JCOEF *wsptr
326        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
327        mov     eax, JDIMENSION [output_col(eax)]
328        mov     ecx, DCTSIZE/4                          ; ctr
329        alignx  16,7
330.rowloop:
331
332        ; -- Even part
333
334        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
335        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
336        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
337        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
338
339        movq    mm4,mm0
340        movq    mm5,mm1
341        psubw   mm0,mm2                 ; mm0=tmp11
342        psubw   mm1,mm3
343        paddw   mm4,mm2                 ; mm4=tmp10
344        paddw   mm5,mm3                 ; mm5=tmp13
345
346        psllw   mm1,PRE_MULTIPLY_SCALE_BITS
347        pmulhw  mm1,[GOTOFF(ebx,PW_F1414)]
348        psubw   mm1,mm5                 ; mm1=tmp12
349
350        movq    mm6,mm4
351        movq    mm7,mm0
352        psubw   mm4,mm5                 ; mm4=tmp3
353        psubw   mm0,mm1                 ; mm0=tmp2
354        paddw   mm6,mm5                 ; mm6=tmp0
355        paddw   mm7,mm1                 ; mm7=tmp1
356
357        movq    MMWORD [wk(1)], mm4     ; wk(1)=tmp3
358        movq    MMWORD [wk(0)], mm0     ; wk(0)=tmp2
359
360        ; -- Odd part
361
362        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
363        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
364        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
365        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
366
367        movq    mm4,mm2
368        movq    mm0,mm5
369        psubw   mm2,mm1                 ; mm2=z12
370        psubw   mm5,mm3                 ; mm5=z10
371        paddw   mm4,mm1                 ; mm4=z11
372        paddw   mm0,mm3                 ; mm0=z13
373
374        movq    mm1,mm5                 ; mm1=z10(unscaled)
375        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
376        psllw   mm5,PRE_MULTIPLY_SCALE_BITS
377
378        movq    mm3,mm4
379        psubw   mm4,mm0
380        paddw   mm3,mm0                 ; mm3=tmp7
381
382        psllw   mm4,PRE_MULTIPLY_SCALE_BITS
383        pmulhw  mm4,[GOTOFF(ebx,PW_F1414)]      ; mm4=tmp11
384
385        ; To avoid overflow...
386        ;
387        ; (Original)
388        ; tmp12 = -2.613125930 * z10 + z5;
389        ;
390        ; (This implementation)
391        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
392        ;       = -1.613125930 * z10 - z10 + z5;
393
394        movq    mm0,mm5
395        paddw   mm5,mm2
396        pmulhw  mm5,[GOTOFF(ebx,PW_F1847)]      ; mm5=z5
397        pmulhw  mm0,[GOTOFF(ebx,PW_MF1613)]
398        pmulhw  mm2,[GOTOFF(ebx,PW_F1082)]
399        psubw   mm0,mm1
400        psubw   mm2,mm5                 ; mm2=tmp10
401        paddw   mm0,mm5                 ; mm0=tmp12
402
403        ; -- Final output stage
404
405        psubw   mm0,mm3                 ; mm0=tmp6
406        movq    mm1,mm6
407        movq    mm5,mm7
408        paddw   mm6,mm3                 ; mm6=data0=(00 10 20 30)
409        paddw   mm7,mm0                 ; mm7=data1=(01 11 21 31)
410        psraw   mm6,(PASS1_BITS+3)      ; descale
411        psraw   mm7,(PASS1_BITS+3)      ; descale
412        psubw   mm1,mm3                 ; mm1=data7=(07 17 27 37)
413        psubw   mm5,mm0                 ; mm5=data6=(06 16 26 36)
414        psraw   mm1,(PASS1_BITS+3)      ; descale
415        psraw   mm5,(PASS1_BITS+3)      ; descale
416        psubw   mm4,mm0                 ; mm4=tmp5
417
418        packsswb  mm6,mm5               ; mm6=(00 10 20 30 06 16 26 36)
419        packsswb  mm7,mm1               ; mm7=(01 11 21 31 07 17 27 37)
420
421        movq    mm3, MMWORD [wk(0)]     ; mm3=tmp2
422        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp3
423
424        paddw   mm2,mm4                 ; mm2=tmp4
425        movq    mm5,mm3
426        movq    mm1,mm0
427        paddw   mm3,mm4                 ; mm3=data2=(02 12 22 32)
428        paddw   mm0,mm2                 ; mm0=data4=(04 14 24 34)
429        psraw   mm3,(PASS1_BITS+3)      ; descale
430        psraw   mm0,(PASS1_BITS+3)      ; descale
431        psubw   mm5,mm4                 ; mm5=data5=(05 15 25 35)
432        psubw   mm1,mm2                 ; mm1=data3=(03 13 23 33)
433        psraw   mm5,(PASS1_BITS+3)      ; descale
434        psraw   mm1,(PASS1_BITS+3)      ; descale
435
436        movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm4=[PB_CENTERJSAMP]
437
438        packsswb  mm3,mm0               ; mm3=(02 12 22 32 04 14 24 34)
439        packsswb  mm1,mm5               ; mm1=(03 13 23 33 05 15 25 35)
440
441        paddb     mm6,mm4
442        paddb     mm7,mm4
443        paddb     mm3,mm4
444        paddb     mm1,mm4
445
446        movq      mm2,mm6               ; transpose coefficients(phase 1)
447        punpcklbw mm6,mm7               ; mm6=(00 01 10 11 20 21 30 31)
448        punpckhbw mm2,mm7               ; mm2=(06 07 16 17 26 27 36 37)
449        movq      mm0,mm3               ; transpose coefficients(phase 1)
450        punpcklbw mm3,mm1               ; mm3=(02 03 12 13 22 23 32 33)
451        punpckhbw mm0,mm1               ; mm0=(04 05 14 15 24 25 34 35)
452
453        movq      mm5,mm6               ; transpose coefficients(phase 2)
454        punpcklwd mm6,mm3               ; mm6=(00 01 02 03 10 11 12 13)
455        punpckhwd mm5,mm3               ; mm5=(20 21 22 23 30 31 32 33)
456        movq      mm4,mm0               ; transpose coefficients(phase 2)
457        punpcklwd mm0,mm2               ; mm0=(04 05 06 07 14 15 16 17)
458        punpckhwd mm4,mm2               ; mm4=(24 25 26 27 34 35 36 37)
459
460        movq      mm7,mm6               ; transpose coefficients(phase 3)
461        punpckldq mm6,mm0               ; mm6=(00 01 02 03 04 05 06 07)
462        punpckhdq mm7,mm0               ; mm7=(10 11 12 13 14 15 16 17)
463        movq      mm1,mm5               ; transpose coefficients(phase 3)
464        punpckldq mm5,mm4               ; mm5=(20 21 22 23 24 25 26 27)
465        punpckhdq mm1,mm4               ; mm1=(30 31 32 33 34 35 36 37)
466
467        pushpic ebx                     ; save GOT address
468
469        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
470        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
471        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
472        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
473        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
474        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
475        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
476        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
477
478        poppic  ebx                     ; restore GOT address
479
480        add     esi, byte 4*SIZEOF_JCOEF        ; wsptr
481        add     edi, byte 4*SIZEOF_JSAMPROW
482        dec     ecx                             ; ctr
483        jnz     near .rowloop
484
485        emms            ; empty MMX state
486
487        pop     edi
488        pop     esi
489;       pop     edx             ; need not be preserved
490;       pop     ecx             ; need not be preserved
491        pop     ebx
492        mov     esp,ebp         ; esp <- aligned ebp
493        pop     esp             ; esp <- original ebp
494        pop     ebp
495        ret
496
497; For some reason, the OS X linker does not honor the request to align the
498; segment unless we do this.
499        align   16
500