1;
2; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright 2009 D. R. Commander
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; This file contains a floating-point implementation of the inverse DCT
19; (Discrete Cosine Transform). The following code is based directly on
20; the IJG's original jidctflt.c; see the jidctflt.c for more details.
21;
22; [TAB8]
23
24%include "jsimdext.inc"
25%include "jdct.inc"
26
27; --------------------------------------------------------------------------
28
29%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
30        shufps  %1,%2,0x44
31%endmacro
32
33%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
34        shufps  %1,%2,0xEE
35%endmacro
36
37; --------------------------------------------------------------------------
38        SECTION SEG_CONST
39
40        alignz  16
41        global  EXTN(jconst_idct_float_sse2)
42
43EXTN(jconst_idct_float_sse2):
44
45PD_1_414        times 4 dd  1.414213562373095048801689
46PD_1_847        times 4 dd  1.847759065022573512256366
47PD_1_082        times 4 dd  1.082392200292393968799446
48PD_M2_613       times 4 dd -2.613125929752753055713286
49PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
50PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
51
52        alignz  16
53
54; --------------------------------------------------------------------------
55        SECTION SEG_TEXT
56        BITS    64
57;
58; Perform dequantization and inverse DCT on one block of coefficients.
59;
60; GLOBAL(void)
61; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
62;                        JSAMPARRAY output_buf, JDIMENSION output_col)
63;
64
65; r10 = void * dct_table
66; r11 = JCOEFPTR coef_block
67; r12 = JSAMPARRAY output_buf
68; r13 = JDIMENSION output_col
69
70%define original_rbp    rbp+0
71%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
72%define WK_NUM          2
73%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
74                                        ; FAST_FLOAT workspace[DCTSIZE2]
75
76        align   16
77        global  EXTN(jsimd_idct_float_sse2)
78
79EXTN(jsimd_idct_float_sse2):
80        push    rbp
81        mov     rax,rsp                         ; rax = original rbp
82        sub     rsp, byte 4
83        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
84        mov     [rsp],rax
85        mov     rbp,rsp                         ; rbp = aligned rbp
86        lea     rsp, [workspace]
87        collect_args
88        push    rbx
89
90        ; ---- Pass 1: process columns from input, store into work array.
91
92        mov     rdx, r10                ; quantptr
93        mov     rsi, r11                ; inptr
94        lea     rdi, [workspace]                        ; FAST_FLOAT * wsptr
95        mov     rcx, DCTSIZE/4                          ; ctr
96.columnloop:
97%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
98        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
99        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
100        jnz     near .columnDCT
101
102        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
103        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
104        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
105        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
106        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
107        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
108        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
109        por     xmm1,xmm2
110        por     xmm3,xmm4
111        por     xmm5,xmm6
112        por     xmm1,xmm3
113        por     xmm5,xmm7
114        por     xmm1,xmm5
115        packsswb xmm1,xmm1
116        movd    eax,xmm1
117        test    rax,rax
118        jnz     short .columnDCT
119
120        ; -- AC terms all zero
121
122        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
123
124        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
125        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
126        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
127
128        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
129
130        movaps  xmm1,xmm0
131        movaps  xmm2,xmm0
132        movaps  xmm3,xmm0
133
134        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
135        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
136        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
137        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
138
139        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
140        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
141        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
142        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
143        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
144        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
145        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
146        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
147        jmp     near .nextcolumn
148%endif
149.columnDCT:
150
151        ; -- Even part
152
153        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
154        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
155        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
156        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
157
158        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
159        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
160        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
161        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
162        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
163        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
164
165        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
166        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
167        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
168        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
169        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
170        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
171
172        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
173        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
174        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
175        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
176
177        movaps  xmm4,xmm0
178        movaps  xmm5,xmm1
179        subps   xmm0,xmm2               ; xmm0=tmp11
180        subps   xmm1,xmm3
181        addps   xmm4,xmm2               ; xmm4=tmp10
182        addps   xmm5,xmm3               ; xmm5=tmp13
183
184        mulps   xmm1,[rel PD_1_414]
185        subps   xmm1,xmm5               ; xmm1=tmp12
186
187        movaps  xmm6,xmm4
188        movaps  xmm7,xmm0
189        subps   xmm4,xmm5               ; xmm4=tmp3
190        subps   xmm0,xmm1               ; xmm0=tmp2
191        addps   xmm6,xmm5               ; xmm6=tmp0
192        addps   xmm7,xmm1               ; xmm7=tmp1
193
194        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
195        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
196
197        ; -- Odd part
198
199        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
200        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
201        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
202        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
203
204        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
205        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
206        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
207        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
208        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
209        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
210
211        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
212        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
213        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
214        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
215        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
216        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
217
218        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
219        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
220        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
221        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
222
223        movaps  xmm4,xmm2
224        movaps  xmm0,xmm5
225        addps   xmm2,xmm1               ; xmm2=z11
226        addps   xmm5,xmm3               ; xmm5=z13
227        subps   xmm4,xmm1               ; xmm4=z12
228        subps   xmm0,xmm3               ; xmm0=z10
229
230        movaps  xmm1,xmm2
231        subps   xmm2,xmm5
232        addps   xmm1,xmm5               ; xmm1=tmp7
233
234        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
235
236        movaps  xmm3,xmm0
237        addps   xmm0,xmm4
238        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
239        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
240        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
241        addps   xmm3,xmm0               ; xmm3=tmp12
242        subps   xmm4,xmm0               ; xmm4=tmp10
243
244        ; -- Final output stage
245
246        subps   xmm3,xmm1               ; xmm3=tmp6
247        movaps  xmm5,xmm6
248        movaps  xmm0,xmm7
249        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
250        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
251        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
252        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
253        subps   xmm2,xmm3               ; xmm2=tmp5
254
255        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
256        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
257        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
258        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
259        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
260        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
261
262        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
263        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
264
265        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
266        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
267
268        addps   xmm4,xmm2               ; xmm4=tmp4
269        movaps  xmm0,xmm7
270        movaps  xmm3,xmm5
271        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
272        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
273        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
274        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
275
276        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
277        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
278        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
279        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
280        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
281        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
282
283        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
284        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
285        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
286        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
287        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
288        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
289
290        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
291        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
292
293        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
294        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
295        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
296        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
297
298        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
299        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
300        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
301        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
302        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
303        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
304
305        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
306        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
307        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
308        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
309
310.nextcolumn:
311        add     rsi, byte 4*SIZEOF_JCOEF                ; coef_block
312        add     rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
313        add     rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
314        dec     rcx                                     ; ctr
315        jnz     near .columnloop
316
317        ; -- Prefetch the next coefficient block
318
319        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
320        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
321        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
322        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
323
324        ; ---- Pass 2: process rows from work array, store into output array.
325
326        mov     rax, [original_rbp]
327        lea     rsi, [workspace]                        ; FAST_FLOAT * wsptr
328        mov     rdi, r12        ; (JSAMPROW *)
329        mov     eax, r13d
330        mov     rcx, DCTSIZE/4                          ; ctr
331.rowloop:
332
333        ; -- Even part
334
335        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
336        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
337        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
338        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
339
340        movaps  xmm4,xmm0
341        movaps  xmm5,xmm1
342        subps   xmm0,xmm2               ; xmm0=tmp11
343        subps   xmm1,xmm3
344        addps   xmm4,xmm2               ; xmm4=tmp10
345        addps   xmm5,xmm3               ; xmm5=tmp13
346
347        mulps   xmm1,[rel PD_1_414]
348        subps   xmm1,xmm5               ; xmm1=tmp12
349
350        movaps  xmm6,xmm4
351        movaps  xmm7,xmm0
352        subps   xmm4,xmm5               ; xmm4=tmp3
353        subps   xmm0,xmm1               ; xmm0=tmp2
354        addps   xmm6,xmm5               ; xmm6=tmp0
355        addps   xmm7,xmm1               ; xmm7=tmp1
356
357        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
358        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
359
360        ; -- Odd part
361
362        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
363        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
364        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
365        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
366
367        movaps  xmm4,xmm2
368        movaps  xmm0,xmm5
369        addps   xmm2,xmm1               ; xmm2=z11
370        addps   xmm5,xmm3               ; xmm5=z13
371        subps   xmm4,xmm1               ; xmm4=z12
372        subps   xmm0,xmm3               ; xmm0=z10
373
374        movaps  xmm1,xmm2
375        subps   xmm2,xmm5
376        addps   xmm1,xmm5               ; xmm1=tmp7
377
378        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
379
380        movaps  xmm3,xmm0
381        addps   xmm0,xmm4
382        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
383        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
384        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
385        addps   xmm3,xmm0               ; xmm3=tmp12
386        subps   xmm4,xmm0               ; xmm4=tmp10
387
388        ; -- Final output stage
389
390        subps   xmm3,xmm1               ; xmm3=tmp6
391        movaps  xmm5,xmm6
392        movaps  xmm0,xmm7
393        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
394        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
395        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
396        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
397        subps   xmm2,xmm3               ; xmm2=tmp5
398
399        movaps  xmm1,[rel PD_RNDINT_MAGIC]      ; xmm1=[rel PD_RNDINT_MAGIC]
400        pcmpeqd xmm3,xmm3
401        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
402
403        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
404        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
405        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
406        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
407
408        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
409        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
410        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
411        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
412        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
413        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
414
415        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
416        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
417
418        addps   xmm4,xmm2               ; xmm4=tmp4
419        movaps  xmm7,xmm1
420        movaps  xmm5,xmm3
421        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
422        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
423        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
424        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
425
426        movaps  xmm2,[rel PD_RNDINT_MAGIC]      ; xmm2=[rel PD_RNDINT_MAGIC]
427        pcmpeqd xmm4,xmm4
428        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
429
430        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
431        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
432        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
433        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
434
435        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
436        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
437        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
438        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
439        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
440        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
441
442        movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
443
444        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
445        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
446        paddb     xmm6,xmm2
447        paddb     xmm1,xmm2
448
449        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
450        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
451        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
452
453        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
454        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
455        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
456
457        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
458        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
459
460        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
461        mov     rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
462        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
463        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
464        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
465        mov     rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
466        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
467        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
468
469        add     rsi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
470        add     rdi, byte 4*SIZEOF_JSAMPROW
471        dec     rcx                             ; ctr
472        jnz     near .rowloop
473
474        pop     rbx
475        uncollect_args
476        mov     rsp,rbp         ; rsp <- aligned rbp
477        pop     rsp             ; rsp <- original rbp
478        pop     rbp
479        ret
480
481; For some reason, the OS X linker does not honor the request to align the
482; segment unless we do this.
483        align   16
484