1;
2; jdsample.asm - upsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22        SECTION SEG_CONST
23
24        alignz  16
25        global  EXTN(jconst_fancy_upsample_sse2)
26
27EXTN(jconst_fancy_upsample_sse2):
28
29PW_ONE          times 8 dw  1
30PW_TWO          times 8 dw  2
31PW_THREE        times 8 dw  3
32PW_SEVEN        times 8 dw  7
33PW_EIGHT        times 8 dw  8
34
35        alignz  16
36
37; --------------------------------------------------------------------------
38        SECTION SEG_TEXT
39        BITS    64
40;
41; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
42;
43; The upsampling algorithm is linear interpolation between pixel centers,
44; also known as a "triangle filter".  This is a good compromise between
45; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
46; of the way between input pixel centers.
47;
48; GLOBAL(void)
49; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
50;                                 JDIMENSION downsampled_width,
51;                                 JSAMPARRAY input_data,
52;                                 JSAMPARRAY *output_data_ptr);
53;
54
55; r10 = int max_v_samp_factor
56; r11 = JDIMENSION downsampled_width
57; r12 = JSAMPARRAY input_data
58; r13 = JSAMPARRAY *output_data_ptr
59
60        align   16
61        global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
62
63EXTN(jsimd_h2v1_fancy_upsample_sse2):
64        push    rbp
65        mov     rax,rsp
66        mov     rbp,rsp
67        collect_args
68
69        mov     eax, r11d  ; colctr
70        test    rax,rax
71        jz      near .return
72
73        mov     rcx, r10        ; rowctr
74        test    rcx,rcx
75        jz      near .return
76
77        mov     rsi, r12        ; input_data
78        mov     rdi, r13
79        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
80.rowloop:
81        push    rax                     ; colctr
82        push    rdi
83        push    rsi
84
85        mov     rsi, JSAMPROW [rsi]     ; inptr
86        mov     rdi, JSAMPROW [rdi]     ; outptr
87
88        test    rax, SIZEOF_XMMWORD-1
89        jz      short .skip
90        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
91        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
92.skip:
93        pxor    xmm0,xmm0               ; xmm0=(all 0's)
94        pcmpeqb xmm7,xmm7
95        psrldq  xmm7,(SIZEOF_XMMWORD-1)
96        pand    xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
97
98        add     rax, byte SIZEOF_XMMWORD-1
99        and     rax, byte -SIZEOF_XMMWORD
100        cmp     rax, byte SIZEOF_XMMWORD
101        ja      short .columnloop
102
103.columnloop_last:
104        pcmpeqb xmm6,xmm6
105        pslldq  xmm6,(SIZEOF_XMMWORD-1)
106        pand    xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
107        jmp     short .upsample
108
109.columnloop:
110        movdqa  xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
111        pslldq  xmm6,(SIZEOF_XMMWORD-1)
112
113.upsample:
114        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
115        movdqa  xmm2,xmm1
116        movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
117        pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
118        psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
119
120        por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
121        por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
122
123        movdqa  xmm7,xmm1
124        psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
125
126        movdqa    xmm4,xmm1
127        punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
128        punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
129        movdqa    xmm5,xmm2
130        punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
131        punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
132        movdqa    xmm6,xmm3
133        punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
134        punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
135
136        pmullw  xmm1,[rel PW_THREE]
137        pmullw  xmm4,[rel PW_THREE]
138        paddw   xmm2,[rel PW_ONE]
139        paddw   xmm5,[rel PW_ONE]
140        paddw   xmm3,[rel PW_TWO]
141        paddw   xmm6,[rel PW_TWO]
142
143        paddw   xmm2,xmm1
144        paddw   xmm5,xmm4
145        psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
146        psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
147        paddw   xmm3,xmm1
148        paddw   xmm6,xmm4
149        psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
150        psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
151
152        psllw   xmm3,BYTE_BIT
153        psllw   xmm6,BYTE_BIT
154        por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
155        por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
156
157        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
158        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
159
160        sub     rax, byte SIZEOF_XMMWORD
161        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr
162        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
163        cmp     rax, byte SIZEOF_XMMWORD
164        ja      near .columnloop
165        test    eax,eax
166        jnz     near .columnloop_last
167
168        pop     rsi
169        pop     rdi
170        pop     rax
171
172        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
173        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
174        dec     rcx                             ; rowctr
175        jg      near .rowloop
176
177.return:
178        uncollect_args
179        pop     rbp
180        ret
181
182; --------------------------------------------------------------------------
183;
184; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
185; Again a triangle filter; see comments for h2v1 case, above.
186;
187; GLOBAL(void)
188; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
189;                                 JDIMENSION downsampled_width,
190;                                 JSAMPARRAY input_data,
191;                                 JSAMPARRAY *output_data_ptr);
192;
193
194; r10 = int max_v_samp_factor
195; r11 = JDIMENSION downsampled_width
196; r12 = JSAMPARRAY input_data
197; r13 = JSAMPARRAY *output_data_ptr
198
199%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
200%define WK_NUM          4
201
202        align   16
203        global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
204
205EXTN(jsimd_h2v2_fancy_upsample_sse2):
206        push    rbp
207        mov     rax,rsp                         ; rax = original rbp
208        sub     rsp, byte 4
209        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
210        mov     [rsp],rax
211        mov     rbp,rsp                         ; rbp = aligned rbp
212        lea     rsp, [wk(0)]
213        collect_args
214        push    rbx
215
216        mov     eax, r11d  ; colctr
217        test    rax,rax
218        jz      near .return
219
220        mov     rcx, r10        ; rowctr
221        test    rcx,rcx
222        jz      near .return
223
224        mov     rsi, r12        ; input_data
225        mov     rdi, r13
226        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
227.rowloop:
228        push    rax                                     ; colctr
229        push    rcx
230        push    rdi
231        push    rsi
232
233        mov     rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
234        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
235        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
236        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
237        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
238
239        test    rax, SIZEOF_XMMWORD-1
240        jz      short .skip
241        push    rdx
242        mov     dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
243        mov     JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
244        mov     dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
245        mov     JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
246        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
247        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
248        pop     rdx
249.skip:
250        ; -- process the first column block
251
252        movdqa  xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
253        movdqa  xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
254        movdqa  xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
255
256        pxor      xmm3,xmm3             ; xmm3=(all 0's)
257        movdqa    xmm4,xmm0
258        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
259        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
260        movdqa    xmm5,xmm1
261        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
262        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
263        movdqa    xmm6,xmm2
264        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
265        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
266
267        pmullw  xmm0,[rel PW_THREE]
268        pmullw  xmm4,[rel PW_THREE]
269
270        pcmpeqb xmm7,xmm7
271        psrldq  xmm7,(SIZEOF_XMMWORD-2)
272
273        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
274        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
275        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
276        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
277
278        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
279        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
280        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
281        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
282
283        pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
284        pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
285
286        movdqa  XMMWORD [wk(0)], xmm1
287        movdqa  XMMWORD [wk(1)], xmm2
288
289        add     rax, byte SIZEOF_XMMWORD-1
290        and     rax, byte -SIZEOF_XMMWORD
291        cmp     rax, byte SIZEOF_XMMWORD
292        ja      short .columnloop
293
294.columnloop_last:
295        ; -- process the last column block
296
297        pcmpeqb xmm1,xmm1
298        pslldq  xmm1,(SIZEOF_XMMWORD-2)
299        movdqa  xmm2,xmm1
300
301        pand    xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
302        pand    xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
303
304        movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
305        movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
306
307        jmp     near .upsample
308
309.columnloop:
310        ; -- process the next column block
311
312        movdqa  xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
313        movdqa  xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
314        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
315
316        pxor      xmm3,xmm3             ; xmm3=(all 0's)
317        movdqa    xmm4,xmm0
318        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
319        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
320        movdqa    xmm5,xmm1
321        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
322        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
323        movdqa    xmm6,xmm2
324        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
325        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
326
327        pmullw  xmm0,[rel PW_THREE]
328        pmullw  xmm4,[rel PW_THREE]
329
330        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
331        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
332        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
333        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
334
335        movdqa  XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
336        movdqa  XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
337        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
338        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
339
340        pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
341        pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
342
343        movdqa  XMMWORD [wk(2)], xmm1
344        movdqa  XMMWORD [wk(3)], xmm2
345
346.upsample:
347        ; -- process the upper row
348
349        movdqa  xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
350        movdqa  xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
351
352        movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
353        movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
354        psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
355        pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
356        movdqa  xmm5,xmm7
357        movdqa  xmm6,xmm3
358        psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
359        pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
360
361        por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
362        por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
363
364        movdqa  xmm1,xmm7
365        movdqa  xmm2,xmm3
366        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
367        psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
368        movdqa  xmm4,xmm3
369        psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
370
371        por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
372        por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
373
374        movdqa  XMMWORD [wk(0)], xmm4
375
376        pmullw  xmm7,[rel PW_THREE]
377        pmullw  xmm3,[rel PW_THREE]
378        paddw   xmm1,[rel PW_EIGHT]
379        paddw   xmm5,[rel PW_EIGHT]
380        paddw   xmm0,[rel PW_SEVEN]
381        paddw   xmm2,[rel PW_SEVEN]
382
383        paddw   xmm1,xmm7
384        paddw   xmm5,xmm3
385        psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
386        psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
387        paddw   xmm0,xmm7
388        paddw   xmm2,xmm3
389        psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
390        psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
391
392        psllw   xmm0,BYTE_BIT
393        psllw   xmm2,BYTE_BIT
394        por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
395        por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
396
397        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
398        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
399
400        ; -- process the lower row
401
402        movdqa  xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
403        movdqa  xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
404
405        movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
406        movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
407        psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
408        pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
409        movdqa  xmm0,xmm6
410        movdqa  xmm2,xmm4
411        psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
412        pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
413
414        por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
415        por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
416
417        movdqa  xmm1,xmm6
418        movdqa  xmm5,xmm4
419        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
420        psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
421        movdqa  xmm3,xmm4
422        psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
423
424        por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
425        por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
426
427        movdqa  XMMWORD [wk(1)], xmm3
428
429        pmullw  xmm6,[rel PW_THREE]
430        pmullw  xmm4,[rel PW_THREE]
431        paddw   xmm1,[rel PW_EIGHT]
432        paddw   xmm0,[rel PW_EIGHT]
433        paddw   xmm7,[rel PW_SEVEN]
434        paddw   xmm5,[rel PW_SEVEN]
435
436        paddw   xmm1,xmm6
437        paddw   xmm0,xmm4
438        psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
439        psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
440        paddw   xmm7,xmm6
441        paddw   xmm5,xmm4
442        psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
443        psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
444
445        psllw   xmm7,BYTE_BIT
446        psllw   xmm5,BYTE_BIT
447        por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
448        por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
449
450        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
451        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
452
453        sub     rax, byte SIZEOF_XMMWORD
454        add     rcx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
455        add     rbx, byte 1*SIZEOF_XMMWORD      ; inptr0
456        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
457        add     rdx, byte 2*SIZEOF_XMMWORD      ; outptr0
458        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr1
459        cmp     rax, byte SIZEOF_XMMWORD
460        ja      near .columnloop
461        test    rax,rax
462        jnz     near .columnloop_last
463
464        pop     rsi
465        pop     rdi
466        pop     rcx
467        pop     rax
468
469        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
470        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
471        sub     rcx, byte 2                     ; rowctr
472        jg      near .rowloop
473
474.return:
475        pop     rbx
476        uncollect_args
477        mov     rsp,rbp         ; rsp <- aligned rbp
478        pop     rsp             ; rsp <- original rbp
479        pop     rbp
480        ret
481
482; --------------------------------------------------------------------------
483;
484; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
485; It's still a box filter.
486;
487; GLOBAL(void)
488; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
489;                           JDIMENSION output_width,
490;                           JSAMPARRAY input_data,
491;                           JSAMPARRAY *output_data_ptr);
492;
493
494; r10 = int max_v_samp_factor
495; r11 = JDIMENSION output_width
496; r12 = JSAMPARRAY input_data
497; r13 = JSAMPARRAY *output_data_ptr
498
499        align   16
500        global  EXTN(jsimd_h2v1_upsample_sse2)
501
502EXTN(jsimd_h2v1_upsample_sse2):
503        push    rbp
504        mov     rax,rsp
505        mov     rbp,rsp
506        collect_args
507
508        mov     edx, r11d
509        add     rdx, byte (2*SIZEOF_XMMWORD)-1
510        and     rdx, byte -(2*SIZEOF_XMMWORD)
511        jz      near .return
512
513        mov     rcx, r10        ; rowctr
514        test    rcx,rcx
515        jz      short .return
516
517        mov     rsi, r12 ; input_data
518        mov     rdi, r13
519        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
520.rowloop:
521        push    rdi
522        push    rsi
523
524        mov     rsi, JSAMPROW [rsi]             ; inptr
525        mov     rdi, JSAMPROW [rdi]             ; outptr
526        mov     rax,rdx                         ; colctr
527.columnloop:
528
529        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
530
531        movdqa    xmm1,xmm0
532        punpcklbw xmm0,xmm0
533        punpckhbw xmm1,xmm1
534
535        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
536        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
537
538        sub     rax, byte 2*SIZEOF_XMMWORD
539        jz      short .nextrow
540
541        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
542
543        movdqa    xmm3,xmm2
544        punpcklbw xmm2,xmm2
545        punpckhbw xmm3,xmm3
546
547        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
548        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
549
550        sub     rax, byte 2*SIZEOF_XMMWORD
551        jz      short .nextrow
552
553        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
554        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr
555        jmp     short .columnloop
556
557.nextrow:
558        pop     rsi
559        pop     rdi
560
561        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
562        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
563        dec     rcx                             ; rowctr
564        jg      short .rowloop
565
566.return:
567        uncollect_args
568        pop     rbp
569        ret
570
571; --------------------------------------------------------------------------
572;
573; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
574; It's still a box filter.
575;
576; GLOBAL(void)
577; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
578;                           JDIMENSION output_width,
579;                           JSAMPARRAY input_data,
580;                           JSAMPARRAY *output_data_ptr);
581;
582
583; r10 = int max_v_samp_factor
584; r11 = JDIMENSION output_width
585; r12 = JSAMPARRAY input_data
586; r13 = JSAMPARRAY *output_data_ptr
587
588        align   16
589        global  EXTN(jsimd_h2v2_upsample_sse2)
590
591EXTN(jsimd_h2v2_upsample_sse2):
592        push    rbp
593        mov     rax,rsp
594        mov     rbp,rsp
595        collect_args
596        push    rbx
597
598        mov     edx, r11d
599        add     rdx, byte (2*SIZEOF_XMMWORD)-1
600        and     rdx, byte -(2*SIZEOF_XMMWORD)
601        jz      near .return
602
603        mov     rcx, r10        ; rowctr
604        test    rcx,rcx
605        jz      near .return
606
607        mov     rsi, r12        ; input_data
608        mov     rdi, r13
609        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
610.rowloop:
611        push    rdi
612        push    rsi
613
614        mov     rsi, JSAMPROW [rsi]                     ; inptr
615        mov     rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
616        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
617        mov     rax,rdx                                 ; colctr
618.columnloop:
619
620        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
621
622        movdqa    xmm1,xmm0
623        punpcklbw xmm0,xmm0
624        punpckhbw xmm1,xmm1
625
626        movdqa  XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
627        movdqa  XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
628        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
629        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
630
631        sub     rax, byte 2*SIZEOF_XMMWORD
632        jz      short .nextrow
633
634        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
635
636        movdqa    xmm3,xmm2
637        punpcklbw xmm2,xmm2
638        punpckhbw xmm3,xmm3
639
640        movdqa  XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
641        movdqa  XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
642        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
643        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
644
645        sub     rax, byte 2*SIZEOF_XMMWORD
646        jz      short .nextrow
647
648        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
649        add     rbx, byte 4*SIZEOF_XMMWORD      ; outptr0
650        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr1
651        jmp     short .columnloop
652
653.nextrow:
654        pop     rsi
655        pop     rdi
656
657        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
658        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
659        sub     rcx, byte 2                     ; rowctr
660        jg      near .rowloop
661
662.return:
663        pop     rbx
664        uncollect_args
665        pop     rbp
666        ret
667
668; For some reason, the OS X linker does not honor the request to align the
669; segment unless we do this.
670        align   16
671