1;
2; jdsample.asm - upsampling (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22        SECTION SEG_CONST
23
24        alignz  16
25        global  EXTN(jconst_fancy_upsample_mmx)
26
27EXTN(jconst_fancy_upsample_mmx):
28
29PW_ONE          times 4 dw  1
30PW_TWO          times 4 dw  2
31PW_THREE        times 4 dw  3
32PW_SEVEN        times 4 dw  7
33PW_EIGHT        times 4 dw  8
34
35        alignz  16
36
37; --------------------------------------------------------------------------
38        SECTION SEG_TEXT
39        BITS    32
40;
41; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
42;
43; The upsampling algorithm is linear interpolation between pixel centers,
44; also known as a "triangle filter".  This is a good compromise between
45; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
46; of the way between input pixel centers.
47;
48; GLOBAL(void)
49; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
50;                                JDIMENSION downsampled_width,
51;                                JSAMPARRAY input_data,
52;                                JSAMPARRAY * output_data_ptr);
53;
54
55%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
56%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
57%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
58%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
59
60        align   16
61        global  EXTN(jsimd_h2v1_fancy_upsample_mmx)
62
63EXTN(jsimd_h2v1_fancy_upsample_mmx):
64        push    ebp
65        mov     ebp,esp
66        pushpic ebx
67;       push    ecx             ; need not be preserved
68;       push    edx             ; need not be preserved
69        push    esi
70        push    edi
71
72        get_GOT ebx             ; get GOT address
73
74        mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
75        test    eax,eax
76        jz      near .return
77
78        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
79        test    ecx,ecx
80        jz      near .return
81
82        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
83        mov     edi, POINTER [output_data_ptr(ebp)]
84        mov     edi, JSAMPARRAY [edi]                   ; output_data
85        alignx  16,7
86.rowloop:
87        push    eax                     ; colctr
88        push    edi
89        push    esi
90
91        mov     esi, JSAMPROW [esi]     ; inptr
92        mov     edi, JSAMPROW [edi]     ; outptr
93
94        test    eax, SIZEOF_MMWORD-1
95        jz      short .skip
96        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
97        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
98.skip:
99        pxor    mm0,mm0                 ; mm0=(all 0's)
100        pcmpeqb mm7,mm7
101        psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
102        pand    mm7, MMWORD [esi+0*SIZEOF_MMWORD]
103
104        add     eax, byte SIZEOF_MMWORD-1
105        and     eax, byte -SIZEOF_MMWORD
106        cmp     eax, byte SIZEOF_MMWORD
107        ja      short .columnloop
108        alignx  16,7
109
110.columnloop_last:
111        pcmpeqb mm6,mm6
112        psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
113        pand    mm6, MMWORD [esi+0*SIZEOF_MMWORD]
114        jmp     short .upsample
115        alignx  16,7
116
117.columnloop:
118        movq    mm6, MMWORD [esi+1*SIZEOF_MMWORD]
119        psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
120
121.upsample:
122        movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
123        movq    mm2,mm1
124        movq    mm3,mm1                 ; mm1=( 0 1 2 3 4 5 6 7)
125        psllq   mm2,BYTE_BIT            ; mm2=( - 0 1 2 3 4 5 6)
126        psrlq   mm3,BYTE_BIT            ; mm3=( 1 2 3 4 5 6 7 -)
127
128        por     mm2,mm7                 ; mm2=(-1 0 1 2 3 4 5 6)
129        por     mm3,mm6                 ; mm3=( 1 2 3 4 5 6 7 8)
130
131        movq    mm7,mm1
132        psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
133
134        movq      mm4,mm1
135        punpcklbw mm1,mm0               ; mm1=( 0 1 2 3)
136        punpckhbw mm4,mm0               ; mm4=( 4 5 6 7)
137        movq      mm5,mm2
138        punpcklbw mm2,mm0               ; mm2=(-1 0 1 2)
139        punpckhbw mm5,mm0               ; mm5=( 3 4 5 6)
140        movq      mm6,mm3
141        punpcklbw mm3,mm0               ; mm3=( 1 2 3 4)
142        punpckhbw mm6,mm0               ; mm6=( 5 6 7 8)
143
144        pmullw  mm1,[GOTOFF(ebx,PW_THREE)]
145        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
146        paddw   mm2,[GOTOFF(ebx,PW_ONE)]
147        paddw   mm5,[GOTOFF(ebx,PW_ONE)]
148        paddw   mm3,[GOTOFF(ebx,PW_TWO)]
149        paddw   mm6,[GOTOFF(ebx,PW_TWO)]
150
151        paddw   mm2,mm1
152        paddw   mm5,mm4
153        psrlw   mm2,2                   ; mm2=OutLE=( 0  2  4  6)
154        psrlw   mm5,2                   ; mm5=OutHE=( 8 10 12 14)
155        paddw   mm3,mm1
156        paddw   mm6,mm4
157        psrlw   mm3,2                   ; mm3=OutLO=( 1  3  5  7)
158        psrlw   mm6,2                   ; mm6=OutHO=( 9 11 13 15)
159
160        psllw   mm3,BYTE_BIT
161        psllw   mm6,BYTE_BIT
162        por     mm2,mm3                 ; mm2=OutL=( 0  1  2  3  4  5  6  7)
163        por     mm5,mm6                 ; mm5=OutH=( 8  9 10 11 12 13 14 15)
164
165        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
166        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm5
167
168        sub     eax, byte SIZEOF_MMWORD
169        add     esi, byte 1*SIZEOF_MMWORD       ; inptr
170        add     edi, byte 2*SIZEOF_MMWORD       ; outptr
171        cmp     eax, byte SIZEOF_MMWORD
172        ja      near .columnloop
173        test    eax,eax
174        jnz     near .columnloop_last
175
176        pop     esi
177        pop     edi
178        pop     eax
179
180        add     esi, byte SIZEOF_JSAMPROW       ; input_data
181        add     edi, byte SIZEOF_JSAMPROW       ; output_data
182        dec     ecx                             ; rowctr
183        jg      near .rowloop
184
185        emms            ; empty MMX state
186
187.return:
188        pop     edi
189        pop     esi
190;       pop     edx             ; need not be preserved
191;       pop     ecx             ; need not be preserved
192        poppic  ebx
193        pop     ebp
194        ret
195
196; --------------------------------------------------------------------------
197;
198; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
199; Again a triangle filter; see comments for h2v1 case, above.
200;
201; GLOBAL(void)
202; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
203;                                JDIMENSION downsampled_width,
204;                                JSAMPARRAY input_data,
205;                                JSAMPARRAY * output_data_ptr);
206;
207
208%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
209%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
210%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
211%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
212
213%define original_ebp    ebp+0
214%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
215%define WK_NUM          4
216%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
217
218        align   16
219        global  EXTN(jsimd_h2v2_fancy_upsample_mmx)
220
221EXTN(jsimd_h2v2_fancy_upsample_mmx):
222        push    ebp
223        mov     eax,esp                         ; eax = original ebp
224        sub     esp, byte 4
225        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
226        mov     [esp],eax
227        mov     ebp,esp                         ; ebp = aligned ebp
228        lea     esp, [wk(0)]
229        pushpic eax             ; make a room for GOT address
230        push    ebx
231;       push    ecx             ; need not be preserved
232;       push    edx             ; need not be preserved
233        push    esi
234        push    edi
235
236        get_GOT ebx                     ; get GOT address
237        movpic  POINTER [gotptr], ebx   ; save GOT address
238
239        mov     edx,eax                         ; edx = original ebp
240        mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
241        test    eax,eax
242        jz      near .return
243
244        mov     ecx, INT [max_v_samp(edx)]      ; rowctr
245        test    ecx,ecx
246        jz      near .return
247
248        mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
249        mov     edi, POINTER [output_data_ptr(edx)]
250        mov     edi, JSAMPARRAY [edi]                   ; output_data
251        alignx  16,7
252.rowloop:
253        push    eax                                     ; colctr
254        push    ecx
255        push    edi
256        push    esi
257
258        mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
259        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
260        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
261        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
262        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
263
264        test    eax, SIZEOF_MMWORD-1
265        jz      short .skip
266        push    edx
267        mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
268        mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
269        mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
270        mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
271        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
272        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
273        pop     edx
274.skip:
275        ; -- process the first column block
276
277        movq    mm0, MMWORD [ebx+0*SIZEOF_MMWORD]       ; mm0=row[ 0][0]
278        movq    mm1, MMWORD [ecx+0*SIZEOF_MMWORD]       ; mm1=row[-1][0]
279        movq    mm2, MMWORD [esi+0*SIZEOF_MMWORD]       ; mm2=row[+1][0]
280
281        pushpic ebx
282        movpic  ebx, POINTER [gotptr]   ; load GOT address
283
284        pxor      mm3,mm3               ; mm3=(all 0's)
285        movq      mm4,mm0
286        punpcklbw mm0,mm3               ; mm0=row[ 0][0]( 0 1 2 3)
287        punpckhbw mm4,mm3               ; mm4=row[ 0][0]( 4 5 6 7)
288        movq      mm5,mm1
289        punpcklbw mm1,mm3               ; mm1=row[-1][0]( 0 1 2 3)
290        punpckhbw mm5,mm3               ; mm5=row[-1][0]( 4 5 6 7)
291        movq      mm6,mm2
292        punpcklbw mm2,mm3               ; mm2=row[+1][0]( 0 1 2 3)
293        punpckhbw mm6,mm3               ; mm6=row[+1][0]( 4 5 6 7)
294
295        pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
296        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
297
298        pcmpeqb mm7,mm7
299        psrlq   mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
300
301        paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
302        paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
303        paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
304        paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
305
306        movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1       ; temporarily save
307        movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5       ; the intermediate data
308        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
309        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm6
310
311        pand    mm1,mm7                 ; mm1=( 0 - - -)
312        pand    mm2,mm7                 ; mm2=( 0 - - -)
313
314        movq    MMWORD [wk(0)], mm1
315        movq    MMWORD [wk(1)], mm2
316
317        poppic  ebx
318
319        add     eax, byte SIZEOF_MMWORD-1
320        and     eax, byte -SIZEOF_MMWORD
321        cmp     eax, byte SIZEOF_MMWORD
322        ja      short .columnloop
323        alignx  16,7
324
325.columnloop_last:
326        ; -- process the last column block
327
328        pushpic ebx
329        movpic  ebx, POINTER [gotptr]   ; load GOT address
330
331        pcmpeqb mm1,mm1
332        psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
333        movq    mm2,mm1
334
335        pand    mm1, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm1=( - - - 7)
336        pand    mm2, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm2=( - - - 7)
337
338        movq    MMWORD [wk(2)], mm1
339        movq    MMWORD [wk(3)], mm2
340
341        jmp     short .upsample
342        alignx  16,7
343
344.columnloop:
345        ; -- process the next column block
346
347        movq    mm0, MMWORD [ebx+1*SIZEOF_MMWORD]       ; mm0=row[ 0][1]
348        movq    mm1, MMWORD [ecx+1*SIZEOF_MMWORD]       ; mm1=row[-1][1]
349        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]       ; mm2=row[+1][1]
350
351        pushpic ebx
352        movpic  ebx, POINTER [gotptr]   ; load GOT address
353
354        pxor      mm3,mm3               ; mm3=(all 0's)
355        movq      mm4,mm0
356        punpcklbw mm0,mm3               ; mm0=row[ 0][1]( 0 1 2 3)
357        punpckhbw mm4,mm3               ; mm4=row[ 0][1]( 4 5 6 7)
358        movq      mm5,mm1
359        punpcklbw mm1,mm3               ; mm1=row[-1][1]( 0 1 2 3)
360        punpckhbw mm5,mm3               ; mm5=row[-1][1]( 4 5 6 7)
361        movq      mm6,mm2
362        punpcklbw mm2,mm3               ; mm2=row[+1][1]( 0 1 2 3)
363        punpckhbw mm6,mm3               ; mm6=row[+1][1]( 4 5 6 7)
364
365        pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
366        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
367
368        paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
369        paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
370        paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
371        paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
372
373        movq    MMWORD [edx+2*SIZEOF_MMWORD], mm1       ; temporarily save
374        movq    MMWORD [edx+3*SIZEOF_MMWORD], mm5       ; the intermediate data
375        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
376        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm6
377
378        psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
379        psllq   mm2,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
380
381        movq    MMWORD [wk(2)], mm1
382        movq    MMWORD [wk(3)], mm2
383
384.upsample:
385        ; -- process the upper row
386
387        movq    mm7, MMWORD [edx+0*SIZEOF_MMWORD]       ; mm7=Int0L=( 0 1 2 3)
388        movq    mm3, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm3=Int0H=( 4 5 6 7)
389
390        movq    mm0,mm7
391        movq    mm4,mm3
392        psrlq   mm0,2*BYTE_BIT                  ; mm0=( 1 2 3 -)
393        psllq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
394        movq    mm5,mm7
395        movq    mm6,mm3
396        psrlq   mm5,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
397        psllq   mm6,2*BYTE_BIT                  ; mm6=( - 4 5 6)
398
399        por     mm0,mm4                         ; mm0=( 1 2 3 4)
400        por     mm5,mm6                         ; mm5=( 3 4 5 6)
401
402        movq    mm1,mm7
403        movq    mm2,mm3
404        psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
405        psrlq   mm2,2*BYTE_BIT                  ; mm2=( 5 6 7 -)
406        movq    mm4,mm3
407        psrlq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
408
409        por     mm1, MMWORD [wk(0)]             ; mm1=(-1 0 1 2)
410        por     mm2, MMWORD [wk(2)]             ; mm2=( 5 6 7 8)
411
412        movq    MMWORD [wk(0)], mm4
413
414        pmullw  mm7,[GOTOFF(ebx,PW_THREE)]
415        pmullw  mm3,[GOTOFF(ebx,PW_THREE)]
416        paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
417        paddw   mm5,[GOTOFF(ebx,PW_EIGHT)]
418        paddw   mm0,[GOTOFF(ebx,PW_SEVEN)]
419        paddw   mm2,[GOTOFF(ebx,PW_SEVEN)]
420
421        paddw   mm1,mm7
422        paddw   mm5,mm3
423        psrlw   mm1,4                   ; mm1=Out0LE=( 0  2  4  6)
424        psrlw   mm5,4                   ; mm5=Out0HE=( 8 10 12 14)
425        paddw   mm0,mm7
426        paddw   mm2,mm3
427        psrlw   mm0,4                   ; mm0=Out0LO=( 1  3  5  7)
428        psrlw   mm2,4                   ; mm2=Out0HO=( 9 11 13 15)
429
430        psllw   mm0,BYTE_BIT
431        psllw   mm2,BYTE_BIT
432        por     mm1,mm0                 ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
433        por     mm5,mm2                 ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
434
435        movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1
436        movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5
437
438        ; -- process the lower row
439
440        movq    mm6, MMWORD [edi+0*SIZEOF_MMWORD]       ; mm6=Int1L=( 0 1 2 3)
441        movq    mm4, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm4=Int1H=( 4 5 6 7)
442
443        movq    mm7,mm6
444        movq    mm3,mm4
445        psrlq   mm7,2*BYTE_BIT                  ; mm7=( 1 2 3 -)
446        psllq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
447        movq    mm0,mm6
448        movq    mm2,mm4
449        psrlq   mm0,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
450        psllq   mm2,2*BYTE_BIT                  ; mm2=( - 4 5 6)
451
452        por     mm7,mm3                         ; mm7=( 1 2 3 4)
453        por     mm0,mm2                         ; mm0=( 3 4 5 6)
454
455        movq    mm1,mm6
456        movq    mm5,mm4
457        psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
458        psrlq   mm5,2*BYTE_BIT                  ; mm5=( 5 6 7 -)
459        movq    mm3,mm4
460        psrlq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
461
462        por     mm1, MMWORD [wk(1)]             ; mm1=(-1 0 1 2)
463        por     mm5, MMWORD [wk(3)]             ; mm5=( 5 6 7 8)
464
465        movq    MMWORD [wk(1)], mm3
466
467        pmullw  mm6,[GOTOFF(ebx,PW_THREE)]
468        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
469        paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
470        paddw   mm0,[GOTOFF(ebx,PW_EIGHT)]
471        paddw   mm7,[GOTOFF(ebx,PW_SEVEN)]
472        paddw   mm5,[GOTOFF(ebx,PW_SEVEN)]
473
474        paddw   mm1,mm6
475        paddw   mm0,mm4
476        psrlw   mm1,4                   ; mm1=Out1LE=( 0  2  4  6)
477        psrlw   mm0,4                   ; mm0=Out1HE=( 8 10 12 14)
478        paddw   mm7,mm6
479        paddw   mm5,mm4
480        psrlw   mm7,4                   ; mm7=Out1LO=( 1  3  5  7)
481        psrlw   mm5,4                   ; mm5=Out1HO=( 9 11 13 15)
482
483        psllw   mm7,BYTE_BIT
484        psllw   mm5,BYTE_BIT
485        por     mm1,mm7                 ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
486        por     mm0,mm5                 ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
487
488        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm1
489        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm0
490
491        poppic  ebx
492
493        sub     eax, byte SIZEOF_MMWORD
494        add     ecx, byte 1*SIZEOF_MMWORD       ; inptr1(above)
495        add     ebx, byte 1*SIZEOF_MMWORD       ; inptr0
496        add     esi, byte 1*SIZEOF_MMWORD       ; inptr1(below)
497        add     edx, byte 2*SIZEOF_MMWORD       ; outptr0
498        add     edi, byte 2*SIZEOF_MMWORD       ; outptr1
499        cmp     eax, byte SIZEOF_MMWORD
500        ja      near .columnloop
501        test    eax,eax
502        jnz     near .columnloop_last
503
504        pop     esi
505        pop     edi
506        pop     ecx
507        pop     eax
508
509        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
510        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
511        sub     ecx, byte 2                     ; rowctr
512        jg      near .rowloop
513
514        emms            ; empty MMX state
515
516.return:
517        pop     edi
518        pop     esi
519;       pop     edx             ; need not be preserved
520;       pop     ecx             ; need not be preserved
521        pop     ebx
522        mov     esp,ebp         ; esp <- aligned ebp
523        pop     esp             ; esp <- original ebp
524        pop     ebp
525        ret
526
527; --------------------------------------------------------------------------
528;
529; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
530; It's still a box filter.
531;
532; GLOBAL(void)
533; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
534;                          JDIMENSION output_width,
535;                          JSAMPARRAY input_data,
536;                          JSAMPARRAY * output_data_ptr);
537;
538
539%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
540%define output_width(b)         (b)+12          ; JDIMENSION output_width
541%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
542%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
543
544        align   16
545        global  EXTN(jsimd_h2v1_upsample_mmx)
546
547EXTN(jsimd_h2v1_upsample_mmx):
548        push    ebp
549        mov     ebp,esp
550;       push    ebx             ; unused
551;       push    ecx             ; need not be preserved
552;       push    edx             ; need not be preserved
553        push    esi
554        push    edi
555
556        mov     edx, JDIMENSION [output_width(ebp)]
557        add     edx, byte (2*SIZEOF_MMWORD)-1
558        and     edx, byte -(2*SIZEOF_MMWORD)
559        jz      short .return
560
561        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
562        test    ecx,ecx
563        jz      short .return
564
565        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
566        mov     edi, POINTER [output_data_ptr(ebp)]
567        mov     edi, JSAMPARRAY [edi]                   ; output_data
568        alignx  16,7
569.rowloop:
570        push    edi
571        push    esi
572
573        mov     esi, JSAMPROW [esi]             ; inptr
574        mov     edi, JSAMPROW [edi]             ; outptr
575        mov     eax,edx                         ; colctr
576        alignx  16,7
577.columnloop:
578
579        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
580
581        movq      mm1,mm0
582        punpcklbw mm0,mm0
583        punpckhbw mm1,mm1
584
585        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
586        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
587
588        sub     eax, byte 2*SIZEOF_MMWORD
589        jz      short .nextrow
590
591        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
592
593        movq      mm3,mm2
594        punpcklbw mm2,mm2
595        punpckhbw mm3,mm3
596
597        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
598        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
599
600        sub     eax, byte 2*SIZEOF_MMWORD
601        jz      short .nextrow
602
603        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
604        add     edi, byte 4*SIZEOF_MMWORD       ; outptr
605        jmp     short .columnloop
606        alignx  16,7
607
608.nextrow:
609        pop     esi
610        pop     edi
611
612        add     esi, byte SIZEOF_JSAMPROW       ; input_data
613        add     edi, byte SIZEOF_JSAMPROW       ; output_data
614        dec     ecx                             ; rowctr
615        jg      short .rowloop
616
617        emms            ; empty MMX state
618
619.return:
620        pop     edi
621        pop     esi
622;       pop     edx             ; need not be preserved
623;       pop     ecx             ; need not be preserved
624;       pop     ebx             ; unused
625        pop     ebp
626        ret
627
628; --------------------------------------------------------------------------
629;
630; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
631; It's still a box filter.
632;
633; GLOBAL(void)
634; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
635;                          JDIMENSION output_width,
636;                          JSAMPARRAY input_data,
637;                          JSAMPARRAY * output_data_ptr);
638;
639
640%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
641%define output_width(b)         (b)+12          ; JDIMENSION output_width
642%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
643%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
644
645        align   16
646        global  EXTN(jsimd_h2v2_upsample_mmx)
647
648EXTN(jsimd_h2v2_upsample_mmx):
649        push    ebp
650        mov     ebp,esp
651        push    ebx
652;       push    ecx             ; need not be preserved
653;       push    edx             ; need not be preserved
654        push    esi
655        push    edi
656
657        mov     edx, JDIMENSION [output_width(ebp)]
658        add     edx, byte (2*SIZEOF_MMWORD)-1
659        and     edx, byte -(2*SIZEOF_MMWORD)
660        jz      near .return
661
662        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
663        test    ecx,ecx
664        jz      short .return
665
666        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
667        mov     edi, POINTER [output_data_ptr(ebp)]
668        mov     edi, JSAMPARRAY [edi]                   ; output_data
669        alignx  16,7
670.rowloop:
671        push    edi
672        push    esi
673
674        mov     esi, JSAMPROW [esi]                     ; inptr
675        mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
676        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
677        mov     eax,edx                                 ; colctr
678        alignx  16,7
679.columnloop:
680
681        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
682
683        movq      mm1,mm0
684        punpcklbw mm0,mm0
685        punpckhbw mm1,mm1
686
687        movq    MMWORD [ebx+0*SIZEOF_MMWORD], mm0
688        movq    MMWORD [ebx+1*SIZEOF_MMWORD], mm1
689        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
690        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
691
692        sub     eax, byte 2*SIZEOF_MMWORD
693        jz      short .nextrow
694
695        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
696
697        movq      mm3,mm2
698        punpcklbw mm2,mm2
699        punpckhbw mm3,mm3
700
701        movq    MMWORD [ebx+2*SIZEOF_MMWORD], mm2
702        movq    MMWORD [ebx+3*SIZEOF_MMWORD], mm3
703        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
704        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
705
706        sub     eax, byte 2*SIZEOF_MMWORD
707        jz      short .nextrow
708
709        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
710        add     ebx, byte 4*SIZEOF_MMWORD       ; outptr0
711        add     edi, byte 4*SIZEOF_MMWORD       ; outptr1
712        jmp     short .columnloop
713        alignx  16,7
714
715.nextrow:
716        pop     esi
717        pop     edi
718
719        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
720        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
721        sub     ecx, byte 2                     ; rowctr
722        jg      short .rowloop
723
724        emms            ; empty MMX state
725
726.return:
727        pop     edi
728        pop     esi
729;       pop     edx             ; need not be preserved
730;       pop     ecx             ; need not be preserved
731        pop     ebx
732        pop     ebp
733        ret
734
735; For some reason, the OS X linker does not honor the request to align the
736; segment unless we do this.
737        align   16
738