1
2/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
3 *
4 * For Intel/AMD x86 or x86-64 CPU (Pentium-MMX or later) and GNU C compiler.
5 *
6 * Last changed in libpng 1.2.19 August 18, 2007
7 * For conditions of distribution and use, see copyright notice in png.h
8 * Copyright (c) 1998 Intel Corporation
9 * Copyright (c) 1999-2002,2007 Greg Roelofs
10 * Copyright (c) 1998-2007 Glenn Randers-Pehrson
11 *
12 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
13 * Interface to libpng contributed by Gilles Vollant, 1999.
14 * GNU C port by Greg Roelofs, 1999-2001.
15 *
16 * References:
17 *
18 *     http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
19 *     http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
20 *       [Intel's performance analysis of the MMX vs. non-MMX code;
21 *        moved/deleted as of 2006, but text and some graphs still
22 *        available via WayBack Machine at archive.org]
23 *
24 *     http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html
25 *     http://sam.zoy.org/blog/2007-04-13-shlib-with-non-pic-code-have-inline-assembly-and-pic-mix-well
26 *     http://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html
27 *     http://gcc.gnu.org/onlinedocs/gcc/Variable-Attributes.html
28 *     http://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
29 *     AMD64 Architecture Programmer's Manual, volumes 1 and 5
30 *       [http://www.amd.com/us-en/Processors/TechnicalResources/0,,30_182_739_7044,00.html]
31 *     Intel 64 and IA-32 Software Developer's Manuals
32 *       [http://developer.intel.com/products/processor/manuals/]
33 *
34 * png_read_filter_row_mmx_*() were converted in place with intel2gas 1.3.1:
35 *
36 *     intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
37 *
38 * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
39 *
40 * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
41 * is required to assemble the newer asm instructions such as movq.  (Version
42 * 2.5.2l.15 is definitely too old.)  See ftp://ftp.gnu.org/pub/gnu/binutils/ .
43 */
44
45/*
46 * PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47 * ===========================
48 *
49 * 19991006:
50 *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51 *
52 * 19991007:
53 *  - additional optimizations (possible or definite):
54 *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55 *     - write MMX code for 48-bit case (pixel_bytes == 6)
56 *     - figure out what's up with 24-bit case (pixel_bytes == 3):
57 *        why subtract 8 from width_mmx in the pass 4/5 case?
58 *        (only width_mmx case) (near line 2335)
59 *     x [DONE] replace pixel_bytes within each block with the true
60 *        constant value (or are compilers smart enough to do that?)
61 *     - rewrite all MMX interlacing code so it's aligned with
62 *        the *beginning* of the row buffer, not the end.  This
63 *        would not only allow one to eliminate half of the memory
64 *        writes for odd passes (that is, pass == odd), it may also
65 *        eliminate some unaligned-data-access exceptions (assuming
66 *        there's a penalty for not aligning 64-bit accesses on
67 *        64-bit boundaries).  The only catch is that the "leftover"
68 *        pixel(s) at the end of the row would have to be saved,
69 *        but there are enough unused MMX registers in every case,
70 *        so this is not a problem.  A further benefit is that the
71 *        post-MMX cleanup code (C code) in at least some of the
72 *        cases could be done within the assembler block.
73 *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74 *     inconsistent, and don't match the MMX Programmer's Reference
75 *     Manual conventions anyway.  They should be changed to
76 *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77 *     was lowest in memory (i.e., corresponding to a left pixel)
78 *     and b7 is the byte that was highest (i.e., a right pixel).
79 *
80 * 19991016:
81 *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
82 *     want globals prefixed by underscores when referencing them--
83 *     i.e., if the variable is const4, then refer to it as const4,
84 *     not _const4.  This seems to be a djgpp-specific requirement.
85 *     Also, such variables apparently *must* be declared outside
86 *     of functions; neither static nor automatic variables work if
87 *     defined within the scope of a single function, but both
88 *     static and truly global (multi-module) variables work fine.
89 *
90 * 19991017:
91 *  - replaced pixel_bytes in each png_memcpy() call with constant value for
92 *     inlining (png_do_read_interlace() "non-MMX/modified C code" block)
93 *
94 * 19991023:
95 *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
96 *  - switched from string-concatenation-with-macros to cleaner method of
97 *     renaming global variables for djgpp--i.e., always use prefixes in
98 *     inlined assembler code (== strings) and conditionally rename the
99 *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
100 *
101 * 19991024:
102 *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
103 *     This one was severely weird:  even though mmxsupport() doesn't touch
104 *     ebx (where "row" pointer was stored), it nevertheless managed to zero
105 *     the register (even in static/non-fPIC code--see below), which in turn
106 *     caused png_do_read_interlace() to return prematurely on the first row of
107 *     interlaced images (i.e., without expanding the interlaced pixels).
108 *     Inspection of the generated assembly code didn't turn up any clues,
109 *     although it did point at a minor optimization (i.e., get rid of
110 *     mmx_supported_local variable and just use eax).  Possibly the CPUID
111 *     instruction is more destructive than it looks?  (Not yet checked.)
112 *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
113 *     listings...  Apparently register spillage has to do with ebx, since
114 *     it's used to index the global offset table.  Commenting it out of the
115 *     input-reg lists in png_combine_row() eliminated compiler barfage, so
116 *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
117 *
118 * 19991107:
119 *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
120 *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
121 *
122 * 19991120:
123 *  - made "diff" variable (now "_dif") global to simplify conversion of
124 *     filtering routines (running out of regs, sigh).  "diff" is still used
125 *     in interlacing routines, however.
126 *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
127 *     macro determines which is used); original not yet tested.
128 *
129 * 20000213:
130 *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
131 *
132 * 20000319:
133 *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
134 *     pass == 4 or 5, that caused visible corruption of interlaced images
135 *
136 * 20000623:
137 *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
138 *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
139 *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
140 *     Chuck Wilson supplied a patch involving dummy output registers.  See
141 *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
142 *     for the original (anonymous) SourceForge bug report.
143 *
144 * 20000706:
145 *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
146 *       pnggccrd.c: In function `png_combine_row':
147 *       pnggccrd.c:525: more than 10 operands in `asm'
148 *       pnggccrd.c:669: more than 10 operands in `asm'
149 *       pnggccrd.c:828: more than 10 operands in `asm'
150 *       pnggccrd.c:994: more than 10 operands in `asm'
151 *       pnggccrd.c:1177: more than 10 operands in `asm'
152 *     They are all the same problem and can be worked around by using the
153 *     global _unmask variable unconditionally, not just in the -fPIC case.
154 *     Reportedly earlier versions of gcc also have the problem with more than
155 *     10 operands; they just don't report it.  Much strangeness ensues, etc.
156 *
157 * 20000729:
158 *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
159 *     MMX routine); began converting png_read_filter_row_mmx_sub()
160 *  - to finish remaining sections:
161 *     - clean up indentation and comments
162 *     - preload local variables
163 *     - add output and input regs (order of former determines numerical
164 *        mapping of latter)
165 *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
166 *     - remove "$" from addressing of Shift and Mask variables [20000823]
167 *
168 * 20000731:
169 *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
170 *
171 * 20000822:
172 *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
173 *     shared-library (-fPIC) version!  Code works just fine as part of static
174 *     library.  Should have tested that sooner.
175 *     ebx is getting clobbered again (explicitly this time); need to save it
176 *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
177 *
178 * 20000823:
179 *  - first section was trickiest; all remaining sections have ebx -> edx now.
180 *     (-fPIC works again.)  Also added missing underscores to various Shift*
181 *     and *Mask* globals and got rid of leading "$" signs.
182 *
183 * 20000826:
184 *  - added visual separators to help navigate microscopic printed copies
185 *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
186 *     on png_read_filter_row_mmx_avg()
187 *
188 * 20000828:
189 *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
190 *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
191 *     cleaned up/shortened in either routine, but functionality is complete
192 *     and seems to be working fine.
193 *
194 * 20000829:
195 *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
196 *     as an input reg (with dummy output variables, etc.), then it *cannot*
197 *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
198 *     is simple enough...
199 *
200 * 20000914:
201 *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
202 *     correctly (but 48-bit RGB just fine)
203 *
204 * 20000916:
205 *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
206 *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
207 *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
208 *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
209 *
210 * 20010101:
211 *  - added new png_init_mmx_flags() function (here only because it needs to
212 *     call mmxsupport(), which should probably become global png_mmxsupport());
213 *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
214 *
215 * 20010103:
216 *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
217 *     and made it public; moved png_init_mmx_flags() to png.c as internal func
218 *
219 * 20010104:
220 *  - removed dependency on png_read_filter_row_c() (C code already duplicated
221 *     within MMX version of png_read_filter_row()) so no longer necessary to
222 *     compile it into pngrutil.o
223 *
224 * 20010310:
225 *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
226 *
227 * 20010808:
228 *  - added PNG_THREAD_UNSAFE_OK around code using global variables [GR-P]
229 *
230 * 20011124:
231 *  - fixed missing save of Eflag in png_mmx_support() [Maxim Sobolev]
232 *
233 * 20020304:
234 *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
235 *
236 * 20020407:
237 *  - fixed insufficient preservation of ebx register [Sami Farin]
238 *
239 * 20040724:
240 *  - more tinkering with clobber list at lines 4529 and 5033 to get it to
241 *     compile with gcc 3.4 [GR-P]
242 *
243 * 20040809:
244 *  - added "rim" definitions for CONST4 and CONST6 [GR-P]
245 *
246 * 20060303:
247 *  - added "OS2" to list of systems that don't need leading underscores [GR-P]
248 *
249 * 20060320:
250 *  - made PIC-compliant [Christian Aichinger]
251 *
252 * 20070313:
253 *  - finally applied Giuseppe Ghibò's 64-bit patch of 20060803 (completely
254 *     overlooked Dylan Alex Simon's similar patch of 20060414, oops...)
255 *
256 * 20070524:
257 *  - fixed link failure caused by asm-only variables being optimized out
258 *     (identified by Dimitri of Trolltech) with __attribute__((used)), which
259 *     also gets rid of warnings => nuked ugly png_squelch_warnings() hack
260 *  - dropped redundant ifdef
261 *  - moved png_mmx_support() back up where originally intended (as in
262 *     pngvcrd.c), using __attribute__((noinline)) in extra prototype
263 *
264 * 20070527:
265 *  - revised png_combine_row() to reuse mask in lieu of external _unmask
266 *  - moved 32-bit (RGBA) case to top of png_combine_row():  most common
267 *  - just about ready to give up on x86-64 -fPIC mode; can't even access 16
268 *     _mask*_* constants without triggering link error on shared library:
269 *       /usr/bin/ld: pnggccrd.pic.o: relocation R_X86_64_32S against `a local
270 *         symbol' can not be used when making a shared object; recompile with
271 *         -fPIC
272 *       pnggccrd.pic.o: could not read symbols: Bad value
273 *       ("objdump -x pnggccrd.pic.o | grep rodata" to verify)
274 *     [might be able to work around by doing within assembly code whatever
275 *     -fPIC does, but given problems to date, seems like long shot...]
276 *     [relevant ifdefs:  __x86_64__ && __PIC__ => C code only]
277 *  - changed #if 0 to #ifdef PNG_CLOBBER_MMX_REGS_SUPPORTED in case gcc ever
278 *     supports MMX regs (%mm0, etc.) in clobber list (not supported by gcc
279 *     2.7.2.3, 2.91.66 (egcs 1.1.2), 3.x, or 4.1.2)
280 *
281 * 20070603:
282 *  - revised png_combine_row() to use @GOTPCREL(%%rip) addressing on _c64
283 *     struct of _mask*_* constants for x86-64 -fPIC; see sam.zoy.org link
284 *     above for details
285 *  - moved _const4 and _const6 into _c64 struct, renamed to _amask5_3_0 and
286 *     _amask7_1_0, respectively
287 *  - can't figure out how to use _c64._mask*_* vars within asm code, so still
288 *     need single variables for non-x86-64/-fPIC half :-(
289 *  - replaced various __PIC__ ifdefs with *_GOT_ebx macros
290 *  - moved _LBCarryMask and _HBClearMask into _c64 struct
291 *  - conditionally replaced _p*temp variables with %r11d-%r13d (via p*_TEMP
292 *     and CLOBBER_r1*d macros)
293 *
294 * 20070604:
295 *  - replaced all _ActiveMask and _ActiveMaskEnd with new _amask*_*_* consts
296 *     (_amask naming convention:  numbers of 00-bytes, ff-bytes, 00-bytes)
297 *    - _ActiveMask     // (10) // avg/paeth/sub; read-only; consts; movq/pand
298 *       0x0000000000ffffffLL (bpp 3, avg)      _amask5_3_0
299 *       0xffffffffffffffffLL (bpp 4, 6, avg)   _amask0_8_0
300 *       0x000000000000ffffLL (bpp 2, avg)      _amask6_2_0
301 *       0x0000000000ffffffLL (bpp 3, paeth)    _amask5_3_0
302 *       0x00000000ffffffffLL (bpp 6, paeth)    _amask4_4_0
303 *       0x00000000ffffffffLL (bpp 4, paeth)    _amask4_4_0
304 *       0x00000000ffffffffLL (bpp 8, paeth)    _amask4_4_0
305 *       0x0000ffffff000000LL (bpp 3, sub)      _amask2_3_3
306 *       0x00000000ffff0000LL (bpp 2, sub)      _amask4_2_2
307 *    - _ActiveMaskEnd  // (1)  // paeth only; read-only; const; pand
308 *       0xffff000000000000LL (bpp 3, paeth)    _amask0_2_6
309 *  - changed all "#if defined(__x86_64__) // later // && defined(__PIC__)"
310 *     lines to "#ifdef PNG_x86_64_USE_GOTPCREL" for easier/safer testing
311 *
312 * 20070605:
313 *  - merged PNG_x86_64_USE_GOTPCREL, non-PNG_x86_64_USE_GOTPCREL code via
314 *     *MASK* and LOAD/RESTORE macros
315 *
316 * 20070607:
317 *  - replaced all constant instances of _ShiftBpp, _ShiftRem with immediates
318 *     (still have two shared cases in avg, sub routines)
319 *
320 * 20070609:
321 *  - replaced remaining instances of _ShiftBpp, _ShiftRem with immediates
322 *     (split sub and avg 4/6-bpp cases into separate blocks)
323 *  - fixed paeth bug due to clobbered r11/r12/r13 regs
324 *
325 * 20070610:
326 *  - made global "_dif" variable (avg/paeth/sub routines) local again (now
327 *     "diff"--see 19991120 entry above), using register constraints
328 *  - note that %ebp in clobber list doesn't actually work, at least for 32-bit
329 *     version and gcc 4.1.2; must save and restore manually.  (Seems to work
330 *     OK for 64-bit version and gcc 3.4.3, but gcc may not be using ebp/rbp
331 *     in that case.)
332 *  - started replacing direct _MMXLength accesses with register constraints
333 *
334 * 20070612:
335 *  - continued replacing direct _MMXLength accesses with register constraints
336 *
337 * 20070613:
338 *  - finished replacing direct _MMXLength accesses with register constraints;
339 *     switched to local variable (and renamed back to MMXLength)
340 *
341 * 20070614:
342 *  - fixed sub bpp = 1 bug
343 *  - started replacing direct _FullLength accesses with register constraints
344 *
345 * 20070615:
346 *  - fixed 64-bit paeth bpp 3 crash bug (misplaced LOAD_GOT_rbp)
347 *  - fixed 64-bit paeth bpp 1/2 and cleanup-block crash bugs (misplaced
348 *     RESTORE_r11_r12_r13)
349 *  - slightly optimized avg/paeth cleanup blocks and paeth bpp 1/2 block
350 *     (save/restore ebx only if needed)
351 *  - continued replacing direct _FullLength accesses with register constraints
352 *
353 * 20070616:
354 *  - finished replacing direct _FullLength accesses with register constraints
355 *     (*ugly* conditional clobber-separator macros for avg and paeth, sigh)
356 *
357 * 20070618:
358 *  - fixed misplaced PNG_THREAD_UNSAFE_OK endif (was missing LOAD_GOT_rbp/
359 *     RESTORE_rbp in 32-bit thread-safe case)
360 *  - changed all "ifdef *" to "if defined(*)" [GR-P]
361 *
362 * 20070619:
363 *  - rearranged most bitdepth-related case statements to put most frequent
364 *     cases at top (24-bit, 32-bit, 8-bit, rest)
365 *
366 * 20070623:
367 *  - cleaned up png_debug() warnings/formatting
368 *  - removed PNG_MMX_CODE_SUPPORTED ifdefs and added outer __GNUC__ ifdef
369 *     (module no longer used by non-x86/non-GCC builds as of libpng 1.2.19)
370 *  - removed single libpng-1.2.x PNG_DEBUG dependency on 1.0.x png_struct
371 *     member (row_buf_size)
372 *  - rearranged pass-related if-blocks in png_do_read_interlace() to put most
373 *     frequent cases (4, 5) at top [GR-P suggestion]
374 *
375 * 20070624-29:
376 *  - fixed 64-bit crash bug:  pointers -> rsi/rdi, not esi/edi (switched to
377 *     %0/%1/%2/%3/%4 notation; eliminated size suffixes from relevant add/
378 *     inc/sub/mov instructions; changed dummy vars to pointers)
379 *     - png_combine_row()
380 *     - png_do_read_interlace()
381 *     - png_read_filter_row_mmx_avg()
382 *     - png_read_filter_row_mmx_paeth()
383 *     - png_read_filter_row_mmx_sub()
384 *     - png_read_filter_row_mmx_up()
385 *  - NOTE:  this fix makes use of the fact that modifying a 32-bit reg (e.g.,
386 *     %%ebx) clears the top half of its corresponding 64-bit reg (%%rbx), so
387 *     it's safe to mix 32-bit operations with 64-bit base/index addressing
388 *     (see new PSI/PAX/PBX/PDX/PBP/etc. "pointer-register" macros); applies
389 *     also to clobber lists
390 *
391 * 20070630:
392 *  - cleaned up formatting, macros, minor png_read_filter_row_mmx_sub() 8-bpp
393 *     register-usage inefficiency
394 *  - fixed 32-bit png_do_read_interlace() bug (was using pointer size for
395 *     64-bit dummy values)
396 *
397 * 20070703:
398 *  - added check for (manual) PIC macro to fix OpenBSD crash bug
399 *
400 * 20070717:
401 *  - fixed 48-bit png_combine_row() bug (was acting like 32-bit):  copy 6
402 *     bytes per pixel, not 4, and use stride of 6, not 4, in the second loop
403 *     of interlace processing of 48-bit pixels [GR-P]
404 *
405 * 20070722:
406 *  - fixed 64-bit png_uint_32 bug with MMXLength/FullLength temp vars
407 *
408 * [still broken:  tops of all row-filter blocks (input/output constraints);
409 *  shows up on 64-bit dynamic (-fPIC) version with -O2, especially if debug-
410 *  printfs enabled, but at right edge of odd-width images even if disabled]
411 *
412 *
413 * STILL TO DO:
414 *  - fix final thread-unsafe code using stack vars and pointer? (paeth top,
415 *     default, bottom only:  default, bottom already 5 reg constraints; could
416 *     replace bpp with pointer and group bpp/patemp/pbtemp/pctemp in array)
417 *  - fix ebp/no-reg-constraint inefficiency (avg/paeth/sub top)
418 *  - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
419 *  - write MMX code for 48-bit case (pixel_bytes == 6)
420 *  - figure out what's up with 24-bit case (pixel_bytes == 3):
421 *     why subtract 8 from width_mmx in the pass 4/5 case?  due to
422 *     odd number of bytes? (only width_mmx case) (near line 2335)
423 *  - rewrite all MMX interlacing code so it's aligned with beginning
424 *     of the row buffer, not the end (see 19991007 for details)
425 *  - add error messages to any remaining bogus default cases
426 *  - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
427 *  - try =r, etc., as reg constraints?  (would gcc use 64-bit ones on x86-64?)
428 *  - need full, non-graphical, CRC-based test suite...  maybe autogenerate
429 *     random data of various height/width/depth, compute CRCs, write (C
430 *     funcs), read (asm/MMX), recompute CRCs, and compare?
431 *  - write true x86-64 version using 128-bit "media instructions", %xmm0-15,
432 *     and extra general-purpose registers
433 */
434
435#if defined(__GNUC__)
436
437#define PNG_INTERNAL
438#include "png.h"
439
440
441/* for some inexplicable reason, gcc 3.3.5 on OpenBSD (and elsewhere?) does
442 * *not* define __PIC__ when the -fPIC option is used, so we have to rely on
443 * makefiles and whatnot to define the PIC macro explicitly */
444#if defined(PIC) && !defined(__PIC__)   // (this can/should move to pngconf.h)
445#  define __PIC__
446#endif
447
448#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
449
450/* if you want/need full thread-safety on x86-64 even when linking statically,
451 * comment out the "&& defined(__PIC__)" part here: */
452#if defined(__x86_64__) && defined(__PIC__)
453#  define PNG_x86_64_USE_GOTPCREL            // GOTPCREL => full thread-safety
454#  define PNG_CLOBBER_x86_64_REGS_SUPPORTED  // works as of gcc 3.4.3 ...
455#endif
456
457int PNGAPI png_mmx_support(void);
458
459#if defined(PNG_USE_LOCAL_ARRAYS)
460static PNG_CONST int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
461static PNG_CONST int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
462static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
463#endif
464
465/* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
466 * so define them without: */
467#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
468    defined(__OS2__)
469#  define _mmx_supported  mmx_supported
470#  define _mask8_0        mask8_0
471#  define _mask16_1       mask16_1
472#  define _mask16_0       mask16_0
473#  define _mask24_2       mask24_2
474#  define _mask24_1       mask24_1
475#  define _mask24_0       mask24_0
476#  define _mask32_3       mask32_3
477#  define _mask32_2       mask32_2
478#  define _mask32_1       mask32_1
479#  define _mask32_0       mask32_0
480#  define _mask48_5       mask48_5
481#  define _mask48_4       mask48_4
482#  define _mask48_3       mask48_3
483#  define _mask48_2       mask48_2
484#  define _mask48_1       mask48_1
485#  define _mask48_0       mask48_0
486#  define _amask5_3_0     amask5_3_0
487#  define _amask7_1_0     amask7_1_0
488#  define _LBCarryMask    LBCarryMask
489#  define _HBClearMask    HBClearMask
490#  define _amask0_8_0     amask0_8_0
491#  define _amask6_2_0     amask6_2_0
492#  define _amask4_4_0     amask4_4_0
493#  define _amask0_2_6     amask0_2_6
494#  define _amask2_3_3     amask2_3_3
495#  define _amask4_2_2     amask4_2_2
496#  if defined(PNG_THREAD_UNSAFE_OK)
497#    define _patemp       patemp
498#    define _pbtemp       pbtemp
499#    define _pctemp       pctemp
500#  endif
501#endif // djgpp, Win32, Cygwin, OS2
502
503
504/* These constants are used in the inlined MMX assembly code. */
505
506typedef unsigned long long  ull;
507
508#if defined(PNG_x86_64_USE_GOTPCREL)
509static PNG_CONST struct {
510    //ull _mask_array[26];
511
512    // png_combine_row() constants:
513    ull _mask8_0;
514    ull _mask16_0, _mask16_1;
515    ull _mask24_0, _mask24_1, _mask24_2;
516    ull _mask32_0, _mask32_1, _mask32_2, _mask32_3;
517    ull _mask48_0, _mask48_1, _mask48_2, _mask48_3, _mask48_4, _mask48_5;
518
519    // png_do_read_interlace() constants:
520    ull _amask5_3_0, _amask7_1_0;  // was _const4 and _const6, respectively
521
522    // png_read_filter_row_mmx_avg() constants (also uses _amask5_3_0):
523    ull _LBCarryMask, _HBClearMask;
524    ull _amask0_8_0, _amask6_2_0;  // was ActiveMask for bpp 4/6 and 2 cases
525
526    // png_read_filter_row_mmx_paeth() constants (also uses _amask5_3_0):
527    ull _amask4_4_0, _amask0_2_6;  // was ActiveMask{,End} for bpp 6/4/8 and 3
528
529    // png_read_filter_row_mmx_sub() constants:
530    ull _amask2_3_3, _amask4_2_2;  // was ActiveMask for bpp 3 and 2 cases
531
532} _c64 __attribute__((used, aligned(8))) = {
533
534    // png_combine_row() constants:
535    0x0102040810204080LL, // _mask8_0      offset 0
536
537    0x1010202040408080LL, // _mask16_0     offset 8
538    0x0101020204040808LL, // _mask16_1     offset 16
539
540    0x2020404040808080LL, // _mask24_0     offset 24
541    0x0408080810101020LL, // _mask24_1     offset 32
542    0x0101010202020404LL, // _mask24_2     offset 40
543
544    0x4040404080808080LL, // _mask32_0     offset 48
545    0x1010101020202020LL, // _mask32_1     offset 56
546    0x0404040408080808LL, // _mask32_2     offset 64
547    0x0101010102020202LL, // _mask32_3     offset 72
548
549    0x4040808080808080LL, // _mask48_0     offset 80
550    0x2020202040404040LL, // _mask48_1     offset 88
551    0x1010101010102020LL, // _mask48_2     offset 96
552    0x0404080808080808LL, // _mask48_3     offset 104
553    0x0202020204040404LL, // _mask48_4     offset 112
554    0x0101010101010202LL, // _mask48_5     offset 120
555
556    // png_do_read_interlace() constants:
557    0x0000000000FFFFFFLL, // _amask5_3_0   offset 128  (bpp 3, avg/paeth) const4
558    0x00000000000000FFLL, // _amask7_1_0   offset 136                     const6
559
560    // png_read_filter_row_mmx_avg() constants:
561    0x0101010101010101LL, // _LBCarryMask  offset 144
562    0x7F7F7F7F7F7F7F7FLL, // _HBClearMask  offset 152
563    0xFFFFFFFFFFFFFFFFLL, // _amask0_8_0   offset 160  (bpp 4/6, avg)
564    0x000000000000FFFFLL, // _amask6_2_0   offset 168  (bpp 2,   avg)
565
566    // png_read_filter_row_mmx_paeth() constants:
567    0x00000000FFFFFFFFLL, // _amask4_4_0   offset 176  (bpp 6/4/8, paeth)
568    0xFFFF000000000000LL, // _amask0_2_6   offset 184  (bpp 3, paeth)   A.M.End
569
570    // png_read_filter_row_mmx_sub() constants:
571    0x0000FFFFFF000000LL, // _amask2_3_3   offset 192  (bpp 3, sub)
572    0x00000000FFFF0000LL, // _amask4_2_2   offset 200  (bpp 2, sub)
573
574};
575
576#define MASK8_0        "(%%rbp)"
577#define MASK16_0       "8(%%rbp)"
578#define MASK16_1       "16(%%rbp)"
579#define MASK24_0       "24(%%rbp)"
580#define MASK24_1       "32(%%rbp)"
581#define MASK24_2       "40(%%rbp)"
582#define MASK32_0       "48(%%rbp)"
583#define MASK32_1       "56(%%rbp)"
584#define MASK32_2       "64(%%rbp)"
585#define MASK32_3       "72(%%rbp)"
586#define MASK48_0       "80(%%rbp)"
587#define MASK48_1       "88(%%rbp)"
588#define MASK48_2       "96(%%rbp)"
589#define MASK48_3       "104(%%rbp)"
590#define MASK48_4       "112(%%rbp)"
591#define MASK48_5       "120(%%rbp)"
592#define AMASK5_3_0     "128(%%rbp)"
593#define AMASK7_1_0     "136(%%rbp)"
594#define LB_CARRY_MASK  "144(%%rbp)"
595#define HB_CLEAR_MASK  "152(%%rbp)"
596#define AMASK0_8_0     "160(%%rbp)"
597#define AMASK6_2_0     "168(%%rbp)"
598#define AMASK4_4_0     "176(%%rbp)"
599#define AMASK0_2_6     "184(%%rbp)"
600#define AMASK2_3_3     "192(%%rbp)"
601#define AMASK4_2_2     "200(%%rbp)"
602
603#else // !PNG_x86_64_USE_GOTPCREL
604
605static PNG_CONST ull _mask8_0  __attribute__((used, aligned(8))) = 0x0102040810204080LL;
606
607static PNG_CONST ull _mask16_1 __attribute__((used, aligned(8))) = 0x0101020204040808LL;
608static PNG_CONST ull _mask16_0 __attribute__((used, aligned(8))) = 0x1010202040408080LL;
609
610static PNG_CONST ull _mask24_2 __attribute__((used, aligned(8))) = 0x0101010202020404LL;
611static PNG_CONST ull _mask24_1 __attribute__((used, aligned(8))) = 0x0408080810101020LL;
612static PNG_CONST ull _mask24_0 __attribute__((used, aligned(8))) = 0x2020404040808080LL;
613
614static PNG_CONST ull _mask32_3 __attribute__((used, aligned(8))) = 0x0101010102020202LL;
615static PNG_CONST ull _mask32_2 __attribute__((used, aligned(8))) = 0x0404040408080808LL;
616static PNG_CONST ull _mask32_1 __attribute__((used, aligned(8))) = 0x1010101020202020LL;
617static PNG_CONST ull _mask32_0 __attribute__((used, aligned(8))) = 0x4040404080808080LL;
618
619static PNG_CONST ull _mask48_5 __attribute__((used, aligned(8))) = 0x0101010101010202LL;
620static PNG_CONST ull _mask48_4 __attribute__((used, aligned(8))) = 0x0202020204040404LL;
621static PNG_CONST ull _mask48_3 __attribute__((used, aligned(8))) = 0x0404080808080808LL;
622static PNG_CONST ull _mask48_2 __attribute__((used, aligned(8))) = 0x1010101010102020LL;
623static PNG_CONST ull _mask48_1 __attribute__((used, aligned(8))) = 0x2020202040404040LL;
624static PNG_CONST ull _mask48_0 __attribute__((used, aligned(8))) = 0x4040808080808080LL;
625
626// png_do_read_interlace() constants:
627static PNG_CONST ull _amask5_3_0  __attribute__((aligned(8))) = 0x0000000000FFFFFFLL;  // was _const4
628static PNG_CONST ull _amask7_1_0  __attribute__((aligned(8))) = 0x00000000000000FFLL;  // was _const6
629
630// png_read_filter_row_mmx_avg() constants:
631static PNG_CONST ull _LBCarryMask __attribute__((used, aligned(8))) = 0x0101010101010101LL;
632static PNG_CONST ull _HBClearMask __attribute__((used, aligned(8))) = 0x7f7f7f7f7f7f7f7fLL;
633static PNG_CONST ull _amask0_8_0  __attribute__((used, aligned(8))) = 0xFFFFFFFFFFFFFFFFLL;
634static PNG_CONST ull _amask6_2_0  __attribute__((used, aligned(8))) = 0x000000000000FFFFLL;
635
636// png_read_filter_row_mmx_paeth() constants:
637static PNG_CONST ull _amask4_4_0  __attribute__((used, aligned(8))) = 0x00000000FFFFFFFFLL;
638static PNG_CONST ull _amask0_2_6  __attribute__((used, aligned(8))) = 0xFFFF000000000000LL;
639
640// png_read_filter_row_mmx_sub() constants:
641static PNG_CONST ull _amask2_3_3  __attribute__((used, aligned(8))) = 0x0000FFFFFF000000LL;
642static PNG_CONST ull _amask4_2_2  __attribute__((used, aligned(8))) = 0x00000000FFFF0000LL;
643
644#define MASK8_0        "_mask8_0"
645#define MASK16_0       "_mask16_0"
646#define MASK16_1       "_mask16_1"
647#define MASK24_0       "_mask24_0"
648#define MASK24_1       "_mask24_1"
649#define MASK24_2       "_mask24_2"
650#define MASK32_0       "_mask32_0"
651#define MASK32_1       "_mask32_1"
652#define MASK32_2       "_mask32_2"
653#define MASK32_3       "_mask32_3"
654#define MASK48_0       "_mask48_0"
655#define MASK48_1       "_mask48_1"
656#define MASK48_2       "_mask48_2"
657#define MASK48_3       "_mask48_3"
658#define MASK48_4       "_mask48_4"
659#define MASK48_5       "_mask48_5"
660#define AMASK5_3_0     "_amask5_3_0"
661#define AMASK7_1_0     "_amask7_1_0"
662#define LB_CARRY_MASK  "_LBCarryMask"
663#define HB_CLEAR_MASK  "_HBClearMask"
664#define AMASK0_8_0     "_amask0_8_0"
665#define AMASK6_2_0     "_amask6_2_0"
666#define AMASK4_4_0     "_amask4_4_0"
667#define AMASK0_2_6     "_amask0_2_6"
668#define AMASK2_3_3     "_amask2_3_3"
669#define AMASK4_2_2     "_amask4_2_2"
670
671#endif // ?PNG_x86_64_USE_GOTPCREL
672
673
674#if defined(PNG_HAVE_MMX_READ_FILTER_ROW) || defined(PNG_HAVE_MMX_COMBINE_ROW)
675
676// this block is specific to png_read_filter_row_mmx_paeth() except for
677// LOAD_GOT_rbp and RESTORE_rbp, which are also used in png_combine_row()
678#if defined(PNG_x86_64_USE_GOTPCREL)
679#  define pa_TEMP                "%%r11d"
680#  define pb_TEMP                "%%r12d"
681#  define pc_TEMP                "%%r13d"
682#  if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)  // works as of gcc 3.4.3 ...
683#    define SAVE_r11_r12_r13
684#    define RESTORE_r11_r12_r13
685#    define _CLOBBER_r11_r12_r13 ,"%r11", "%r12", "%r13"
686#    define CLOBBER_r11_r12_r13  "%r11", "%r12", "%r13"
687#  else // !PNG_CLOBBER_x86_64_REGS_SUPPORTED
688#    define SAVE_r11_r12_r13     "pushq %%r11  \n\t" \
689                                 "pushq %%r12  \n\t" \
690                                 "pushq %%r13  \n\t"  // "normally 0-extended"
691#    define RESTORE_r11_r12_r13  "popq  %%r13  \n\t" \
692                                 "popq  %%r12  \n\t" \
693                                 "popq  %%r11  \n\t"
694#    define _CLOBBER_r11_r12_r13
695#    define CLOBBER_r11_r12_r13
696#  endif
697#  define LOAD_GOT_rbp           "pushq %%rbp                        \n\t" \
698                                 "movq  _c64@GOTPCREL(%%rip), %%rbp  \n\t"
699#  define RESTORE_rbp            "popq  %%rbp                        \n\t"
700#else // 32-bit and/or non-PIC
701#  if defined(PNG_THREAD_UNSAFE_OK)
702     // These variables are used in png_read_filter_row_mmx_paeth() and would be
703     //   local variables if not for gcc-inline-assembly addressing limitations
704     //   (some apparently related to ELF format, others to CPU type).
705     //
706     // WARNING: Their presence defeats the thread-safety of libpng.
707     static int                     _patemp  __attribute__((used));
708     static int                     _pbtemp  __attribute__((used));
709     static int                     _pctemp  __attribute__((used));
710#    define pa_TEMP                "_patemp"
711#    define pb_TEMP                "_pbtemp"  // temp variables for
712#    define pc_TEMP                "_pctemp"  //  Paeth routine
713#    define SAVE_r11_r12_r13
714#    define RESTORE_r11_r12_r13
715#    define _CLOBBER_r11_r12_r13   // not using regs => not clobbering
716#    define CLOBBER_r11_r12_r13
717#  endif // PNG_THREAD_UNSAFE_OK
718#  define LOAD_GOT_rbp
719#  define RESTORE_rbp
720#endif
721
722#if defined(__x86_64__)
723#  define SAVE_ebp
724#  define RESTORE_ebp
725#  define _CLOBBER_ebp         ,"%ebp"
726#  define CLOBBER_ebp          "%ebp"
727#  define SAVE_FullLength      "movl %%eax, %%r15d  \n\t"
728#  define RESTORE_FullLength   "movl %%r15d, "     // may go into eax or ecx
729#  if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)   // works as of gcc 3.4.3 ...
730#    define SAVE_r15
731#    define RESTORE_r15
732#    define _CLOBBER_r15       ,"%r15"
733#  else
734#    define SAVE_r15           "pushq %%r15  \n\t"
735#    define RESTORE_r15        "popq  %%r15  \n\t"
736#    define _CLOBBER_r15
737#  endif
738#  define PBP                  "%%rbp"             // regs used for 64-bit
739#  define PAX                  "%%rax"             //  pointers or in
740#  define PBX                  "%%rbx"             //  combination with
741#  define PCX                  "%%rcx"             //  64-bit pointer-regs
742#  define PDX                  "%%rdx"             //  (base/index pairs,
743#  define PSI                  "%%rsi"             //  add/sub/mov pairs)
744#  define CLEAR_BOTTOM_3_BITS  "and  $0xfffffffffffffff8, "
745#else
746#  define SAVE_ebp             "pushl %%ebp \n\t"  // clobber list doesn't work
747#  define RESTORE_ebp          "popl  %%ebp \n\t"  //  for %ebp on 32-bit; not
748#  define _CLOBBER_ebp                             //  clear why not
749#  define CLOBBER_ebp
750#  define SAVE_FullLength      "pushl %%eax \n\t"
751#  define RESTORE_FullLength   "popl "             // eax (avg) or ecx (paeth)
752#  define SAVE_r15
753#  define RESTORE_r15
754#  define _CLOBBER_r15
755#  define PBP                  "%%ebp"             // regs used for or in
756#  define PAX                  "%%eax"             //  combination with
757#  define PBX                  "%%ebx"             //  "normal," 32-bit
758#  define PCX                  "%%ecx"             //  pointers
759#  define PDX                  "%%edx"
760#  define PSI                  "%%esi"
761#  define CLEAR_BOTTOM_3_BITS  "and  $0xfffffff8, "
762#endif
763
764// CLOB_COMMA_ebx_ebp:  need comma ONLY if both CLOBBER_ebp and CLOBBER_GOT_ebx
765//                      have values, i.e., only if __x86_64__ AND !__PIC__
766#if defined(__x86_64__) && !defined(__PIC__)
767#  define CLOB_COMMA_ebx_ebp    , // clobbering both ebp and ebx => need comma
768#else
769#  define CLOB_COMMA_ebx_ebp
770#endif
771
772// CLOB_COMMA_ebX_r1X:  need comma UNLESS both CLOBBER_ebp and CLOBBER_GOT_ebx
773//                   are empty OR CLOBBER_r11_r12_r13 is empty--i.e., NO comma
774//                   if (!__x86_64__ AND __PIC__) OR !(PNG_x86_64_USE_GOTPCREL
775//                   AND PNG_CLOBBER_x86_64_REGS_SUPPORTED)   (double sigh...)
776#if (!defined(__x86_64__) && defined(__PIC__)) || \
777    !defined(PNG_x86_64_USE_GOTPCREL) || \
778    !defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)
779#  define CLOB_COMMA_ebX_r1X
780#else
781#  define CLOB_COMMA_ebX_r1X    , // clobbering (ebp OR ebx) AND r11_r12_r13
782#endif
783
784// CLOB_COLON_ebx_ebp:  need colon unless CLOBBER_ebp and CLOBBER_GOT_ebx are
785//                      BOTH empty--i.e., NO colon if (!__x86_64__ AND __PIC__)
786// CLOB_COLON_ebx_ebp_r1X:  if, in addition, CLOBBER_r11_r12_r13 is empty, then
787//                          no colon for Paeth blocks, either--i.e., NO colon
788//                          if !(PNG_x86_64_USE_GOTPCREL AND
789//                               PNG_CLOBBER_x86_64_REGS_SUPPORTED)
790#if (!defined(__x86_64__) && defined(__PIC__))
791#  define CLOB_COLON_ebx_ebp
792#  if !(defined(PNG_x86_64_USE_GOTPCREL) && \
793        defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED))
794#    define CLOB_COLON_ebx_ebp_r1X
795#  else
796#    define CLOB_COLON_ebx_ebp_r1X  : // clobbering ebp OR ebx OR r11_r12_r13
797#  endif
798#else
799#  define CLOB_COLON_ebx_ebp        : // clobbering ebp OR ebx
800#  define CLOB_COLON_ebx_ebp_r1X    : // clobbering ebp OR ebx OR r11_r12_r13
801#endif
802
803#endif // PNG_HAVE_MMX_READ_FILTER_ROW
804
805#if defined(__PIC__)  // macros to save, restore index to Global Offset Table
806#  if defined(__x86_64__)
807#    define SAVE_GOT_ebx     "pushq %%rbx \n\t"
808#    define RESTORE_GOT_ebx  "popq  %%rbx \n\t"
809#  else
810#    define SAVE_GOT_ebx     "pushl %%ebx \n\t"
811#    define RESTORE_GOT_ebx  "popl  %%ebx \n\t"
812#  endif
813#  define _CLOBBER_GOT_ebx   // explicitly saved, restored => not clobbered
814#  define CLOBBER_GOT_ebx
815#else
816#  define SAVE_GOT_ebx
817#  define RESTORE_GOT_ebx
818#  define _CLOBBER_GOT_ebx   ,"%ebx"
819#  define CLOBBER_GOT_ebx    "%ebx"
820#endif
821
822#if defined(PNG_HAVE_MMX_COMBINE_ROW) || defined(PNG_HAVE_MMX_READ_INTERLACE)
823#  define BPP2  2
824#  define BPP3  3  // bytes per pixel (a.k.a. pixel_bytes)
825#  define BPP4  4  // (defined only to help avoid cut-and-paste errors)
826#  define BPP6  6
827#  define BPP8  8
828#endif
829
830
831
832static int _mmx_supported = 2; // 0: no MMX; 1: MMX supported; 2: not tested
833
834/*===========================================================================*/
835/*                                                                           */
836/*                      P N G _ M M X _ S U P P O R T                        */
837/*                                                                           */
838/*===========================================================================*/
839
840// GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
841//             (2) all instructions compile with gcc 2.7.2.3 and later
842//           x (3) the function is moved down here to prevent gcc from
843//           x      inlining it in multiple places and then barfing be-
844//           x      cause the ".NOT_SUPPORTED" label is multiply defined
845//                  [need to retest with gcc 2.7.2.3]
846
847// GRR 20070524:  This declaration apparently is compatible with but supersedes
848//   the one in png.h; in any case, the generated object file is slightly
849//   smaller.  It is unnecessary with gcc 4.1.2, but gcc 2.x apparently
850//   replicated the ".NOT_SUPPORTED" label in each location the function was
851//   inlined, leading to compilation errors due to the "multiply defined"
852//   label.  Old workaround was to leave the function at the end of this
853//   file; new one (still testing) is to use a gcc-specific function attribute
854//   to prevent local inlining.
855int PNGAPI
856png_mmx_support(void) __attribute__((noinline));
857
858int PNGAPI
859png_mmx_support(void)
860{
861#if defined(PNG_MMX_CODE_SUPPORTED)  // superfluous, but what the heck
862    int result;
863    __asm__ __volatile__ (
864#if defined(__x86_64__)
865        "pushq %%rbx          \n\t"  // rbx gets clobbered by CPUID instruction
866        "pushq %%rcx          \n\t"  // so does rcx...
867        "pushq %%rdx          \n\t"  // ...and rdx (but rcx & rdx safe on Linux)
868        "pushfq               \n\t"  // save Eflag to stack
869        "popq %%rax           \n\t"  // get Eflag from stack into rax
870        "movq %%rax, %%rcx    \n\t"  // make another copy of Eflag in rcx
871        "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
872        "pushq %%rax          \n\t"  // save modified Eflag back to stack
873        "popfq                \n\t"  // restore modified value to Eflag reg
874        "pushfq               \n\t"  // save Eflag to stack
875        "popq %%rax           \n\t"  // get Eflag from stack
876        "pushq %%rcx          \n\t"  // save original Eflag to stack
877        "popfq                \n\t"  // restore original Eflag
878#else
879        "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
880        "pushl %%ecx          \n\t"  // so does ecx...
881        "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
882        "pushfl               \n\t"  // save Eflag to stack
883        "popl %%eax           \n\t"  // get Eflag from stack into eax
884        "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
885        "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
886        "pushl %%eax          \n\t"  // save modified Eflag back to stack
887        "popfl                \n\t"  // restore modified value to Eflag reg
888        "pushfl               \n\t"  // save Eflag to stack
889        "popl %%eax           \n\t"  // get Eflag from stack
890        "pushl %%ecx          \n\t"  // save original Eflag to stack
891        "popfl                \n\t"  // restore original Eflag
892#endif
893        "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
894        "jz 0f                \n\t"  // if same, CPUID instr. is not supported
895
896        "xorl %%eax, %%eax    \n\t"  // set eax to zero
897//      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
898        "cpuid                \n\t"  // get the CPU identification info
899        "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
900        "jl 0f                \n\t"  // if eax is zero, MMX is not supported
901
902        "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
903        "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
904                                     // faster than the instruction "mov eax, 1"
905        "cpuid                \n\t"  // get the CPU identification info again
906        "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
907        "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
908        "jz 0f                \n\t"  // non-zero = yes, MMX IS supported
909
910        "movl $1, %%eax       \n\t"  // set return value to 1
911        "jmp  1f              \n\t"  // DONE:  have MMX support
912
913    "0:                       \n\t"  // .NOT_SUPPORTED: target label for jump instructions
914        "movl $0, %%eax       \n\t"  // set return value to 0
915    "1:                       \n\t"  // .RETURN: target label for jump instructions
916#if defined(__x86_64__)
917        "popq %%rdx           \n\t"  // restore rdx
918        "popq %%rcx           \n\t"  // restore rcx
919        "popq %%rbx           \n\t"  // restore rbx
920#else
921        "popl %%edx           \n\t"  // restore edx
922        "popl %%ecx           \n\t"  // restore ecx
923        "popl %%ebx           \n\t"  // restore ebx
924#endif
925
926//      "ret                  \n\t"  // DONE:  no MMX support
927                                     // (fall through to standard C "ret")
928
929        : "=a" (result)              // output list
930
931        :                            // any variables used on input (none)
932
933                                     // no clobber list
934//      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
935//      , "memory"   // if write to a variable gcc thought was in a reg
936//      , "cc"       // "condition codes" (flag bits)
937    );
938    _mmx_supported = result;
939#else
940    _mmx_supported = 0;
941#endif /* PNG_MMX_CODE_SUPPORTED */
942
943    return _mmx_supported;
944}
945
946
947/*===========================================================================*/
948/*                                                                           */
949/*                       P N G _ C O M B I N E _ R O W                       */
950/*                                                                           */
951/*===========================================================================*/
952
953#if defined(PNG_HAVE_MMX_COMBINE_ROW)
954
955/* Combines the row recently read in with the previous row.
956   This routine takes care of alpha and transparency if requested.
957   This routine also handles the two methods of progressive display
958   of interlaced images, depending on the mask value.
959   The mask value describes which pixels are to be combined with
960   the row.  The pattern always repeats every 8 pixels, so just 8
961   bits are needed.  A one indicates the pixel is to be combined; a
962   zero indicates the pixel is to be skipped.  This is in addition
963   to any alpha or transparency value associated with the pixel.
964   If you want all pixels to be combined, pass 0xff (255) in mask. */
965
966/* Use this routine for the x86 platform - it uses a faster MMX routine
967   if the machine supports MMX. */
968
969void /* PRIVATE */
970png_combine_row(png_structp png_ptr, png_bytep row, int mask)
971{
972   int dummy_value_a;    // fix 'forbidden register spilled' error
973   int dummy_value_c;
974   int dummy_value_d;
975   png_bytep dummy_value_S;
976   png_bytep dummy_value_D;
977
978   png_debug(1, "in png_combine_row (pnggccrd.c)\n");
979
980   if (_mmx_supported == 2) {
981#if !defined(PNG_1_0_X)
982       /* this should have happened in png_init_mmx_flags() already */
983       png_warning(png_ptr, "asm_flags may not have been initialized");
984#endif
985       png_mmx_support();
986   }
987
988   if (mask == 0xff)
989   {
990      png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
991      png_memcpy(row, png_ptr->row_buf + 1,
992       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
993   }
994   else   /* (png_combine_row() is never called with mask == 0) */
995   {
996      switch (png_ptr->row_info.pixel_depth)
997      {
998         case 24:       /* png_ptr->row_info.pixel_depth */
999         {
1000            png_bytep srcptr;
1001            png_bytep dstptr;
1002
1003#if !defined(PNG_1_0_X)
1004            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1005#else
1006            if (_mmx_supported)
1007#endif
1008            {
1009               png_uint_32 len;
1010               int diff;
1011
1012               srcptr = png_ptr->row_buf + 1;
1013               dstptr = row;
1014               len  = png_ptr->width & ~7;          // reduce to multiple of 8
1015               diff = (int) (png_ptr->width & 7);   // amount lost
1016
1017               __asm__ __volatile__ (
1018                  "not       %%edx            \n\t" // mask => unmask
1019                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
1020                  "not       %%edx            \n\t" // unmask => mask for later
1021                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1022                  "punpcklbw %%mm7, %%mm7     \n\t"
1023                  "punpcklwd %%mm7, %%mm7     \n\t"
1024                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1025
1026                  LOAD_GOT_rbp
1027                  "movq   " MASK24_0 ", %%mm0 \n\t" // _mask24_0 -> mm0
1028                  "movq   " MASK24_1 ", %%mm1 \n\t" // _mask24_1 -> mm1
1029                  "movq   " MASK24_2 ", %%mm2 \n\t" // _mask24_2 -> mm2
1030                  RESTORE_rbp
1031
1032                  "pand      %%mm7, %%mm0     \n\t"
1033                  "pand      %%mm7, %%mm1     \n\t"
1034                  "pand      %%mm7, %%mm2     \n\t"
1035
1036                  "pcmpeqb   %%mm6, %%mm0     \n\t"
1037                  "pcmpeqb   %%mm6, %%mm1     \n\t"
1038                  "pcmpeqb   %%mm6, %%mm2     \n\t"
1039
1040// preload        "movl      len, %%ecx       \n\t" // load length of line
1041// preload        "movl      srcptr, %3       \n\t" // load source
1042// preload        "movl      dstptr, %4       \n\t" // load dest
1043
1044                  "cmpl      $0, %%ecx        \n\t"
1045                  "jz        mainloop24end    \n\t"
1046
1047                "mainloop24:                  \n\t"
1048                  "movq      (%3), %%mm4      \n\t"
1049                  "pand      %%mm0, %%mm4     \n\t"
1050                  "movq      %%mm0, %%mm6     \n\t"
1051                  "movq      (%4), %%mm7      \n\t"
1052                  "pandn     %%mm7, %%mm6     \n\t"
1053                  "por       %%mm6, %%mm4     \n\t"
1054                  "movq      %%mm4, (%4)      \n\t"
1055
1056                  "movq      8(%3), %%mm5     \n\t"
1057                  "pand      %%mm1, %%mm5     \n\t"
1058                  "movq      %%mm1, %%mm7     \n\t"
1059                  "movq      8(%4), %%mm6     \n\t"
1060                  "pandn     %%mm6, %%mm7     \n\t"
1061                  "por       %%mm7, %%mm5     \n\t"
1062                  "movq      %%mm5, 8(%4)     \n\t"
1063
1064                  "movq      16(%3), %%mm6    \n\t"
1065                  "pand      %%mm2, %%mm6     \n\t"
1066                  "movq      %%mm2, %%mm4     \n\t"
1067                  "movq      16(%4), %%mm7    \n\t"
1068                  "pandn     %%mm7, %%mm4     \n\t"
1069                  "por       %%mm4, %%mm6     \n\t"
1070                  "movq      %%mm6, 16(%4)    \n\t"
1071
1072                  "add       $24, %3          \n\t" // inc by 24 bytes processed
1073                  "add       $24, %4          \n\t"
1074                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1075
1076                  "ja        mainloop24       \n\t"
1077
1078                "mainloop24end:               \n\t"
1079// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1080                  "movl      %%eax, %%ecx     \n\t"
1081                  "cmpl      $0, %%ecx        \n\t"
1082                  "jz        end24            \n\t"
1083// preload        "movl      mask, %%edx      \n\t"
1084                  "sall      $24, %%edx       \n\t" // make low byte, high byte
1085
1086                "secondloop24:                \n\t"
1087                  "sall      %%edx            \n\t" // move high bit to CF
1088                  "jnc       skip24           \n\t" // if CF = 0
1089                  "movw      (%3), %%ax       \n\t"
1090                  "movw      %%ax, (%4)       \n\t"
1091                  "xorl      %%eax, %%eax     \n\t"
1092                  "movb      2(%3), %%al      \n\t"
1093                  "movb      %%al, 2(%4)      \n\t"
1094
1095                "skip24:                      \n\t"
1096                  "add       $3, %3           \n\t"
1097                  "add       $3, %4           \n\t"
1098                  "decl      %%ecx            \n\t"
1099                  "jnz       secondloop24     \n\t"
1100
1101                "end24:                       \n\t"
1102                  "EMMS                       \n\t" // DONE
1103
1104                  : "=a" (dummy_value_a),           // output regs (dummy)
1105                    "=d" (dummy_value_d),
1106                    "=c" (dummy_value_c),
1107                    "=S" (dummy_value_S),
1108                    "=D" (dummy_value_D)
1109
1110                  : "0" (diff),        // eax       // input regs
1111                    "1" (mask),        // edx
1112                    "2" (len),         // ecx
1113// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1114                    "3" (srcptr),      // esi/rsi
1115                    "4" (dstptr)       // edi/rdi
1116
1117#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
1118                  : "%mm0", "%mm1", "%mm2"          // clobber list
1119                  , "%mm4", "%mm5", "%mm6", "%mm7"
1120#endif
1121               );
1122            }
1123            else /* not _mmx_supported - use modified C routine */
1124            {
1125               register png_uint_32 i;
1126               png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1127                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1128               register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1129                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1130               register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1131                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1132               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1133               int diff = (int) (png_ptr->width & 7); /* amount lost */
1134               register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
1135
1136               srcptr = png_ptr->row_buf + 1 + initial_val;
1137               dstptr = row + initial_val;
1138
1139               for (i = initial_val; i < final_val; i += stride)
1140               {
1141                  png_memcpy(dstptr, srcptr, rep_bytes);
1142                  srcptr += stride;
1143                  dstptr += stride;
1144               }
1145               if (diff)  /* number of leftover pixels:  3 for pngtest */
1146               {
1147                  final_val += diff*BPP3;
1148                  for (; i < final_val; i += stride)
1149                  {
1150                     if (rep_bytes > (int)(final_val-i))
1151                        rep_bytes = (int)(final_val-i);
1152                     png_memcpy(dstptr, srcptr, rep_bytes);
1153                     srcptr += stride;
1154                     dstptr += stride;
1155                  }
1156               }
1157            } /* end of else (_mmx_supported) */
1158
1159            break;
1160         }       /* end 24 bpp */
1161
1162         // formerly claimed to be most common case (combining 32-bit RGBA),
1163         // but almost certainly less common than 24-bit RGB case
1164         case 32:       /* png_ptr->row_info.pixel_depth */
1165         {
1166            png_bytep srcptr;
1167            png_bytep dstptr;
1168
1169#if !defined(PNG_1_0_X)
1170            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1171#else
1172            if (_mmx_supported)
1173#endif
1174            {
1175               png_uint_32 len;
1176               int diff;
1177
1178               srcptr = png_ptr->row_buf + 1;
1179               dstptr = row;
1180               len  = png_ptr->width & ~7;          // reduce to multiple of 8
1181               diff = (int) (png_ptr->width & 7);   // amount lost
1182
1183               __asm__ __volatile__ (
1184                  "not       %%edx            \n\t" // mask => unmask
1185                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
1186                  "not       %%edx            \n\t" // unmask => mask for later
1187                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1188                  "punpcklbw %%mm7, %%mm7     \n\t"
1189                  "punpcklwd %%mm7, %%mm7     \n\t"
1190                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1191
1192                  LOAD_GOT_rbp
1193                  "movq   " MASK32_0 ", %%mm0 \n\t" // _mask32_0
1194                  "movq   " MASK32_1 ", %%mm1 \n\t" // _mask32_1
1195                  "movq   " MASK32_2 ", %%mm2 \n\t" // _mask32_2
1196                  "movq   " MASK32_3 ", %%mm3 \n\t" // _mask32_3
1197                  RESTORE_rbp
1198
1199                  "pand      %%mm7, %%mm0     \n\t"
1200                  "pand      %%mm7, %%mm1     \n\t"
1201                  "pand      %%mm7, %%mm2     \n\t"
1202                  "pand      %%mm7, %%mm3     \n\t"
1203
1204                  "pcmpeqb   %%mm6, %%mm0     \n\t"
1205                  "pcmpeqb   %%mm6, %%mm1     \n\t"
1206                  "pcmpeqb   %%mm6, %%mm2     \n\t"
1207                  "pcmpeqb   %%mm6, %%mm3     \n\t"
1208
1209// preload        "movl      len, %%ecx       \n\t" // load length of line
1210// preload        "movl      srcptr, %3       \n\t" // load source
1211// preload        "movl      dstptr, %4       \n\t" // load dest
1212
1213                  "cmpl      $0, %%ecx        \n\t" // lcr
1214                  "jz        mainloop32end    \n\t"
1215
1216                "mainloop32:                  \n\t"
1217                  "movq      (%3), %%mm4      \n\t"
1218                  "pand      %%mm0, %%mm4     \n\t"
1219                  "movq      %%mm0, %%mm6     \n\t"
1220                  "movq      (%4), %%mm7      \n\t"
1221                  "pandn     %%mm7, %%mm6     \n\t"
1222                  "por       %%mm6, %%mm4     \n\t"
1223                  "movq      %%mm4, (%4)      \n\t"
1224
1225                  "movq      8(%3), %%mm5     \n\t"
1226                  "pand      %%mm1, %%mm5     \n\t"
1227                  "movq      %%mm1, %%mm7     \n\t"
1228                  "movq      8(%4), %%mm6     \n\t"
1229                  "pandn     %%mm6, %%mm7     \n\t"
1230                  "por       %%mm7, %%mm5     \n\t"
1231                  "movq      %%mm5, 8(%4)     \n\t"
1232
1233                  "movq      16(%3), %%mm6    \n\t"
1234                  "pand      %%mm2, %%mm6     \n\t"
1235                  "movq      %%mm2, %%mm4     \n\t"
1236                  "movq      16(%4), %%mm7    \n\t"
1237                  "pandn     %%mm7, %%mm4     \n\t"
1238                  "por       %%mm4, %%mm6     \n\t"
1239                  "movq      %%mm6, 16(%4)    \n\t"
1240
1241                  "movq      24(%3), %%mm7    \n\t"
1242                  "pand      %%mm3, %%mm7     \n\t"
1243                  "movq      %%mm3, %%mm5     \n\t"
1244                  "movq      24(%4), %%mm4    \n\t"
1245                  "pandn     %%mm4, %%mm5     \n\t"
1246                  "por       %%mm5, %%mm7     \n\t"
1247                  "movq      %%mm7, 24(%4)    \n\t"
1248
1249                  "add       $32, %3          \n\t" // inc by 32 bytes processed
1250                  "add       $32, %4          \n\t"
1251                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1252                  "ja        mainloop32       \n\t"
1253
1254                "mainloop32end:               \n\t"
1255// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1256                  "movl      %%eax, %%ecx     \n\t"
1257                  "cmpl      $0, %%ecx        \n\t"
1258                  "jz        end32            \n\t"
1259// preload        "movl      mask, %%edx      \n\t"
1260                  "sall      $24, %%edx       \n\t" // low byte => high byte
1261
1262                "secondloop32:                \n\t"
1263                  "sall      %%edx            \n\t" // move high bit to CF
1264                  "jnc       skip32           \n\t" // if CF = 0
1265                  "movl      (%3), %%eax      \n\t"
1266                  "movl      %%eax, (%4)      \n\t"
1267
1268                "skip32:                      \n\t"
1269                  "add       $4, %3           \n\t"
1270                  "add       $4, %4           \n\t"
1271                  "decl      %%ecx            \n\t"
1272                  "jnz       secondloop32     \n\t"
1273
1274                "end32:                       \n\t"
1275                  "EMMS                       \n\t" // DONE
1276
1277                  : "=a" (dummy_value_a),           // output regs (dummy)
1278                    "=d" (dummy_value_d),
1279                    "=c" (dummy_value_c),
1280                    "=S" (dummy_value_S),
1281                    "=D" (dummy_value_D)
1282
1283                  : "0" (diff),        // eax       // input regs
1284                    "1" (mask),        // edx
1285                    "2" (len),         // ecx
1286// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1287                    "3" (srcptr),      // esi/rsi
1288                    "4" (dstptr)       // edi/rdi
1289
1290#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
1291                  : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1292                  , "%mm4", "%mm5", "%mm6", "%mm7"
1293#endif
1294               );
1295            }
1296            else /* not _mmx_supported - use modified C routine */
1297            {
1298               register png_uint_32 i;
1299               png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1300                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1301               register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1302                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1303               register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1304                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1305               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1306               int diff = (int) (png_ptr->width & 7); /* amount lost */
1307               register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
1308
1309               srcptr = png_ptr->row_buf + 1 + initial_val;
1310               dstptr = row + initial_val;
1311
1312               for (i = initial_val; i < final_val; i += stride)
1313               {
1314                  png_memcpy(dstptr, srcptr, rep_bytes);
1315                  srcptr += stride;
1316                  dstptr += stride;
1317               }
1318               if (diff)  /* number of leftover pixels:  3 for pngtest */
1319               {
1320                  final_val += diff*BPP4;
1321                  for (; i < final_val; i += stride)
1322                  {
1323                     if (rep_bytes > (int)(final_val-i))
1324                        rep_bytes = (int)(final_val-i);
1325                     png_memcpy(dstptr, srcptr, rep_bytes);
1326                     srcptr += stride;
1327                     dstptr += stride;
1328                  }
1329               }
1330            } /* end of else (_mmx_supported) */
1331
1332            break;
1333         }       /* end 32 bpp */
1334
1335         case 8:        /* png_ptr->row_info.pixel_depth */
1336         {
1337            png_bytep srcptr;
1338            png_bytep dstptr;
1339
1340#if !defined(PNG_1_0_X)
1341            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1342#else
1343            if (_mmx_supported)
1344#endif
1345            {
1346               png_uint_32 len;
1347               int diff;
1348
1349               srcptr = png_ptr->row_buf + 1;
1350               dstptr = row;
1351               len  = png_ptr->width & ~7;          // reduce to multiple of 8
1352               diff = (int) (png_ptr->width & 7);   // amount lost
1353
1354               __asm__ __volatile__ (
1355                  "not       %%edx            \n\t" // mask => unmask
1356                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
1357                  "not       %%edx            \n\t" // unmask => mask for later
1358                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1359                  "punpcklbw %%mm7, %%mm7     \n\t"
1360                  "punpcklwd %%mm7, %%mm7     \n\t"
1361                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1362
1363                  LOAD_GOT_rbp
1364                  "movq   " MASK8_0 ", %%mm0  \n\t" // _mask8_0 -> mm0
1365                  RESTORE_rbp
1366
1367                  "pand      %%mm7, %%mm0     \n\t" // nonzero if keep byte
1368                  "pcmpeqb   %%mm6, %%mm0     \n\t" // zeros->1s, v versa
1369
1370// preload        "movl      len, %%ecx       \n\t" // load length of line
1371// preload        "movl      srcptr, %3       \n\t" // load source
1372// preload        "movl      dstptr, %4       \n\t" // load dest
1373
1374                  "cmpl      $0, %%ecx        \n\t" // len == 0 ?
1375                  "je        mainloop8end     \n\t"
1376
1377                "mainloop8:                   \n\t"
1378                  "movq      (%3), %%mm4      \n\t" // *srcptr
1379                  "pand      %%mm0, %%mm4     \n\t"
1380                  "movq      %%mm0, %%mm6     \n\t"
1381                  "pandn     (%4), %%mm6      \n\t" // *dstptr
1382                  "por       %%mm6, %%mm4     \n\t"
1383                  "movq      %%mm4, (%4)      \n\t"
1384                  "add       $8, %3           \n\t" // inc by 8 bytes processed
1385                  "add       $8, %4           \n\t"
1386                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1387                  "ja        mainloop8        \n\t"
1388
1389                "mainloop8end:                \n\t"
1390// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1391                  "movl      %%eax, %%ecx     \n\t"
1392                  "cmpl      $0, %%ecx        \n\t"
1393                  "jz        end8             \n\t"
1394// preload        "movl      mask, %%edx      \n\t"
1395                  "sall      $24, %%edx       \n\t" // make low byte, high byte
1396
1397                "secondloop8:                 \n\t"
1398                  "sall      %%edx            \n\t" // move high bit to CF
1399                  "jnc       skip8            \n\t" // if CF = 0
1400                  "movb      (%3), %%al       \n\t"
1401                  "movb      %%al, (%4)       \n\t"
1402
1403                "skip8:                       \n\t"
1404                  "inc       %3               \n\t"
1405                  "inc       %4               \n\t"
1406                  "decl      %%ecx            \n\t"
1407                  "jnz       secondloop8      \n\t"
1408
1409                "end8:                        \n\t"
1410                  "EMMS                       \n\t" // DONE
1411
1412                  : "=a" (dummy_value_a),           // output regs (dummy)
1413                    "=d" (dummy_value_d),
1414                    "=c" (dummy_value_c),
1415                    "=S" (dummy_value_S),
1416                    "=D" (dummy_value_D)
1417
1418                  : "0" (diff),        // eax       // input regs
1419                    "1" (mask),        // edx
1420                    "2" (len),         // ecx
1421// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1422                    "3" (srcptr),      // esi/rsi
1423                    "4" (dstptr)       // edi/rdi
1424
1425#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
1426                  : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
1427#endif
1428               );
1429            }
1430            else /* not _mmx_supported - use modified C routine */
1431            {
1432               register png_uint_32 i;
1433               png_uint_32 initial_val = png_pass_start[png_ptr->pass];
1434                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1435               register int stride = png_pass_inc[png_ptr->pass];
1436                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1437               register int rep_bytes = png_pass_width[png_ptr->pass];
1438                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1439               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1440               int diff = (int) (png_ptr->width & 7); /* amount lost */
1441               register png_uint_32 final_val = len;  /* GRR bugfix */
1442
1443               srcptr = png_ptr->row_buf + 1 + initial_val;
1444               dstptr = row + initial_val;
1445
1446               for (i = initial_val; i < final_val; i += stride)
1447               {
1448                  png_memcpy(dstptr, srcptr, rep_bytes);
1449                  srcptr += stride;
1450                  dstptr += stride;
1451               }
1452               if (diff)  /* number of leftover pixels:  3 for pngtest */
1453               {
1454                  final_val += diff /* *BPP1 */ ;
1455                  for (; i < final_val; i += stride)
1456                  {
1457                     if (rep_bytes > (int)(final_val-i))
1458                        rep_bytes = (int)(final_val-i);
1459                     png_memcpy(dstptr, srcptr, rep_bytes);
1460                     srcptr += stride;
1461                     dstptr += stride;
1462                  }
1463               }
1464
1465            } /* end of else (_mmx_supported) */
1466
1467            break;
1468         }       /* end 8 bpp */
1469
1470         case 1:        /* png_ptr->row_info.pixel_depth */
1471         {
1472            png_bytep sp;
1473            png_bytep dp;
1474            int s_inc, s_start, s_end;
1475            int m;
1476            int shift;
1477            png_uint_32 i;
1478
1479            sp = png_ptr->row_buf + 1;
1480            dp = row;
1481            m = 0x80;
1482#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1483            if (png_ptr->transformations & PNG_PACKSWAP)
1484            {
1485               s_start = 0;
1486               s_end = 7;
1487               s_inc = 1;
1488            }
1489            else
1490#endif
1491            {
1492               s_start = 7;
1493               s_end = 0;
1494               s_inc = -1;
1495            }
1496
1497            shift = s_start;
1498
1499            for (i = 0; i < png_ptr->width; i++)
1500            {
1501               if (m & mask)
1502               {
1503                  int value;
1504
1505                  value = (*sp >> shift) & 0x1;
1506                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
1507                  *dp |= (png_byte)(value << shift);
1508               }
1509
1510               if (shift == s_end)
1511               {
1512                  shift = s_start;
1513                  sp++;
1514                  dp++;
1515               }
1516               else
1517                  shift += s_inc;
1518
1519               if (m == 1)
1520                  m = 0x80;
1521               else
1522                  m >>= 1;
1523            }
1524            break;
1525         }       /* end 1 bpp */
1526
1527         case 2:        /* png_ptr->row_info.pixel_depth */
1528         {
1529            png_bytep sp;
1530            png_bytep dp;
1531            int s_start, s_end, s_inc;
1532            int m;
1533            int shift;
1534            png_uint_32 i;
1535            int value;
1536
1537            sp = png_ptr->row_buf + 1;
1538            dp = row;
1539            m = 0x80;
1540#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1541            if (png_ptr->transformations & PNG_PACKSWAP)
1542            {
1543               s_start = 0;
1544               s_end = 6;
1545               s_inc = 2;
1546            }
1547            else
1548#endif
1549            {
1550               s_start = 6;
1551               s_end = 0;
1552               s_inc = -2;
1553            }
1554
1555            shift = s_start;
1556
1557            for (i = 0; i < png_ptr->width; i++)
1558            {
1559               if (m & mask)
1560               {
1561                  value = (*sp >> shift) & 0x3;
1562                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
1563                  *dp |= (png_byte)(value << shift);
1564               }
1565
1566               if (shift == s_end)
1567               {
1568                  shift = s_start;
1569                  sp++;
1570                  dp++;
1571               }
1572               else
1573                  shift += s_inc;
1574               if (m == 1)
1575                  m = 0x80;
1576               else
1577                  m >>= 1;
1578            }
1579            break;
1580         }       /* end 2 bpp */
1581
1582         case 4:        /* png_ptr->row_info.pixel_depth */
1583         {
1584            png_bytep sp;
1585            png_bytep dp;
1586            int s_start, s_end, s_inc;
1587            int m;
1588            int shift;
1589            png_uint_32 i;
1590            int value;
1591
1592            sp = png_ptr->row_buf + 1;
1593            dp = row;
1594            m = 0x80;
1595#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1596            if (png_ptr->transformations & PNG_PACKSWAP)
1597            {
1598               s_start = 0;
1599               s_end = 4;
1600               s_inc = 4;
1601            }
1602            else
1603#endif
1604            {
1605               s_start = 4;
1606               s_end = 0;
1607               s_inc = -4;
1608            }
1609
1610            shift = s_start;
1611
1612            for (i = 0; i < png_ptr->width; i++)
1613            {
1614               if (m & mask)
1615               {
1616                  value = (*sp >> shift) & 0xf;
1617                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
1618                  *dp |= (png_byte)(value << shift);
1619               }
1620
1621               if (shift == s_end)
1622               {
1623                  shift = s_start;
1624                  sp++;
1625                  dp++;
1626               }
1627               else
1628                  shift += s_inc;
1629               if (m == 1)
1630                  m = 0x80;
1631               else
1632                  m >>= 1;
1633            }
1634            break;
1635         }       /* end 4 bpp */
1636
1637         case 16:       /* png_ptr->row_info.pixel_depth */
1638         {
1639            png_bytep srcptr;
1640            png_bytep dstptr;
1641
1642#if !defined(PNG_1_0_X)
1643            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1644#else
1645            if (_mmx_supported)
1646#endif
1647            {
1648               png_uint_32 len;
1649               int diff;
1650
1651               srcptr = png_ptr->row_buf + 1;
1652               dstptr = row;
1653               len  = png_ptr->width & ~7;          // reduce to multiple of 8
1654               diff = (int) (png_ptr->width & 7);   // amount lost
1655
1656               __asm__ __volatile__ (
1657                  "not       %%edx            \n\t" // mask => unmask
1658                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
1659                  "not       %%edx            \n\t" // unmask => mask for later
1660                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1661                  "punpcklbw %%mm7, %%mm7     \n\t"
1662                  "punpcklwd %%mm7, %%mm7     \n\t"
1663                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1664
1665                  LOAD_GOT_rbp
1666                  "movq   " MASK16_0 ", %%mm0 \n\t" // _mask16_0 -> mm0
1667                  "movq   " MASK16_1 ", %%mm1 \n\t" // _mask16_1 -> mm1
1668                  RESTORE_rbp
1669
1670                  "pand      %%mm7, %%mm0     \n\t"
1671                  "pand      %%mm7, %%mm1     \n\t"
1672
1673                  "pcmpeqb   %%mm6, %%mm0     \n\t"
1674                  "pcmpeqb   %%mm6, %%mm1     \n\t"
1675
1676// preload        "movl      len, %%ecx       \n\t" // load length of line
1677// preload        "movl      srcptr, %3       \n\t" // load source
1678// preload        "movl      dstptr, %4       \n\t" // load dest
1679
1680                  "cmpl      $0, %%ecx        \n\t"
1681                  "jz        mainloop16end    \n\t"
1682
1683                "mainloop16:                  \n\t"
1684                  "movq      (%3), %%mm4      \n\t"
1685                  "pand      %%mm0, %%mm4     \n\t"
1686                  "movq      %%mm0, %%mm6     \n\t"
1687                  "movq      (%4), %%mm7      \n\t"
1688                  "pandn     %%mm7, %%mm6     \n\t"
1689                  "por       %%mm6, %%mm4     \n\t"
1690                  "movq      %%mm4, (%4)      \n\t"
1691
1692                  "movq      8(%3), %%mm5     \n\t"
1693                  "pand      %%mm1, %%mm5     \n\t"
1694                  "movq      %%mm1, %%mm7     \n\t"
1695                  "movq      8(%4), %%mm6     \n\t"
1696                  "pandn     %%mm6, %%mm7     \n\t"
1697                  "por       %%mm7, %%mm5     \n\t"
1698                  "movq      %%mm5, 8(%4)     \n\t"
1699
1700                  "add       $16, %3          \n\t" // inc by 16 bytes processed
1701                  "add       $16, %4          \n\t"
1702                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1703                  "ja        mainloop16       \n\t"
1704
1705                "mainloop16end:               \n\t"
1706// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1707                  "movl      %%eax, %%ecx     \n\t"
1708                  "cmpl      $0, %%ecx        \n\t"
1709                  "jz        end16            \n\t"
1710// preload        "movl      mask, %%edx      \n\t"
1711                  "sall      $24, %%edx       \n\t" // make low byte, high byte
1712
1713                "secondloop16:                \n\t"
1714                  "sall      %%edx            \n\t" // move high bit to CF
1715                  "jnc       skip16           \n\t" // if CF = 0
1716                  "movw      (%3), %%ax       \n\t"
1717                  "movw      %%ax, (%4)       \n\t"
1718
1719                "skip16:                      \n\t"
1720                  "add       $2, %3           \n\t"
1721                  "add       $2, %4           \n\t"
1722                  "decl      %%ecx            \n\t"
1723                  "jnz       secondloop16     \n\t"
1724
1725                "end16:                       \n\t"
1726                  "EMMS                       \n\t" // DONE
1727
1728                  : "=a" (dummy_value_a),           // output regs (dummy)
1729                    "=d" (dummy_value_d),
1730                    "=c" (dummy_value_c),
1731                    "=S" (dummy_value_S),
1732                    "=D" (dummy_value_D)
1733
1734                  : "0" (diff),        // eax       // input regs
1735                    "1" (mask),        // edx
1736                    "2" (len),         // ecx
1737// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1738                    "3" (srcptr),      // esi/rsi
1739                    "4" (dstptr)       // edi/rdi
1740
1741#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
1742                  : "%mm0", "%mm1", "%mm4"          // clobber list
1743                  , "%mm5", "%mm6", "%mm7"
1744#endif
1745               );
1746            }
1747            else /* not _mmx_supported - use modified C routine */
1748            {
1749               register png_uint_32 i;
1750               png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
1751                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1752               register int stride = BPP2 * png_pass_inc[png_ptr->pass];
1753                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1754               register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
1755                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1756               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1757               int diff = (int) (png_ptr->width & 7); /* amount lost */
1758               register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
1759
1760               srcptr = png_ptr->row_buf + 1 + initial_val;
1761               dstptr = row + initial_val;
1762
1763               for (i = initial_val; i < final_val; i += stride)
1764               {
1765                  png_memcpy(dstptr, srcptr, rep_bytes);
1766                  srcptr += stride;
1767                  dstptr += stride;
1768               }
1769               if (diff)  /* number of leftover pixels:  3 for pngtest */
1770               {
1771                  final_val += diff*BPP2;
1772                  for (; i < final_val; i += stride)
1773                  {
1774                     if (rep_bytes > (int)(final_val-i))
1775                        rep_bytes = (int)(final_val-i);
1776                     png_memcpy(dstptr, srcptr, rep_bytes);
1777                     srcptr += stride;
1778                     dstptr += stride;
1779                  }
1780               }
1781            } /* end of else (_mmx_supported) */
1782
1783            break;
1784         }       /* end 16 bpp */
1785
1786         case 48:       /* png_ptr->row_info.pixel_depth */
1787         {
1788            png_bytep srcptr;
1789            png_bytep dstptr;
1790
1791#if !defined(PNG_1_0_X)
1792            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1793#else
1794            if (_mmx_supported)
1795#endif
1796            {
1797               png_uint_32 len;
1798               int diff;
1799
1800               srcptr = png_ptr->row_buf + 1;
1801               dstptr = row;
1802               len  = png_ptr->width & ~7;          // reduce to multiple of 8
1803               diff = (int) (png_ptr->width & 7);   // amount lost
1804
1805               __asm__ __volatile__ (
1806                  "not       %%edx            \n\t" // mask => unmask
1807                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
1808                  "not       %%edx            \n\t" // unmask => mask for later
1809                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1810                  "punpcklbw %%mm7, %%mm7     \n\t"
1811                  "punpcklwd %%mm7, %%mm7     \n\t"
1812                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1813
1814                  LOAD_GOT_rbp
1815                  "movq   " MASK48_0 ", %%mm0 \n\t" // _mask48_0 -> mm0
1816                  "movq   " MASK48_1 ", %%mm1 \n\t" // _mask48_1 -> mm1
1817                  "movq   " MASK48_2 ", %%mm2 \n\t" // _mask48_2 -> mm2
1818                  "movq   " MASK48_3 ", %%mm3 \n\t" // _mask48_3 -> mm3
1819                  "movq   " MASK48_4 ", %%mm4 \n\t" // _mask48_4 -> mm4
1820                  "movq   " MASK48_5 ", %%mm5 \n\t" // _mask48_5 -> mm5
1821                  RESTORE_rbp
1822
1823                  "pand      %%mm7, %%mm0     \n\t"
1824                  "pand      %%mm7, %%mm1     \n\t"
1825                  "pand      %%mm7, %%mm2     \n\t"
1826                  "pand      %%mm7, %%mm3     \n\t"
1827                  "pand      %%mm7, %%mm4     \n\t"
1828                  "pand      %%mm7, %%mm5     \n\t"
1829
1830                  "pcmpeqb   %%mm6, %%mm0     \n\t"
1831                  "pcmpeqb   %%mm6, %%mm1     \n\t"
1832                  "pcmpeqb   %%mm6, %%mm2     \n\t"
1833                  "pcmpeqb   %%mm6, %%mm3     \n\t"
1834                  "pcmpeqb   %%mm6, %%mm4     \n\t"
1835                  "pcmpeqb   %%mm6, %%mm5     \n\t"
1836
1837// preload        "movl      len, %%ecx       \n\t" // load length of line
1838// preload        "movl      srcptr, %3       \n\t" // load source
1839// preload        "movl      dstptr, %4       \n\t" // load dest
1840
1841                  "cmpl      $0, %%ecx        \n\t"
1842                  "jz        mainloop48end    \n\t"
1843
1844                "mainloop48:                  \n\t"
1845                  "movq      (%3), %%mm7      \n\t"
1846                  "pand      %%mm0, %%mm7     \n\t"
1847                  "movq      %%mm0, %%mm6     \n\t"
1848                  "pandn     (%4), %%mm6      \n\t"
1849                  "por       %%mm6, %%mm7     \n\t"
1850                  "movq      %%mm7, (%4)      \n\t"
1851
1852                  "movq      8(%3), %%mm6     \n\t"
1853                  "pand      %%mm1, %%mm6     \n\t"
1854                  "movq      %%mm1, %%mm7     \n\t"
1855                  "pandn     8(%4), %%mm7     \n\t"
1856                  "por       %%mm7, %%mm6     \n\t"
1857                  "movq      %%mm6, 8(%4)     \n\t"
1858
1859                  "movq      16(%3), %%mm6    \n\t"
1860                  "pand      %%mm2, %%mm6     \n\t"
1861                  "movq      %%mm2, %%mm7     \n\t"
1862                  "pandn     16(%4), %%mm7    \n\t"
1863                  "por       %%mm7, %%mm6     \n\t"
1864                  "movq      %%mm6, 16(%4)    \n\t"
1865
1866                  "movq      24(%3), %%mm7    \n\t"
1867                  "pand      %%mm3, %%mm7     \n\t"
1868                  "movq      %%mm3, %%mm6     \n\t"
1869                  "pandn     24(%4), %%mm6    \n\t"
1870                  "por       %%mm6, %%mm7     \n\t"
1871                  "movq      %%mm7, 24(%4)    \n\t"
1872
1873                  "movq      32(%3), %%mm6    \n\t"
1874                  "pand      %%mm4, %%mm6     \n\t"
1875                  "movq      %%mm4, %%mm7     \n\t"
1876                  "pandn     32(%4), %%mm7    \n\t"
1877                  "por       %%mm7, %%mm6     \n\t"
1878                  "movq      %%mm6, 32(%4)    \n\t"
1879
1880                  "movq      40(%3), %%mm7    \n\t"
1881                  "pand      %%mm5, %%mm7     \n\t"
1882                  "movq      %%mm5, %%mm6     \n\t"
1883                  "pandn     40(%4), %%mm6    \n\t"
1884                  "por       %%mm6, %%mm7     \n\t"
1885                  "movq      %%mm7, 40(%4)    \n\t"
1886
1887                  "add       $48, %3          \n\t" // inc by 48 bytes processed
1888                  "add       $48, %4          \n\t"
1889                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1890
1891                  "ja        mainloop48       \n\t"
1892
1893                "mainloop48end:               \n\t"
1894// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1895                  "movl      %%eax, %%ecx     \n\t"
1896                  "cmpl      $0, %%ecx        \n\t"
1897                  "jz        end48            \n\t"
1898// preload        "movl      mask, %%edx      \n\t"
1899                  "sall      $24, %%edx       \n\t" // make low byte, high byte
1900
1901                "secondloop48:                \n\t"
1902                  "sall      %%edx            \n\t" // move high bit to CF
1903                  "jnc       skip48           \n\t" // if CF = 0
1904                  "movl      (%3), %%eax      \n\t"
1905                  "movl      %%eax, (%4)      \n\t"
1906                  "movw      4(%3), %%ax      \n\t" // GR-P bugfix 20070717
1907                  "movw      %%ax, 4(%4)      \n\t" // GR-P bugfix 20070717
1908
1909                "skip48:                      \n\t"
1910                  "add       $6, %3           \n\t" // GR-P bugfix 20070717
1911                  "add       $6, %4           \n\t" // GR-P bugfix 20070717
1912                  "decl      %%ecx            \n\t"
1913                  "jnz       secondloop48     \n\t"
1914
1915                "end48:                       \n\t"
1916                  "EMMS                       \n\t" // DONE
1917
1918                  : "=a" (dummy_value_a),           // output regs (dummy)
1919                    "=d" (dummy_value_d),
1920                    "=c" (dummy_value_c),
1921                    "=S" (dummy_value_S),
1922                    "=D" (dummy_value_D)
1923
1924                  : "0" (diff),        // eax       // input regs
1925                    "1" (mask),        // edx
1926                    "2" (len),         // ecx
1927// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1928                    "3" (srcptr),      // esi/rsi
1929                    "4" (dstptr)       // edi/rdi
1930
1931#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
1932                  : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1933                  , "%mm4", "%mm5", "%mm6", "%mm7"
1934#endif
1935               );
1936            }
1937            else /* not _mmx_supported - use modified C routine */
1938            {
1939               register png_uint_32 i;
1940               png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1941                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1942               register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1943                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1944               register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1945                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1946               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1947               int diff = (int) (png_ptr->width & 7); /* amount lost */
1948               register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
1949
1950               srcptr = png_ptr->row_buf + 1 + initial_val;
1951               dstptr = row + initial_val;
1952
1953               for (i = initial_val; i < final_val; i += stride)
1954               {
1955                  png_memcpy(dstptr, srcptr, rep_bytes);
1956                  srcptr += stride;
1957                  dstptr += stride;
1958               }
1959               if (diff)  /* number of leftover pixels:  3 for pngtest */
1960               {
1961                  final_val += diff*BPP6;
1962                  for (; i < final_val; i += stride)
1963                  {
1964                     if (rep_bytes > (int)(final_val-i))
1965                        rep_bytes = (int)(final_val-i);
1966                     png_memcpy(dstptr, srcptr, rep_bytes);
1967                     srcptr += stride;
1968                     dstptr += stride;
1969                  }
1970               }
1971            } /* end of else (_mmx_supported) */
1972
1973            break;
1974         }       /* end 48 bpp */
1975
1976         case 64:       /* png_ptr->row_info.pixel_depth */
1977         {
1978            png_bytep srcptr;
1979            png_bytep dstptr;
1980            register png_uint_32 i;
1981            png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1982              /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1983            register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1984              /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1985            register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1986              /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1987            png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1988            int diff = (int) (png_ptr->width & 7); /* amount lost */
1989            register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
1990
1991            srcptr = png_ptr->row_buf + 1 + initial_val;
1992            dstptr = row + initial_val;
1993
1994            for (i = initial_val; i < final_val; i += stride)
1995            {
1996               png_memcpy(dstptr, srcptr, rep_bytes);
1997               srcptr += stride;
1998               dstptr += stride;
1999            }
2000            if (diff)  /* number of leftover pixels:  3 for pngtest */
2001            {
2002               final_val += diff*BPP8;
2003               for (; i < final_val; i += stride)
2004               {
2005                  if (rep_bytes > (int)(final_val-i))
2006                     rep_bytes = (int)(final_val-i);
2007                  png_memcpy(dstptr, srcptr, rep_bytes);
2008                  srcptr += stride;
2009                  dstptr += stride;
2010               }
2011            }
2012
2013            break;
2014         }       /* end 64 bpp */
2015
2016         default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
2017         {
2018            // ERROR:  SHOULD NEVER BE REACHED
2019#if defined(PNG_DEBUG)
2020            png_debug(1, "Internal libpng logic error (GCC "
2021              "png_combine_row() pixel_depth)\n");
2022#endif
2023            break;
2024         }
2025      } /* end switch (png_ptr->row_info.pixel_depth) */
2026
2027   } /* end if (non-trivial mask) */
2028
2029} /* end png_combine_row() */
2030
2031#endif /* PNG_HAVE_MMX_COMBINE_ROW */
2032
2033
2034
2035
2036/*===========================================================================*/
2037/*                                                                           */
2038/*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
2039/*                                                                           */
2040/*===========================================================================*/
2041
2042#if defined(PNG_READ_INTERLACING_SUPPORTED)
2043#if defined(PNG_HAVE_MMX_READ_INTERLACE)
2044
2045/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
2046 * has taken place.  [GRR: what other steps come before and/or after?]
2047 */
2048
2049void /* PRIVATE */
2050png_do_read_interlace(png_structp png_ptr)
2051{
2052   png_row_infop row_info = &(png_ptr->row_info);
2053   png_bytep row = png_ptr->row_buf + 1;
2054   int pass = png_ptr->pass;
2055#if defined(PNG_READ_PACKSWAP_SUPPORTED)
2056   png_uint_32 transformations = png_ptr->transformations;
2057#endif
2058
2059   png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
2060
2061   if (_mmx_supported == 2) {
2062#if !defined(PNG_1_0_X)
2063       /* this should have happened in png_init_mmx_flags() already */
2064       png_warning(png_ptr, "asm_flags may not have been initialized");
2065#endif
2066       png_mmx_support();
2067   }
2068
2069   if (row != NULL && row_info != NULL)
2070   {
2071      png_uint_32 final_width;
2072
2073      final_width = row_info->width * png_pass_inc[pass];
2074
2075      switch (row_info->pixel_depth)
2076      {
2077         case 1:
2078         {
2079            png_bytep sp, dp;
2080            int sshift, dshift;
2081            int s_start, s_end, s_inc;
2082            png_byte v;
2083            png_uint_32 i;
2084            int j;
2085
2086            sp = row + (png_size_t)((row_info->width - 1) >> 3);
2087            dp = row + (png_size_t)((final_width - 1) >> 3);
2088#if defined(PNG_READ_PACKSWAP_SUPPORTED)
2089            if (transformations & PNG_PACKSWAP)
2090            {
2091               sshift = (int)((row_info->width + 7) & 7);
2092               dshift = (int)((final_width + 7) & 7);
2093               s_start = 7;
2094               s_end = 0;
2095               s_inc = -1;
2096            }
2097            else
2098#endif
2099            {
2100               sshift = 7 - (int)((row_info->width + 7) & 7);
2101               dshift = 7 - (int)((final_width + 7) & 7);
2102               s_start = 0;
2103               s_end = 7;
2104               s_inc = 1;
2105            }
2106
2107            for (i = row_info->width; i; i--)
2108            {
2109               v = (png_byte)((*sp >> sshift) & 0x1);
2110               for (j = 0; j < png_pass_inc[pass]; j++)
2111               {
2112                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
2113                  *dp |= (png_byte)(v << dshift);
2114                  if (dshift == s_end)
2115                  {
2116                     dshift = s_start;
2117                     dp--;
2118                  }
2119                  else
2120                     dshift += s_inc;
2121               }
2122               if (sshift == s_end)
2123               {
2124                  sshift = s_start;
2125                  sp--;
2126               }
2127               else
2128                  sshift += s_inc;
2129            }
2130            break;
2131         }
2132
2133         case 2:
2134         {
2135            png_bytep sp, dp;
2136            int sshift, dshift;
2137            int s_start, s_end, s_inc;
2138            png_uint_32 i;
2139
2140            sp = row + (png_size_t)((row_info->width - 1) >> 2);
2141            dp = row + (png_size_t)((final_width - 1) >> 2);
2142#if defined(PNG_READ_PACKSWAP_SUPPORTED)
2143            if (transformations & PNG_PACKSWAP)
2144            {
2145               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
2146               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
2147               s_start = 6;
2148               s_end = 0;
2149               s_inc = -2;
2150            }
2151            else
2152#endif
2153            {
2154               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
2155               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
2156               s_start = 0;
2157               s_end = 6;
2158               s_inc = 2;
2159            }
2160
2161            for (i = row_info->width; i; i--)
2162            {
2163               png_byte v;
2164               int j;
2165
2166               v = (png_byte)((*sp >> sshift) & 0x3);
2167               for (j = 0; j < png_pass_inc[pass]; j++)
2168               {
2169                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
2170                  *dp |= (png_byte)(v << dshift);
2171                  if (dshift == s_end)
2172                  {
2173                     dshift = s_start;
2174                     dp--;
2175                  }
2176                  else
2177                     dshift += s_inc;
2178               }
2179               if (sshift == s_end)
2180               {
2181                  sshift = s_start;
2182                  sp--;
2183               }
2184               else
2185                  sshift += s_inc;
2186            }
2187            break;
2188         }
2189
2190         case 4:
2191         {
2192            png_bytep sp, dp;
2193            int sshift, dshift;
2194            int s_start, s_end, s_inc;
2195            png_uint_32 i;
2196
2197            sp = row + (png_size_t)((row_info->width - 1) >> 1);
2198            dp = row + (png_size_t)((final_width - 1) >> 1);
2199#if defined(PNG_READ_PACKSWAP_SUPPORTED)
2200            if (transformations & PNG_PACKSWAP)
2201            {
2202               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
2203               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
2204               s_start = 4;
2205               s_end = 0;
2206               s_inc = -4;
2207            }
2208            else
2209#endif
2210            {
2211               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
2212               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
2213               s_start = 0;
2214               s_end = 4;
2215               s_inc = 4;
2216            }
2217
2218            for (i = row_info->width; i; i--)
2219            {
2220               png_byte v;
2221               int j;
2222
2223               v = (png_byte)((*sp >> sshift) & 0xf);
2224               for (j = 0; j < png_pass_inc[pass]; j++)
2225               {
2226                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
2227                  *dp |= (png_byte)(v << dshift);
2228                  if (dshift == s_end)
2229                  {
2230                     dshift = s_start;
2231                     dp--;
2232                  }
2233                  else
2234                     dshift += s_inc;
2235               }
2236               if (sshift == s_end)
2237               {
2238                  sshift = s_start;
2239                  sp--;
2240               }
2241               else
2242                  sshift += s_inc;
2243            }
2244            break;
2245         }
2246
2247       /*====================================================================*/
2248
2249         default: /* 8-bit or larger (this is where the routine is modified) */
2250         {
2251            png_bytep sptr, dp;
2252            png_uint_32 i;
2253            png_size_t pixel_bytes;
2254            int width = (int)row_info->width;
2255
2256            pixel_bytes = (row_info->pixel_depth >> 3);
2257
2258            /* point sptr at the last pixel in the pre-expanded row: */
2259            sptr = row + (width - 1) * pixel_bytes;
2260
2261            /* point dp at the last pixel position in the expanded row: */
2262            dp = row + (final_width - 1) * pixel_bytes;
2263
2264            /* New code by Nirav Chhatrapati - Intel Corporation */
2265
2266#if !defined(PNG_1_0_X)
2267            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
2268#else
2269            if (_mmx_supported)
2270#endif
2271            {
2272               int dummy_value_c;        // fix 'forbidden register spilled'
2273               png_bytep dummy_value_S;
2274               png_bytep dummy_value_D;
2275               png_bytep dummy_value_a;
2276               png_bytep dummy_value_d;
2277
2278               //--------------------------------------------------------------
2279               if (pixel_bytes == BPP3)
2280               {
2281                  if (((pass == 4) || (pass == 5)) && width)
2282                  {
2283                     int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
2284                     if (width_mmx < 0)
2285                         width_mmx = 0;
2286                     width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
2287                     if (width_mmx)
2288                     {
2289                        // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
2290                        // sptr points at last pixel in pre-expanded row
2291                        // dp points at last pixel position in expanded row
2292                        __asm__ __volatile__ (
2293                           "sub  $3, %1             \n\t"
2294                           "sub  $9, %2             \n\t"
2295                                        // (png_pass_inc[pass] + 1)*pixel_bytes
2296
2297                        ".loop3_pass4:              \n\t"
2298                           "movq (%1), %%mm0        \n\t" // x x 5 4 3 2 1 0
2299                           "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
2300                           "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
2301                           "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
2302                           "pand (%3), %%mm1        \n\t" // z z z z z 2 1 0
2303                           "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
2304                           "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
2305                           "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
2306                           "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
2307                           "movq %%mm0, (%2)        \n\t"
2308                           "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
2309                           "pand (%4), %%mm3        \n\t" // z z z z z z z 5
2310                           "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
2311                           "sub  $6, %1             \n\t"
2312                           "movd %%mm2, 8(%2)       \n\t"
2313                           "sub  $12, %2            \n\t"
2314                           "subl $2, %%ecx          \n\t"
2315                           "jnz .loop3_pass4        \n\t"
2316                           "EMMS                    \n\t" // DONE
2317
2318                           : "=c" (dummy_value_c),        // output regs (dummy)
2319                             "=S" (dummy_value_S),
2320                             "=D" (dummy_value_D),
2321                             "=a" (dummy_value_a),
2322                             "=d" (dummy_value_d)
2323
2324                           : "0" (width_mmx),     // ecx  // input regs
2325                             "1" (sptr),          // esi/rsi
2326                             "2" (dp),            // edi/rdi
2327#if defined(PNG_x86_64_USE_GOTPCREL)     // formerly _const4 and _const6:
2328                             "3" (&_c64._amask5_3_0), // (0x0000000000FFFFFFLL)
2329                             "4" (&_c64._amask7_1_0)  // (0x00000000000000FFLL)
2330#else
2331                             "3" (&_amask5_3_0),  // eax (0x0000000000FFFFFFLL)
2332                             "4" (&_amask7_1_0)   // edx (0x00000000000000FFLL)
2333#endif
2334
2335#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2336                           : "%mm0", "%mm1"               // clobber list
2337                           , "%mm2", "%mm3"
2338#endif
2339                        );
2340                     }
2341
2342                     sptr -= width_mmx*BPP3;
2343                     dp -= width_mmx*2*BPP3;
2344                     for (i = width; i; i--)
2345                     {
2346                        png_byte v[8];
2347                        int j;
2348
2349                        png_memcpy(v, sptr, BPP3);
2350                        for (j = 0; j < png_pass_inc[pass]; j++)
2351                        {
2352                           png_memcpy(dp, v, BPP3);
2353                           dp -= BPP3;
2354                        }
2355                        sptr -= BPP3;
2356                     }
2357                  }
2358                  else if (((pass == 2) || (pass == 3)) && width)
2359                  {
2360                     __asm__ __volatile__ (
2361                        "sub  $9, %2             \n\t"
2362                                     // (png_pass_inc[pass] - 1)*pixel_bytes
2363
2364                     ".loop3_pass2:              \n\t"
2365                        "movd (%1), %%mm0        \n\t" // x x x x x 2 1 0
2366                        "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
2367                        "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
2368                        "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
2369                        "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
2370                        "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
2371                        "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
2372                        "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
2373                        "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
2374                        "movq %%mm0, 4(%2)       \n\t"
2375                        "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
2376                        "sub  $3, %1             \n\t"
2377                        "movd %%mm0, (%2)        \n\t"
2378                        "sub  $12, %2            \n\t"
2379                        "decl %%ecx              \n\t"
2380                        "jnz .loop3_pass2        \n\t"
2381                        "EMMS                    \n\t" // DONE
2382
2383                        : "=c" (dummy_value_c),        // output regs (dummy)
2384                          "=S" (dummy_value_S),
2385                          "=D" (dummy_value_D),
2386                          "=a" (dummy_value_a)
2387
2388                        : "0" (width),         // ecx  // input regs
2389                          "1" (sptr),          // esi/rsi
2390                          "2" (dp),            // edi/rdi
2391#if defined(PNG_x86_64_USE_GOTPCREL)           // formerly _const4:
2392                          "3" (&_c64._amask5_3_0)  // (0x0000000000FFFFFFLL)
2393#else
2394                          "3" (&_amask5_3_0)   // eax (0x0000000000FFFFFFLL)
2395#endif
2396
2397#if defined(CLOBBER_MMX_REGS_SUPPORTED)
2398                        : "%mm0", "%mm1", "%mm2"       // clobber list
2399#endif
2400                     );
2401                  }
2402                  else if (width)  // && ((pass == 0) || (pass == 1))
2403                  {
2404                     __asm__ __volatile__ (
2405                        "sub  $21, %2            \n\t"
2406                                     // (png_pass_inc[pass] - 1)*pixel_bytes
2407
2408                     ".loop3_pass0:              \n\t"
2409                        "movd (%1), %%mm0        \n\t" // x x x x x 2 1 0
2410                        "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
2411                        "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
2412                        "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
2413                        "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
2414                        "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
2415                        "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
2416                        "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
2417                        "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
2418                        "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
2419                        "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
2420                        "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
2421                        "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
2422                        "movq %%mm4, 16(%2)      \n\t"
2423                        "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
2424                        "movq %%mm3, 8(%2)       \n\t"
2425                        "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
2426                        "sub  $3, %1             \n\t"
2427                        "movq %%mm0, (%2)        \n\t"
2428                        "sub  $24, %2            \n\t"
2429                        "decl %%ecx              \n\t"
2430                        "jnz .loop3_pass0        \n\t"
2431                        "EMMS                    \n\t" // DONE
2432
2433                        : "=c" (dummy_value_c),        // output regs (dummy)
2434                          "=S" (dummy_value_S),
2435                          "=D" (dummy_value_D),
2436                          "=a" (dummy_value_a)
2437
2438                        : "0" (width),         // ecx  // input regs
2439                          "1" (sptr),          // esi/rsi
2440                          "2" (dp),            // edi/rdi
2441#if defined(PNG_x86_64_USE_GOTPCREL)           // formerly _const4:
2442                          "3" (&_c64._amask5_3_0)  // (0x0000000000FFFFFFLL)
2443#else
2444                          "3" (&_amask5_3_0)   // eax (0x0000000000FFFFFFLL)
2445#endif
2446
2447#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2448                        : "%mm0", "%mm1", "%mm2"       // clobber list
2449                        , "%mm3", "%mm4"
2450#endif
2451                     );
2452                  }
2453               } /* end of pixel_bytes == 3 */
2454
2455               //--------------------------------------------------------------
2456               else if (pixel_bytes == BPP4)
2457               {
2458                  if (((pass == 4) || (pass == 5)) && width)
2459                  {
2460                     int width_mmx = ((width >> 1) << 1) ;
2461                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2462                     if (width_mmx)
2463                     {
2464                        __asm__ __volatile__ (
2465                           "sub  $4, %1             \n\t"
2466                           "sub  $12, %2            \n\t"
2467
2468                        ".loop4_pass4:              \n\t"
2469                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
2470                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2471                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2472                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2473                           "movq %%mm0, (%2)        \n\t"
2474                           "sub  $8, %1             \n\t"
2475                           "movq %%mm1, 8(%2)       \n\t"
2476                           "sub  $16, %2            \n\t"
2477                           "subl $2, %%ecx          \n\t"
2478                           "jnz .loop4_pass4        \n\t"
2479                           "EMMS                    \n\t" // DONE
2480
2481                           : "=c" (dummy_value_c),        // output regs (dummy)
2482                             "=S" (dummy_value_S),
2483                             "=D" (dummy_value_D)
2484
2485                           : "0" (width_mmx),     // ecx  // input regs
2486                             "1" (sptr),          // esi/rsi
2487                             "2" (dp)             // edi/rdi
2488
2489#if defined(CLOBBER_MMX_REGS_SUPPORTED)
2490                           : "%mm0", "%mm1"               // clobber list
2491#endif
2492                        );
2493                     }
2494
2495                     sptr -= (width_mmx*BPP4 - BPP4); // sign fixed
2496                     dp -= (width_mmx*2*BPP4 - BPP4); // sign fixed
2497                     for (i = width; i; i--)
2498                     {
2499                        png_byte v[8];
2500                        int j;
2501                        sptr -= BPP4;
2502                        png_memcpy(v, sptr, BPP4);
2503                        for (j = 0; j < png_pass_inc[pass]; j++)
2504                        {
2505                           dp -= BPP4;
2506                           png_memcpy(dp, v, BPP4);
2507                        }
2508                     }
2509                  }
2510                  else if (((pass == 2) || (pass == 3)) && width)
2511                  {
2512                     int width_mmx = ((width >> 1) << 1);
2513                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2514                     if (width_mmx)
2515                     {
2516                        __asm__ __volatile__ (
2517                           "sub  $4, %1             \n\t"
2518                           "sub  $28, %2            \n\t"
2519
2520                        ".loop4_pass2:              \n\t"
2521                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
2522                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2523                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2524                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2525                           "movq %%mm0, (%2)        \n\t"
2526                           "movq %%mm0, 8(%2)       \n\t"
2527                           "movq %%mm1, 16(%2)      \n\t"
2528                           "movq %%mm1, 24(%2)      \n\t"
2529                           "sub  $8, %1             \n\t"
2530                           "sub  $32, %2            \n\t"
2531                           "subl $2, %%ecx          \n\t"
2532                           "jnz .loop4_pass2        \n\t"
2533                           "EMMS                    \n\t" // DONE
2534
2535                           : "=c" (dummy_value_c),        // output regs (dummy)
2536                             "=S" (dummy_value_S),
2537                             "=D" (dummy_value_D)
2538
2539                           : "0" (width_mmx),     // ecx  // input regs
2540                             "1" (sptr),          // esi/rsi
2541                             "2" (dp)             // edi/rdi
2542
2543#if defined(CLOBBER_MMX_REGS_SUPPORTED)
2544                           : "%mm0", "%mm1"               // clobber list
2545#endif
2546                        );
2547                     }
2548
2549                     sptr -= (width_mmx*4 - 4); // sign fixed
2550                     dp -= (width_mmx*16 - 4);  // sign fixed
2551                     for (i = width; i; i--)
2552                     {
2553                        png_byte v[8];
2554                        int j;
2555                        sptr -= 4;
2556                        png_memcpy(v, sptr, 4);
2557                        for (j = 0; j < png_pass_inc[pass]; j++)
2558                        {
2559                           dp -= 4;
2560                           png_memcpy(dp, v, 4);
2561                        }
2562                     }
2563                  }
2564                  else if (width)  // && ((pass == 0) || (pass == 1))
2565                  {
2566                     int width_mmx = ((width >> 1) << 1);
2567                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2568                     if (width_mmx)
2569                     {
2570                        __asm__ __volatile__ (
2571                           "sub  $4, %1             \n\t"
2572                           "sub  $60, %2            \n\t"
2573
2574                        ".loop4_pass0:              \n\t"
2575                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
2576                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2577                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2578                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2579                           "movq %%mm0, (%2)        \n\t"
2580                           "movq %%mm0, 8(%2)       \n\t"
2581                           "movq %%mm0, 16(%2)      \n\t"
2582                           "movq %%mm0, 24(%2)      \n\t"
2583                           "movq %%mm1, 32(%2)      \n\t"
2584                           "movq %%mm1, 40(%2)      \n\t"
2585                           "movq %%mm1, 48(%2)      \n\t"
2586                           "sub  $8, %1             \n\t"
2587                           "movq %%mm1, 56(%2)      \n\t"
2588                           "sub  $64, %2            \n\t"
2589                           "subl $2, %%ecx          \n\t"
2590                           "jnz .loop4_pass0        \n\t"
2591                           "EMMS                    \n\t" // DONE
2592
2593                           : "=c" (dummy_value_c),        // output regs (dummy)
2594                             "=S" (dummy_value_S),
2595                             "=D" (dummy_value_D)
2596
2597                           : "0" (width_mmx),     // ecx  // input regs
2598                             "1" (sptr),          // esi/rsi
2599                             "2" (dp)             // edi/rdi
2600
2601#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2602                           : "%mm0", "%mm1"               // clobber list
2603#endif
2604                        );
2605                     }
2606
2607                     sptr -= (width_mmx*4 - 4); // sign fixed
2608                     dp -= (width_mmx*32 - 4);  // sign fixed
2609                     for (i = width; i; i--)
2610                     {
2611                        png_byte v[8];
2612                        int j;
2613                        sptr -= 4;
2614                        png_memcpy(v, sptr, 4);
2615                        for (j = 0; j < png_pass_inc[pass]; j++)
2616                        {
2617                           dp -= 4;
2618                           png_memcpy(dp, v, 4);
2619                        }
2620                     }
2621                  }
2622               } /* end of pixel_bytes == 4 */
2623
2624               //--------------------------------------------------------------
2625               else if (pixel_bytes == 1)
2626               {
2627                  if (((pass == 4) || (pass == 5)) && width)
2628                  {
2629                     int width_mmx = ((width >> 3) << 3);
2630                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
2631                     if (width_mmx)
2632                     {
2633                        __asm__ __volatile__ (
2634                           "sub  $7, %1             \n\t"
2635                           "sub  $15, %2            \n\t"
2636
2637                        ".loop1_pass4:              \n\t"
2638                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
2639                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2640                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2641                           "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
2642                           "movq %%mm1, 8(%2)       \n\t"
2643                           "sub  $8, %1             \n\t"
2644                           "movq %%mm0, (%2)        \n\t"
2645                           "sub  $16, %2            \n\t"
2646                           "subl $8, %%ecx          \n\t"
2647                           "jnz .loop1_pass4        \n\t"
2648                           "EMMS                    \n\t" // DONE
2649
2650                           : "=c" (dummy_value_c),        // output regs (dummy)
2651                             "=S" (dummy_value_S),
2652                             "=D" (dummy_value_D)
2653
2654                           : "0" (width_mmx),     // ecx  // input regs
2655                             "1" (sptr),          // esi/rsi
2656                             "2" (dp)             // edi/rdi
2657
2658#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2659                           : "%mm0", "%mm1"               // clobber list
2660#endif
2661                        );
2662                     }
2663
2664                     sptr -= width_mmx;
2665                     dp -= width_mmx*2;
2666                     for (i = width; i; i--)
2667                     {
2668                        int j;
2669
2670                        for (j = 0; j < png_pass_inc[pass]; j++)
2671                        {
2672                           *dp-- = *sptr;
2673                        }
2674                        --sptr;
2675                     }
2676                  }
2677                  else if (((pass == 2) || (pass == 3)) && width)
2678                  {
2679                     int width_mmx = ((width >> 2) << 2);
2680                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
2681                     if (width_mmx)
2682                     {
2683                        __asm__ __volatile__ (
2684                           "sub  $3, %1             \n\t"
2685                           "sub  $15, %2            \n\t"
2686
2687                        ".loop1_pass2:              \n\t"
2688                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
2689                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2690                           "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
2691                           "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
2692                           "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
2693                           "movq %%mm0, (%2)        \n\t"
2694                           "sub  $4, %1             \n\t"
2695                           "movq %%mm1, 8(%2)       \n\t"
2696                           "sub  $16, %2            \n\t"
2697                           "subl $4, %%ecx          \n\t"
2698                           "jnz .loop1_pass2        \n\t"
2699                           "EMMS                    \n\t" // DONE
2700
2701                           : "=c" (dummy_value_c),        // output regs (dummy)
2702                             "=S" (dummy_value_S),
2703                             "=D" (dummy_value_D)
2704
2705                           : "0" (width_mmx),     // ecx  // input regs
2706                             "1" (sptr),          // esi/rsi
2707                             "2" (dp)             // edi/rdi
2708
2709#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2710                           : "%mm0", "%mm1"               // clobber list
2711#endif
2712                        );
2713                     }
2714
2715                     sptr -= width_mmx;
2716                     dp -= width_mmx*4;
2717                     for (i = width; i; i--)
2718                     {
2719                        int j;
2720
2721                        for (j = 0; j < png_pass_inc[pass]; j++)
2722                        {
2723                           *dp-- = *sptr;
2724                        }
2725                        --sptr;
2726                     }
2727                  }
2728                  else if (width)  // && ((pass == 0) || (pass == 1))
2729                  {
2730                     int width_mmx = ((width >> 2) << 2);
2731                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
2732                     if (width_mmx)
2733                     {
2734                        __asm__ __volatile__ (
2735                           "sub  $3, %1             \n\t"
2736                           "sub  $31, %2            \n\t"
2737
2738                        ".loop1_pass0:              \n\t"
2739                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
2740                           "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
2741                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2742                           "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
2743                           "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
2744                           "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
2745                           "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
2746                           "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
2747                           "movq %%mm0, (%2)        \n\t"
2748                           "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
2749                           "movq %%mm3, 8(%2)       \n\t"
2750                           "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
2751                           "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
2752                           "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
2753                           "movq %%mm2, 16(%2)      \n\t"
2754                           "sub  $4, %1             \n\t"
2755                           "movq %%mm4, 24(%2)      \n\t"
2756                           "sub  $32, %2            \n\t"
2757                           "subl $4, %%ecx          \n\t"
2758                           "jnz .loop1_pass0        \n\t"
2759                           "EMMS                    \n\t" // DONE
2760
2761                           : "=c" (dummy_value_c),        // output regs (dummy)
2762                             "=S" (dummy_value_S),
2763                             "=D" (dummy_value_D)
2764
2765                           : "0" (width_mmx),     // ecx  // input regs
2766                             "1" (sptr),          // esi/rsi
2767                             "2" (dp)             // edi/rdi
2768
2769#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2770                           : "%mm0", "%mm1", "%mm2"       // clobber list
2771                           , "%mm3", "%mm4"
2772#endif
2773                        );
2774                     }
2775
2776                     sptr -= width_mmx;
2777                     dp -= width_mmx*8;
2778                     for (i = width; i; i--)
2779                     {
2780                        int j;
2781
2782                       /* I simplified this part in version 1.0.4e
2783                        * here and in several other instances where
2784                        * pixel_bytes == 1  -- GR-P
2785                        *
2786                        * Original code:
2787                        *
2788                        * png_byte v[8];
2789                        * png_memcpy(v, sptr, pixel_bytes);
2790                        * for (j = 0; j < png_pass_inc[pass]; j++)
2791                        * {
2792                        *    png_memcpy(dp, v, pixel_bytes);
2793                        *    dp -= pixel_bytes;
2794                        * }
2795                        * sptr -= pixel_bytes;
2796                        *
2797                        * Replacement code is in the next three lines:
2798                        */
2799
2800                        for (j = 0; j < png_pass_inc[pass]; j++)
2801                        {
2802                           *dp-- = *sptr;
2803                        }
2804                        --sptr;
2805                     }
2806                  }
2807               } /* end of pixel_bytes == 1 */
2808
2809               //--------------------------------------------------------------
2810               else if (pixel_bytes == BPP2)
2811               {
2812                  if (((pass == 4) || (pass == 5)) && width)
2813                  {
2814                     int width_mmx = ((width >> 1) << 1) ;
2815                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2816                     if (width_mmx)
2817                     {
2818                        __asm__ __volatile__ (
2819                           "sub  $2, %1             \n\t"
2820                           "sub  $6, %2             \n\t"
2821
2822                        ".loop2_pass4:              \n\t"
2823                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
2824                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2825                           "sub  $4, %1             \n\t"
2826                           "movq %%mm0, (%2)        \n\t"
2827                           "sub  $8, %2             \n\t"
2828                           "subl $2, %%ecx          \n\t"
2829                           "jnz .loop2_pass4        \n\t"
2830                           "EMMS                    \n\t" // DONE
2831
2832                           : "=c" (dummy_value_c),        // output regs (dummy)
2833                             "=S" (dummy_value_S),
2834                             "=D" (dummy_value_D)
2835
2836                           : "0" (width_mmx),     // ecx  // input regs
2837                             "1" (sptr),          // esi/rsi
2838                             "2" (dp)             // edi/rdi
2839
2840#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
2841                           : "%mm0"                       // clobber list
2842#endif
2843                        );
2844                     }
2845
2846                     sptr -= (width_mmx*BPP2 - BPP2); // sign fixed
2847                     dp -= (width_mmx*2*BPP2 - BPP2); // sign fixed
2848                     for (i = width; i; i--)
2849                     {
2850                        png_byte v[8];
2851                        int j;
2852                        sptr -= BPP2;
2853                        png_memcpy(v, sptr, BPP2);
2854                        for (j = 0; j < png_pass_inc[pass]; j++)
2855                        {
2856                           dp -= BPP2;
2857                           png_memcpy(dp, v, BPP2);
2858                        }
2859                     }
2860                  }
2861                  else if (((pass == 2) || (pass == 3)) && width)
2862                  {
2863                     int width_mmx = ((width >> 1) << 1) ;
2864                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2865                     if (width_mmx)
2866                     {
2867                        __asm__ __volatile__ (
2868                           "sub  $2, %1             \n\t"
2869                           "sub  $14, %2            \n\t"
2870
2871                        ".loop2_pass2:              \n\t"
2872                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
2873                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2874                           "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2875                           "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2876                           "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2877                           "movq %%mm0, (%2)        \n\t"
2878                           "sub  $4, %1             \n\t"
2879                           "movq %%mm1, 8(%2)       \n\t"
2880                           "sub  $16, %2            \n\t"
2881                           "subl $2, %%ecx          \n\t"
2882                           "jnz .loop2_pass2        \n\t"
2883                           "EMMS                    \n\t" // DONE
2884
2885                           : "=c" (dummy_value_c),        // output regs (dummy)
2886                             "=S" (dummy_value_S),
2887                             "=D" (dummy_value_D)
2888
2889                           : "0" (width_mmx),     // ecx  // input regs
2890                             "1" (sptr),          // esi/rsi
2891                             "2" (dp)             // edi/rdi
2892
2893#if defined(CLOBBER_MMX_REGS_SUPPORTED)
2894                           : "%mm0", "%mm1"               // clobber list
2895#endif
2896                        );
2897                     }
2898
2899                     sptr -= (width_mmx*2 - 2); // sign fixed
2900                     dp -= (width_mmx*8 - 2);   // sign fixed
2901                     for (i = width; i; i--)
2902                     {
2903                        png_byte v[8];
2904                        int j;
2905                        sptr -= 2;
2906                        png_memcpy(v, sptr, 2);
2907                        for (j = 0; j < png_pass_inc[pass]; j++)
2908                        {
2909                           dp -= 2;
2910                           png_memcpy(dp, v, 2);
2911                        }
2912                     }
2913                  }
2914                  else if (width)  // && ((pass == 0) || (pass == 1))
2915                  {
2916                     int width_mmx = ((width >> 1) << 1);
2917                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2918                     if (width_mmx)
2919                     {
2920                        __asm__ __volatile__ (
2921                           "sub  $2, %1             \n\t"
2922                           "sub  $30, %2            \n\t"
2923
2924                        ".loop2_pass0:              \n\t"
2925                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
2926                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2927                           "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2928                           "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2929                           "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2930                           "movq %%mm0, (%2)        \n\t"
2931                           "movq %%mm0, 8(%2)       \n\t"
2932                           "movq %%mm1, 16(%2)      \n\t"
2933                           "sub  $4, %1             \n\t"
2934                           "movq %%mm1, 24(%2)      \n\t"
2935                           "sub  $32, %2            \n\t"
2936                           "subl $2, %%ecx          \n\t"
2937                           "jnz .loop2_pass0        \n\t"
2938                           "EMMS                    \n\t" // DONE
2939
2940                           : "=c" (dummy_value_c),        // output regs (dummy)
2941                             "=S" (dummy_value_S),
2942                             "=D" (dummy_value_D)
2943
2944                           : "0" (width_mmx),     // ecx  // input regs
2945                             "1" (sptr),          // esi/rsi
2946                             "2" (dp)             // edi/rdi
2947
2948#if defined(CLOBBER_MMX_REGS_SUPPORTED)
2949                           : "%mm0", "%mm1"               // clobber list
2950#endif
2951                        );
2952                     }
2953
2954                     sptr -= (width_mmx*2 - 2); // sign fixed
2955                     dp -= (width_mmx*16 - 2);  // sign fixed
2956                     for (i = width; i; i--)
2957                     {
2958                        png_byte v[8];
2959                        int j;
2960                        sptr -= 2;
2961                        png_memcpy(v, sptr, 2);
2962                        for (j = 0; j < png_pass_inc[pass]; j++)
2963                        {
2964                           dp -= 2;
2965                           png_memcpy(dp, v, 2);
2966                        }
2967                     }
2968                  }
2969               } /* end of pixel_bytes == 2 */
2970
2971               //--------------------------------------------------------------
2972               else if (pixel_bytes == BPP8)
2973               {
2974// GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
2975                  // GRR NOTE:  no need to combine passes here!
2976                  if (((pass == 4) || (pass == 5)) && width)
2977                  {
2978                     // source is 8-byte RRGGBBAA
2979                     // dest is 16-byte RRGGBBAA RRGGBBAA
2980                     __asm__ __volatile__ (
2981                        "sub  $8, %2             \n\t" // start of last block
2982
2983                     ".loop8_pass4:              \n\t"
2984                        "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
2985                        "movq %%mm0, (%2)        \n\t"
2986                        "sub  $8, %1             \n\t"
2987                        "movq %%mm0, 8(%2)       \n\t"
2988                        "sub  $16, %2            \n\t"
2989                        "decl %%ecx              \n\t"
2990                        "jnz .loop8_pass4        \n\t"
2991                        "EMMS                    \n\t" // DONE
2992
2993                        : "=c" (dummy_value_c),        // output regs (dummy)
2994                          "=S" (dummy_value_S),
2995                          "=D" (dummy_value_D)
2996
2997                        : "0" (width),         // ecx  // input regs
2998                          "1" (sptr),          // esi/rsi
2999                          "2" (dp)             // edi/rdi
3000
3001#if defined(CLOBBER_MMX_REGS_SUPPORTED)
3002                        : "%mm0"                       // clobber list
3003#endif
3004                     );
3005                  }
3006                  else if (((pass == 2) || (pass == 3)) && width)
3007                  {
3008                     // source is 8-byte RRGGBBAA
3009                     // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
3010                     // (recall that expansion is _in place_:  sptr and dp
3011                     //  both point at locations within same row buffer)
3012                     __asm__ __volatile__ (
3013                        "sub  $24, %2            \n\t" // start of last block
3014
3015                     ".loop8_pass2:              \n\t"
3016                        "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
3017                        "movq %%mm0, (%2)        \n\t"
3018                        "movq %%mm0, 8(%2)       \n\t"
3019                        "movq %%mm0, 16(%2)      \n\t"
3020                        "sub  $8, %1             \n\t"
3021                        "movq %%mm0, 24(%2)      \n\t"
3022                        "sub  $32, %2            \n\t"
3023                        "decl %%ecx              \n\t"
3024                        "jnz .loop8_pass2        \n\t"
3025                        "EMMS                    \n\t" // DONE
3026
3027                        : "=c" (dummy_value_c),        // output regs (dummy)
3028                          "=S" (dummy_value_S),
3029                          "=D" (dummy_value_D)
3030
3031                        : "0" (width),         // ecx  // input regs
3032                          "1" (sptr),          // esi/rsi
3033                          "2" (dp)             // edi/rdi
3034
3035#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
3036                        : "%mm0"                       // clobber list
3037#endif
3038                     );
3039                  }
3040                  else if (width)  // && ((pass == 0) || (pass == 1))
3041                  {
3042                     // source is 8-byte RRGGBBAA
3043                     // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
3044                     __asm__ __volatile__ (
3045                        "sub  $56, %2            \n\t" // start of last block
3046
3047                     ".loop8_pass0:              \n\t"
3048                        "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
3049                        "movq %%mm0, (%2)        \n\t"
3050                        "movq %%mm0, 8(%2)       \n\t"
3051                        "movq %%mm0, 16(%2)      \n\t"
3052                        "movq %%mm0, 24(%2)      \n\t"
3053                        "movq %%mm0, 32(%2)      \n\t"
3054                        "movq %%mm0, 40(%2)      \n\t"
3055                        "movq %%mm0, 48(%2)      \n\t"
3056                        "sub  $8, %1             \n\t"
3057                        "movq %%mm0, 56(%2)      \n\t"
3058                        "sub  $64, %2            \n\t"
3059                        "decl %%ecx              \n\t"
3060                        "jnz .loop8_pass0        \n\t"
3061                        "EMMS                    \n\t" // DONE
3062
3063                        : "=c" (dummy_value_c),        // output regs (dummy)
3064                          "=S" (dummy_value_S),
3065                          "=D" (dummy_value_D)
3066
3067                        : "0" (width),         // ecx  // input regs
3068                          "1" (sptr),          // esi/rsi
3069                          "2" (dp)             // edi/rdi
3070
3071#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
3072                        : "%mm0"                       // clobber list
3073#endif
3074                     );
3075                  }
3076               } /* end of pixel_bytes == 8 */
3077
3078               //--------------------------------------------------------------
3079               else if (pixel_bytes == BPP6)   // why no MMX for this case?
3080               {
3081                  for (i = width; i; i--)
3082                  {
3083                     png_byte v[8];
3084                     int j;
3085                     png_memcpy(v, sptr, BPP6);
3086                     for (j = 0; j < png_pass_inc[pass]; j++)
3087                     {
3088                        png_memcpy(dp, v, BPP6);
3089                        dp -= BPP6;
3090                     }
3091                     sptr -= BPP6;
3092                  }
3093               } /* end of pixel_bytes == 6 */
3094
3095               //--------------------------------------------------------------
3096               else
3097               {
3098                  // ERROR:  SHOULD NEVER BE REACHED
3099#if defined(PNG_DEBUG)
3100                  png_debug(1, "Internal libpng logic error (GCC "
3101                    "png_do_read_interlace() _mmx_supported)\n");
3102#endif
3103               }
3104
3105            } // end of _mmx_supported ========================================
3106
3107            else /* MMX not supported:  use modified C code - takes advantage
3108                  *   of inlining of png_memcpy for a constant */
3109            {
3110               if (pixel_bytes == BPP3)
3111               {
3112                  for (i = width; i; i--)
3113                  {
3114                     png_byte v[8];
3115                     int j;
3116                     png_memcpy(v, sptr, BPP3);
3117                     for (j = 0; j < png_pass_inc[pass]; j++)
3118                     {
3119                        png_memcpy(dp, v, BPP3);
3120                        dp -= BPP3;
3121                     }
3122                     sptr -= BPP3;
3123                  }
3124               }
3125               else if (pixel_bytes == BPP4)
3126               {
3127                  for (i = width; i; i--)
3128                  {
3129                     png_byte v[8];
3130                     int j;
3131                     png_memcpy(v, sptr, BPP4);
3132                     for (j = 0; j < png_pass_inc[pass]; j++)
3133                     {
3134#if defined(PNG_DEBUG) && defined(PNG_1_0_X)  // row_buf_size gone in 1.2.x
3135                        if (dp < row || dp+3 > row+png_ptr->row_buf_size)
3136                        {
3137                           printf("dp out of bounds: row=%10p, dp=%10p, "
3138                             "rp=%10p\n", row, dp, row+png_ptr->row_buf_size);
3139                           printf("row_buf_size=%lu\n", png_ptr->row_buf_size);
3140                        }
3141#endif
3142                        png_memcpy(dp, v, BPP4);
3143                        dp -= BPP4;
3144                     }
3145                     sptr -= BPP4;
3146                  }
3147               }
3148               else if (pixel_bytes == 1)
3149               {
3150                  for (i = width; i; i--)
3151                  {
3152                     int j;
3153                     for (j = 0; j < png_pass_inc[pass]; j++)
3154                     {
3155                        *dp-- = *sptr;
3156                     }
3157                     --sptr;
3158                  }
3159               }
3160               else if (pixel_bytes == BPP2)
3161               {
3162                  for (i = width; i; i--)
3163                  {
3164                     png_byte v[8];
3165                     int j;
3166                     png_memcpy(v, sptr, BPP2);
3167                     for (j = 0; j < png_pass_inc[pass]; j++)
3168                     {
3169                        png_memcpy(dp, v, BPP2);
3170                        dp -= BPP2;
3171                     }
3172                     sptr -= BPP2;
3173                  }
3174               }
3175               else if (pixel_bytes == BPP6)
3176               {
3177                  for (i = width; i; i--)
3178                  {
3179                     png_byte v[8];
3180                     int j;
3181                     png_memcpy(v, sptr, BPP6);
3182                     for (j = 0; j < png_pass_inc[pass]; j++)
3183                     {
3184                        png_memcpy(dp, v, BPP6);
3185                        dp -= BPP6;
3186                     }
3187                     sptr -= BPP6;
3188                  }
3189               }
3190               else if (pixel_bytes == BPP8)
3191               {
3192                  for (i = width; i; i--)
3193                  {
3194                     png_byte v[8];
3195                     int j;
3196                     png_memcpy(v, sptr, BPP8);
3197                     for (j = 0; j < png_pass_inc[pass]; j++)
3198                     {
3199                        png_memcpy(dp, v, BPP8);
3200                        dp -= BPP8;
3201                     }
3202                     sptr -= BPP8;
3203                  }
3204               }
3205               else
3206               {
3207                  // ERROR:  SHOULD NEVER BE REACHED
3208#if defined(PNG_DEBUG)
3209                  png_debug(1, "Internal libpng logic error (GCC "
3210                    "png_do_read_interlace() !_mmx_supported)\n");
3211#endif
3212               }
3213
3214            } /* end if (MMX not supported) */
3215            break;
3216         } /* end default (8-bit or larger) */
3217      } /* end switch (row_info->pixel_depth) */
3218
3219      row_info->width = final_width;
3220
3221      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
3222   }
3223
3224} /* end png_do_read_interlace() */
3225
3226#endif /* PNG_HAVE_MMX_READ_INTERLACE */
3227#endif /* PNG_READ_INTERLACING_SUPPORTED */
3228
3229
3230
3231#if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
3232#if defined(PNG_MMX_READ_FILTER_AVG_SUPPORTED)
3233
3234//===========================================================================//
3235//                                                                           //
3236//           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
3237//                                                                           //
3238//===========================================================================//
3239
3240// Optimized code for PNG Average filter decoder
3241
3242static void /* PRIVATE */
3243png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
3244                            png_bytep prev_row)
3245{
3246   unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
3247   int bpp;
3248   int dummy_value_a;
3249   int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
3250   int dummy_value_d;
3251   png_bytep dummy_value_S;
3252   png_bytep dummy_value_D;
3253   int diff; //     __attribute__((used));
3254
3255   bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
3256   FullLength = row_info->rowbytes;         // number of bytes to filter
3257
3258   __asm__ __volatile__ (
3259   "avg_top:                       \n\t"
3260      SAVE_GOT_ebx
3261      SAVE_r15
3262      SAVE_ebp
3263      // initialize address pointers and offset
3264//pre "movl row, %5                \n\t" // edi/rdi:  ptr to Avg(x)
3265      "xorl %%ebx, %%ebx           \n\t" // ebx:  x
3266//pre "movl prev_row, %4           \n\t" // esi/rsi:  ptr to Prior(x)
3267      "mov  %5, " PDX "            \n\t" // copy of row ptr...
3268//pre "subl bpp, " PDX "           \n\t" // (bpp is preloaded into ecx)
3269      "sub  " PCX "," PDX "        \n\t" // edx/rdx:  ptr to Raw(x-bpp)
3270//pre "movl FullLength, %%eax      \n\t" // bring in via eax...
3271      SAVE_FullLength                    // ...but store for later use
3272      "xorl %%eax, %%eax           \n\t"
3273
3274      // Compute the Raw value for the first bpp bytes
3275      //    Raw(x) = Avg(x) + (Prior(x)/2)
3276   "avg_rlp:                       \n\t"
3277      "movb (%4," PBX ",), %%al    \n\t" // load al with Prior(x)
3278      "incl %%ebx                  \n\t"
3279      "shrb %%al                   \n\t" // divide by 2
3280      "addb -1(%5," PBX ",), %%al  \n\t" // add Avg(x); -1 to offset inc ebx
3281//pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
3282      "cmpl %%ecx, %%ebx           \n\t"
3283      "movb %%al, -1(%5," PBX ",)  \n\t" // write Raw(x); -1 to offset inc ebx
3284      "jb avg_rlp                  \n\t" // mov does not affect flags
3285
3286      // get # of bytes to alignment (32-bit mask _would_ be good enough
3287      // [computing delta], but 32-bit ops are zero-extended on 64-bit, argh)
3288      // (if swapped edx and ebp, could do 8-bit or 16-bit mask...FIXME?)
3289      "mov  %5, " PBP "            \n\t" // take start of row
3290      "add  " PBX "," PBP "        \n\t" // add bpp
3291      "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
3292//    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
3293      CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
3294      "sub  %5, " PBP "            \n\t" // subtract row ptr again => ebp =
3295      "jz avg_go                   \n\t" //  target value of ebx at alignment
3296
3297      "xorl %%ecx, %%ecx           \n\t"
3298
3299      // fix alignment
3300      // Compute the Raw value for the bytes up to the alignment boundary
3301      //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3302   "avg_lp1:                       \n\t"
3303      "xorl %%eax, %%eax           \n\t"
3304      "movb (%4," PBX ",), %%cl    \n\t" // load cl with Prior(x)
3305      "movb (" PDX "," PBX ",), %%al \n\t" // load al with Raw(x-bpp)
3306      "addw %%cx, %%ax             \n\t"
3307      "incl %%ebx                  \n\t"
3308      "shrw %%ax                   \n\t" // divide by 2
3309      "addb -1(%5," PBX ",), %%al  \n\t" // add Avg(x); -1 to offset inc ebx
3310      "cmpl %%ebp, %%ebx           \n\t" // check if at alignment boundary
3311      "movb %%al, -1(%5," PBX ",)  \n\t" // write Raw(x); -1 to offset inc ebx
3312      "jb avg_lp1                  \n\t" // repeat until at alignment boundary
3313
3314   "avg_go:                        \n\t"
3315      RESTORE_FullLength "%%eax    \n\t" // FullLength -> eax
3316      "movl %%eax, %%ecx           \n\t" // copy -> ecx
3317      "subl %%ebx, %%eax           \n\t" // subtract alignment fix
3318      "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
3319      "subl %%eax, %%ecx           \n\t" // sub over-bytes from original length
3320//out "movl %%ecx, MMXLength       \n\t"
3321      "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
3322      RESTORE_ebp                        //  (could swap ebp and edx functions)
3323      RESTORE_r15
3324      RESTORE_GOT_ebx
3325
3326// "There is no way for you to specify that an input operand is modified
3327// without also specifying it as an output operand."  [makes sense]
3328
3329// "Unless an output operand has the `&' constraint modifier, GCC may
3330// allocate it in the same register as an unrelated input operand, on the
3331// assumption the inputs are consumed before the outputs are produced."
3332// [trying to _force_ this]
3333
3334// "`='   Means that this operand is write-only for this instruction:
3335//        the previous value is discarded and replaced by output data."
3336//        [operand == variable name, presumably]
3337
3338      // output regs
3339      // these are operands 0-1 (originally 0-3):
3340      : "=c" (MMXLength),      // %0 -> %0
3341        "=a" (diff)            // %3 -> %1
3342//      "=S" (dummy_value_S),  // %1 -> GONE
3343//      "=D" (dummy_value_D),  // %2 -> GONE
3344
3345      // input regs
3346      // these are operands 2-5 (originally 4-7); two of their constraints say
3347      // they must go in same places as operands 0-1 (originally 0-3) above:
3348      : "0" (bpp),         // %4 -> %2 ecx
3349        "1" (FullLength),  // %7 -> %3 eax
3350        "S" (prev_row),    // %5 -> %4 esi/rsi
3351        "D" (row)          // %6 -> %5 edi/rdi
3352
3353      : "%edx"                           // clobber list
3354        _CLOBBER_r15
3355        _CLOBBER_ebp
3356        _CLOBBER_GOT_ebx
3357   );
3358
3359   // now do the math for the rest of the row
3360   switch (bpp)
3361   {
3362      case 3:
3363      {
3364//       _ShiftBpp = 24;    // == 3 * 8
3365//       _ShiftRem = 40;    // == 64 - 24
3366
3367         __asm__ __volatile__ (
3368            // re-init address pointers and offset
3369            LOAD_GOT_rbp
3370            "movq " AMASK5_3_0 ", %%mm7    \n\t" // _amask5_3_0 -> mm7
3371// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
3372                                                 //  alignment boundary
3373            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
3374// preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
3375            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
3376// preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
3377            RESTORE_rbp
3378
3379            // prime the pump:  load the first Raw(x-bpp) data set
3380            "movq  -8(%1," PCX ",), %%mm2 \n\t"// load previous aligned 8 bytes
3381                                               // (correct pos. in loop below)
3382         "avg_3lp:                        \n\t"
3383            "movq  (%1," PCX ",), %%mm0   \n\t" // load mm0 with Avg(x)
3384            "movq  %%mm5, %%mm3           \n\t"
3385            "psrlq $40, %%mm2             \n\t" // correct position Raw(x-bpp)
3386                                                // data
3387            "movq  (%0," PCX ",), %%mm1   \n\t" // load mm1 with Prior(x)
3388            "movq  %%mm7, %%mm6           \n\t"
3389            "pand  %%mm1, %%mm3           \n\t" // get lsb for each prevrow byte
3390            "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
3391            "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
3392                                                // byte
3393            "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
3394                                                // each byte
3395            // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
3396            "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
3397                                                // LBCarrys
3398            "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3399                                                // where both lsb's were == 1
3400                                                // (valid only for active group)
3401            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
3402            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
3403                                                // byte
3404            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
3405                                                // for each byte
3406            "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 1
3407                                                // bytes to add to Avg
3408            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
3409                                                // Avg for each Active byte
3410            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3411            "psllq $24, %%mm6             \n\t" // shift the mm6 mask to cover
3412                                                // bytes 3-5
3413            "movq  %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3414            "psllq $24, %%mm2             \n\t" // shift data to pos. correctly
3415            "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
3416                                                // LBCarrys
3417            "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3418                                                // where both lsb's were == 1
3419                                                // (valid only for active group)
3420            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
3421            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
3422                                                // byte
3423            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
3424                                                // for each byte
3425            "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 2
3426                                                // bytes to add to Avg
3427            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
3428                                                // Avg for each Active byte
3429
3430            // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3431            "psllq $24, %%mm6             \n\t" // shift mm6 mask to cover last
3432                                                // two bytes
3433            "movq  %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3434            "psllq $24, %%mm2             \n\t" // shift data to pos. correctly
3435                              // Data need be shifted only once here to
3436                              // get the correct x-bpp offset.
3437            "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
3438                                                // LBCarrys
3439            "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3440                                                // where both
3441                              // lsb's were == 1 (only valid for active group)
3442            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
3443            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
3444                                                // byte
3445            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
3446                                                // for each byte
3447            "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 2
3448                                                // bytes to add to Avg
3449            "addl  $8, %%ecx              \n\t"
3450            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
3451                                                // Avg for each Active byte
3452            // now ready to write back to memory
3453            "movq  %%mm0, -8(%1," PCX ",) \n\t"
3454            // move updated Raw(x) to use as Raw(x-bpp) for next loop
3455            "cmpl  %%eax, %%ecx           \n\t" // MMXLength
3456            "movq  %%mm0, %%mm2           \n\t" // mov updated Raw(x) to mm2
3457            "jb avg_3lp                   \n\t"
3458
3459            : "=S" (dummy_value_S),            // output regs (dummy)
3460              "=D" (dummy_value_D),
3461              "=c" (dummy_value_c),
3462              "=a" (dummy_value_a)
3463
3464            : "0" (prev_row),    // esi/rsi    // input regs
3465              "1" (row),         // edi/rdi
3466              "2" (diff),        // ecx
3467              "3" (MMXLength)    // eax
3468
3469#if defined(CLOBBER_MMX_REGS_SUPPORTED)
3470            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
3471            , "%mm4", "%mm5", "%mm6", "%mm7"
3472#endif
3473         );
3474      }
3475      break;  // end 3 bpp
3476
3477      case 4:   // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem,
3478      {         // but loop uses all 8 MMX regs, and psrlq/psllq require 64-bit
3479                // mem (PIC/.so problems), MMX reg (none left), or immediate
3480//       _ShiftBpp = bpp << 3;        // 32 (psllq)
3481//       _ShiftRem = 64 - _ShiftBpp;  // 32 (psrlq)
3482
3483         __asm__ __volatile__ (
3484            LOAD_GOT_rbp
3485            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
3486            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
3487            // re-init address pointers and offset
3488// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
3489                                                 // alignment boundary
3490            "movq " AMASK0_8_0 ", %%mm7    \n\t" // _amask0_8_0 -> mm7
3491            RESTORE_rbp
3492
3493            // ... and clear all bytes except for 1st active group
3494// preload  "movl  row, %1               \n\t" // edi:  Avg(x)
3495            "psrlq $32, %%mm7            \n\t" // was _ShiftRem
3496// preload  "movl  prev_row, %0          \n\t" // esi:  Prior(x)
3497            "movq  %%mm7, %%mm6          \n\t"
3498            "psllq $32, %%mm6            \n\t" // mask for 2nd active group
3499
3500            // prime the pump:  load the first Raw(x-bpp) data set
3501            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
3502                                             // (we correct pos. in loop below)
3503         "avg_4lp:                       \n\t"
3504            "movq (%1," PCX ",), %%mm0   \n\t"
3505            "psrlq $32, %%mm2            \n\t" // shift data to pos. correctly
3506            "movq (%0," PCX ",), %%mm1   \n\t"
3507            // add (Prev_row/2) to average
3508            "movq %%mm5, %%mm3           \n\t"
3509            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3510            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3511            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3512                                               // byte
3513            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3514                                               // each byte
3515            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3516            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3517                                               // LBCarrys
3518            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3519                                               // where both
3520                              // lsb's were == 1 (only valid for active group)
3521            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3522            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3523                                               // byte
3524            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3525                                               // for each byte
3526            "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
3527                                               // bytes to add to Avg
3528            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3529                                               // for each Active
3530                              // byte
3531            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3532            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3533            "psllq $32, %%mm2            \n\t" // shift data to pos. correctly
3534            "addl $8, %%ecx              \n\t"
3535            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3536                                               // LBCarrys
3537            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3538                                               // where both
3539                              // lsb's were == 1 (only valid for active group)
3540            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3541            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3542                                               // byte
3543            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3544                                               // for each byte
3545            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3546                                               // bytes to add to Avg
3547            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3548                                               // Avg for each Active byte
3549            "cmpl %%eax, %%ecx           \n\t" // MMXLength
3550            // now ready to write back to memory
3551            "movq %%mm0, -8(%1," PCX ",) \n\t"
3552            // prep Raw(x-bpp) for next loop
3553            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3554            "jb avg_4lp                  \n\t"
3555
3556            : "=S" (dummy_value_S),            // output regs (dummy)
3557              "=D" (dummy_value_D),
3558              "=c" (dummy_value_c),
3559              "=a" (dummy_value_a)
3560
3561            : "0" (prev_row),    // esi/rsi    // input regs
3562              "1" (row),         // edi/rdi
3563              "2" (diff),        // ecx
3564              "3" (MMXLength)    // eax
3565
3566#if defined(CLOBBER_MMX_REGS_SUPPORTED)
3567            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
3568            , "%mm4", "%mm5", "%mm6", "%mm7"
3569#endif
3570         );
3571      }
3572      break;  // end 4 bpp
3573
3574      case 1:
3575      {
3576         __asm__ __volatile__ (
3577            // re-init address pointers and offset
3578// preload  "movl diff, %%ecx            \n\t" // ecx: x = offset to align. bdry
3579// preload  "movl row, %1                \n\t" // edi/rdi:  Avg(x)
3580// preload  "movl FullLength, %%eax      \n\t"
3581            "cmpl %%eax, %%ecx           \n\t" // test if offset at end of array
3582            "jnb avg_1end                \n\t"
3583
3584            SAVE_ebp
3585
3586            // do Avg decode for remaining bytes
3587// preload  "movl prev_row, %0           \n\t" // esi/rsi:  Prior(x)
3588            "mov  %1, " PBP "            \n\t" // copy of row pointer...
3589            "dec  " PBP "                \n\t" // ebp/rbp:  Raw(x-bpp)
3590            "xorl %%edx, %%edx           \n\t" // zero edx before using dl & dx
3591                                               //  in loop below
3592            SAVE_GOT_ebx
3593
3594         "avg_1lp:                       \n\t"
3595            // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3596            "xorl %%ebx, %%ebx           \n\t"
3597            "movb (%0," PCX ",), %%dl    \n\t" // load dl with Prior(x)
3598            "movb (" PBP "," PCX ",), %%bl \n\t" // load bl with Raw(x-bpp)
3599            "addw %%dx, %%bx             \n\t"
3600            "incl %%ecx                  \n\t"
3601            "shrw %%bx                   \n\t" // divide by 2
3602            "addb -1(%1," PCX ",), %%bl  \n\t" // add Avg(x); -1 to offset
3603                                               // inc ecx
3604            "cmpl %%eax, %%ecx           \n\t" // check if at end of array
3605            "movb %%bl, -1(%1," PCX ",)  \n\t" // write back Raw(x);
3606                         // mov does not affect flags; -1 to offset inc ecx
3607            "jb avg_1lp                  \n\t"
3608
3609            RESTORE_GOT_ebx
3610            RESTORE_ebp
3611
3612         "avg_1end:                      \n\t"
3613
3614            : "=S" (dummy_value_S),            // output regs (dummy)
3615              "=D" (dummy_value_D),
3616              "=c" (dummy_value_c),
3617              "=a" (dummy_value_a)
3618
3619            : "0" (prev_row),    // esi/rsi    // input regs
3620              "1" (row),         // edi/rdi
3621              "2" (diff),        // ecx
3622              "3" (FullLength)   // eax
3623
3624            : "%edx"                           // clobber list
3625              _CLOBBER_GOT_ebx
3626              _CLOBBER_ebp
3627         );
3628      }
3629      return;  // end 1 bpp
3630
3631      case 2:
3632      {
3633//       _ShiftBpp = 16;   // == 2 * 8
3634//       _ShiftRem = 48;   // == 64 - _ShiftBpp
3635
3636         __asm__ __volatile__ (
3637            LOAD_GOT_rbp
3638            // load (former) _ActiveMask
3639            "movq " AMASK6_2_0 ", %%mm7    \n\t" // _amask6_2_0 -> mm7
3640            // re-init address pointers and offset
3641// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
3642                                                 // alignment boundary
3643            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
3644// preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
3645            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
3646// preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
3647            RESTORE_rbp
3648
3649            // prime the pump:  load the first Raw(x-bpp) data set
3650            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
3651                                             // (we correct pos. in loop below)
3652         "avg_2lp:                       \n\t"
3653            "movq (%1," PCX ",), %%mm0   \n\t"
3654            "psrlq $48, %%mm2            \n\t" // shift data to pos. correctly
3655            "movq (%0," PCX ",), %%mm1   \n\t" //  (GRR BUGFIX:  was psllq)
3656            // add (Prev_row/2) to average
3657            "movq %%mm5, %%mm3           \n\t"
3658            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3659            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3660            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3661                                               // byte
3662            "movq %%mm7, %%mm6           \n\t"
3663            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3664                                               // each byte
3665
3666            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3667            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3668                                               // LBCarrys
3669            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3670                                               // where both
3671                                               // lsb's were == 1 (only valid
3672                                               // for active group)
3673            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3674            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3675                                               // byte
3676            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3677                                               // for each byte
3678            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1
3679                                               // bytes to add to Avg
3680            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3681                                               // for each Active byte
3682
3683            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3684            "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
3685                                               // bytes 2 & 3
3686            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3687            "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
3688            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3689                                               // LBCarrys
3690            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3691                                               // where both
3692                                               // lsb's were == 1 (only valid
3693                                               // for active group)
3694            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3695            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3696                                               // byte
3697            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3698                                               // for each byte
3699            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3700                                               // bytes to add to Avg
3701            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3702                                               // Avg for each Active byte
3703
3704            // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3705            "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
3706                                               // bytes 4 & 5
3707            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3708            "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
3709            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3710                                               // LBCarrys
3711            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3712                                               // where both lsb's were == 1
3713                                               // (only valid for active group)
3714            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3715            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3716                                               // byte
3717            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3718                                               // for each byte
3719            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3720                                               // bytes to add to Avg
3721            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3722                                               // Avg for each Active byte
3723
3724            // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3725            "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
3726                                               // bytes 6 & 7
3727            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3728            "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
3729            "addl $8, %%ecx              \n\t"
3730            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3731                                               // LBCarrys
3732            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3733                                               // where both
3734                                               // lsb's were == 1 (only valid
3735                                               // for active group)
3736            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3737            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3738                                               // byte
3739            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3740                                               // for each byte
3741            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3742                                               // bytes to add to Avg
3743            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3744                                               // Avg for each Active byte
3745            "cmpl %%eax, %%ecx           \n\t" // MMXLength
3746            // now ready to write back to memory
3747            "movq %%mm0, -8(%1," PCX ",) \n\t"
3748            // prep Raw(x-bpp) for next loop
3749            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3750            "jb avg_2lp                  \n\t"
3751
3752            : "=S" (dummy_value_S),            // output regs (dummy)
3753              "=D" (dummy_value_D),
3754              "=c" (dummy_value_c),
3755              "=a" (dummy_value_a)
3756
3757            : "0" (prev_row),    // esi/rsi    // input regs
3758              "1" (row),         // edi/rdi
3759              "2" (diff),        // ecx
3760              "3" (MMXLength)    // eax
3761
3762#if defined(CLOBBER_MMX_REGS_SUPPORTED)
3763            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
3764            , "%mm4", "%mm5", "%mm6", "%mm7"
3765#endif
3766         );
3767      }
3768      break;  // end 2 bpp
3769
3770      case 6:   // formerly shared with 4 bpp case (see comments there)
3771      {
3772//       _ShiftBpp = bpp << 3;        // 48 (psllq)
3773//       _ShiftRem = 64 - _ShiftBpp;  // 16 (psrlq)
3774
3775         __asm__ __volatile__ (
3776            LOAD_GOT_rbp
3777            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
3778            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
3779            // re-init address pointers and offset
3780// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
3781                                                 // alignment boundary
3782            "movq " AMASK0_8_0 ", %%mm7    \n\t" // _amask0_8_0 -> mm7
3783            RESTORE_rbp
3784
3785            // ... and clear all bytes except for 1st active group
3786// preload  "movl  row, %1               \n\t" // edi:  Avg(x)
3787            "psrlq $16, %%mm7            \n\t"
3788// preload  "movl  prev_row, %0          \n\t" // esi:  Prior(x)
3789            "movq  %%mm7, %%mm6          \n\t"
3790            "psllq $48, %%mm6            \n\t" // mask for 2nd active group
3791
3792            // prime the pump:  load the first Raw(x-bpp) data set
3793            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
3794                                             // (we correct pos. in loop below)
3795         "avg_6lp:                       \n\t"
3796            "movq (%1," PCX ",), %%mm0   \n\t"
3797            "psrlq $16, %%mm2            \n\t" // shift data to pos. correctly
3798            "movq (%0," PCX ",), %%mm1   \n\t"
3799            // add (Prev_row/2) to average
3800            "movq %%mm5, %%mm3           \n\t"
3801            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3802            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3803            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3804                                               // byte
3805            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3806                                               // each byte
3807            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3808            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3809                                               // LBCarrys
3810            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3811                                               // where both
3812                              // lsb's were == 1 (only valid for active group)
3813            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3814            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3815                                               // byte
3816            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3817                                               // for each byte
3818            "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
3819                                               // bytes to add to Avg
3820            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3821                                               // for each Active
3822                              // byte
3823            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3824            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3825            "psllq $48, %%mm2            \n\t" // shift data to pos. correctly
3826            "addl $8, %%ecx              \n\t"
3827            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3828                                               // LBCarrys
3829            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3830                                               // where both
3831                              // lsb's were == 1 (only valid for active group)
3832            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3833            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3834                                               // byte
3835            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3836                                               // for each byte
3837            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3838                                               // bytes to add to Avg
3839            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3840                                               // Avg for each Active byte
3841            "cmpl %%eax, %%ecx           \n\t" // MMXLength
3842            // now ready to write back to memory
3843            "movq %%mm0, -8(%1," PCX ",) \n\t"
3844            // prep Raw(x-bpp) for next loop
3845            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3846            "jb avg_6lp                  \n\t"
3847
3848            : "=S" (dummy_value_S),            // output regs (dummy)
3849              "=D" (dummy_value_D),
3850              "=c" (dummy_value_c),
3851              "=a" (dummy_value_a)
3852
3853            : "0" (prev_row),    // esi/rsi    // input regs
3854              "1" (row),         // edi/rdi
3855              "2" (diff),        // ecx
3856              "3" (MMXLength)    // eax
3857
3858#if defined(CLOBBER_MMX_REGS_SUPPORTED)
3859            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
3860            , "%mm4", "%mm5", "%mm6", "%mm7"
3861#endif
3862         );
3863      }
3864      break;  // end 6 bpp
3865
3866      case 8:
3867      {
3868         __asm__ __volatile__ (
3869            // re-init address pointers and offset
3870// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
3871                                                 // alignment boundary
3872            LOAD_GOT_rbp
3873            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
3874// preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
3875            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
3876// preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
3877            RESTORE_rbp
3878
3879            // prime the pump:  load the first Raw(x-bpp) data set
3880            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
3881                                      // (NO NEED to correct pos. in loop below)
3882
3883         "avg_8lp:                       \n\t"
3884            "movq (%1," PCX ",), %%mm0   \n\t"
3885            "movq %%mm5, %%mm3           \n\t"
3886            "movq (%0," PCX ",), %%mm1   \n\t"
3887            "addl $8, %%ecx              \n\t"
3888            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3889            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3890            "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
3891                                               //  where both lsb's were == 1
3892            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3893            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
3894            "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
3895            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
3896            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
3897            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
3898            "cmpl %%eax, %%ecx           \n\t" // MMXLength
3899            "movq %%mm0, -8(%1," PCX ",) \n\t"
3900            "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
3901            "jb avg_8lp                  \n\t"
3902
3903            : "=S" (dummy_value_S),            // output regs (dummy)
3904              "=D" (dummy_value_D),
3905              "=c" (dummy_value_c),
3906              "=a" (dummy_value_a)
3907
3908            : "0" (prev_row),    // esi/rsi    // input regs
3909              "1" (row),         // edi/rdi
3910              "2" (diff),        // ecx
3911              "3" (MMXLength)    // eax
3912
3913#if defined(CLOBBER_MMX_REGS_SUPPORTED)
3914            : "%mm0", "%mm1", "%mm2"           // clobber list
3915            , "%mm3", "%mm4", "%mm5"
3916#endif
3917         );
3918      }
3919      break;  // end 8 bpp
3920
3921      default:                // bpp != 1,2,3,4,6,8:  doesn't exist
3922      {
3923         // ERROR:  SHOULD NEVER BE REACHED
3924#if defined(PNG_DEBUG)
3925         png_debug(1, "Internal libpng logic error (GCC "
3926           "png_read_filter_row_mmx_avg())\n");
3927#endif
3928      }
3929      break;
3930
3931   } // end switch (bpp)
3932
3933   __asm__ __volatile__ (
3934      // MMX acceleration complete; now do clean-up
3935      // check if any remaining bytes left to decode
3936//pre "movl FullLength, %%edx      \n\t"
3937//pre "movl MMXLength, %%eax       \n\t" // eax:  x == offset bytes after MMX
3938//pre "movl row, %2                \n\t" // edi:  Avg(x)
3939      "cmpl %%edx, %%eax           \n\t" // test if offset at end of array
3940      "jnb avg_end                 \n\t"
3941
3942      SAVE_ebp
3943
3944      // do Avg decode for remaining bytes
3945//pre "movl prev_row, %1           \n\t" // esi:  Prior(x)
3946      "mov  %2, " PBP "            \n\t" // copy of row pointer...
3947//pre "subl bpp, " PBP "           \n\t" // (bpp is preloaded into ecx)
3948      "sub  " PCX "," PBP "        \n\t" // ebp:  Raw(x-bpp)
3949      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
3950
3951      SAVE_GOT_ebx
3952
3953   "avg_lp2:                       \n\t"
3954      // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3955      "xorl %%ebx, %%ebx           \n\t"
3956      "movb (%1," PAX ",), %%cl    \n\t" // load cl with Prior(x)
3957      "movb (" PBP "," PAX ",), %%bl \n\t" // load bl with Raw(x-bpp)
3958      "addw %%cx, %%bx             \n\t"
3959      "incl %%eax                  \n\t"
3960      "shrw %%bx                   \n\t" // divide by 2
3961      "addb -1(%2," PAX ",), %%bl  \n\t" // add Avg(x); -1 to offset inc eax
3962      "cmpl %%edx, %%eax           \n\t" // check if at end of array
3963      "movb %%bl, -1(%2," PAX ",)  \n\t" // write back Raw(x) [mov does not
3964      "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc eax]
3965
3966      RESTORE_GOT_ebx
3967      RESTORE_ebp
3968
3969   "avg_end:                       \n\t"
3970      "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
3971
3972      : "=c" (dummy_value_c),            // output regs (dummy)
3973        "=S" (dummy_value_S),
3974        "=D" (dummy_value_D),
3975        "=a" (dummy_value_a),
3976        "=d" (dummy_value_d)
3977
3978      : "0" (bpp),         // ecx        // input regs
3979        "1" (prev_row),    // esi/rsi
3980        "2" (row),         // edi/rdi
3981        "3" (MMXLength),   // eax
3982        "4" (FullLength)   // edx
3983
3984      CLOB_COLON_ebx_ebp                 // clobber list
3985        CLOBBER_GOT_ebx
3986        CLOB_COMMA_ebx_ebp
3987        CLOBBER_ebp
3988   );
3989
3990} /* end png_read_filter_row_mmx_avg() */
3991
3992#endif /* PNG_MMX_READ_FILTER_AVG_SUPPORTED */
3993
3994
3995
3996#if defined(PNG_MMX_READ_FILTER_PAETH_SUPPORTED)
3997#if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
3998
3999//===========================================================================//
4000//                                                                           //
4001//         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
4002//                                                                           //
4003//===========================================================================//
4004
4005// Optimized code for PNG Paeth filter decoder
4006
4007static void /* PRIVATE */
4008png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
4009                              png_bytep prev_row)
4010{
4011   unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
4012   int bpp;
4013   int dummy_value_a;
4014   int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
4015   int dummy_value_d;
4016   png_charp dummy_value_S;
4017   png_charp dummy_value_D;
4018   int diff; //     __attribute__((used));
4019
4020   bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
4021   FullLength = row_info->rowbytes;         // number of bytes to filter
4022
4023   __asm__ __volatile__ (
4024      SAVE_GOT_ebx
4025      SAVE_r15
4026      SAVE_ebp
4027//pre "movl row, %2                \n\t" // edi/rdi
4028      "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
4029//pre "movl prev_row, %1           \n\t" // esi/rsi
4030      "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
4031//pre "movl FullLength, %%eax      \n\t" // bring in via eax...
4032      SAVE_FullLength                    // ...but store for later use
4033      "xorl %%eax, %%eax           \n\t"
4034
4035      // Compute the Raw value for the first bpp bytes
4036      // Note: the formula works out to be always
4037      //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
4038   "paeth_rlp:                     \n\t"
4039      "movb (%2," PBX ",), %%al    \n\t"
4040      "addb (%1," PBX ",), %%al    \n\t"
4041      "incl %%ebx                  \n\t"
4042//pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
4043      "cmpl %%ecx, %%ebx           \n\t"
4044      "movb %%al, -1(%2," PBX ",)  \n\t"
4045      "jb paeth_rlp                \n\t"
4046
4047      // get # of bytes to alignment (note:  computing _delta_ of two pointers,
4048      // so hereafter %%ebp is sufficient even on 64-bit)
4049      "mov  %2, " PBP "            \n\t" // take start of row
4050      "add  " PBX "," PBP "        \n\t" // add bpp
4051      "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
4052//    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
4053      CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
4054      "sub  %2, " PBP "            \n\t" // subtract row ptr again => ebp =
4055      "jz paeth_go                 \n\t" //  target value of ebx at alignment
4056
4057      "xorl %%ecx, %%ecx           \n\t"
4058
4059      SAVE_r11_r12_r13
4060
4061      // fix alignment
4062   "paeth_lp1:                     \n\t"
4063      "xorl %%eax, %%eax           \n\t"
4064      // pav = p - a = (a + b - c) - a = b - c
4065      "movb (%1," PBX ",), %%al    \n\t" // load Prior(x) into al
4066      "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
4067      "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4068      "movl %%eax, " pa_TEMP "     \n\t" // Save pav for later use
4069      "xorl %%eax, %%eax           \n\t"
4070      // pbv = p - b = (a + b - c) - b = a - c
4071      "movb (%2," PDX ",), %%al    \n\t" // load Raw(x-bpp) into al
4072      "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4073      "movl %%eax, %%ecx           \n\t"
4074      // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
4075      "addl " pa_TEMP ", %%eax     \n\t" // pcv = pav + pbv
4076      // pc = abs(pcv)
4077      "testl $0x80000000, %%eax    \n\t"
4078      "jz paeth_pca                \n\t"
4079      "negl %%eax                  \n\t" // reverse sign of neg values
4080
4081   "paeth_pca:                     \n\t"
4082      "movl %%eax, " pc_TEMP "     \n\t" // save pc for later use
4083      // pb = abs(pbv)
4084      "testl $0x80000000, %%ecx    \n\t"
4085      "jz paeth_pba                \n\t"
4086      "negl %%ecx                  \n\t" // reverse sign of neg values
4087
4088   "paeth_pba:                     \n\t"
4089      "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
4090      // pa = abs(pav)
4091      "movl " pa_TEMP ", %%eax     \n\t"
4092      "testl $0x80000000, %%eax    \n\t"
4093      "jz paeth_paa                \n\t"
4094      "negl %%eax                  \n\t" // reverse sign of neg values
4095
4096   "paeth_paa:                     \n\t"
4097      "movl %%eax, " pa_TEMP "     \n\t" // save pa for later use
4098      // test if pa <= pb
4099      "cmpl %%ecx, %%eax           \n\t"
4100      "jna paeth_abb               \n\t"
4101      // pa > pb; now test if pb <= pc
4102      "cmpl " pc_TEMP ", %%ecx     \n\t"
4103      "jna paeth_bbc               \n\t"
4104      // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4105      "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
4106      "jmp paeth_paeth             \n\t"
4107
4108   "paeth_bbc:                     \n\t"
4109      // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4110      "movb (%1," PBX ",), %%cl    \n\t" // load Prior(x) into cl
4111      "jmp paeth_paeth             \n\t"
4112
4113   "paeth_abb:                     \n\t"
4114      // pa <= pb; now test if pa <= pc
4115      "cmpl " pc_TEMP ", %%eax     \n\t"
4116      "jna paeth_abc               \n\t"
4117      // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4118      "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
4119      "jmp paeth_paeth             \n\t"
4120
4121   "paeth_abc:                     \n\t"
4122      // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4123      "movb (%2," PDX ",), %%cl    \n\t" // load Raw(x-bpp) into cl
4124
4125   "paeth_paeth:                   \n\t"
4126      "incl %%ebx                  \n\t"
4127      "incl %%edx                  \n\t"
4128      // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4129      "addb %%cl, -1(%2," PBX ",)  \n\t"
4130      "cmpl %%ebp, %%ebx           \n\t"
4131      "jb paeth_lp1                \n\t"
4132
4133      RESTORE_r11_r12_r13
4134
4135   "paeth_go:                      \n\t"
4136      RESTORE_FullLength "%%ecx    \n\t" // FullLength -> ecx
4137      "movl %%ecx, %%eax           \n\t"
4138      "subl %%ebx, %%eax           \n\t" // subtract alignment fix
4139      "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
4140      "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
4141//out "movl %%ecx, MMXLength       \n\t"
4142      "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
4143      RESTORE_ebp                        //  (could swap ebp and edx functions)
4144      RESTORE_r15
4145      RESTORE_GOT_ebx
4146
4147      : "=c" (MMXLength),                // output regs
4148        "=S" (dummy_value_S),
4149        "=D" (dummy_value_D),
4150        "=a" (diff)
4151
4152      : "0" (bpp),         // ecx        // input regs
4153        "1" (prev_row),    // esi/rsi
4154        "2" (row),         // edi/rdi
4155        "3" (FullLength)   // eax
4156
4157      : "%edx"                           // clobber list
4158        _CLOBBER_r11_r12_r13
4159        _CLOBBER_r15
4160        _CLOBBER_ebp
4161        _CLOBBER_GOT_ebx
4162   );
4163
4164   // now do the math for the rest of the row
4165   switch (bpp)
4166   {
4167      case 3:
4168      {
4169//       _ShiftBpp = 24;    // == bpp * 8
4170//       _ShiftRem = 40;    // == 64 - _ShiftBpp
4171
4172         __asm__ __volatile__ (
4173            LOAD_GOT_rbp
4174// preload  "movl diff, %%ecx            \n\t"
4175// preload  "movl row, %1                \n\t" // edi/rdi
4176// preload  "movl prev_row, %0           \n\t" // esi/rsi
4177            "pxor %%mm0, %%mm0           \n\t"
4178
4179            // prime the pump:  load the first Raw(x-bpp) data set
4180            "movq -8(%1," PCX ",), %%mm1 \n\t"
4181         "paeth_3lp:                     \n\t"
4182            "psrlq $40, %%mm1            \n\t" // shift last 3 bytes to 1st
4183                                               // 3 bytes
4184            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4185            "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4186            "movq -8(%0," PCX ",), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
4187            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4188            "psrlq $40, %%mm3            \n\t" // shift last 3 bytes to 1st
4189                                               // 3 bytes
4190            // pav = p - a = (a + b - c) - a = b - c
4191            "movq %%mm2, %%mm4           \n\t"
4192            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4193            // pbv = p - b = (a + b - c) - b = a - c
4194            "movq %%mm1, %%mm5           \n\t"
4195            "psubw %%mm3, %%mm4          \n\t"
4196            "pxor %%mm7, %%mm7           \n\t"
4197            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4198            "movq %%mm4, %%mm6           \n\t"
4199            "psubw %%mm3, %%mm5          \n\t"
4200
4201            // pa = abs(p-a) = abs(pav)
4202            // pb = abs(p-b) = abs(pbv)
4203            // pc = abs(p-c) = abs(pcv)
4204            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4205            "paddw %%mm5, %%mm6          \n\t"
4206            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4207            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4208            "psubw %%mm0, %%mm4          \n\t"
4209            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4210            "psubw %%mm0, %%mm4          \n\t"
4211            "psubw %%mm7, %%mm5          \n\t"
4212            "pxor %%mm0, %%mm0           \n\t"
4213            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4214            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4215            "psubw %%mm7, %%mm5          \n\t"
4216            "psubw %%mm0, %%mm6          \n\t"
4217            //  test pa <= pb
4218            "movq %%mm4, %%mm7           \n\t"
4219            "psubw %%mm0, %%mm6          \n\t"
4220            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4221            "movq %%mm7, %%mm0           \n\t"
4222            // use mm7 mask to merge pa & pb
4223            "pand %%mm7, %%mm5           \n\t"
4224            // use mm0 mask copy to merge a & b
4225            "pand %%mm0, %%mm2           \n\t"
4226            "pandn %%mm4, %%mm7          \n\t"
4227            "pandn %%mm1, %%mm0          \n\t"
4228            "paddw %%mm5, %%mm7          \n\t"
4229            "paddw %%mm2, %%mm0          \n\t"
4230            //  test  ((pa <= pb)? pa:pb) <= pc
4231            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4232            "pxor %%mm1, %%mm1           \n\t"
4233            "pand %%mm7, %%mm3           \n\t"
4234            "pandn %%mm0, %%mm7          \n\t"
4235            "paddw %%mm3, %%mm7          \n\t"
4236            "pxor %%mm0, %%mm0           \n\t"
4237            "packuswb %%mm1, %%mm7       \n\t"
4238            "movq (%0," PCX ",), %%mm3   \n\t" // load c=Prior(x-bpp)
4239            "pand " AMASK5_3_0 ", %%mm7  \n\t" // _amask5_3_0 (was _ActiveMask)
4240            "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
4241            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
4242            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4243            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
4244            "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
4245                                               // Raw(x-bpp)
4246            // now do Paeth for 2nd set of bytes (3-5)
4247            "psrlq $24, %%mm2            \n\t" // load b=Prior(x) step 2
4248            "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4249            "pxor %%mm7, %%mm7           \n\t"
4250            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4251            // pbv = p - b = (a + b - c) - b = a - c
4252            "movq %%mm1, %%mm5           \n\t"
4253            // pav = p - a = (a + b - c) - a = b - c
4254            "movq %%mm2, %%mm4           \n\t"
4255            "psubw %%mm3, %%mm5          \n\t"
4256            "psubw %%mm3, %%mm4          \n\t"
4257            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
4258            //       pav + pbv = pbv + pav
4259            "movq %%mm5, %%mm6           \n\t"
4260            "paddw %%mm4, %%mm6          \n\t"
4261
4262            // pa = abs(p-a) = abs(pav)
4263            // pb = abs(p-b) = abs(pbv)
4264            // pc = abs(p-c) = abs(pcv)
4265            "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
4266            "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
4267            "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
4268            "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
4269            "psubw %%mm0, %%mm5          \n\t"
4270            "psubw %%mm7, %%mm4          \n\t"
4271            "psubw %%mm0, %%mm5          \n\t"
4272            "psubw %%mm7, %%mm4          \n\t"
4273            "pxor %%mm0, %%mm0           \n\t"
4274            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4275            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4276            "psubw %%mm0, %%mm6          \n\t"
4277            //  test pa <= pb
4278            "movq %%mm4, %%mm7           \n\t"
4279            "psubw %%mm0, %%mm6          \n\t"
4280            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4281            "movq %%mm7, %%mm0           \n\t"
4282            // use mm7 mask to merge pa & pb
4283            "pand %%mm7, %%mm5           \n\t"
4284            // use mm0 mask copy to merge a & b
4285            "pand %%mm0, %%mm2           \n\t"
4286            "pandn %%mm4, %%mm7          \n\t"
4287            "pandn %%mm1, %%mm0          \n\t"
4288            "paddw %%mm5, %%mm7          \n\t"
4289            "paddw %%mm2, %%mm0          \n\t"
4290            //  test  ((pa <= pb)? pa:pb) <= pc
4291            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4292            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4293            "pand %%mm7, %%mm3           \n\t"
4294            "pandn %%mm0, %%mm7          \n\t"
4295            "pxor %%mm1, %%mm1           \n\t"
4296            "paddw %%mm3, %%mm7          \n\t"
4297            "pxor %%mm0, %%mm0           \n\t"
4298            "packuswb %%mm1, %%mm7       \n\t"
4299            "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
4300            "pand " AMASK5_3_0 ", %%mm7  \n\t" // _amask5_3_0 (was _ActiveMask)
4301            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4302            "psllq $24, %%mm7            \n\t" // shift bytes to 2nd group of
4303                                               // 3 bytes
4304             // pav = p - a = (a + b - c) - a = b - c
4305            "movq %%mm2, %%mm4           \n\t"
4306            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
4307            "psllq $24, %%mm3            \n\t" // load c=Prior(x-bpp) step 2
4308            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
4309            "movq %%mm7, %%mm1           \n\t"
4310            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4311            "psllq $24, %%mm1            \n\t" // shift bytes (was _ShiftBpp)
4312                                    // now mm1 will be used as Raw(x-bpp)
4313            // now do Paeth for 3rd, and final, set of bytes (6-7)
4314            "pxor %%mm7, %%mm7           \n\t"
4315            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4316            "psubw %%mm3, %%mm4          \n\t"
4317            // pbv = p - b = (a + b - c) - b = a - c
4318            "movq %%mm1, %%mm5           \n\t"
4319            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4320            "movq %%mm4, %%mm6           \n\t"
4321            "psubw %%mm3, %%mm5          \n\t"
4322            "pxor %%mm0, %%mm0           \n\t"
4323            "paddw %%mm5, %%mm6          \n\t"
4324
4325            // pa = abs(p-a) = abs(pav)
4326            // pb = abs(p-b) = abs(pbv)
4327            // pc = abs(p-c) = abs(pcv)
4328            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4329            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4330            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4331            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4332            "psubw %%mm0, %%mm4          \n\t"
4333            "psubw %%mm7, %%mm5          \n\t"
4334            "psubw %%mm0, %%mm4          \n\t"
4335            "psubw %%mm7, %%mm5          \n\t"
4336            "pxor %%mm0, %%mm0           \n\t"
4337            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4338            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4339            "psubw %%mm0, %%mm6          \n\t"
4340            //  test pa <= pb
4341            "movq %%mm4, %%mm7           \n\t"
4342            "psubw %%mm0, %%mm6          \n\t"
4343            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4344            "movq %%mm7, %%mm0           \n\t"
4345            // use mm0 mask copy to merge a & b
4346            "pand %%mm0, %%mm2           \n\t"
4347            // use mm7 mask to merge pa & pb
4348            "pand %%mm7, %%mm5           \n\t"
4349            "pandn %%mm1, %%mm0          \n\t"
4350            "pandn %%mm4, %%mm7          \n\t"
4351            "paddw %%mm2, %%mm0          \n\t"
4352            "paddw %%mm5, %%mm7          \n\t"
4353            //  test  ((pa <= pb)? pa:pb) <= pc
4354            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4355            "pand %%mm7, %%mm3           \n\t"
4356            "pandn %%mm0, %%mm7          \n\t"
4357            "paddw %%mm3, %%mm7          \n\t"
4358            "pxor %%mm1, %%mm1           \n\t"
4359            "packuswb %%mm7, %%mm1       \n\t"
4360            // step ecx to next set of 8 bytes and repeat loop til done
4361            "addl $8, %%ecx              \n\t"
4362            "pand " AMASK0_2_6 ", %%mm1  \n\t" // _amask0_2_6 (_ActiveMaskEnd)
4363            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
4364            "cmpl %%eax, %%ecx           \n\t" // MMXLength
4365            "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
4366            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
4367                                 // mm1 will be used as Raw(x-bpp) next loop
4368                           // mm3 ready to be used as Prior(x-bpp) next loop
4369            "jb paeth_3lp                \n\t"
4370            RESTORE_rbp
4371
4372            : "=S" (dummy_value_S),            // output regs (dummy)
4373              "=D" (dummy_value_D),
4374              "=c" (dummy_value_c),
4375              "=a" (dummy_value_a)
4376
4377            : "0" (prev_row),  // esi/rsi      // input regs
4378              "1" (row),       // edi/rdi
4379              "2" (diff),      // ecx
4380              "3" (MMXLength)  // eax
4381
4382#if defined(CLOBBER_MMX_REGS_SUPPORTED)
4383            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
4384            , "%mm4", "%mm5", "%mm6", "%mm7"
4385#endif
4386         );
4387      }
4388      break;  // end 3 bpp
4389
4390      case 4:
4391      {
4392         __asm__ __volatile__ (
4393// preload  "movl diff, %%ecx            \n\t"
4394// preload  "movl row, %1                \n\t" // edi/rdi
4395// preload  "movl prev_row, %0           \n\t" // esi/rsi
4396            "pxor %%mm0, %%mm0           \n\t"
4397            // prime the pump:  load the first Raw(x-bpp) data set
4398            "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
4399                                               //  a=Raw(x-bpp) bytes
4400         "paeth_4lp:                     \n\t"
4401            // do first set of 4 bytes
4402            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4403            "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4404            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4405            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4406            // pav = p - a = (a + b - c) - a = b - c
4407            "movq %%mm2, %%mm4           \n\t"
4408            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4409            // pbv = p - b = (a + b - c) - b = a - c
4410            "movq %%mm1, %%mm5           \n\t"
4411            "psubw %%mm3, %%mm4          \n\t"
4412            "pxor %%mm7, %%mm7           \n\t"
4413            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4414            "movq %%mm4, %%mm6           \n\t"
4415            "psubw %%mm3, %%mm5          \n\t"
4416            // pa = abs(p-a) = abs(pav)
4417            // pb = abs(p-b) = abs(pbv)
4418            // pc = abs(p-c) = abs(pcv)
4419            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4420            "paddw %%mm5, %%mm6          \n\t"
4421            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4422            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4423            "psubw %%mm0, %%mm4          \n\t"
4424            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4425            "psubw %%mm0, %%mm4          \n\t"
4426            "psubw %%mm7, %%mm5          \n\t"
4427            "pxor %%mm0, %%mm0           \n\t"
4428            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4429            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4430            "psubw %%mm7, %%mm5          \n\t"
4431            "psubw %%mm0, %%mm6          \n\t"
4432            //  test pa <= pb
4433            "movq %%mm4, %%mm7           \n\t"
4434            "psubw %%mm0, %%mm6          \n\t"
4435            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4436            "movq %%mm7, %%mm0           \n\t"
4437            // use mm7 mask to merge pa & pb
4438            "pand %%mm7, %%mm5           \n\t"
4439            // use mm0 mask copy to merge a & b
4440            "pand %%mm0, %%mm2           \n\t"
4441            "pandn %%mm4, %%mm7          \n\t"
4442            "pandn %%mm1, %%mm0          \n\t"
4443            "paddw %%mm5, %%mm7          \n\t"
4444            "paddw %%mm2, %%mm0          \n\t"
4445            //  test  ((pa <= pb)? pa:pb) <= pc
4446            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4447            "pxor %%mm1, %%mm1           \n\t"
4448            "pand %%mm7, %%mm3           \n\t"
4449            "pandn %%mm0, %%mm7          \n\t"
4450            "paddw %%mm3, %%mm7          \n\t"
4451            "pxor %%mm0, %%mm0           \n\t"
4452            "packuswb %%mm1, %%mm7       \n\t"
4453            "movq (%0," PCX ",), %%mm3   \n\t" // load c=Prior(x-bpp)
4454            LOAD_GOT_rbp
4455            "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
4456            RESTORE_rbp
4457            "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
4458            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
4459            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4460            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
4461            "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
4462                                               // Raw(x-bpp)
4463            // do second set of 4 bytes
4464            "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4465            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4466            // pav = p - a = (a + b - c) - a = b - c
4467            "movq %%mm2, %%mm4           \n\t"
4468            // pbv = p - b = (a + b - c) - b = a - c
4469            "movq %%mm1, %%mm5           \n\t"
4470            "psubw %%mm3, %%mm4          \n\t"
4471            "pxor %%mm7, %%mm7           \n\t"
4472            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4473            "movq %%mm4, %%mm6           \n\t"
4474            "psubw %%mm3, %%mm5          \n\t"
4475            // pa = abs(p-a) = abs(pav)
4476            // pb = abs(p-b) = abs(pbv)
4477            // pc = abs(p-c) = abs(pcv)
4478            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4479            "paddw %%mm5, %%mm6          \n\t"
4480            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4481            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4482            "psubw %%mm0, %%mm4          \n\t"
4483            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4484            "psubw %%mm0, %%mm4          \n\t"
4485            "psubw %%mm7, %%mm5          \n\t"
4486            "pxor %%mm0, %%mm0           \n\t"
4487            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4488            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4489            "psubw %%mm7, %%mm5          \n\t"
4490            "psubw %%mm0, %%mm6          \n\t"
4491            //  test pa <= pb
4492            "movq %%mm4, %%mm7           \n\t"
4493            "psubw %%mm0, %%mm6          \n\t"
4494            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4495            "movq %%mm7, %%mm0           \n\t"
4496            // use mm7 mask to merge pa & pb
4497            "pand %%mm7, %%mm5           \n\t"
4498            // use mm0 mask copy to merge a & b
4499            "pand %%mm0, %%mm2           \n\t"
4500            "pandn %%mm4, %%mm7          \n\t"
4501            "pandn %%mm1, %%mm0          \n\t"
4502            "paddw %%mm5, %%mm7          \n\t"
4503            "paddw %%mm2, %%mm0          \n\t"
4504            //  test  ((pa <= pb)? pa:pb) <= pc
4505            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4506            "pxor %%mm1, %%mm1           \n\t"
4507            "pand %%mm7, %%mm3           \n\t"
4508            "pandn %%mm0, %%mm7          \n\t"
4509            "pxor %%mm1, %%mm1           \n\t"
4510            "paddw %%mm3, %%mm7          \n\t"
4511            "pxor %%mm0, %%mm0           \n\t"
4512            // step ecx to next set of 8 bytes and repeat loop til done
4513            "addl $8, %%ecx              \n\t"
4514            "packuswb %%mm7, %%mm1       \n\t"
4515            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add predictor with Raw(x)
4516            "cmpl %%eax, %%ecx           \n\t" // MMXLength
4517            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
4518                                 // mm1 will be used as Raw(x-bpp) next loop
4519            "jb paeth_4lp                \n\t"
4520
4521            : "=S" (dummy_value_S),            // output regs (dummy)
4522              "=D" (dummy_value_D),
4523              "=c" (dummy_value_c),
4524              "=a" (dummy_value_a)
4525
4526            : "0" (prev_row),  // esi/rsi      // input regs
4527              "1" (row),       // edi/rdi
4528              "2" (diff),      // ecx
4529              "3" (MMXLength)  // eax
4530
4531#if defined(CLOBBER_MMX_REGS_SUPPORTED)
4532            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
4533            , "%mm4", "%mm5", "%mm6", "%mm7"
4534#endif
4535         );
4536      }
4537      break;  // end 4 bpp
4538
4539      case 1:
4540      case 2:
4541      {
4542         __asm__ __volatile__ (
4543// preload  "movl diff, %%eax            \n\t" // eax: x = offset to align. bdry
4544// preload  "movl FullLength, %%edx      \n\t"
4545            "cmpl %%edx, %%eax           \n\t"
4546            "jnb paeth_dend              \n\t"
4547
4548            SAVE_ebp
4549
4550// preload  "movl row, %2                \n\t" // edi/rdi
4551            // do Paeth decode for remaining bytes
4552// preload  "movl prev_row, %1           \n\t" // esi/rsi
4553            "movl %%eax, %%ebp           \n\t"
4554// preload  "subl bpp, %%ebp             \n\t" // (bpp is preloaded into ecx)
4555            "subl %%ecx, %%ebp           \n\t" // ebp = eax - bpp
4556            "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
4557
4558            SAVE_GOT_ebx
4559            SAVE_r11_r12_r13
4560
4561         "paeth_dlp:                     \n\t"
4562            "xorl %%ebx, %%ebx           \n\t"
4563            // pav = p - a = (a + b - c) - a = b - c
4564            "movb (%1," PAX ",), %%bl    \n\t" // load Prior(x) into bl
4565            "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
4566            "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
4567            "movl %%ebx, " pa_TEMP "     \n\t" // Save pav for later use
4568            "xorl %%ebx, %%ebx           \n\t"
4569            // pbv = p - b = (a + b - c) - b = a - c
4570            "movb (%2," PBP ",), %%bl    \n\t" // load Raw(x-bpp) into bl
4571            "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
4572            "movl %%ebx, %%ecx           \n\t"
4573            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4574            "addl " pa_TEMP ", %%ebx     \n\t" // pcv = pav + pbv
4575            // pc = abs(pcv)
4576            "testl $0x80000000, %%ebx    \n\t"
4577            "jz paeth_dpca               \n\t"
4578            "negl %%ebx                  \n\t" // reverse sign of neg values
4579
4580         "paeth_dpca:                    \n\t"
4581            "movl %%ebx, " pc_TEMP "     \n\t" // save pc for later use
4582            // pb = abs(pbv)
4583            "testl $0x80000000, %%ecx    \n\t"
4584            "jz paeth_dpba               \n\t"
4585            "negl %%ecx                  \n\t" // reverse sign of neg values
4586
4587         "paeth_dpba:                    \n\t"
4588            "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
4589            // pa = abs(pav)
4590            "movl " pa_TEMP ", %%ebx     \n\t"
4591            "testl $0x80000000, %%ebx    \n\t"
4592            "jz paeth_dpaa               \n\t"
4593            "negl %%ebx                  \n\t" // reverse sign of neg values
4594
4595         "paeth_dpaa:                    \n\t"
4596            "movl %%ebx, " pa_TEMP "     \n\t" // save pa for later use
4597            // test if pa <= pb
4598            "cmpl %%ecx, %%ebx           \n\t"
4599            "jna paeth_dabb              \n\t"
4600            // pa > pb; now test if pb <= pc
4601            "cmpl " pc_TEMP ", %%ecx     \n\t"
4602            "jna paeth_dbbc              \n\t"
4603            // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4604            "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
4605            "jmp paeth_dpaeth            \n\t"
4606
4607         "paeth_dbbc:                    \n\t"
4608            // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4609            "movb (%1," PAX ",), %%cl    \n\t" // load Prior(x) into cl
4610            "jmp paeth_dpaeth            \n\t"
4611
4612         "paeth_dabb:                    \n\t"
4613            // pa <= pb; now test if pa <= pc
4614            "cmpl " pc_TEMP ", %%ebx     \n\t"
4615            "jna paeth_dabc              \n\t"
4616            // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4617            "movb (%1," PBP ",), %%cl   \n\t" // load Prior(x-bpp) into cl
4618            "jmp paeth_dpaeth            \n\t"
4619
4620         "paeth_dabc:                    \n\t"
4621            // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4622            "movb (%2," PBP ",), %%cl    \n\t" // load Raw(x-bpp) into cl
4623
4624         "paeth_dpaeth:                  \n\t"
4625            "incl %%eax                  \n\t"
4626            "incl %%ebp                  \n\t"
4627            // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4628            "addb %%cl, -1(%2," PAX ",)  \n\t"
4629            "cmpl %%edx, %%eax           \n\t" // check against FullLength
4630            "jb paeth_dlp                \n\t"
4631
4632            RESTORE_r11_r12_r13
4633            RESTORE_GOT_ebx
4634            RESTORE_ebp
4635
4636         "paeth_dend:                    \n\t"
4637
4638            : "=c" (dummy_value_c),            // output regs (dummy)
4639              "=S" (dummy_value_S),
4640              "=D" (dummy_value_D),
4641              "=a" (dummy_value_a),
4642              "=d" (dummy_value_d)
4643
4644            : "0" (bpp),         // ecx        // input regs
4645              "1" (prev_row),    // esi/rsi
4646              "2" (row),         // edi/rdi
4647              "3" (diff),        // eax
4648              "4" (FullLength)   // edx
4649
4650            CLOB_COLON_ebx_ebp_r1X             // clobber list
4651              CLOBBER_GOT_ebx
4652              CLOB_COMMA_ebx_ebp
4653              CLOBBER_ebp
4654              CLOB_COMMA_ebX_r1X
4655              CLOBBER_r11_r12_r13
4656         );
4657      }
4658      return; // end 1 or 2 bpp (no need to go further with this one)
4659
4660      case 6:
4661      {
4662//       _ActiveMask2 = 0xffffffff00000000LL;  // NOT USED ("_amask_0_4_4")
4663//       _ShiftBpp = 48;       // bpp << 3 == bpp * 8
4664//       _ShiftRem = 16;       // 64 - _ShiftBpp
4665
4666         __asm__ __volatile__ (
4667// preload  "movl diff, %%ecx            \n\t"
4668// preload  "movl row, %1                \n\t" // edi/rdi
4669// preload  "movl prev_row, %0           \n\t" // esi/rsi
4670            // prime the pump:  load the first Raw(x-bpp) data set
4671            "movq -8(%1," PCX ",), %%mm1 \n\t"
4672            "pxor %%mm0, %%mm0           \n\t"
4673
4674         "paeth_6lp:                     \n\t"
4675            // must shift to position Raw(x-bpp) data
4676            "psrlq $16, %%mm1            \n\t" // was _ShiftRem
4677            // do first set of 4 bytes
4678            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4679            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4680            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4681            "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4682            // must shift to position Prior(x-bpp) data
4683            "psrlq $16, %%mm3            \n\t" // was _ShiftRem
4684            // pav = p - a = (a + b - c) - a = b - c
4685            "movq %%mm2, %%mm4           \n\t"
4686            "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
4687            // pbv = p - b = (a + b - c) - b = a - c
4688            "movq %%mm1, %%mm5           \n\t"
4689            "psubw %%mm3, %%mm4          \n\t"
4690            "pxor %%mm7, %%mm7           \n\t"
4691            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4692            "movq %%mm4, %%mm6           \n\t"
4693            "psubw %%mm3, %%mm5          \n\t"
4694            // pa = abs(p-a) = abs(pav)
4695            // pb = abs(p-b) = abs(pbv)
4696            // pc = abs(p-c) = abs(pcv)
4697            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4698            "paddw %%mm5, %%mm6          \n\t"
4699            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4700            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4701            "psubw %%mm0, %%mm4          \n\t"
4702            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4703            "psubw %%mm0, %%mm4          \n\t"
4704            "psubw %%mm7, %%mm5          \n\t"
4705            "pxor %%mm0, %%mm0           \n\t"
4706            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4707            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4708            "psubw %%mm7, %%mm5          \n\t"
4709            "psubw %%mm0, %%mm6          \n\t"
4710            //  test pa <= pb
4711            "movq %%mm4, %%mm7           \n\t"
4712            "psubw %%mm0, %%mm6          \n\t"
4713            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4714            "movq %%mm7, %%mm0           \n\t"
4715            // use mm7 mask to merge pa & pb
4716            "pand %%mm7, %%mm5           \n\t"
4717            // use mm0 mask copy to merge a & b
4718            "pand %%mm0, %%mm2           \n\t"
4719            "pandn %%mm4, %%mm7          \n\t"
4720            "pandn %%mm1, %%mm0          \n\t"
4721            "paddw %%mm5, %%mm7          \n\t"
4722            "paddw %%mm2, %%mm0          \n\t"
4723            //  test  ((pa <= pb)? pa:pb) <= pc
4724            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4725            "pxor %%mm1, %%mm1           \n\t"
4726            "pand %%mm7, %%mm3           \n\t"
4727            "pandn %%mm0, %%mm7          \n\t"
4728            "paddw %%mm3, %%mm7          \n\t"
4729            "pxor %%mm0, %%mm0           \n\t"
4730            "packuswb %%mm1, %%mm7       \n\t"
4731            "movq -8(%0," PCX ",), %%mm3 \n\t" // load c=Prior(x-bpp)
4732            LOAD_GOT_rbp
4733            "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
4734            RESTORE_rbp
4735            "psrlq $16, %%mm3            \n\t"
4736            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x) step 1
4737            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
4738            "movq %%mm2, %%mm6           \n\t"
4739            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
4740            "movq -8(%1," PCX ",), %%mm1 \n\t"
4741            "psllq $48, %%mm6            \n\t" // bpp * 8 = bits per pixel
4742            "movq %%mm7, %%mm5           \n\t"
4743            "psrlq $16, %%mm1            \n\t" // 64 - (bpp * 8) = remainder
4744            "por %%mm6, %%mm3            \n\t"
4745            "psllq $48, %%mm5            \n\t" // was _ShiftBpp
4746            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4747            "por %%mm5, %%mm1            \n\t"
4748            // do second set of 4 bytes
4749            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4750            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4751            // pav = p - a = (a + b - c) - a = b - c
4752            "movq %%mm2, %%mm4           \n\t"
4753            // pbv = p - b = (a + b - c) - b = a - c
4754            "movq %%mm1, %%mm5           \n\t"
4755            "psubw %%mm3, %%mm4          \n\t"
4756            "pxor %%mm7, %%mm7           \n\t"
4757            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4758            "movq %%mm4, %%mm6           \n\t"
4759            "psubw %%mm3, %%mm5          \n\t"
4760            // pa = abs(p-a) = abs(pav)
4761            // pb = abs(p-b) = abs(pbv)
4762            // pc = abs(p-c) = abs(pcv)
4763            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4764            "paddw %%mm5, %%mm6          \n\t"
4765            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4766            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4767            "psubw %%mm0, %%mm4          \n\t"
4768            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4769            "psubw %%mm0, %%mm4          \n\t"
4770            "psubw %%mm7, %%mm5          \n\t"
4771            "pxor %%mm0, %%mm0           \n\t"
4772            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4773            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4774            "psubw %%mm7, %%mm5          \n\t"
4775            "psubw %%mm0, %%mm6          \n\t"
4776            //  test pa <= pb
4777            "movq %%mm4, %%mm7           \n\t"
4778            "psubw %%mm0, %%mm6          \n\t"
4779            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4780            "movq %%mm7, %%mm0           \n\t"
4781            // use mm7 mask to merge pa & pb
4782            "pand %%mm7, %%mm5           \n\t"
4783            // use mm0 mask copy to merge a & b
4784            "pand %%mm0, %%mm2           \n\t"
4785            "pandn %%mm4, %%mm7          \n\t"
4786            "pandn %%mm1, %%mm0          \n\t"
4787            "paddw %%mm5, %%mm7          \n\t"
4788            "paddw %%mm2, %%mm0          \n\t"
4789            //  test  ((pa <= pb)? pa:pb) <= pc
4790            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4791            "pxor %%mm1, %%mm1           \n\t"
4792            "pand %%mm7, %%mm3           \n\t"
4793            "pandn %%mm0, %%mm7          \n\t"
4794            "pxor %%mm1, %%mm1           \n\t"
4795            "paddw %%mm3, %%mm7          \n\t"
4796            "pxor %%mm0, %%mm0           \n\t"
4797            // step ecx to next set of 8 bytes and repeat loop til done
4798            "addl $8, %%ecx              \n\t"
4799            "packuswb %%mm7, %%mm1       \n\t"
4800            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
4801            "cmpl %%eax, %%ecx           \n\t" // MMXLength
4802            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
4803                                 // mm1 will be used as Raw(x-bpp) next loop
4804            "jb paeth_6lp                \n\t"
4805
4806            : "=S" (dummy_value_S),            // output regs (dummy)
4807              "=D" (dummy_value_D),
4808              "=c" (dummy_value_c),
4809              "=a" (dummy_value_a)
4810
4811            : "0" (prev_row),  // esi/rsi      // input regs
4812              "1" (row),       // edi/rdi
4813              "2" (diff),      // ecx
4814              "3" (MMXLength)  // eax
4815
4816#if defined(CLOBBER_MMX_REGS_SUPPORTED)
4817            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
4818            , "%mm4", "%mm5", "%mm6", "%mm7"
4819#endif
4820         );
4821      }
4822      break;  // end 6 bpp
4823
4824      case 8:                          // bpp == 8
4825      {
4826         __asm__ __volatile__ (
4827// preload  "movl diff, %%ecx            \n\t"
4828// preload  "movl row, %1                \n\t" // edi/rdi
4829// preload  "movl prev_row, %0           \n\t" // esi/rsi
4830            "pxor %%mm0, %%mm0           \n\t"
4831            // prime the pump:  load the first Raw(x-bpp) data set
4832            "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
4833                                               //  a=Raw(x-bpp) bytes
4834         "paeth_8lp:                     \n\t"
4835            // do first set of 4 bytes
4836            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4837            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4838            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4839            "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4840            // pav = p - a = (a + b - c) - a = b - c
4841            "movq %%mm2, %%mm4           \n\t"
4842            "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
4843            // pbv = p - b = (a + b - c) - b = a - c
4844            "movq %%mm1, %%mm5           \n\t"
4845            "psubw %%mm3, %%mm4          \n\t"
4846            "pxor %%mm7, %%mm7           \n\t"
4847            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4848            "movq %%mm4, %%mm6           \n\t"
4849            "psubw %%mm3, %%mm5          \n\t"
4850            // pa = abs(p-a) = abs(pav)
4851            // pb = abs(p-b) = abs(pbv)
4852            // pc = abs(p-c) = abs(pcv)
4853            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4854            "paddw %%mm5, %%mm6          \n\t"
4855            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4856            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4857            "psubw %%mm0, %%mm4          \n\t"
4858            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4859            "psubw %%mm0, %%mm4          \n\t"
4860            "psubw %%mm7, %%mm5          \n\t"
4861            "pxor %%mm0, %%mm0           \n\t"
4862            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4863            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4864            "psubw %%mm7, %%mm5          \n\t"
4865            "psubw %%mm0, %%mm6          \n\t"
4866            //  test pa <= pb
4867            "movq %%mm4, %%mm7           \n\t"
4868            "psubw %%mm0, %%mm6          \n\t"
4869            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4870            "movq %%mm7, %%mm0           \n\t"
4871            // use mm7 mask to merge pa & pb
4872            "pand %%mm7, %%mm5           \n\t"
4873            // use mm0 mask copy to merge a & b
4874            "pand %%mm0, %%mm2           \n\t"
4875            "pandn %%mm4, %%mm7          \n\t"
4876            "pandn %%mm1, %%mm0          \n\t"
4877            "paddw %%mm5, %%mm7          \n\t"
4878            "paddw %%mm2, %%mm0          \n\t"
4879            //  test  ((pa <= pb)? pa:pb) <= pc
4880            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4881            "pxor %%mm1, %%mm1           \n\t"
4882            "pand %%mm7, %%mm3           \n\t"
4883            "pandn %%mm0, %%mm7          \n\t"
4884            "paddw %%mm3, %%mm7          \n\t"
4885            "pxor %%mm0, %%mm0           \n\t"
4886            "packuswb %%mm1, %%mm7       \n\t"
4887            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4888            LOAD_GOT_rbp
4889            "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
4890            RESTORE_rbp
4891            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
4892            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
4893            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4894            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
4895            "movq -8(%1," PCX ",), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4896
4897            // do second set of 4 bytes
4898            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4899            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4900            // pav = p - a = (a + b - c) - a = b - c
4901            "movq %%mm2, %%mm4           \n\t"
4902            // pbv = p - b = (a + b - c) - b = a - c
4903            "movq %%mm1, %%mm5           \n\t"
4904            "psubw %%mm3, %%mm4          \n\t"
4905            "pxor %%mm7, %%mm7           \n\t"
4906            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4907            "movq %%mm4, %%mm6           \n\t"
4908            "psubw %%mm3, %%mm5          \n\t"
4909            // pa = abs(p-a) = abs(pav)
4910            // pb = abs(p-b) = abs(pbv)
4911            // pc = abs(p-c) = abs(pcv)
4912            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4913            "paddw %%mm5, %%mm6          \n\t"
4914            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4915            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4916            "psubw %%mm0, %%mm4          \n\t"
4917            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4918            "psubw %%mm0, %%mm4          \n\t"
4919            "psubw %%mm7, %%mm5          \n\t"
4920            "pxor %%mm0, %%mm0           \n\t"
4921            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4922            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4923            "psubw %%mm7, %%mm5          \n\t"
4924            "psubw %%mm0, %%mm6          \n\t"
4925            //  test pa <= pb
4926            "movq %%mm4, %%mm7           \n\t"
4927            "psubw %%mm0, %%mm6          \n\t"
4928            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4929            "movq %%mm7, %%mm0           \n\t"
4930            // use mm7 mask to merge pa & pb
4931            "pand %%mm7, %%mm5           \n\t"
4932            // use mm0 mask copy to merge a & b
4933            "pand %%mm0, %%mm2           \n\t"
4934            "pandn %%mm4, %%mm7          \n\t"
4935            "pandn %%mm1, %%mm0          \n\t"
4936            "paddw %%mm5, %%mm7          \n\t"
4937            "paddw %%mm2, %%mm0          \n\t"
4938            //  test  ((pa <= pb)? pa:pb) <= pc
4939            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4940            "pxor %%mm1, %%mm1           \n\t"
4941            "pand %%mm7, %%mm3           \n\t"
4942            "pandn %%mm0, %%mm7          \n\t"
4943            "pxor %%mm1, %%mm1           \n\t"
4944            "paddw %%mm3, %%mm7          \n\t"
4945            "pxor %%mm0, %%mm0           \n\t"
4946            // step ecx to next set of 8 bytes and repeat loop til done
4947            "addl $8, %%ecx              \n\t"
4948            "packuswb %%mm7, %%mm1       \n\t"
4949            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
4950            "cmpl %%eax, %%ecx           \n\t" // MMXLength
4951            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
4952                                 // mm1 will be used as Raw(x-bpp) next loop
4953            "jb paeth_8lp                \n\t"
4954
4955            : "=S" (dummy_value_S),            // output regs (dummy)
4956              "=D" (dummy_value_D),
4957              "=c" (dummy_value_c),
4958              "=a" (dummy_value_a)
4959
4960            : "0" (prev_row),  // esi/rsi      // input regs
4961              "1" (row),       // edi/rdi
4962              "2" (diff),      // ecx
4963              "3" (MMXLength)  // eax
4964
4965#if defined(CLOBBER_MMX_REGS_SUPPORTED)
4966            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
4967            , "%mm4", "%mm5", "%mm6", "%mm7"
4968#endif
4969         );
4970      }
4971      break;  // end 8 bpp
4972
4973      default:                // bpp != 1,2,3,4,6,8:  doesn't exist
4974      {
4975         // ERROR:  SHOULD NEVER BE REACHED
4976#if defined(PNG_DEBUG)
4977         png_debug(1, "Internal libpng logic error (GCC "
4978           "png_read_filter_row_mmx_paeth())\n");
4979#endif
4980      }
4981      break;
4982
4983   } // end switch (bpp)
4984
4985   __asm__ __volatile__ (
4986      // MMX acceleration complete; now do clean-up
4987      // check if any remaining bytes left to decode
4988//pre "movl FullLength, %%edx      \n\t"
4989//pre "movl MMXLength, %%eax       \n\t"
4990      "cmpl %%edx, %%eax           \n\t"
4991      "jnb paeth_end               \n\t"
4992
4993      SAVE_ebp
4994
4995//pre "movl row, %2                \n\t" // edi/rdi
4996//pre "movl prev_row, %1           \n\t" // esi/rsi
4997      // do Paeth decode for remaining bytes
4998      "movl %%eax, %%ebp           \n\t"
4999//pre "subl bpp, %%ebp             \n\t" // (bpp is preloaded into ecx)
5000      "subl %%ecx, %%ebp           \n\t" // ebp = eax - bpp
5001      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
5002
5003      SAVE_GOT_ebx
5004      SAVE_r11_r12_r13
5005
5006   "paeth_lp2:                     \n\t"
5007      "xorl %%ebx, %%ebx           \n\t"
5008      // pav = p - a = (a + b - c) - a = b - c
5009      "movb (%1," PAX ",), %%bl    \n\t" // load Prior(x) into bl
5010      "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
5011      "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
5012      "movl %%ebx, " pa_TEMP "     \n\t" // Save pav for later use
5013      "xorl %%ebx, %%ebx           \n\t"
5014      // pbv = p - b = (a + b - c) - b = a - c
5015      "movb (%2," PBP ",), %%bl    \n\t" // load Raw(x-bpp) into bl
5016      "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
5017      "movl %%ebx, %%ecx           \n\t"
5018      // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
5019      "addl " pa_TEMP ", %%ebx     \n\t" // pcv = pav + pbv
5020      // pc = abs(pcv)
5021      "testl $0x80000000, %%ebx    \n\t"
5022      "jz paeth_pca2               \n\t"
5023      "negl %%ebx                  \n\t" // reverse sign of neg values
5024
5025   "paeth_pca2:                    \n\t"
5026      "movl %%ebx, " pc_TEMP "     \n\t" // save pc for later use
5027      // pb = abs(pbv)
5028      "testl $0x80000000, %%ecx    \n\t"
5029      "jz paeth_pba2               \n\t"
5030      "negl %%ecx                  \n\t" // reverse sign of neg values
5031
5032   "paeth_pba2:                    \n\t"
5033      "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
5034      // pa = abs(pav)
5035      "movl " pa_TEMP ", %%ebx     \n\t"
5036      "testl $0x80000000, %%ebx    \n\t"
5037      "jz paeth_paa2               \n\t"
5038      "negl %%ebx                  \n\t" // reverse sign of neg values
5039
5040   "paeth_paa2:                    \n\t"
5041      "movl %%ebx, " pa_TEMP "     \n\t" // save pa for later use
5042      // test if pa <= pb
5043      "cmpl %%ecx, %%ebx           \n\t"
5044      "jna paeth_abb2              \n\t"
5045      // pa > pb; now test if pb <= pc
5046      "cmpl " pc_TEMP ", %%ecx     \n\t"
5047      "jna paeth_bbc2              \n\t"
5048      // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
5049      "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
5050      "jmp paeth_paeth2            \n\t"
5051
5052   "paeth_bbc2:                    \n\t"
5053      // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
5054      "movb (%1," PAX ",), %%cl    \n\t" // load Prior(x) into cl
5055      "jmp paeth_paeth2            \n\t"
5056
5057   "paeth_abb2:                    \n\t"
5058      // pa <= pb; now test if pa <= pc
5059      "cmpl " pc_TEMP ", %%ebx     \n\t"
5060      "jna paeth_abc2              \n\t"
5061      // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
5062      "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
5063      "jmp paeth_paeth2            \n\t"
5064
5065   "paeth_abc2:                    \n\t"
5066      // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
5067      "movb (%2," PBP ",), %%cl    \n\t" // load Raw(x-bpp) into cl
5068
5069   "paeth_paeth2:                  \n\t"
5070      "incl %%eax                  \n\t"
5071      "incl %%ebp                  \n\t"
5072      // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
5073      "addb %%cl, -1(%2," PAX ",)  \n\t"
5074      "cmpl %%edx, %%eax           \n\t" // check against FullLength
5075      "jb paeth_lp2                \n\t"
5076
5077      RESTORE_r11_r12_r13
5078      RESTORE_GOT_ebx
5079      RESTORE_ebp
5080
5081   "paeth_end:                     \n\t"
5082      "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
5083
5084      : "=c" (dummy_value_c),            // output regs (dummy)
5085        "=S" (dummy_value_S),
5086        "=D" (dummy_value_D),
5087        "=a" (dummy_value_a),
5088        "=d" (dummy_value_d)
5089
5090      : "0" (bpp),         // ecx        // input regs
5091        "1" (prev_row),    // esi/rsi
5092        "2" (row),         // edi/rdi
5093        "3" (MMXLength),   // eax
5094        "4" (FullLength)   // edx
5095
5096      CLOB_COLON_ebx_ebp_r1X             // clobber list
5097        CLOBBER_GOT_ebx
5098        CLOB_COMMA_ebx_ebp
5099        CLOBBER_ebp
5100        CLOB_COMMA_ebX_r1X
5101        CLOBBER_r11_r12_r13
5102   );
5103
5104} /* end png_read_filter_row_mmx_paeth() */
5105
5106#endif // PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK
5107#endif /* PNG_MMX_READ_FILTER_PAETH_SUPPORTED */
5108
5109
5110
5111
5112#if defined(PNG_MMX_READ_FILTER_SUB_SUPPORTED)
5113
5114//===========================================================================//
5115//                                                                           //
5116//           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
5117//                                                                           //
5118//===========================================================================//
5119
5120// Optimized code for PNG Sub filter decoder
5121
5122static void /* PRIVATE */
5123png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
5124{
5125   unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
5126   int bpp;
5127   int dummy_value_a;
5128   int dummy_value_c;
5129   int dummy_value_d;
5130   png_bytep dummy_value_D;
5131   int diff; //     __attribute__((used));
5132
5133   bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
5134   FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
5135     // (why do we subtract off bpp?  not so in avg or paeth...)
5136
5137   __asm__ __volatile__ (
5138      SAVE_r15
5139      SAVE_ebp
5140//pre "movl row, %1                \n\t" // edi/rdi
5141      "mov  %1, " PSI "            \n\t" // lp = row
5142//pre "movl bpp, %%ecx             \n\t"
5143      "add  " PCX ", %1            \n\t" // rp = row + bpp
5144//pre "movl FullLength, %%eax      \n\t" // bring in via eax...
5145      SAVE_FullLength                    // ...but store for later use
5146
5147      "xorl %%eax, %%eax           \n\t"
5148
5149      // get # of bytes to alignment (note:  computing _delta_ of two pointers,
5150      // so hereafter %%ebp is sufficient even on 64-bit)
5151      "mov  %1, " PBP "            \n\t" // take start of row
5152      "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
5153//    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
5154      CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
5155      "sub  %1, " PBP "            \n\t" // subtract row ptr again => ebp =
5156      "jz sub_go                   \n\t" //  target value of eax at alignment
5157
5158   "sub_lp1:                       \n\t" // fix alignment
5159      "movb (" PSI "," PAX ",), %%cl \n\t"
5160      "addb %%cl, (%1," PAX ",)    \n\t"
5161      "incl %%eax                  \n\t"
5162      "cmpl %%ebp, %%eax           \n\t"
5163      "jb sub_lp1                  \n\t"
5164
5165   "sub_go:                        \n\t"
5166      RESTORE_FullLength "%%ecx    \n\t" // FullLength -> ecx
5167      "movl %%ecx, %%edx           \n\t"
5168      "subl %%eax, %%edx           \n\t" // subtract alignment fix
5169      "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
5170      "subl %%edx, %%ecx           \n\t" // drop over bytes from length
5171//out "movl %%ecx, MMXLength       \n\t"
5172      "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
5173      RESTORE_ebp                        //  (could swap ebp and ecx functions,
5174      RESTORE_r15                        //  but %%cl issues...)
5175
5176      : "=c" (MMXLength),       // 0     // output regs
5177        "=D" (dummy_value_D),   // 1
5178        "=a" (diff)             // 2
5179
5180      : "0" (bpp),              // ecx   // input regs
5181        "1" (row),              // edi
5182        "2" (FullLength)        // eax
5183
5184      : "%esi", "%edx"                   // clobber list
5185        _CLOBBER_r15
5186        _CLOBBER_ebp
5187   );
5188
5189   // now do the math for the rest of the row
5190   switch (bpp)
5191   {
5192      case 3:
5193      {
5194//       _ShiftBpp = 24;       // == 3 * 8
5195//       _ShiftRem  = 40;      // == 64 - 24
5196
5197         __asm__ __volatile__ (
5198// preload  "mov  row, %1                 \n\t" // edi/rdi
5199            LOAD_GOT_rbp
5200            // load (former) _ActiveMask for 2nd active byte group
5201            "movq " AMASK2_3_3 ", %%mm7   \n\t" // _amask2_3_3
5202            RESTORE_rbp
5203
5204// notused  "mov  %1, " PSI "             \n\t" // lp = row
5205// preload  "movl bpp, %%ecx              \n\t"
5206            "add  " PCX ", %1             \n\t" // rp = row + bpp
5207            "movq %%mm7, %%mm6            \n\t"
5208// preload  "movl diff, %%edx             \n\t"
5209            "psllq $24, %%mm6             \n\t" // move mask in mm6 to cover
5210                                                //  3rd active byte group
5211            // prime the pump:  load the first Raw(x-bpp) data set
5212            "movq -8(%1," PDX ",), %%mm1  \n\t"
5213
5214         "sub_3lp:                        \n\t" // shift data for adding first
5215            "psrlq $40, %%mm1             \n\t" //  bpp bytes (no need for mask;
5216                                                //  shift clears inactive bytes)
5217            // add 1st active group
5218            "movq (%1," PDX ",), %%mm0    \n\t"
5219            "paddb %%mm1, %%mm0           \n\t"
5220
5221            // add 2nd active group
5222            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5223            "psllq $24, %%mm1             \n\t" // shift data to pos. correctly
5224            "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
5225            "paddb %%mm1, %%mm0           \n\t"
5226
5227            // add 3rd active group
5228            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5229            "psllq $24, %%mm1             \n\t" // shift data to pos. correctly
5230            "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
5231            "addl $8, %%edx               \n\t"
5232            "paddb %%mm1, %%mm0           \n\t"
5233
5234            "cmpl %%eax, %%edx            \n\t" // MMXLength
5235            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
5236            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
5237            "jb sub_3lp                   \n\t"
5238
5239            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5240              "=D" (dummy_value_D),   // 1
5241              "=d" (dummy_value_d),   // 2
5242              "=a" (dummy_value_a)    // 3
5243
5244            : "0" (bpp),              // ecx    // input regs
5245              "1" (row),              // edi
5246              "2" (diff),             // edx
5247              "3" (MMXLength)         // eax
5248
5249#if defined(CLOBBER_MMX_REGS_SUPPORTED)
5250            : "%mm0", "%mm1", "%mm6", "%mm7"    // clobber list
5251#endif
5252         );
5253      }
5254      break;  // end 3 bpp
5255
5256      case 4:   // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem,
5257      {         // but 64-bit PIC/.so problems (could still share, moving vars
5258                // into unused MMX regs via ecx/edx, but kludgy)
5259//       _ShiftBpp = bpp << 3;        // 32 (psllq)
5260//       _ShiftRem = 64 - _ShiftBpp;  // 32 (psrlq)
5261
5262         __asm__ __volatile__ (
5263// preload  "mov  row, %1                 \n\t" // edi/rdi
5264// preload  "movl diff, %%edx             \n\t"
5265// notused  "mov  %1, " PSI "             \n\t" // lp = row
5266// preload  "movl bpp, %%ecx              \n\t"
5267            "add  " PCX ", %1             \n\t" // rp = row + bpp
5268
5269            // prime the pump:  load the first Raw(x-bpp) data set
5270            "movq -8(%1," PDX ",), %%mm1  \n\t"
5271
5272         "sub_4lp:                        \n\t" // shift data for adding first
5273            "psrlq $32, %%mm1             \n\t" //  bpp bytes (no need for mask;
5274                                                //  shift clears inactive bytes)
5275            "movq (%1," PDX ",), %%mm0    \n\t"
5276            "paddb %%mm1, %%mm0           \n\t"
5277
5278            // add 2nd active group
5279            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5280            "psllq $32, %%mm1             \n\t" // shift data to pos. correctly
5281            "addl $8, %%edx               \n\t"
5282            "paddb %%mm1, %%mm0           \n\t"
5283
5284            "cmpl %%eax, %%edx            \n\t" // MMXLength
5285            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
5286            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
5287            "jb sub_4lp                   \n\t"
5288
5289            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5290              "=D" (dummy_value_D),   // 1
5291              "=d" (dummy_value_d),   // 2
5292              "=a" (dummy_value_a)    // 3
5293
5294            : "0" (bpp),              // ecx    // input regs
5295              "1" (row),              // edi
5296              "2" (diff),             // edx
5297              "3" (MMXLength)         // eax
5298
5299#if defined(CLOBBER_MMX_REGS_SUPPORTED)
5300            : "%mm0", "%mm1"                    // clobber list
5301#endif
5302         );
5303      }
5304      break;  // end 4 bpp
5305
5306      case 1:
5307      {
5308         __asm__ __volatile__ (
5309// preload  "movl diff, %%edx              \n\t"
5310// preload  "mov  row, %1                  \n\t" // edi/rdi
5311// preload  "cmpl FullLength, %%edx        \n\t"
5312            "cmpl %%eax, %%edx             \n\t"
5313            "jnb sub_1end                  \n\t"
5314            "mov  %1, " PSI "              \n\t" // lp = row
5315// irrel.   "xorl %%ecx, %%ecx             \n\t" // (actually bug with preload)
5316// preload  "movl bpp, %%ecx               \n\t"
5317            "add  " PCX ", %1              \n\t" // rp = row + bpp
5318
5319         "sub_1lp:                         \n\t"
5320            "movb (" PSI "," PDX ",), %%cl \n\t"
5321            "addb %%cl, (%1," PDX ",)      \n\t"
5322            "incl %%edx                    \n\t"
5323            "cmpl %%eax, %%edx             \n\t" // compare with FullLength
5324            "jb sub_1lp                    \n\t"
5325
5326         "sub_1end:                        \n\t"
5327
5328            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5329              "=D" (dummy_value_D),   // 1
5330              "=d" (dummy_value_d),   // 2
5331              "=a" (dummy_value_a)    // 3
5332
5333            : "0" (bpp),              // ecx    // input regs
5334              "1" (row),              // edi
5335              "2" (diff),             // edx
5336              "3" (FullLength)        // eax
5337
5338            : "%esi"                            // clobber list
5339         );
5340      }
5341      return;  // end 1 bpp (bypassing cleanup block!)
5342
5343      case 2:
5344      {
5345//       _ShiftBpp = 16;       // == 2 * 8
5346//       _ShiftRem = 48;       // == 64 - 16
5347
5348         __asm__ __volatile__ (
5349            LOAD_GOT_rbp
5350            // load (former) _ActiveMask for 2nd active byte group
5351            "movq " AMASK4_2_2 ", %%mm7   \n\t" // _amask4_2_2
5352            RESTORE_rbp
5353// preload  "movl diff, %%edx             \n\t"
5354            "movq %%mm7, %%mm6            \n\t"
5355// preload  "mov  row, %1                 \n\t" // edi/rdi
5356            "psllq $16, %%mm6             \n\t" // move mask in mm6 to cover
5357                                                //  3rd active byte group
5358// notused  "mov  %1, " PSI "             \n\t" // lp = row
5359            "movq %%mm6, %%mm5            \n\t"
5360// preload  "movl bpp, %%ecx              \n\t"
5361            "add  " PCX ", %1             \n\t" // rp = row + bpp
5362            "psllq $16, %%mm5             \n\t" // move mask in mm5 to cover
5363                                                //  4th active byte group
5364            // prime the pump:  load the first Raw(x-bpp) data set
5365            "movq -8(%1," PDX ",), %%mm1  \n\t"
5366
5367         "sub_2lp:                        \n\t" // shift data for adding first
5368            "psrlq $48, %%mm1             \n\t" //  bpp bytes (no need for mask;
5369                                                //  shift clears inactive bytes)
5370            // add 1st active group
5371            "movq (%1," PDX ",), %%mm0    \n\t"
5372            "paddb %%mm1, %%mm0           \n\t"
5373
5374            // add 2nd active group
5375            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5376            "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
5377            "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
5378            "paddb %%mm1, %%mm0           \n\t"
5379
5380            // add 3rd active group
5381            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5382            "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
5383            "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
5384            "paddb %%mm1, %%mm0           \n\t"
5385
5386            // add 4th active group
5387            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5388            "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
5389            "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
5390            "addl $8, %%edx               \n\t"
5391            "paddb %%mm1, %%mm0           \n\t"
5392            "cmpl %%eax, %%edx            \n\t" // MMXLength
5393            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
5394            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
5395            "jb sub_2lp                   \n\t"
5396
5397            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5398              "=D" (dummy_value_D),   // 1
5399              "=d" (dummy_value_d),   // 2
5400              "=a" (dummy_value_a)    // 3
5401
5402            : "0" (bpp),              // ecx    // input regs
5403              "1" (row),              // edi
5404              "2" (diff),             // edx
5405              "3" (MMXLength)         // eax
5406
5407#if defined(CLOBBER_MMX_REGS_SUPPORTED)
5408            : "%mm0", "%mm1", "%mm5", "%mm6"    // clobber list
5409            , "%mm7"
5410#endif
5411         );
5412      }
5413      break;  // end 2 bpp
5414
5415      case 6:   // formerly shared with 4 bpp case (see comments there)
5416      {
5417//       _ShiftBpp = bpp << 3;        // 48 (psllq)
5418//       _ShiftRem = 64 - _ShiftBpp;  // 16 (psrlq)
5419
5420         __asm__ __volatile__ (
5421// preload  "mov  row, %1                 \n\t" // edi/rdi
5422// preload  "movl diff, %%edx             \n\t"
5423// notused  "mov  %1, " PSI "             \n\t" // lp = row
5424// preload  "movl bpp, %%ecx              \n\t"
5425            "add  " PCX ", %1             \n\t" // rp = row + bpp
5426
5427            // prime the pump:  load the first Raw(x-bpp) data set
5428            "movq -8(%1," PDX ",), %%mm1  \n\t"
5429
5430         "sub_6lp:                        \n\t" // shift data for adding first
5431            "psrlq $16, %%mm1             \n\t" //  bpp bytes (no need for mask;
5432                                                //  shift clears inactive bytes)
5433            "movq (%1," PDX ",), %%mm0    \n\t"
5434            "paddb %%mm1, %%mm0           \n\t"
5435
5436            // add 2nd active group
5437            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
5438            "psllq $48, %%mm1             \n\t" // shift data to pos. correctly
5439            "addl $8, %%edx               \n\t"
5440            "paddb %%mm1, %%mm0           \n\t"
5441
5442            "cmpl %%eax, %%edx            \n\t" // MMXLength
5443            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
5444            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
5445            "jb sub_6lp                   \n\t"
5446
5447            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5448              "=D" (dummy_value_D),   // 1
5449              "=d" (dummy_value_d),   // 2
5450              "=a" (dummy_value_a)    // 3
5451
5452            : "0" (bpp),              // ecx    // input regs
5453              "1" (row),              // edi
5454              "2" (diff),             // edx
5455              "3" (MMXLength)         // eax
5456
5457#if defined(CLOBBER_MMX_REGS_SUPPORTED)
5458            : "%mm0", "%mm1"                    // clobber list
5459#endif
5460         );
5461      }
5462      break;  // end 6 bpp
5463
5464      case 8:
5465      {
5466         __asm__ __volatile__ (
5467// preload  "mov  row, %1                 \n\t" // edi/rdi
5468// preload  "movl diff, %%edx             \n\t"
5469// notused  "mov  %1, " PSI "             \n\t" // lp = row
5470// preload  "movl bpp, %%ecx              \n\t"
5471            "add  " PCX ", %1             \n\t" // rp = row + bpp
5472// preload  "movl MMXLength, %%eax        \n\t"
5473
5474            // prime the pump:  load the first Raw(x-bpp) data set
5475            "movq -8(%1," PDX ",), %%mm7  \n\t"
5476            "movl %%eax, %%esi            \n\t" // copy of MMXLength -> esi
5477            "andl $0x0000003f, %%esi      \n\t" // calc bytes over mult of 64
5478
5479         "sub_8lp:                        \n\t"
5480            "movq (%1," PDX ",), %%mm0    \n\t" // load Sub(x) for 1st 8 bytes
5481            "paddb %%mm7, %%mm0           \n\t"
5482            "movq 8(%1," PDX ",), %%mm1   \n\t" // load Sub(x) for 2nd 8 bytes
5483            "movq %%mm0, (%1," PDX ",)    \n\t" // write Raw(x) for 1st 8 bytes
5484
5485            // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
5486            // This will be repeated for each group of 8 bytes with the 8th
5487            // group being used as the Raw(x-bpp) for the 1st group of the
5488            // next loop.
5489
5490            "paddb %%mm0, %%mm1           \n\t"
5491            "movq 16(%1," PDX ",), %%mm2  \n\t" // load Sub(x) for 3rd 8 bytes
5492            "movq %%mm1, 8(%1," PDX ",)   \n\t" // write Raw(x) for 2nd 8 bytes
5493            "paddb %%mm1, %%mm2           \n\t"
5494            "movq 24(%1," PDX ",), %%mm3  \n\t" // load Sub(x) for 4th 8 bytes
5495            "movq %%mm2, 16(%1," PDX ",)  \n\t" // write Raw(x) for 3rd 8 bytes
5496            "paddb %%mm2, %%mm3           \n\t"
5497            "movq 32(%1," PDX ",), %%mm4  \n\t" // load Sub(x) for 5th 8 bytes
5498            "movq %%mm3, 24(%1," PDX ",)  \n\t" // write Raw(x) for 4th 8 bytes
5499            "paddb %%mm3, %%mm4           \n\t"
5500            "movq 40(%1," PDX ",), %%mm5  \n\t" // load Sub(x) for 6th 8 bytes
5501            "movq %%mm4, 32(%1," PDX ",)  \n\t" // write Raw(x) for 5th 8 bytes
5502            "paddb %%mm4, %%mm5           \n\t"
5503            "movq 48(%1," PDX ",), %%mm6  \n\t" // load Sub(x) for 7th 8 bytes
5504            "movq %%mm5, 40(%1," PDX ",)  \n\t" // write Raw(x) for 6th 8 bytes
5505            "paddb %%mm5, %%mm6           \n\t"
5506            "movq 56(%1," PDX ",), %%mm7  \n\t" // load Sub(x) for 8th 8 bytes
5507            "movq %%mm6, 48(%1," PDX ",)  \n\t" // write Raw(x) for 7th 8 bytes
5508            "addl $64, %%edx              \n\t"
5509            "paddb %%mm6, %%mm7           \n\t"
5510            "cmpl %%esi, %%edx            \n\t" // cmp to bytes over mult of 64
5511            "movq %%mm7, -8(%1," PDX ",)  \n\t" // write Raw(x) for 8th 8 bytes
5512            "jb sub_8lp                   \n\t"
5513
5514            "cmpl %%eax, %%edx            \n\t" // compare to MMXLength
5515            "jnb sub_8lt8                 \n\t"
5516
5517         "sub_8lpA:                       \n\t"
5518            "movq (%1," PDX ",), %%mm0    \n\t"
5519            "addl $8, %%edx               \n\t"
5520            "paddb %%mm7, %%mm0           \n\t"
5521            "cmpl %%eax, %%edx            \n\t" // compare to MMXLength
5522            "movq %%mm0, -8(%1," PDX ",)  \n\t" // -8 to offset early addl edx
5523            "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
5524            "jb sub_8lpA                  \n\t" //  to mm7 to be new Raw(x-bpp)
5525                                                //  for next loop
5526         "sub_8lt8:                       \n\t"
5527
5528            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5529              "=D" (dummy_value_D),   // 1
5530              "=d" (dummy_value_d),   // 2
5531              "=a" (dummy_value_a)    // 3
5532
5533            : "0" (bpp),              // ecx    // input regs
5534              "1" (row),              // edi
5535              "2" (diff),             // edx
5536              "3" (MMXLength)         // eax
5537
5538            : "%esi"                            // clobber list
5539#if defined(CLOBBER_MMX_REGS_SUPPORTED)
5540            , "%mm0", "%mm1", "%mm2", "%mm3"
5541            , "%mm4", "%mm5", "%mm6", "%mm7"
5542#endif
5543         );
5544      }
5545      break;  // end 8 bpp
5546
5547      default:                // bpp != 1,2,3,4,6,8:  doesn't exist
5548      {
5549         // ERROR:  SHOULD NEVER BE REACHED
5550#if defined(PNG_DEBUG)
5551         png_debug(1, "Internal libpng logic error (GCC "
5552           "png_read_filter_row_mmx_sub())\n");
5553#endif
5554      }
5555      break;
5556
5557   } // end switch (bpp)
5558
5559   __asm__ __volatile__ (
5560//pre "movl MMXLength, %%eax         \n\t"
5561//pre "mov  row, %1                  \n\t" // edi/rdi
5562//pre "cmpl FullLength, %%eax        \n\t"
5563      "cmpl %%edx, %%eax             \n\t"
5564      "jnb sub_end                   \n\t"
5565
5566      "mov  %1, " PSI "              \n\t" // lp = row
5567//pre "movl bpp, %%ecx               \n\t"
5568      "add  " PCX ", %1              \n\t" // rp = row + bpp
5569      "xorl %%ecx, %%ecx             \n\t"
5570
5571   "sub_lp2:                         \n\t"
5572      "movb (" PSI "," PAX ",), %%cl \n\t"
5573      "addb %%cl, (%1," PAX ",)      \n\t"
5574      "incl %%eax                    \n\t"
5575      "cmpl %%edx, %%eax             \n\t" // FullLength
5576      "jb sub_lp2                    \n\t"
5577
5578   "sub_end:                         \n\t"
5579      "EMMS                          \n\t" // end MMX instructions
5580
5581      : "=c" (dummy_value_c),   // 0      // output regs (dummy)
5582        "=D" (dummy_value_D),   // 1
5583        "=a" (dummy_value_a),   // 2
5584        "=d" (dummy_value_d)    // 3
5585
5586      : "0" (bpp),              // ecx    // input regs
5587        "1" (row),              // edi
5588        "2" (MMXLength),        // eax
5589        "3" (FullLength)        // edx
5590
5591      : "%esi"                            // clobber list
5592   );
5593
5594} // end of png_read_filter_row_mmx_sub()
5595
5596#endif /* PNG_MMX_READ_FILTER_SUB_SUPPORTED */
5597
5598
5599
5600
5601#if defined(PNG_MMX_READ_FILTER_UP_SUPPORTED)
5602
5603//===========================================================================//
5604//                                                                           //
5605//            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
5606//                                                                           //
5607//===========================================================================//
5608
5609// Optimized code for PNG Up filter decoder
5610
5611static void /* PRIVATE */
5612png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
5613                           png_bytep prev_row)
5614{
5615   unsigned len;        // png_uint_32 is actually 64-bit on x86-64
5616   int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
5617   png_bytep dummy_value_S;
5618   png_bytep dummy_value_D;
5619
5620   len = row_info->rowbytes;              // number of bytes to filter
5621
5622   __asm__ __volatile__ (
5623      SAVE_GOT_ebx
5624//pre "mov  prev_row, %1           \n\t" // esi/rsi
5625//pre "movl row, %2                \n\t" // edi/rdi
5626
5627      "xorl %%ebx, %%ebx           \n\t"
5628      "xorl %%eax, %%eax           \n\t"
5629
5630      // get # of bytes to alignment (note:  computing _delta_ of two pointers,
5631      // so hereafter %%ecx is sufficient even on 64-bit)
5632      "mov  %2, " PCX "            \n\t" // take start of row
5633      "add  $0x7, " PCX "          \n\t" // add 7 to incr past alignment bdry
5634//    "andl $0xfffffff8, %%ecx     \n\t" // mask to alignment boundary (32-bit!)
5635      CLEAR_BOTTOM_3_BITS  PCX    "\n\t" // mask to alignment boundary
5636      "sub  %2, " PCX "            \n\t" // subtract row ptr again => ebp =
5637      "jz up_go                    \n\t" //  target value of ecx at alignment
5638
5639   "up_lp1:                        \n\t" // fix alignment
5640      "movb (%2," PBX ",), %%al    \n\t"
5641      "addb (%1," PBX ",), %%al    \n\t"
5642      "incl %%ebx                  \n\t"
5643      "cmpl %%ecx, %%ebx           \n\t"
5644      "movb %%al, -1(%2," PBX ",)  \n\t" // mov does not affect flags; -1 to
5645      "jb up_lp1                   \n\t" //  offset incl ebx
5646
5647   "up_go:                         \n\t"
5648//pre "movl len, %%edx             \n\t"
5649      "movl %%edx, %%ecx           \n\t"
5650      "subl %%ebx, %%edx           \n\t" // subtract alignment fix
5651      "andl $0x0000003f, %%edx     \n\t" // calc bytes over mult of 64
5652      "subl %%edx, %%ecx           \n\t" // sub over-bytes from original length
5653
5654      // unrolled loop - use all MMX registers and interleave to reduce
5655      // number of branch instructions (loops) and reduce partial stalls
5656   "up_loop:                       \n\t"
5657      "movq (%1," PBX ",), %%mm1   \n\t"
5658      "movq (%2," PBX ",), %%mm0   \n\t"
5659      "movq 8(%1," PBX ",), %%mm3  \n\t"
5660      "paddb %%mm1, %%mm0          \n\t"
5661      "movq 8(%2," PBX ",), %%mm2  \n\t"
5662      "movq %%mm0, (%2," PBX ",)   \n\t"
5663      "paddb %%mm3, %%mm2          \n\t"
5664      "movq 16(%1," PBX ",), %%mm5 \n\t"
5665      "movq %%mm2, 8(%2," PBX ",)  \n\t"
5666      "movq 16(%2," PBX ",), %%mm4 \n\t"
5667      "movq 24(%1," PBX ",), %%mm7 \n\t"
5668      "paddb %%mm5, %%mm4          \n\t"
5669      "movq 24(%2," PBX ",), %%mm6 \n\t"
5670      "movq %%mm4, 16(%2," PBX ",) \n\t"
5671      "paddb %%mm7, %%mm6          \n\t"
5672      "movq 32(%1," PBX ",), %%mm1 \n\t"
5673      "movq %%mm6, 24(%2," PBX ",) \n\t"
5674      "movq 32(%2," PBX ",), %%mm0 \n\t"
5675      "movq 40(%1," PBX ",), %%mm3 \n\t"
5676      "paddb %%mm1, %%mm0          \n\t"
5677      "movq 40(%2," PBX ",), %%mm2 \n\t"
5678      "movq %%mm0, 32(%2," PBX ",) \n\t"
5679      "paddb %%mm3, %%mm2          \n\t"
5680      "movq 48(%1," PBX ",), %%mm5 \n\t"
5681      "movq %%mm2, 40(%2," PBX ",) \n\t"
5682      "movq 48(%2," PBX ",), %%mm4 \n\t"
5683      "movq 56(%1," PBX ",), %%mm7 \n\t"
5684      "paddb %%mm5, %%mm4          \n\t"
5685      "movq 56(%2," PBX ",), %%mm6 \n\t"
5686      "movq %%mm4, 48(%2," PBX ",) \n\t"
5687      "addl $64, %%ebx             \n\t"
5688      "paddb %%mm7, %%mm6          \n\t"
5689      "cmpl %%ecx, %%ebx           \n\t"
5690      "movq %%mm6, -8(%2," PBX ",) \n\t" // (+56)movq does not affect flags;
5691      "jb up_loop                  \n\t" //  -8 to offset addl ebx
5692
5693      "cmpl $0, %%edx              \n\t" // test for bytes over mult of 64
5694      "jz up_end                   \n\t"
5695
5696      "cmpl $8, %%edx              \n\t" // test for less than 8 bytes
5697      "jb up_lt8                   \n\t" //  [added by lcreeve at netins.net]
5698
5699      "addl %%edx, %%ecx           \n\t"
5700      "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
5701      "subl %%edx, %%ecx           \n\t" // drop over-bytes from length
5702      "jz up_lt8                   \n\t"
5703
5704   "up_lpA:                        \n\t" // use MMX regs to update 8 bytes sim.
5705      "movq (%1," PBX ",), %%mm1   \n\t"
5706      "movq (%2," PBX ",), %%mm0   \n\t"
5707      "addl $8, %%ebx              \n\t"
5708      "paddb %%mm1, %%mm0          \n\t"
5709      "cmpl %%ecx, %%ebx           \n\t"
5710      "movq %%mm0, -8(%2," PBX ",) \n\t" // movq does not affect flags; -8 to
5711      "jb up_lpA                   \n\t" //  offset add ebx
5712      "cmpl $0, %%edx              \n\t" // test for bytes over mult of 8
5713      "jz up_end                   \n\t"
5714
5715   "up_lt8:                        \n\t"
5716      "xorl %%eax, %%eax           \n\t"
5717      "addl %%edx, %%ecx           \n\t" // move over byte count into counter
5718
5719   "up_lp2:                        \n\t" // use x86 regs for remaining bytes
5720      "movb (%2," PBX ",), %%al    \n\t"
5721      "addb (%1," PBX ",), %%al    \n\t"
5722      "incl %%ebx                  \n\t"
5723      "cmpl %%ecx, %%ebx           \n\t"
5724      "movb %%al, -1(%2," PBX ",)  \n\t" // mov does not affect flags; -1 to
5725      "jb up_lp2                   \n\t" //  offset inc ebx
5726
5727   "up_end:                        \n\t"
5728      "EMMS                        \n\t" // conversion of filtered row complete
5729      RESTORE_GOT_ebx
5730
5731      : "=d" (dummy_value_d),   // 0     // output regs (dummy)
5732        "=S" (dummy_value_S),   // 1
5733        "=D" (dummy_value_D)    // 2
5734
5735      : "0" (len),              // edx   // input regs
5736        "1" (prev_row),         // esi
5737        "2" (row)               // edi
5738
5739      : "%eax", "%ecx"                   // clobber list (no input regs!)
5740        _CLOBBER_GOT_ebx
5741#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
5742      , "%mm0", "%mm1", "%mm2", "%mm3"
5743      , "%mm4", "%mm5", "%mm6", "%mm7"
5744#endif
5745   );
5746
5747} // end of png_read_filter_row_mmx_up()
5748
5749#endif /* PNG_MMX_READ_FILTER_UP_SUPPORTED */
5750
5751
5752
5753
5754/*===========================================================================*/
5755/*                                                                           */
5756/*                   P N G _ R E A D _ F I L T E R _ R O W                   */
5757/*                                                                           */
5758/*===========================================================================*/
5759
5760/* Optimized png_read_filter_row routines */
5761
5762void /* PRIVATE */
5763png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5764   row, png_bytep prev_row, int filter)
5765{
5766#if defined(PNG_DEBUG)
5767   char filtname[10];
5768#endif
5769
5770   if (_mmx_supported == 2) {
5771#if !defined(PNG_1_0_X)
5772       /* this should have happened in png_init_mmx_flags() already */
5773       png_warning(png_ptr, "asm_flags may not have been initialized");
5774#endif
5775       png_mmx_support();
5776   }
5777
5778#if defined(PNG_DEBUG)
5779   png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5780   switch (filter)
5781   {
5782      case 0:
5783         png_snprintf(filtname, 10, "none");
5784         break;
5785
5786      case 1:
5787         png_snprintf(filtname, 10, "sub-%s",
5788#ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED
5789#if !defined(PNG_1_0_X)
5790           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5791            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5792            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5793#else
5794           _mmx_supported
5795#endif
5796           ? "MMX" :
5797#endif
5798           "C");
5799         break;
5800
5801      case 2:
5802         png_snprintf(filtname, 10, "up-%s",
5803#ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED
5804#if !defined(PNG_1_0_X)
5805           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5806            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5807            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5808#else
5809           _mmx_supported
5810#endif
5811           ? "MMX" :
5812#endif
5813           "C");
5814         break;
5815
5816      case 3:
5817         png_snprintf(filtname, 10, "avg-%s",
5818#ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED
5819#if !defined(PNG_1_0_X)
5820           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5821            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5822            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5823#else
5824           _mmx_supported
5825#endif
5826           ? "MMX" :
5827#endif
5828           "C");
5829         break;
5830
5831      case 4:
5832         png_snprintf(filtname, 10, "paeth-%s",
5833#ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED
5834#if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
5835#if !defined(PNG_1_0_X)
5836           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5837            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5838            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5839#else
5840           _mmx_supported
5841#endif
5842           ? "MMX" :
5843#endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
5844#endif
5845           "C");
5846         break;
5847
5848      default:
5849         png_snprintf(filtname, 10, "unknown");
5850         break;
5851   }
5852   png_debug2(2, "row_number=%ld, %s, ", png_ptr->row_number, filtname);
5853   //png_debug1(0, "png_ptr=%10p, ", png_ptr);
5854   //png_debug1(0, "asm_flags=0x%08lx, ", png_ptr->asm_flags);
5855   png_debug1(0, "row=%10p, ", row);
5856   png_debug2(0, "pixdepth=%d, bytes=%d, ", (int)row_info->pixel_depth,
5857      (int)((row_info->pixel_depth + 7) >> 3));
5858   png_debug1(0, "rowbytes=%ld\n", row_info->rowbytes);
5859#endif /* PNG_DEBUG */
5860
5861   switch (filter)
5862   {
5863      case PNG_FILTER_VALUE_NONE:
5864         break;
5865
5866      case PNG_FILTER_VALUE_SUB:
5867#ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED
5868#if !defined(PNG_1_0_X)
5869         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5870             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5871             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5872#else
5873         if (_mmx_supported)
5874#endif
5875         {
5876            png_read_filter_row_mmx_sub(row_info, row);
5877         }
5878         else
5879#endif
5880         {
5881            png_uint_32 i;
5882            png_uint_32 istop = row_info->rowbytes;
5883            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5884            png_bytep rp = row + bpp;
5885            png_bytep lp = row;
5886
5887            for (i = bpp; i < istop; i++)
5888            {
5889               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5890               rp++;
5891            }
5892         }  /* end !UseMMX_sub */
5893         break;
5894
5895      case PNG_FILTER_VALUE_UP:
5896#ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED
5897#if !defined(PNG_1_0_X)
5898         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5899             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5900             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5901#else
5902         if (_mmx_supported)
5903#endif
5904         {
5905            png_read_filter_row_mmx_up(row_info, row, prev_row);
5906         }
5907          else
5908#endif
5909         {
5910            png_uint_32 i;
5911            png_uint_32 istop = row_info->rowbytes;
5912            png_bytep rp = row;
5913            png_bytep pp = prev_row;
5914
5915            for (i = 0; i < istop; ++i)
5916            {
5917               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5918               rp++;
5919            }
5920         }  /* end !UseMMX_up */
5921         break;
5922
5923      case PNG_FILTER_VALUE_AVG:
5924#ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED
5925#if !defined(PNG_1_0_X)
5926         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5927             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5928             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5929#else
5930         if (_mmx_supported)
5931#endif
5932         {
5933            png_read_filter_row_mmx_avg(row_info, row, prev_row);
5934         }
5935         else
5936#endif
5937         {
5938            png_uint_32 i;
5939            png_bytep rp = row;
5940            png_bytep pp = prev_row;
5941            png_bytep lp = row;
5942            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5943            png_uint_32 istop = row_info->rowbytes - bpp;
5944
5945            for (i = 0; i < bpp; i++)
5946            {
5947               *rp = (png_byte)(((int)(*rp) +
5948                  ((int)(*pp++) >> 1)) & 0xff);
5949               rp++;
5950            }
5951
5952            for (i = 0; i < istop; i++)
5953            {
5954               *rp = (png_byte)(((int)(*rp) +
5955                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5956               rp++;
5957            }
5958         }  /* end !UseMMX_avg */
5959         break;
5960
5961      case PNG_FILTER_VALUE_PAETH:
5962#ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED
5963#if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
5964#if !defined(PNG_1_0_X)
5965         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5966             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5967             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5968#else
5969         if (_mmx_supported)
5970#endif
5971         {
5972            png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5973         }
5974         else
5975#endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
5976#endif
5977         {
5978            png_uint_32 i;
5979            png_bytep rp = row;
5980            png_bytep pp = prev_row;
5981            png_bytep lp = row;
5982            png_bytep cp = prev_row;
5983            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5984            png_uint_32 istop = row_info->rowbytes - bpp;
5985
5986            for (i = 0; i < bpp; i++)
5987            {
5988               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5989               rp++;
5990            }
5991
5992            for (i = 0; i < istop; i++)   /* use leftover rp,pp */
5993            {
5994               int a, b, c, pa, pb, pc, p;
5995
5996               a = *lp++;
5997               b = *pp++;
5998               c = *cp++;
5999
6000               p = b - c;
6001               pc = a - c;
6002
6003#if defined(PNG_USE_ABS)
6004               pa = abs(p);
6005               pb = abs(pc);
6006               pc = abs(p + pc);
6007#else
6008               pa = p < 0 ? -p : p;
6009               pb = pc < 0 ? -pc : pc;
6010               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
6011#endif
6012
6013               /*
6014                  if (pa <= pb && pa <= pc)
6015                     p = a;
6016                  else if (pb <= pc)
6017                     p = b;
6018                  else
6019                     p = c;
6020                */
6021
6022               p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
6023
6024               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
6025               rp++;
6026            }
6027         }  /* end !UseMMX_paeth */
6028         break;
6029
6030      default:
6031         png_warning(png_ptr, "Ignoring bad row-filter type");
6032         *row=0;
6033         break;
6034   }
6035}
6036
6037#endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
6038
6039
6040#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */
6041#endif /* __GNUC__ */
6042