From d20666a228af58e97b755fbb8c7f86f4f49ed482 Mon Sep 17 00:00:00 2001
From: Glenn Randers-Pehrson <glennrp at users.sourceforge.net>
Date: Sat, 1 Sep 2007 15:12:50 -0500
Subject: Imported from libpng-1.2.20rc4.tar

---
 ANNOUNCE                       |   31 +-
 CHANGES                        |    3 +
 INSTALL                        |   27 +-
 KNOWNBUG                       |    2 +-
 LICENSE                        |    4 +-
 Makefile.am                    |    4 +-
 Makefile.in                    |   26 +-
 README                         |   14 +-
 Y2KINFO                        |    4 +-
 configure                      |  121 +-
 configure.ac                   |   15 +-
 libpng-1.2.20rc3.txt           | 3023 --------------------
 libpng-1.2.20rc4.txt           | 2851 +++++++++++++++++++
 libpng.3                       |  198 +-
 libpngpf.3                     |    4 +-
 png.5                          |    2 +-
 png.c                          |   52 +-
 png.h                          |   28 +-
 pngconf.h                      |   79 +-
 pngerror.c                     |    2 +-
 pnggccrd.c                     | 6043 +---------------------------------------
 pngget.c                       |   77 +-
 pngpread.c                     |    7 +-
 pngread.c                      |    6 -
 pngrutil.c                     | 1053 +------
 pngset.c                       |   51 +-
 pngtest.c                      |    2 +-
 pngvcrd.c                      | 3918 +-------------------------
 pngwrite.c                     |   16 -
 scripts/CMakeLists.txt         |   21 +-
 scripts/libpng-config-head.in  |    2 +-
 scripts/libpng.pc-configure.in |    2 +-
 scripts/libpng.pc.in           |    2 +-
 scripts/makefile.32sunu        |    2 +-
 scripts/makefile.64sunu        |    2 +-
 scripts/makefile.aix           |    2 +-
 scripts/makefile.beos          |    2 +-
 scripts/makefile.cygwin        |   21 +-
 scripts/makefile.darwin        |    2 +-
 scripts/makefile.dec           |    2 +-
 scripts/makefile.elf           |    2 +-
 scripts/makefile.gcmmx         |   12 +-
 scripts/makefile.hp64          |    2 +-
 scripts/makefile.hpgcc         |    2 +-
 scripts/makefile.hpux          |    2 +-
 scripts/makefile.intel         |    8 +-
 scripts/makefile.linux         |    2 +-
 scripts/makefile.mingw         |    9 +-
 scripts/makefile.ne12bsd       |    8 +-
 scripts/makefile.netbsd        |    8 +-
 scripts/makefile.nommx         |    2 +-
 scripts/makefile.openbsd       |    4 +-
 scripts/makefile.sco           |    2 +-
 scripts/makefile.sggcc         |    2 +-
 scripts/makefile.sgi           |    2 +-
 scripts/makefile.so9           |    2 +-
 scripts/makefile.solaris       |    2 +-
 scripts/makefile.solaris-x86   |    5 +-
 scripts/makefile.vcawin32      |    6 +-
 scripts/pngos2.def             |    2 +-
 scripts/pngw32.def             |    2 +-
 61 files changed, 3049 insertions(+), 14760 deletions(-)
 delete mode 100644 libpng-1.2.20rc3.txt
 create mode 100644 libpng-1.2.20rc4.txt
diff --git a/ANNOUNCE b/ANNOUNCE
index 2c813538e..0a7187c6c 100644
--- a/ANNOUNCE
+++ b/ANNOUNCE
@@ -1,5 +1,5 @@
 
-Libpng 1.2.20rc3 - August 30, 2007
+Libpng 1.2.20rc4 - September 1, 2007
 
 This is not intended to be a public release.  It will be replaced
 within a few weeks by a public version or by another test version.
@@ -9,32 +9,32 @@ Files available for download:
 Source files with LF line endings (for Unix/Linux) and with a
 "configure" script
 
-   libpng-1.2.20rc3.tar.gz
-   libpng-1.2.20rc3.tar.bz2
+   libpng-1.2.20rc4.tar.gz
+   libpng-1.2.20rc4.tar.bz2
 
 Source files with LF line endings (for Unix/Linux) without the
 "configure" script
 
-   libpng-1.2.20rc3-no-config.tar.gz
-   libpng-1.2.20rc3-no-config.tar.bz2
+   libpng-1.2.20rc4-no-config.tar.gz
+   libpng-1.2.20rc4-no-config.tar.bz2
 
 Source files with CRLF line endings (for Windows), without the
 "configure" script
 
-   lp1220r03.zip
-   lp1220r03.tar.bz2
+   lp1220r04.zip
+   lp1220r04.tar.bz2
 
 Project files
 
-   libpng-1.2.20rc3-project-netware.zip
-   libpng-1.2.20rc3-project-wince.zip
+   libpng-1.2.20rc4-project-netware.zip
+   libpng-1.2.20rc4-project-wince.zip
 
 Other information:
 
-   libpng-1.2.20rc3-README.txt
-   libpng-1.2.20rc3-KNOWNBUGS.txt
-   libpng-1.2.20rc3-LICENSE.txt
-   libpng-1.2.20rc3-Y2K-compliance.txt
+   libpng-1.2.20rc4-README.txt
+   libpng-1.2.20rc4-KNOWNBUGS.txt
+   libpng-1.2.20rc4-LICENSE.txt
+   libpng-1.2.20rc4-Y2K-compliance.txt
 
 Changes since the last public release (1.2.19):
 
@@ -62,13 +62,16 @@ version 1.2.20rc2 [August 27, 2007]
   Revised #ifdefs to ensure one and only one of pnggccrd.c, pngvcrd.c,
     or part of pngrutil.c is selected.
 
-version 1.2.20rc3 [August 30, 2007]
+version 1.2.20rc3 [September 1, 2007]
   Remove a little more code in pngwutil.c when PNG_NO_WRITE_FILTER is selected.
   Added /D _CRT_SECURE_NO_WARNINGS to visual6c and visualc71 projects.
   Compile png_mmx_support() in png.c even when PNG_NO_MMX_CODE is defined.
   Restored a "superfluous" #ifdef that was removed from 1.2.20rc2 pnggccrd.c,
     breaking the png_mmx_support() function.
 
+version 1.2.20rc4 [September 1, 2007]
+  Removed Intel contributions (MMX, Optimized C).
+
 Send comments/corrections/commendations to png-mng-implement at lists.sf.net
 
 (subscription required; visit 
diff --git a/CHANGES b/CHANGES
index e09eeb342..ffb954e24 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1926,6 +1926,9 @@ version 1.2.20rc3 [August 30, 2007]
   Restored a "superfluous" #ifdef that was removed from 1.2.20rc2 pnggccrd.c,
     breaking the png_mmx_support() function.
 
+version 1.2.20rc4 [September 1, 2007]
+  Removed Intel contributions (MMX, Optimized C).
+
 Send comments/corrections/commendations to png-mng-implement at lists.sf.net
 (subscription required; visit
 https://lists.sourceforge.net/lists/listinfo/png-mng-implement
diff --git a/INSTALL b/INSTALL
index ed8b51322..75f58e812 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,5 +1,5 @@
 
-Installing libpng version 1.2.20rc3 - August 30, 2007
+Installing libpng version 1.2.20rc4 - September 1, 2007
 
 On Unix/Linux and similar systems, you can simply type
 
@@ -44,7 +44,7 @@ to have access to the zlib.h and zconf.h include files that
 correspond to the version of zlib that's installed.
 
 You can rename the directories that you downloaded (they
-might be called "libpng-1.2.20rc3" or "lpng109" and "zlib-1.2.1"
+might be called "libpng-1.2.20rc4" or "lpng109" and "zlib-1.2.1"
 or "zlib121") so that you have directories called "zlib" and "libpng".
 
 Your directory structure should look like this:
@@ -101,15 +101,9 @@ include
  CMakeLists.txt    =>  "cmake" script
  makefile.std      =>  Generic UNIX makefile (cc, creates static libpng.a)
  makefile.elf      =>  Linux/ELF makefile symbol versioning,
-                       gcc, creates libpng12.so.0.1.2.20rc3)
+                       gcc, creates libpng12.so.0.1.2.20rc4)
  makefile.linux    =>  Linux/ELF makefile
-                       (gcc, creates libpng12.so.0.1.2.20rc3)
- makefile.gcmmx    =>  Linux/ELF makefile
-                       (gcc, creates libpng12.so.0.1.2.20rc3,
-                       uses assembler code tuned for Intel MMX platform)
- makefile.nommx    =>  Linux/ELF makefile
-                       (gcc, creates libpng12.so.0.1.2.20rc3
-                       does not use Intel MMX assembler code)
+                       (gcc, creates libpng12.so.0.1.2.20rc4)
  makefile.gcc      =>  Generic makefile (gcc, creates static libpng.a)
  makefile.knr      =>  Archaic UNIX Makefile that converts files with
                        ansi2knr (Requires ansi2knr.c from
@@ -131,14 +125,14 @@ include
  makefile.openbsd  =>  OpenBSD makefile
  makefile.sgi      =>  Silicon Graphics IRIX makefile (cc, creates static lib)
  makefile.sggcc    =>  Silicon Graphics (gcc,
-                       creates libpng12.so.0.1.2.20rc3)
+                       creates libpng12.so.0.1.2.20rc4)
  makefile.sunos    =>  Sun makefile
  makefile.solaris  =>  Solaris 2.X makefile (gcc,
-                       creates libpng12.so.0.1.2.20rc3)
+                       creates libpng12.so.0.1.2.20rc4)
  makefile.solaris-x86 =>  Solaris/intelMMX 2.X makefile (gcc,
-                       creates libpng12.so.0.1.2.20rc3)
+                       creates libpng12.so.0.1.2.20rc4)
  makefile.so9      =>  Solaris 9 makefile (gcc,
-                       creates libpng12.so.0.1.2.20rc3)
+                       creates libpng12.so.0.1.2.20rc4)
  makefile.32sunu   =>  Sun Ultra 32-bit makefile
  makefile.64sunu   =>  Sun Ultra 64-bit makefile
  makefile.sco      =>  For SCO OSr5  ELF and Unixware 7 with Native cc
@@ -154,10 +148,7 @@ include
  makefile.tc3      =>  Turbo C 3.0 makefile
  makefile.dj2      =>  DJGPP 2 makefile
  makefile.msc      =>  Microsoft C makefile
- makefile.vcawin32 =>  makefile for Microsoft Visual C++ 5.0 and later (uses
-                       assembler code tuned for Intel MMX platform)
- makefile.vcwin32  =>  makefile for Microsoft Visual C++ 4.0 and later (does
-                       not use assembler code)
+ makefile.vcwin32  =>  makefile for Microsoft Visual C++ 4.0 and later
  makefile.os2      =>  OS/2 Makefile (gcc and emx, requires pngos2.def)
  pngos2.def        =>  OS/2 module definition file used by makefile.os2
  makefile.watcom   =>  Watcom 10a+ Makefile, 32-bit flat memory model
diff --git a/KNOWNBUG b/KNOWNBUG
index 024512327..b1a946ce0 100644
--- a/KNOWNBUG
+++ b/KNOWNBUG
@@ -1,5 +1,5 @@
 
-Known bugs in libpng version 1.2.20rc3
+Known bugs in libpng version 1.2.20rc4
 
 1. April 22, 2001: pnggccrd.c has been reported to crash on NetBSD when
    reading interlaced PNG files, when assembler code is enabled but running
diff --git a/LICENSE b/LICENSE
index 9f9b12219..15011523d 100644
--- a/LICENSE
+++ b/LICENSE
@@ -8,7 +8,7 @@ COPYRIGHT NOTICE, DISCLAIMER, and LICENSE:
 If you modify libpng you may insert additional notices immediately following
 this sentence.
 
-libpng versions 1.2.6, August 15, 2004, through 1.2.20rc3, August 30, 2007, are
+libpng versions 1.2.6, August 15, 2004, through 1.2.20rc4, September 1, 2007, are
 Copyright (c) 2004, 2006-2007 Glenn Randers-Pehrson, and are
 distributed according to the same disclaimer and license as libpng-1.2.5
 with the following individual added to the list of Contributing Authors
@@ -106,4 +106,4 @@ certification mark of the Open Source Initiative.
 
 Glenn Randers-Pehrson
 glennrp at users.sourceforge.net
-August 30, 2007
+September 1, 2007
diff --git a/Makefile.am b/Makefile.am
index de7dd3115..549c2c7b5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -33,7 +33,7 @@ lib_LTLIBRARIES=libpng12.la @compatlib@
 EXTRA_LTLIBRARIES= libpng.la
 libpng12_la_SOURCES = png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
 	pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
-	pngwtran.c pngmem.c pngerror.c pngpread.c pnggccrd.c \
+	pngwtran.c pngmem.c pngerror.c pngpread.c \
 	png.h pngconf.h
 libpng_la_SOURCES = $(libpng12_la_SOURCES)
 
@@ -83,7 +83,7 @@ EXTRA_DIST= \
 	${srcdir}/contrib/pngsuite/* \
 	${srcdir}/contrib/visupng/* \
 	$(TESTS) \
-	example.c libpng.txt pngvcrd.c 
+	example.c libpng.txt
 
 CLEANFILES= pngout.png libpng12.pc libpng12-config libpng.vers \
 libpng.sym
diff --git a/Makefile.in b/Makefile.in
index 41913eba3..a56378adc 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -87,8 +87,7 @@ am__objects_1 = libpng_la-png.lo libpng_la-pngset.lo \
 	libpng_la-pngread.lo libpng_la-pngrio.lo libpng_la-pngwio.lo \
 	libpng_la-pngwrite.lo libpng_la-pngrtran.lo \
 	libpng_la-pngwtran.lo libpng_la-pngmem.lo \
-	libpng_la-pngerror.lo libpng_la-pngpread.lo \
-	libpng_la-pnggccrd.lo
+	libpng_la-pngerror.lo libpng_la-pngpread.lo
 am_libpng_la_OBJECTS = $(am__objects_1)
 libpng_la_OBJECTS = $(am_libpng_la_OBJECTS)
 libpng12_la_LIBADD =
@@ -99,7 +98,7 @@ am_libpng12_la_OBJECTS = libpng12_la-png.lo libpng12_la-pngset.lo \
 	libpng12_la-pngwio.lo libpng12_la-pngwrite.lo \
 	libpng12_la-pngrtran.lo libpng12_la-pngwtran.lo \
 	libpng12_la-pngmem.lo libpng12_la-pngerror.lo \
-	libpng12_la-pngpread.lo libpng12_la-pnggccrd.lo
+	libpng12_la-pngpread.lo
 libpng12_la_OBJECTS = $(am_libpng12_la_OBJECTS)
 am_pngtest_OBJECTS = pngtest.$(OBJEXT)
 pngtest_OBJECTS = $(am_pngtest_OBJECTS)
@@ -184,7 +183,6 @@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
 LDFLAGS = @LDFLAGS@
 LIBOBJS = @LIBOBJS@
 LIBPNG_DEFINES = @LIBPNG_DEFINES@
-LIBPNG_NO_MMX = @LIBPNG_NO_MMX@
 LIBS = @LIBS@
 LIBTOOL = @LIBTOOL@
 LN_S = @LN_S@
@@ -286,7 +284,7 @@ lib_LTLIBRARIES = libpng12.la @compatlib@
 EXTRA_LTLIBRARIES = libpng.la
 libpng12_la_SOURCES = png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
 	pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
-	pngwtran.c pngmem.c pngerror.c pngpread.c pnggccrd.c \
+	pngwtran.c pngmem.c pngerror.c pngpread.c \
 	png.h pngconf.h
 
 libpng_la_SOURCES = $(libpng12_la_SOURCES)
@@ -322,7 +320,7 @@ EXTRA_DIST = \
 	${srcdir}/contrib/pngsuite/* \
 	${srcdir}/contrib/visupng/* \
 	$(TESTS) \
-	example.c libpng.txt pngvcrd.c 
+	example.c libpng.txt
 
 CLEANFILES = pngout.png libpng12.pc libpng12-config libpng.vers \
 libpng.sym
@@ -455,7 +453,6 @@ distclean-compile:
 
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-png.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pngerror.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pnggccrd.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pngget.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pngmem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pngpread.Plo@am__quote@
@@ -471,7 +468,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pngwutil.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-png.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngerror.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pnggccrd.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngget.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngmem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngpread.Plo@am__quote@
@@ -613,13 +609,6 @@ libpng_la-pngpread.lo: pngpread.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng_la-pngpread.lo `test -f 'pngpread.c' || echo '$(srcdir)/'`pngpread.c
 
-libpng_la-pnggccrd.lo: pnggccrd.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng_la-pnggccrd.lo -MD -MP -MF "$(DEPDIR)/libpng_la-pnggccrd.Tpo" -c -o libpng_la-pnggccrd.lo `test -f 'pnggccrd.c' || echo '$(srcdir)/'`pnggccrd.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpng_la-pnggccrd.Tpo" "$(DEPDIR)/libpng_la-pnggccrd.Plo"; else rm -f "$(DEPDIR)/libpng_la-pnggccrd.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pnggccrd.c' object='libpng_la-pnggccrd.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng_la-pnggccrd.lo `test -f 'pnggccrd.c' || echo '$(srcdir)/'`pnggccrd.c
-
 libpng12_la-png.lo: png.c
 @am__fastdepCC_TRUE@	if $(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng12_la-png.lo -MD -MP -MF "$(DEPDIR)/libpng12_la-png.Tpo" -c -o libpng12_la-png.lo `test -f 'png.c' || echo '$(srcdir)/'`png.c; \
 @am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpng12_la-png.Tpo" "$(DEPDIR)/libpng12_la-png.Plo"; else rm -f "$(DEPDIR)/libpng12_la-png.Tpo"; exit 1; fi
@@ -725,13 +714,6 @@ libpng12_la-pngpread.lo: pngpread.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng12_la-pngpread.lo `test -f 'pngpread.c' || echo '$(srcdir)/'`pngpread.c
 
-libpng12_la-pnggccrd.lo: pnggccrd.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng12_la-pnggccrd.lo -MD -MP -MF "$(DEPDIR)/libpng12_la-pnggccrd.Tpo" -c -o libpng12_la-pnggccrd.lo `test -f 'pnggccrd.c' || echo '$(srcdir)/'`pnggccrd.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpng12_la-pnggccrd.Tpo" "$(DEPDIR)/libpng12_la-pnggccrd.Plo"; else rm -f "$(DEPDIR)/libpng12_la-pnggccrd.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pnggccrd.c' object='libpng12_la-pnggccrd.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng12_la-pnggccrd.lo `test -f 'pnggccrd.c' || echo '$(srcdir)/'`pnggccrd.c
-
 mostlyclean-libtool:
 	-rm -f *.lo
 
diff --git a/README b/README
index dec2d45d2..6ae9daf7b 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README for libpng version 1.2.20rc3 - August 30, 2007 (shared library 12.0)
+README for libpng version 1.2.20rc4 - September 1, 2007 (shared library 12.0)
 See the note about version numbers near the top of png.h
 
 See INSTALL for instructions on how to install libpng.
@@ -190,11 +190,11 @@ Files in this distribution:
        descrip.mms      =>  VMS makefile for MMS or MMK
        makefile.std     =>  Generic UNIX makefile (cc, creates static libpng.a)
        makefile.elf     =>  Linux/ELF makefile symbol versioning,
-                            gcc, creates libpng12.so.0.1.2.20rc3)
+                            gcc, creates libpng12.so.0.1.2.20rc4)
        makefile.linux   =>  Linux/ELF makefile
-                            (gcc, creates libpng12.so.0.1.2.20rc3)
+                            (gcc, creates libpng12.so.0.1.2.20rc4)
        makefile.gcmmx   =>  Linux/ELF makefile
-                            (gcc, creates libpng12.so.0.1.2.20rc3,
+                            (gcc, creates libpng12.so.0.1.2.20rc4,
                             uses assembler code tuned for Intel MMX platform)
        makefile.gcc     =>  Generic makefile (gcc, creates static libpng.a)
        makefile.knr     =>  Archaic UNIX Makefile that converts files with
@@ -216,12 +216,12 @@ Files in this distribution:
        makefile.openbsd =>  OpenBSD makefile
        makefile.sgi     =>  Silicon Graphics IRIX (cc, creates static lib)
        makefile.sggcc   =>  Silicon Graphics
-                            (gcc, creates libpng12.so.0.1.2.20rc3)
+                            (gcc, creates libpng12.so.0.1.2.20rc4)
        makefile.sunos   =>  Sun makefile
        makefile.solaris =>  Solaris 2.X makefile
-                            (gcc, creates libpng12.so.0.1.2.20rc3)
+                            (gcc, creates libpng12.so.0.1.2.20rc4)
        makefile.so9     =>  Solaris 9 makefile
-                            (gcc, creates libpng12.so.0.1.2.20rc3)
+                            (gcc, creates libpng12.so.0.1.2.20rc4)
        makefile.32sunu  =>  Sun Ultra 32-bit makefile
        makefile.64sunu  =>  Sun Ultra 64-bit makefile
        makefile.sco     =>  For SCO OSr5  ELF and Unixware 7 with Native cc
diff --git a/Y2KINFO b/Y2KINFO
index af33e0ed4..52cb27b16 100644
--- a/Y2KINFO
+++ b/Y2KINFO
@@ -1,13 +1,13 @@
    Y2K compliance in libpng:
    =========================
 
-      August 30, 2007
+      September 1, 2007
 
       Since the PNG Development group is an ad-hoc body, we can't make
       an official declaration.
 
       This is your unofficial assurance that libpng from version 0.71 and
-      upward through 1.2.20rc3 are Y2K compliant.  It is my belief that earlier
+      upward through 1.2.20rc4 are Y2K compliant.  It is my belief that earlier
       versions were also Y2K compliant.
 
       Libpng only has three year fields.  One is a 2-byte unsigned integer
diff --git a/configure b/configure
index c60ff4030..bfaf6366f 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.61 for libpng 1.2.20rc3.
+# Generated by GNU Autoconf 2.61 for libpng 1.2.20rc4.
 #
 # Report bugs to <png-mng-implement@lists.sourceforge.net>.
 #
@@ -728,8 +728,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='libpng'
 PACKAGE_TARNAME='libpng'
-PACKAGE_VERSION='1.2.20rc3'
-PACKAGE_STRING='libpng 1.2.20rc3'
+PACKAGE_VERSION='1.2.20rc4'
+PACKAGE_STRING='libpng 1.2.20rc4'
 PACKAGE_BUGREPORT='png-mng-implement@lists.sourceforge.net'
 
 ac_unique_file="pngget.c"
@@ -876,7 +876,6 @@ LIBTOOL
 POW_LIB
 LIBOBJS
 LIBPNG_DEFINES
-LIBPNG_NO_MMX
 HAVE_LD_VERSION_SCRIPT_TRUE
 HAVE_LD_VERSION_SCRIPT_FALSE
 PNGLIB_VERSION
@@ -1405,7 +1404,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures libpng 1.2.20rc3 to adapt to many kinds of systems.
+\`configure' configures libpng 1.2.20rc4 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1475,7 +1474,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of libpng 1.2.20rc3:";;
+     short | recursive ) echo "Configuration of libpng 1.2.20rc4:";;
    esac
   cat <<\_ACEOF
 
@@ -1585,7 +1584,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-libpng configure 1.2.20rc3
+libpng configure 1.2.20rc4
 generated by GNU Autoconf 2.61
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1599,7 +1598,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by libpng $as_me 1.2.20rc3, which was
+It was created by libpng $as_me 1.2.20rc4, which was
 generated by GNU Autoconf 2.61.  Invocation command line was
 
   $ $0 $@
@@ -2269,7 +2268,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='libpng'
- VERSION='1.2.20rc3'
+ VERSION='1.2.20rc4'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -2440,7 +2439,7 @@ fi
 
 
 
-PNGLIB_VERSION=1.2.20rc3
+PNGLIB_VERSION=1.2.20rc4
 PNGLIB_MAJOR=1
 PNGLIB_MINOR=2
 PNGLIB_RELEASE=20
@@ -4788,7 +4787,7 @@ ia64-*-hpux*)
   ;;
 *-*-irix6*)
   # Find out which ABI we are using.
-  echo '#line 4791 "configure"' > conftest.$ac_ext
+  echo '#line 4790 "configure"' > conftest.$ac_ext
   if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
   (eval $ac_compile) 2>&5
   ac_status=$?
@@ -7301,11 +7300,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7304: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7303: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:7308: \$? = $ac_status" >&5
+   echo "$as_me:7307: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -7591,11 +7590,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7594: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7593: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:7598: \$? = $ac_status" >&5
+   echo "$as_me:7597: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -7695,11 +7694,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7698: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7697: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:7702: \$? = $ac_status" >&5
+   echo "$as_me:7701: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -10044,7 +10043,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10047 "configure"
+#line 10046 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -10144,7 +10143,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10147 "configure"
+#line 10146 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -12564,11 +12563,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:12567: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:12566: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:12571: \$? = $ac_status" >&5
+   echo "$as_me:12570: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -12668,11 +12667,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:12671: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:12670: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:12675: \$? = $ac_status" >&5
+   echo "$as_me:12674: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -14230,11 +14229,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:14233: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:14232: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:14237: \$? = $ac_status" >&5
+   echo "$as_me:14236: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -14334,11 +14333,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:14337: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:14336: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:14341: \$? = $ac_status" >&5
+   echo "$as_me:14340: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -16521,11 +16520,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:16524: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:16523: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:16528: \$? = $ac_status" >&5
+   echo "$as_me:16527: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -16811,11 +16810,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:16814: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:16813: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:16818: \$? = $ac_status" >&5
+   echo "$as_me:16817: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -16915,11 +16914,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:16918: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:16917: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:16922: \$? = $ac_status" >&5
+   echo "$as_me:16921: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -20713,55 +20712,6 @@ fi
 
 
 LIBPNG_DEFINES=-DPNG_CONFIGURE_LIBPNG
-{ echo "$as_me:$LINENO: checking if assembler code in pnggccrd.c can be compiled without PNG_NO_MMX_CODE" >&5
-echo $ECHO_N "checking if assembler code in pnggccrd.c can be compiled without PNG_NO_MMX_CODE... $ECHO_C" >&6; }
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include "$srcdir/pnggccrd.c"
-int
-main ()
-{
-return 0;
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  { echo "$as_me:$LINENO: result: yes" >&5
-echo "${ECHO_T}yes" >&6; }
-  LIBPNG_NO_MMX=""
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	{ echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-  LIBPNG_NO_MMX=-DPNG_NO_MMX_CODE
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-LIBPNG_DEFINES=$LIBPNG_DEFINES\ $LIBPNG_NO_MMX
-
 
 
 { echo "$as_me:$LINENO: checking if libraries can be versioned" >&5
@@ -21282,7 +21232,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by libpng $as_me 1.2.20rc3, which was
+This file was extended by libpng $as_me 1.2.20rc4, which was
 generated by GNU Autoconf 2.61.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -21335,7 +21285,7 @@ Report bugs to <bug-autoconf@gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-libpng config.status 1.2.20rc3
+libpng config.status 1.2.20rc4
 configured by $0, generated by GNU Autoconf 2.61,
   with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
@@ -21663,7 +21613,6 @@ LIBTOOL!$LIBTOOL$ac_delim
 POW_LIB!$POW_LIB$ac_delim
 LIBOBJS!$LIBOBJS$ac_delim
 LIBPNG_DEFINES!$LIBPNG_DEFINES$ac_delim
-LIBPNG_NO_MMX!$LIBPNG_NO_MMX$ac_delim
 HAVE_LD_VERSION_SCRIPT_TRUE!$HAVE_LD_VERSION_SCRIPT_TRUE$ac_delim
 HAVE_LD_VERSION_SCRIPT_FALSE!$HAVE_LD_VERSION_SCRIPT_FALSE$ac_delim
 PNGLIB_VERSION!$PNGLIB_VERSION$ac_delim
@@ -21676,7 +21625,7 @@ compatlib!$compatlib$ac_delim
 LTLIBOBJS!$LTLIBOBJS$ac_delim
 _ACEOF
 
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 21; then
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 20; then
     break
   elif $ac_last_try; then
     { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
diff --git a/configure.ac b/configure.ac
index c7a98e2c0..244952c57 100644
--- a/configure.ac
+++ b/configure.ac
@@ -18,12 +18,12 @@ AC_PREREQ(2.59)
 
 dnl Version number stuff here:
 
-AC_INIT([libpng], [1.2.20rc3], [png-mng-implement@lists.sourceforge.net])
+AC_INIT([libpng], [1.2.20rc4], [png-mng-implement@lists.sourceforge.net])
 AM_INIT_AUTOMAKE
 dnl stop configure from automagically running automake
 AM_MAINTAINER_MODE
 
-PNGLIB_VERSION=1.2.20rc3
+PNGLIB_VERSION=1.2.20rc4
 PNGLIB_MAJOR=1
 PNGLIB_MINOR=2
 PNGLIB_RELEASE=20
@@ -59,18 +59,7 @@ AC_CHECK_FUNCS([pow], , AC_CHECK_LIB(m, pow, , AC_ERROR([cannot find pow])) )
 AC_CHECK_LIB(z, zlibVersion, , AC_ERROR([zlib not installed]))
 
 LIBPNG_DEFINES=-DPNG_CONFIGURE_LIBPNG
-AC_MSG_CHECKING(
-  [if assembler code in pnggccrd.c can be compiled without PNG_NO_MMX_CODE])
-AC_TRY_COMPILE(
-  [#include "$srcdir/pnggccrd.c"],
-  [return 0;],
-  AC_MSG_RESULT(yes)
-  LIBPNG_NO_MMX="",
-  AC_MSG_RESULT(no)
-  LIBPNG_NO_MMX=-DPNG_NO_MMX_CODE)
-LIBPNG_DEFINES=$LIBPNG_DEFINES\ $LIBPNG_NO_MMX
 AC_SUBST(LIBPNG_DEFINES)
-AC_SUBST(LIBPNG_NO_MMX)
 
 AC_MSG_CHECKING([if libraries can be versioned])
 GLD=`$LD --help < /dev/null 2>/dev/null | grep version-script`
diff --git a/libpng-1.2.20rc3.txt b/libpng-1.2.20rc3.txt
deleted file mode 100644
index ce051241f..000000000
--- a/libpng-1.2.20rc3.txt
+++ /dev/null
@@ -1,3023 +0,0 @@
-libpng.txt - A description on how to use and modify libpng
-
- libpng version 1.2.20rc3 - August 30, 2007
- Updated and distributed by Glenn Randers-Pehrson
- <glennrp at users.sourceforge.net>
- Copyright (c) 1998-2007 Glenn Randers-Pehrson
- For conditions of distribution and use, see copyright
- notice in png.h.
-
- based on:
-
- libpng 1.0 beta 6  version 0.96 May 28, 1997
- Updated and distributed by Andreas Dilger
- Copyright (c) 1996, 1997 Andreas Dilger
-
- libpng 1.0 beta 2 - version 0.88  January 26, 1996
- For conditions of distribution and use, see copyright
- notice in png.h. Copyright (c) 1995, 1996 Guy Eric
- Schalnat, Group 42, Inc.
-
- Updated/rewritten per request in the libpng FAQ
- Copyright (c) 1995, 1996 Frank J. T. Wojcik
- December 18, 1995 & January 20, 1996
-
-I. Introduction
-
-This file describes how to use and modify the PNG reference library
-(known as libpng) for your own use.  There are five sections to this
-file: introduction, structures, reading, writing, and modification and
-configuration notes for various special platforms.  In addition to this
-file, example.c is a good starting point for using the library, as
-it is heavily commented and should include everything most people
-will need.  We assume that libpng is already installed; see the
-INSTALL file for instructions on how to install libpng.
-
-For examples of libpng usage, see the files "example.c", "pngtest.c",
-and the files in the "contrib" directory, all of which are included in the
-libpng distribution.
-
-Libpng was written as a companion to the PNG specification, as a way
-of reducing the amount of time and effort it takes to support the PNG
-file format in application programs.
-
-The PNG specification (second edition), November 2003, is available as
-a W3C Recommendation and as an ISO Standard (ISO/IEC 15948:2003 (E)) at
-<http://www.w3.org/TR/2003/REC-PNG-20031110/
-The W3C and ISO documents have identical technical content.
-
-The PNG-1.2 specification is available at
-<http://www.libpng.org/pub/png/documents/>
-
-The PNG-1.0 specification is available
-as RFC 2083 <http://www.libpng.org/pub/png/documents/> and as a
-W3C Recommendation <http://www.w3.org/TR/REC.png.html>. Some
-additional chunks are described in the special-purpose public chunks
-documents at <http://www.libpng.org/pub/png/documents/>.
-
-Other information
-about PNG, and the latest version of libpng, can be found at the PNG home
-page, <http://www.libpng.org/pub/png/>.
-
-Most users will not have to modify the library significantly; advanced
-users may want to modify it more.  All attempts were made to make it as
-complete as possible, while keeping the code easy to understand.
-Currently, this library only supports C.  Support for other languages
-is being considered.
-
-Libpng has been designed to handle multiple sessions at one time,
-to be easily modifiable, to be portable to the vast majority of
-machines (ANSI, K&R, 16-, 32-, and 64-bit) available, and to be easy
-to use.  The ultimate goal of libpng is to promote the acceptance of
-the PNG file format in whatever way possible.  While there is still
-work to be done (see the TODO file), libpng should cover the
-majority of the needs of its users.
-
-Libpng uses zlib for its compression and decompression of PNG files.
-Further information about zlib, and the latest version of zlib, can
-be found at the zlib home page, <http://www.info-zip.org/pub/infozip/zlib/>.
-The zlib compression utility is a general purpose utility that is
-useful for more than PNG files, and can be used without libpng.
-See the documentation delivered with zlib for more details.
-You can usually find the source files for the zlib utility wherever you
-find the libpng source files.
-
-Libpng is thread safe, provided the threads are using different
-instances of the structures.  Each thread should have its own
-png_struct and png_info instances, and thus its own image.
-Libpng does not protect itself against two threads using the
-same instance of a structure.  Note: thread safety may be defeated
-by use of some of the MMX assembler code in pnggccrd.c, which is only
-compiled when the user defines PNG_THREAD_UNSAFE_OK.
-
-II. Structures
-
-There are two main structures that are important to libpng, png_struct
-and png_info.  The first, png_struct, is an internal structure that
-will not, for the most part, be used by a user except as the first
-variable passed to every libpng function call.
-
-The png_info structure is designed to provide information about the
-PNG file.  At one time, the fields of png_info were intended to be
-directly accessible to the user.  However, this tended to cause problems
-with applications using dynamically loaded libraries, and as a result
-a set of interface functions for png_info (the png_get_*() and png_set_*()
-functions) was developed.  The fields of png_info are still available for
-older applications, but it is suggested that applications use the new
-interfaces if at all possible.
-
-Applications that do make direct access to the members of png_struct (except
-for png_ptr->jmpbuf) must be recompiled whenever the library is updated,
-and applications that make direct access to the members of png_info must
-be recompiled if they were compiled or loaded with libpng version 1.0.6,
-in which the members were in a different order.  In version 1.0.7, the
-members of the png_info structure reverted to the old order, as they were
-in versions 0.97c through 1.0.5.  Starting with version 2.0.0, both
-structures are going to be hidden, and the contents of the structures will
-only be accessible through the png_get/png_set functions.
-
-The png.h header file is an invaluable reference for programming with libpng.
-And while I'm on the topic, make sure you include the libpng header file:
-
-#include <png.h>
-
-III. Reading
-
-We'll now walk you through the possible functions to call when reading
-in a PNG file sequentially, briefly explaining the syntax and purpose
-of each one.  See example.c and png.h for more detail.  While
-progressive reading is covered in the next section, you will still
-need some of the functions discussed in this section to read a PNG
-file.
-
-Setup
-
-You will want to do the I/O initialization(*) before you get into libpng,
-so if it doesn't work, you don't have much to undo.  Of course, you
-will also want to insure that you are, in fact, dealing with a PNG
-file.  Libpng provides a simple check to see if a file is a PNG file.
-To use it, pass in the first 1 to 8 bytes of the file to the function
-png_sig_cmp(), and it will return 0 if the bytes match the corresponding
-bytes of the PNG signature, or nonzero otherwise.  Of course, the more bytes
-you pass in, the greater the accuracy of the prediction.
-
-If you are intending to keep the file pointer open for use in libpng,
-you must ensure you don't read more than 8 bytes from the beginning
-of the file, and you also have to make a call to png_set_sig_bytes_read()
-with the number of bytes you read from the beginning.  Libpng will
-then only check the bytes (if any) that your program didn't read.
-
-(*): If you are not using the standard I/O functions, you will need
-to replace them with custom functions.  See the discussion under
-Customizing libpng.
-
-
-    FILE *fp = fopen(file_name, "rb");
-    if (!fp)
-    {
-        return (ERROR);
-    }
-    fread(header, 1, number, fp);
-    is_png = !png_sig_cmp(header, 0, number);
-    if (!is_png)
-    {
-        return (NOT_PNG);
-    }
-
-
-Next, png_struct and png_info need to be allocated and initialized.  In
-order to ensure that the size of these structures is correct even with a
-dynamically linked libpng, there are functions to initialize and
-allocate the structures.  We also pass the library version, optional
-pointers to error handling functions, and a pointer to a data struct for
-use by the error functions, if necessary (the pointer and functions can
-be NULL if the default error handlers are to be used).  See the section
-on Changes to Libpng below regarding the old initialization functions.
-The structure allocation functions quietly return NULL if they fail to
-create the structure, so your application should check for that.
-
-    png_structp png_ptr = png_create_read_struct
-       (PNG_LIBPNG_VER_STRING, (png_voidp)user_error_ptr,
-        user_error_fn, user_warning_fn);
-    if (!png_ptr)
-        return (ERROR);
-
-    png_infop info_ptr = png_create_info_struct(png_ptr);
-    if (!info_ptr)
-    {
-        png_destroy_read_struct(&png_ptr,
-           (png_infopp)NULL, (png_infopp)NULL);
-        return (ERROR);
-    }
-
-    png_infop end_info = png_create_info_struct(png_ptr);
-    if (!end_info)
-    {
-        png_destroy_read_struct(&png_ptr, &info_ptr,
-          (png_infopp)NULL);
-        return (ERROR);
-    }
-
-If you want to use your own memory allocation routines,
-define PNG_USER_MEM_SUPPORTED and use
-png_create_read_struct_2() instead of png_create_read_struct():
-
-    png_structp png_ptr = png_create_read_struct_2
-       (PNG_LIBPNG_VER_STRING, (png_voidp)user_error_ptr,
-        user_error_fn, user_warning_fn, (png_voidp)
-        user_mem_ptr, user_malloc_fn, user_free_fn);
-
-The error handling routines passed to png_create_read_struct()
-and the memory alloc/free routines passed to png_create_struct_2()
-are only necessary if you are not using the libpng supplied error
-handling and memory alloc/free functions.
-
-When libpng encounters an error, it expects to longjmp back
-to your routine.  Therefore, you will need to call setjmp and pass
-your png_jmpbuf(png_ptr).  If you read the file from different
-routines, you will need to update the jmpbuf field every time you enter
-a new routine that will call a png_*() function.
-
-See your documentation of setjmp/longjmp for your compiler for more
-information on setjmp/longjmp.  See the discussion on libpng error
-handling in the Customizing Libpng section below for more information
-on the libpng error handling.  If an error occurs, and libpng longjmp's
-back to your setjmp, you will want to call png_destroy_read_struct() to
-free any memory.
-
-    if (setjmp(png_jmpbuf(png_ptr)))
-    {
-        png_destroy_read_struct(&png_ptr, &info_ptr,
-           &end_info);
-        fclose(fp);
-        return (ERROR);
-    }
-
-If you would rather avoid the complexity of setjmp/longjmp issues,
-you can compile libpng with PNG_SETJMP_NOT_SUPPORTED, in which case
-errors will result in a call to PNG_ABORT() which defaults to abort().
-
-Now you need to set up the input code.  The default for libpng is to
-use the C function fread().  If you use this, you will need to pass a
-valid FILE * in the function png_init_io().  Be sure that the file is
-opened in binary mode.  If you wish to handle reading data in another
-way, you need not call the png_init_io() function, but you must then
-implement the libpng I/O methods discussed in the Customizing Libpng
-section below.
-
-    png_init_io(png_ptr, fp);
-
-If you had previously opened the file and read any of the signature from
-the beginning in order to see if this was a PNG file, you need to let
-libpng know that there are some bytes missing from the start of the file.
-
-    png_set_sig_bytes(png_ptr, number);
-
-Setting up callback code
-
-You can set up a callback function to handle any unknown chunks in the
-input stream. You must supply the function
-
-    read_chunk_callback(png_ptr ptr,
-         png_unknown_chunkp chunk);
-    {
-       /* The unknown chunk structure contains your
-          chunk data: */
-           png_byte name[5];
-           png_byte *data;
-           png_size_t size;
-       /* Note that libpng has already taken care of
-          the CRC handling */
-
-       /* put your code here.  Return one of the
-          following: */
-
-       return (-n); /* chunk had an error */
-       return (0); /* did not recognize */
-       return (n); /* success */
-    }
-
-(You can give your function another name that you like instead of
-"read_chunk_callback")
-
-To inform libpng about your function, use
-
-    png_set_read_user_chunk_fn(png_ptr, user_chunk_ptr,
-        read_chunk_callback);
-
-This names not only the callback function, but also a user pointer that
-you can retrieve with
-
-    png_get_user_chunk_ptr(png_ptr);
-
-At this point, you can set up a callback function that will be
-called after each row has been read, which you can use to control
-a progress meter or the like.  It's demonstrated in pngtest.c.
-You must supply a function
-
-    void read_row_callback(png_ptr ptr, png_uint_32 row,
-       int pass);
-    {
-      /* put your code here */
-    }
-
-(You can give it another name that you like instead of "read_row_callback")
-
-To inform libpng about your function, use
-
-    png_set_read_status_fn(png_ptr, read_row_callback);
-
-Width and height limits
-
-The PNG specification allows the width and height of an image to be as
-large as 2^31-1 (0x7fffffff), or about 2.147 billion rows and columns.
-Since very few applications really need to process such large images,
-we have imposed an arbitrary 1-million limit on rows and columns.
-Larger images will be rejected immediately with a png_error() call. If
-you wish to override this limit, you can use
-
-   png_set_user_limits(png_ptr, width_max, height_max);
-
-to set your own limits, or use width_max = height_max = 0x7fffffffL
-to allow all valid dimensions (libpng may reject some very large images
-anyway because of potential buffer overflow conditions).
-
-You should put this statement after you create the PNG structure and
-before calling png_read_info(), png_read_png(), or png_process_data().
-If you need to retrieve the limits that are being applied, use
-
-   width_max = png_get_user_width_max(png_ptr);
-   height_max = png_get_user_height_max(png_ptr);
-
-Unknown-chunk handling
-
-Now you get to set the way the library processes unknown chunks in the
-input PNG stream. Both known and unknown chunks will be read.  Normal
-behavior is that known chunks will be parsed into information in
-various info_ptr members; unknown chunks will be discarded. To change
-this, you can call:
-
-    png_set_keep_unknown_chunks(png_ptr, keep,
-        chunk_list, num_chunks);
-    keep       - 0: do not handle as unknown
-                 1: do not keep
-                 2: keep only if safe-to-copy
-                 3: keep even if unsafe-to-copy
-               You can use these definitions:
-                 PNG_HANDLE_CHUNK_AS_DEFAULT   0
-                 PNG_HANDLE_CHUNK_NEVER        1
-                 PNG_HANDLE_CHUNK_IF_SAFE      2
-                 PNG_HANDLE_CHUNK_ALWAYS       3
-    chunk_list - list of chunks affected (a byte string,
-                 five bytes per chunk, NULL or '\0' if
-                 num_chunks is 0)
-    num_chunks - number of chunks affected; if 0, all
-                 unknown chunks are affected.  If nonzero,
-                 only the chunks in the list are affected
-
-Unknown chunks declared in this way will be saved as raw data onto a
-list of png_unknown_chunk structures.  If a chunk that is normally
-known to libpng is named in the list, it will be handled as unknown,
-according to the "keep" directive.  If a chunk is named in successive
-instances of png_set_keep_unknown_chunks(), the final instance will
-take precedence.  The IHDR and IEND chunks should not be named in
-chunk_list; if they are, libpng will process them normally anyway.
-
-The high-level read interface
-
-At this point there are two ways to proceed; through the high-level
-read interface, or through a sequence of low-level read operations.
-You can use the high-level interface if (a) you are willing to read
-the entire image into memory, and (b) the input transformations
-you want to do are limited to the following set:
-
-    PNG_TRANSFORM_IDENTITY      No transformation
-    PNG_TRANSFORM_STRIP_16      Strip 16-bit samples to
-                                8 bits
-    PNG_TRANSFORM_STRIP_ALPHA   Discard the alpha channel
-    PNG_TRANSFORM_PACKING       Expand 1, 2 and 4-bit
-                                samples to bytes
-    PNG_TRANSFORM_PACKSWAP      Change order of packed
-                                pixels to LSB first
-    PNG_TRANSFORM_EXPAND        Perform set_expand()
-    PNG_TRANSFORM_INVERT_MONO   Invert monochrome images
-    PNG_TRANSFORM_SHIFT         Normalize pixels to the
-                                sBIT depth
-    PNG_TRANSFORM_BGR           Flip RGB to BGR, RGBA
-                                to BGRA
-    PNG_TRANSFORM_SWAP_ALPHA    Flip RGBA to ARGB or GA
-                                to AG
-    PNG_TRANSFORM_INVERT_ALPHA  Change alpha from opacity
-                                to transparency
-    PNG_TRANSFORM_SWAP_ENDIAN   Byte-swap 16-bit samples
-
-(This excludes setting a background color, doing gamma transformation,
-dithering, and setting filler.)  If this is the case, simply do this:
-
-    png_read_png(png_ptr, info_ptr, png_transforms, NULL)
-
-where png_transforms is an integer containing the bitwise OR of
-some set of transformation flags.  This call is equivalent to png_read_info(),
-followed the set of transformations indicated by the transform mask,
-then png_read_image(), and finally png_read_end().
-
-(The final parameter of this call is not yet used.  Someday it might point
-to transformation parameters required by some future input transform.)
-
-You must use png_transforms and not call any png_set_transform() functions
-when you use png_read_png().
-
-After you have called png_read_png(), you can retrieve the image data
-with
-
-   row_pointers = png_get_rows(png_ptr, info_ptr);
-
-where row_pointers is an array of pointers to the pixel data for each row:
-
-   png_bytep row_pointers[height];
-
-If you know your image size and pixel size ahead of time, you can allocate
-row_pointers prior to calling png_read_png() with
-
-   if (height > PNG_UINT_32_MAX/png_sizeof(png_byte))
-      png_error (png_ptr,
-         "Image is too tall to process in memory");
-   if (width > PNG_UINT_32_MAX/pixel_size)
-      png_error (png_ptr,
-         "Image is too wide to process in memory");
-   row_pointers = png_malloc(png_ptr,
-      height*png_sizeof(png_bytep));
-   for (int i=0; i<height, i++)
-      row_pointers[i]=png_malloc(png_ptr,
-         width*pixel_size);
-   png_set_rows(png_ptr, info_ptr, &row_pointers);
-
-Alternatively you could allocate your image in one big block and define
-row_pointers[i] to point into the proper places in your block.
-
-If you use png_set_rows(), the application is responsible for freeing
-row_pointers (and row_pointers[i], if they were separately allocated).
-
-If you don't allocate row_pointers ahead of time, png_read_png() will
-do it, and it'll be free'ed when you call png_destroy_*().
-
-The low-level read interface
-
-If you are going the low-level route, you are now ready to read all
-the file information up to the actual image data.  You do this with a
-call to png_read_info().
-
-    png_read_info(png_ptr, info_ptr);
-
-This will process all chunks up to but not including the image data.
-
-Querying the info structure
-
-Functions are used to get the information from the info_ptr once it
-has been read.  Note that these fields may not be completely filled
-in until png_read_end() has read the chunk data following the image.
-
-    png_get_IHDR(png_ptr, info_ptr, &width, &height,
-       &bit_depth, &color_type, &interlace_type,
-       &compression_type, &filter_method);
-
-    width          - holds the width of the image
-                     in pixels (up to 2^31).
-    height         - holds the height of the image
-                     in pixels (up to 2^31).
-    bit_depth      - holds the bit depth of one of the
-                     image channels.  (valid values are
-                     1, 2, 4, 8, 16 and depend also on
-                     the color_type.  See also
-                     significant bits (sBIT) below).
-    color_type     - describes which color/alpha channels
-                         are present.
-                     PNG_COLOR_TYPE_GRAY
-                        (bit depths 1, 2, 4, 8, 16)
-                     PNG_COLOR_TYPE_GRAY_ALPHA
-                        (bit depths 8, 16)
-                     PNG_COLOR_TYPE_PALETTE
-                        (bit depths 1, 2, 4, 8)
-                     PNG_COLOR_TYPE_RGB
-                        (bit_depths 8, 16)
-                     PNG_COLOR_TYPE_RGB_ALPHA
-                        (bit_depths 8, 16)
-
-                     PNG_COLOR_MASK_PALETTE
-                     PNG_COLOR_MASK_COLOR
-                     PNG_COLOR_MASK_ALPHA
-
-    filter_method  - (must be PNG_FILTER_TYPE_BASE
-                     for PNG 1.0, and can also be
-                     PNG_INTRAPIXEL_DIFFERENCING if
-                     the PNG datastream is embedded in
-                     a MNG-1.0 datastream)
-    compression_type - (must be PNG_COMPRESSION_TYPE_BASE
-                     for PNG 1.0)
-    interlace_type - (PNG_INTERLACE_NONE or
-                     PNG_INTERLACE_ADAM7)
-    Any or all of interlace_type, compression_type, of
-    filter_method can be NULL if you are
-    not interested in their values.
-
-    channels = png_get_channels(png_ptr, info_ptr);
-    channels       - number of channels of info for the
-                     color type (valid values are 1 (GRAY,
-                     PALETTE), 2 (GRAY_ALPHA), 3 (RGB),
-                     4 (RGB_ALPHA or RGB + filler byte))
-    rowbytes = png_get_rowbytes(png_ptr, info_ptr);
-    rowbytes       - number of bytes needed to hold a row
-
-    signature = png_get_signature(png_ptr, info_ptr);
-    signature      - holds the signature read from the
-                     file (if any).  The data is kept in
-                     the same offset it would be if the
-                     whole signature were read (i.e. if an
-                     application had already read in 4
-                     bytes of signature before starting
-                     libpng, the remaining 4 bytes would
-                     be in signature[4] through signature[7]
-                     (see png_set_sig_bytes())).
-
-
-    width            = png_get_image_width(png_ptr,
-                         info_ptr);
-    height           = png_get_image_height(png_ptr,
-                         info_ptr);
-    bit_depth        = png_get_bit_depth(png_ptr,
-                         info_ptr);
-    color_type       = png_get_color_type(png_ptr,
-                         info_ptr);
-    filter_method    = png_get_filter_type(png_ptr,
-                         info_ptr);
-    compression_type = png_get_compression_type(png_ptr,
-                         info_ptr);
-    interlace_type   = png_get_interlace_type(png_ptr,
-                         info_ptr);
-
-
-These are also important, but their validity depends on whether the chunk
-has been read.  The png_get_valid(png_ptr, info_ptr, PNG_INFO_<chunk>) and
-png_get_<chunk>(png_ptr, info_ptr, ...) functions return non-zero if the
-data has been read, or zero if it is missing.  The parameters to the
-png_get_<chunk> are set directly if they are simple data types, or a pointer
-into the info_ptr is returned for any complex types.
-
-    png_get_PLTE(png_ptr, info_ptr, &palette,
-                     &num_palette);
-    palette        - the palette for the file
-                     (array of png_color)
-    num_palette    - number of entries in the palette
-
-    png_get_gAMA(png_ptr, info_ptr, &gamma);
-    gamma          - the gamma the file is written
-                     at (PNG_INFO_gAMA)
-
-    png_get_sRGB(png_ptr, info_ptr, &srgb_intent);
-    srgb_intent    - the rendering intent (PNG_INFO_sRGB)
-                     The presence of the sRGB chunk
-                     means that the pixel data is in the
-                     sRGB color space.  This chunk also
-                     implies specific values of gAMA and
-                     cHRM.
-
-    png_get_iCCP(png_ptr, info_ptr, &name,
-       &compression_type, &profile, &proflen);
-    name            - The profile name.
-    compression     - The compression type; always
-                      PNG_COMPRESSION_TYPE_BASE for PNG 1.0.
-                      You may give NULL to this argument to
-                      ignore it.
-    profile         - International Color Consortium color
-                      profile data. May contain NULs.
-    proflen         - length of profile data in bytes.
-
-    png_get_sBIT(png_ptr, info_ptr, &sig_bit);
-    sig_bit        - the number of significant bits for
-                     (PNG_INFO_sBIT) each of the gray,
-                     red, green, and blue channels,
-                     whichever are appropriate for the
-                     given color type (png_color_16)
-
-    png_get_tRNS(png_ptr, info_ptr, &trans, &num_trans,
-                     &trans_values);
-    trans          - array of transparent entries for
-                     palette (PNG_INFO_tRNS)
-    trans_values   - graylevel or color sample values of
-                     the single transparent color for
-                     non-paletted images (PNG_INFO_tRNS)
-    num_trans      - number of transparent entries
-                     (PNG_INFO_tRNS)
-
-    png_get_hIST(png_ptr, info_ptr, &hist);
-                     (PNG_INFO_hIST)
-    hist           - histogram of palette (array of
-                     png_uint_16)
-
-    png_get_tIME(png_ptr, info_ptr, &mod_time);
-    mod_time       - time image was last modified
-                    (PNG_VALID_tIME)
-
-    png_get_bKGD(png_ptr, info_ptr, &background);
-    background     - background color (PNG_VALID_bKGD)
-                     valid 16-bit red, green and blue
-                     values, regardless of color_type
-
-    num_comments   = png_get_text(png_ptr, info_ptr,
-                     &text_ptr, &num_text);
-    num_comments   - number of comments
-    text_ptr       - array of png_text holding image
-                     comments
-    text_ptr[i].compression - type of compression used
-                 on "text" PNG_TEXT_COMPRESSION_NONE
-                           PNG_TEXT_COMPRESSION_zTXt
-                           PNG_ITXT_COMPRESSION_NONE
-                           PNG_ITXT_COMPRESSION_zTXt
-    text_ptr[i].key   - keyword for comment.  Must contain
-                         1-79 characters.
-    text_ptr[i].text  - text comments for current
-                         keyword.  Can be empty.
-    text_ptr[i].text_length - length of text string,
-                 after decompression, 0 for iTXt
-    text_ptr[i].itxt_length - length of itxt string,
-                 after decompression, 0 for tEXt/zTXt
-    text_ptr[i].lang  - language of comment (empty
-                         string for unknown).
-    text_ptr[i].lang_key  - keyword in UTF-8
-                         (empty string for unknown).
-    num_text       - number of comments (same as
-                     num_comments; you can put NULL here
-                     to avoid the duplication)
-    Note while png_set_text() will accept text, language,
-    and translated keywords that can be NULL pointers, the
-    structure returned by png_get_text will always contain
-    regular zero-terminated C strings.  They might be
-    empty strings but they will never be NULL pointers.
-
-    num_spalettes = png_get_sPLT(png_ptr, info_ptr,
-       &palette_ptr);
-    palette_ptr    - array of palette structures holding
-                     contents of one or more sPLT chunks
-                     read.
-    num_spalettes  - number of sPLT chunks read.
-
-    png_get_oFFs(png_ptr, info_ptr, &offset_x, &offset_y,
-       &unit_type);
-    offset_x       - positive offset from the left edge
-                     of the screen
-    offset_y       - positive offset from the top edge
-                     of the screen
-    unit_type      - PNG_OFFSET_PIXEL, PNG_OFFSET_MICROMETER
-
-    png_get_pHYs(png_ptr, info_ptr, &res_x, &res_y,
-       &unit_type);
-    res_x          - pixels/unit physical resolution in
-                     x direction
-    res_y          - pixels/unit physical resolution in
-                     x direction
-    unit_type      - PNG_RESOLUTION_UNKNOWN,
-                     PNG_RESOLUTION_METER
-
-    png_get_sCAL(png_ptr, info_ptr, &unit, &width,
-       &height)
-    unit        - physical scale units (an integer)
-    width       - width of a pixel in physical scale units
-    height      - height of a pixel in physical scale units
-                 (width and height are doubles)
-
-    png_get_sCAL_s(png_ptr, info_ptr, &unit, &width,
-       &height)
-    unit        - physical scale units (an integer)
-    width       - width of a pixel in physical scale units
-    height      - height of a pixel in physical scale units
-                 (width and height are strings like "2.54")
-
-    num_unknown_chunks = png_get_unknown_chunks(png_ptr,
-       info_ptr, &unknowns)
-    unknowns          - array of png_unknown_chunk
-                        structures holding unknown chunks
-    unknowns[i].name  - name of unknown chunk
-    unknowns[i].data  - data of unknown chunk
-    unknowns[i].size  - size of unknown chunk's data
-    unknowns[i].location - position of chunk in file
-
-    The value of "i" corresponds to the order in which the
-    chunks were read from the PNG file or inserted with the
-    png_set_unknown_chunks() function.
-
-The data from the pHYs chunk can be retrieved in several convenient
-forms:
-
-    res_x = png_get_x_pixels_per_meter(png_ptr,
-       info_ptr)
-    res_y = png_get_y_pixels_per_meter(png_ptr,
-       info_ptr)
-    res_x_and_y = png_get_pixels_per_meter(png_ptr,
-       info_ptr)
-    res_x = png_get_x_pixels_per_inch(png_ptr,
-       info_ptr)
-    res_y = png_get_y_pixels_per_inch(png_ptr,
-       info_ptr)
-    res_x_and_y = png_get_pixels_per_inch(png_ptr,
-       info_ptr)
-    aspect_ratio = png_get_pixel_aspect_ratio(png_ptr,
-       info_ptr)
-
-   (Each of these returns 0 [signifying "unknown"] if
-       the data is not present or if res_x is 0;
-       res_x_and_y is 0 if res_x != res_y)
-
-The data from the oFFs chunk can be retrieved in several convenient
-forms:
-
-    x_offset = png_get_x_offset_microns(png_ptr, info_ptr);
-    y_offset = png_get_y_offset_microns(png_ptr, info_ptr);
-    x_offset = png_get_x_offset_inches(png_ptr, info_ptr);
-    y_offset = png_get_y_offset_inches(png_ptr, info_ptr);
-
-   (Each of these returns 0 [signifying "unknown" if both
-       x and y are 0] if the data is not present or if the
-       chunk is present but the unit is the pixel)
-
-For more information, see the png_info definition in png.h and the
-PNG specification for chunk contents.  Be careful with trusting
-rowbytes, as some of the transformations could increase the space
-needed to hold a row (expand, filler, gray_to_rgb, etc.).
-See png_read_update_info(), below.
-
-A quick word about text_ptr and num_text.  PNG stores comments in
-keyword/text pairs, one pair per chunk, with no limit on the number
-of text chunks, and a 2^31 byte limit on their size.  While there are
-suggested keywords, there is no requirement to restrict the use to these
-strings.  It is strongly suggested that keywords and text be sensible
-to humans (that's the point), so don't use abbreviations.  Non-printing
-symbols are not allowed.  See the PNG specification for more details.
-There is also no requirement to have text after the keyword.
-
-Keywords should be limited to 79 Latin-1 characters without leading or
-trailing spaces, but non-consecutive spaces are allowed within the
-keyword.  It is possible to have the same keyword any number of times.
-The text_ptr is an array of png_text structures, each holding a
-pointer to a language string, a pointer to a keyword and a pointer to
-a text string.  The text string, language code, and translated
-keyword may be empty or NULL pointers.  The keyword/text
-pairs are put into the array in the order that they are received.
-However, some or all of the text chunks may be after the image, so, to
-make sure you have read all the text chunks, don't mess with these
-until after you read the stuff after the image.  This will be
-mentioned again below in the discussion that goes with png_read_end().
-
-Input transformations
-
-After you've read the header information, you can set up the library
-to handle any special transformations of the image data.  The various
-ways to transform the data will be described in the order that they
-should occur.  This is important, as some of these change the color
-type and/or bit depth of the data, and some others only work on
-certain color types and bit depths.  Even though each transformation
-checks to see if it has data that it can do something with, you should
-make sure to only enable a transformation if it will be valid for the
-data.  For example, don't swap red and blue on grayscale data.
-
-The colors used for the background and transparency values should be
-supplied in the same format/depth as the current image data.  They
-are stored in the same format/depth as the image data in a bKGD or tRNS
-chunk, so this is what libpng expects for this data.  The colors are
-transformed to keep in sync with the image data when an application
-calls the png_read_update_info() routine (see below).
-
-Data will be decoded into the supplied row buffers packed into bytes
-unless the library has been told to transform it into another format.
-For example, 4 bit/pixel paletted or grayscale data will be returned
-2 pixels/byte with the leftmost pixel in the high-order bits of the
-byte, unless png_set_packing() is called.  8-bit RGB data will be stored
-in RGB RGB RGB format unless png_set_filler() or png_set_add_alpha()
-is called to insert filler bytes, either before or after each RGB triplet.
-16-bit RGB data will be returned RRGGBB RRGGBB, with the most significant
-byte of the color value first, unless png_set_strip_16() is called to
-transform it to regular RGB RGB triplets, or png_set_filler() or
-png_set_add alpha() is called to insert filler bytes, either before or
-after each RRGGBB triplet.  Similarly, 8-bit or 16-bit grayscale data can
-be modified with
-png_set_filler(), png_set_add_alpha(), or png_set_strip_16().
-
-The following code transforms grayscale images of less than 8 to 8 bits,
-changes paletted images to RGB, and adds a full alpha channel if there is
-transparency information in a tRNS chunk.  This is most useful on
-grayscale images with bit depths of 2 or 4 or if there is a multiple-image
-viewing application that wishes to treat all images in the same way.
-
-    if (color_type == PNG_COLOR_TYPE_PALETTE)
-        png_set_palette_to_rgb(png_ptr);
-
-    if (color_type == PNG_COLOR_TYPE_GRAY &&
-        bit_depth < 8) png_set_expand_gray_1_2_4_to_8(png_ptr);
-
-    if (png_get_valid(png_ptr, info_ptr,
-        PNG_INFO_tRNS)) png_set_tRNS_to_alpha(png_ptr);
-
-These three functions are actually aliases for png_set_expand(), added
-in libpng version 1.0.4, with the function names expanded to improve code
-readability.  In some future version they may actually do different
-things.
-
-As of libpng version 1.2.9, png_set_expand_gray_1_2_4_to_8() was
-added.  It expands the sample depth without changing tRNS to alpha.
-At the same time, png_set_gray_1_2_4_to_8() was deprecated, and it
-will be removed from a future version.
-
-PNG can have files with 16 bits per channel.  If you only can handle
-8 bits per channel, this will strip the pixels down to 8 bit.
-
-    if (bit_depth == 16)
-        png_set_strip_16(png_ptr);
-
-If, for some reason, you don't need the alpha channel on an image,
-and you want to remove it rather than combining it with the background
-(but the image author certainly had in mind that you *would* combine
-it with the background, so that's what you should probably do):
-
-    if (color_type & PNG_COLOR_MASK_ALPHA)
-        png_set_strip_alpha(png_ptr);
-
-In PNG files, the alpha channel in an image
-is the level of opacity.  If you need the alpha channel in an image to
-be the level of transparency instead of opacity, you can invert the
-alpha channel (or the tRNS chunk data) after it's read, so that 0 is
-fully opaque and 255 (in 8-bit or paletted images) or 65535 (in 16-bit
-images) is fully transparent, with
-
-    png_set_invert_alpha(png_ptr);
-
-PNG files pack pixels of bit depths 1, 2, and 4 into bytes as small as
-they can, resulting in, for example, 8 pixels per byte for 1 bit
-files.  This code expands to 1 pixel per byte without changing the
-values of the pixels:
-
-    if (bit_depth < 8)
-        png_set_packing(png_ptr);
-
-PNG files have possible bit depths of 1, 2, 4, 8, and 16.  All pixels
-stored in a PNG image have been "scaled" or "shifted" up to the next
-higher possible bit depth (e.g. from 5 bits/sample in the range [0,31] to
-8 bits/sample in the range [0, 255]).  However, it is also possible to
-convert the PNG pixel data back to the original bit depth of the image.
-This call reduces the pixels back down to the original bit depth:
-
-    png_color_8p sig_bit;
-
-    if (png_get_sBIT(png_ptr, info_ptr, &sig_bit))
-        png_set_shift(png_ptr, sig_bit);
-
-PNG files store 3-color pixels in red, green, blue order.  This code
-changes the storage of the pixels to blue, green, red:
-
-    if (color_type == PNG_COLOR_TYPE_RGB ||
-        color_type == PNG_COLOR_TYPE_RGB_ALPHA)
-        png_set_bgr(png_ptr);
-
-PNG files store RGB pixels packed into 3 or 6 bytes. This code expands them
-into 4 or 8 bytes for windowing systems that need them in this format:
-
-    if (color_type == PNG_COLOR_TYPE_RGB)
-        png_set_filler(png_ptr, filler, PNG_FILLER_BEFORE);
-
-where "filler" is the 8 or 16-bit number to fill with, and the location is
-either PNG_FILLER_BEFORE or PNG_FILLER_AFTER, depending upon whether
-you want the filler before the RGB or after.  This transformation
-does not affect images that already have full alpha channels.  To add an
-opaque alpha channel, use filler=0xff or 0xffff and PNG_FILLER_AFTER which
-will generate RGBA pixels.
-
-Note that png_set_filler() does not change the color type.  If you want
-to do that, you can add a true alpha channel with
-
-    if (color_type == PNG_COLOR_TYPE_RGB ||
-           color_type == PNG_COLOR_TYPE_GRAY)
-    png_set_add_alpha(png_ptr, filler, PNG_FILLER_AFTER);
-
-where "filler" contains the alpha value to assign to each pixel.
-This function was added in libpng-1.2.7.
-
-If you are reading an image with an alpha channel, and you need the
-data as ARGB instead of the normal PNG format RGBA:
-
-    if (color_type == PNG_COLOR_TYPE_RGB_ALPHA)
-        png_set_swap_alpha(png_ptr);
-
-For some uses, you may want a grayscale image to be represented as
-RGB.  This code will do that conversion:
-
-    if (color_type == PNG_COLOR_TYPE_GRAY ||
-        color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
-          png_set_gray_to_rgb(png_ptr);
-
-Conversely, you can convert an RGB or RGBA image to grayscale or grayscale
-with alpha.
-
-    if (color_type == PNG_COLOR_TYPE_RGB ||
-        color_type == PNG_COLOR_TYPE_RGB_ALPHA)
-          png_set_rgb_to_gray_fixed(png_ptr, error_action,
-             int red_weight, int green_weight);
-
-    error_action = 1: silently do the conversion
-    error_action = 2: issue a warning if the original
-                      image has any pixel where
-                      red != green or red != blue
-    error_action = 3: issue an error and abort the
-                      conversion if the original
-                      image has any pixel where
-                      red != green or red != blue
-
-    red_weight:       weight of red component times 100000
-    green_weight:     weight of green component times 100000
-                      If either weight is negative, default
-                      weights (21268, 71514) are used.
-
-If you have set error_action = 1 or 2, you can
-later check whether the image really was gray, after processing
-the image rows, with the png_get_rgb_to_gray_status(png_ptr) function.
-It will return a png_byte that is zero if the image was gray or
-1 if there were any non-gray pixels.  bKGD and sBIT data
-will be silently converted to grayscale, using the green channel
-data, regardless of the error_action setting.
-
-With red_weight+green_weight<=100000,
-the normalized graylevel is computed:
-
-    int rw = red_weight * 65536;
-    int gw = green_weight * 65536;
-    int bw = 65536 - (rw + gw);
-    gray = (rw*red + gw*green + bw*blue)/65536;
-
-The default values approximate those recommended in the Charles
-Poynton's Color FAQ, <http://www.inforamp.net/~poynton/>
-Copyright (c) 1998-01-04 Charles Poynton <poynton at inforamp.net>
-
-    Y = 0.212671 * R + 0.715160 * G + 0.072169 * B
-
-Libpng approximates this with
-
-    Y = 0.21268 * R    + 0.7151 * G    + 0.07217 * B
-
-which can be expressed with integers as
-
-    Y = (6969 * R + 23434 * G + 2365 * B)/32768
-
-The calculation is done in a linear colorspace, if the image gamma
-is known.
-
-If you have a grayscale and you are using png_set_expand_depth(),
-png_set_expand(), or png_set_gray_to_rgb to change to truecolor or to
-a higher bit-depth, you must either supply the background color as a gray
-value at the original file bit-depth (need_expand = 1) or else supply the
-background color as an RGB triplet at the final, expanded bit depth
-(need_expand = 0).  Similarly, if you are reading a paletted image, you
-must either supply the background color as a palette index (need_expand = 1)
-or as an RGB triplet that may or may not be in the palette (need_expand = 0).
-
-    png_color_16 my_background;
-    png_color_16p image_background;
-
-    if (png_get_bKGD(png_ptr, info_ptr, &image_background))
-        png_set_background(png_ptr, image_background,
-          PNG_BACKGROUND_GAMMA_FILE, 1, 1.0);
-    else
-        png_set_background(png_ptr, &my_background,
-          PNG_BACKGROUND_GAMMA_SCREEN, 0, 1.0);
-
-The png_set_background() function tells libpng to composite images
-with alpha or simple transparency against the supplied background
-color.  If the PNG file contains a bKGD chunk (PNG_INFO_bKGD valid),
-you may use this color, or supply another color more suitable for
-the current display (e.g., the background color from a web page).  You
-need to tell libpng whether the color is in the gamma space of the
-display (PNG_BACKGROUND_GAMMA_SCREEN for colors you supply), the file
-(PNG_BACKGROUND_GAMMA_FILE for colors from the bKGD chunk), or one
-that is neither of these gammas (PNG_BACKGROUND_GAMMA_UNIQUE - I don't
-know why anyone would use this, but it's here).
-
-To properly display PNG images on any kind of system, the application needs
-to know what the display gamma is.  Ideally, the user will know this, and
-the application will allow them to set it.  One method of allowing the user
-to set the display gamma separately for each system is to check for a
-SCREEN_GAMMA or DISPLAY_GAMMA environment variable, which will hopefully be
-correctly set.
-
-Note that display_gamma is the overall gamma correction required to produce
-pleasing results, which depends on the lighting conditions in the surrounding
-environment.  In a dim or brightly lit room, no compensation other than
-the physical gamma exponent of the monitor is needed, while in a dark room
-a slightly smaller exponent is better.
-
-   double gamma, screen_gamma;
-
-   if (/* We have a user-defined screen
-       gamma value */)
-   {
-      screen_gamma = user_defined_screen_gamma;
-   }
-   /* One way that applications can share the same
-      screen gamma value */
-   else if ((gamma_str = getenv("SCREEN_GAMMA"))
-      != NULL)
-   {
-      screen_gamma = (double)atof(gamma_str);
-   }
-   /* If we don't have another value */
-   else
-   {
-      screen_gamma = 2.2; /* A good guess for a
-           PC monitor in a bright office or a dim room */
-      screen_gamma = 2.0; /* A good guess for a
-           PC monitor in a dark room */
-      screen_gamma = 1.7 or 1.0;  /* A good
-           guess for Mac systems */
-   }
-
-The png_set_gamma() function handles gamma transformations of the data.
-Pass both the file gamma and the current screen_gamma.  If the file does
-not have a gamma value, you can pass one anyway if you have an idea what
-it is (usually 0.45455 is a good guess for GIF images on PCs).  Note
-that file gammas are inverted from screen gammas.  See the discussions
-on gamma in the PNG specification for an excellent description of what
-gamma is, and why all applications should support it.  It is strongly
-recommended that PNG viewers support gamma correction.
-
-   if (png_get_gAMA(png_ptr, info_ptr, &gamma))
-      png_set_gamma(png_ptr, screen_gamma, gamma);
-   else
-      png_set_gamma(png_ptr, screen_gamma, 0.45455);
-
-If you need to reduce an RGB file to a paletted file, or if a paletted
-file has more entries then will fit on your screen, png_set_dither()
-will do that.  Note that this is a simple match dither that merely
-finds the closest color available.  This should work fairly well with
-optimized palettes, and fairly badly with linear color cubes.  If you
-pass a palette that is larger then maximum_colors, the file will
-reduce the number of colors in the palette so it will fit into
-maximum_colors.  If there is a histogram, it will use it to make
-more intelligent choices when reducing the palette.  If there is no
-histogram, it may not do as good a job.
-
-   if (color_type & PNG_COLOR_MASK_COLOR)
-   {
-      if (png_get_valid(png_ptr, info_ptr,
-         PNG_INFO_PLTE))
-      {
-         png_uint_16p histogram = NULL;
-
-         png_get_hIST(png_ptr, info_ptr,
-            &histogram);
-         png_set_dither(png_ptr, palette, num_palette,
-            max_screen_colors, histogram, 1);
-      }
-      else
-      {
-         png_color std_color_cube[MAX_SCREEN_COLORS] =
-            { ... colors ... };
-
-         png_set_dither(png_ptr, std_color_cube,
-            MAX_SCREEN_COLORS, MAX_SCREEN_COLORS,
-            NULL,0);
-      }
-   }
-
-PNG files describe monochrome as black being zero and white being one.
-The following code will reverse this (make black be one and white be
-zero):
-
-   if (bit_depth == 1 && color_type == PNG_COLOR_TYPE_GRAY)
-      png_set_invert_mono(png_ptr);
-
-This function can also be used to invert grayscale and gray-alpha images:
-
-   if (color_type == PNG_COLOR_TYPE_GRAY ||
-        color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
-      png_set_invert_mono(png_ptr);
-
-PNG files store 16 bit pixels in network byte order (big-endian,
-ie. most significant bits first).  This code changes the storage to the
-other way (little-endian, i.e. least significant bits first, the
-way PCs store them):
-
-    if (bit_depth == 16)
-        png_set_swap(png_ptr);
-
-If you are using packed-pixel images (1, 2, or 4 bits/pixel), and you
-need to change the order the pixels are packed into bytes, you can use:
-
-    if (bit_depth < 8)
-       png_set_packswap(png_ptr);
-
-Finally, you can write your own transformation function if none of
-the existing ones meets your needs.  This is done by setting a callback
-with
-
-    png_set_read_user_transform_fn(png_ptr,
-       read_transform_fn);
-
-You must supply the function
-
-    void read_transform_fn(png_ptr ptr, row_info_ptr
-       row_info, png_bytep data)
-
-See pngtest.c for a working example.  Your function will be called
-after all of the other transformations have been processed.
-
-You can also set up a pointer to a user structure for use by your
-callback function, and you can inform libpng that your transform
-function will change the number of channels or bit depth with the
-function
-
-    png_set_user_transform_info(png_ptr, user_ptr,
-       user_depth, user_channels);
-
-The user's application, not libpng, is responsible for allocating and
-freeing any memory required for the user structure.
-
-You can retrieve the pointer via the function
-png_get_user_transform_ptr().  For example:
-
-    voidp read_user_transform_ptr =
-       png_get_user_transform_ptr(png_ptr);
-
-The last thing to handle is interlacing; this is covered in detail below,
-but you must call the function here if you want libpng to handle expansion
-of the interlaced image.
-
-    number_of_passes = png_set_interlace_handling(png_ptr);
-
-After setting the transformations, libpng can update your png_info
-structure to reflect any transformations you've requested with this
-call.  This is most useful to update the info structure's rowbytes
-field so you can use it to allocate your image memory.  This function
-will also update your palette with the correct screen_gamma and
-background if these have been given with the calls above.
-
-    png_read_update_info(png_ptr, info_ptr);
-
-After you call png_read_update_info(), you can allocate any
-memory you need to hold the image.  The row data is simply
-raw byte data for all forms of images.  As the actual allocation
-varies among applications, no example will be given.  If you
-are allocating one large chunk, you will need to build an
-array of pointers to each row, as it will be needed for some
-of the functions below.
-
-Reading image data
-
-After you've allocated memory, you can read the image data.
-The simplest way to do this is in one function call.  If you are
-allocating enough memory to hold the whole image, you can just
-call png_read_image() and libpng will read in all the image data
-and put it in the memory area supplied.  You will need to pass in
-an array of pointers to each row.
-
-This function automatically handles interlacing, so you don't need
-to call png_set_interlace_handling() or call this function multiple
-times, or any of that other stuff necessary with png_read_rows().
-
-   png_read_image(png_ptr, row_pointers);
-
-where row_pointers is:
-
-   png_bytep row_pointers[height];
-
-You can point to void or char or whatever you use for pixels.
-
-If you don't want to read in the whole image at once, you can
-use png_read_rows() instead.  If there is no interlacing (check
-interlace_type == PNG_INTERLACE_NONE), this is simple:
-
-    png_read_rows(png_ptr, row_pointers, NULL,
-       number_of_rows);
-
-where row_pointers is the same as in the png_read_image() call.
-
-If you are doing this just one row at a time, you can do this with
-a single row_pointer instead of an array of row_pointers:
-
-    png_bytep row_pointer = row;
-    png_read_row(png_ptr, row_pointer, NULL);
-
-If the file is interlaced (interlace_type != 0 in the IHDR chunk), things
-get somewhat harder.  The only current (PNG Specification version 1.2)
-interlacing type for PNG is (interlace_type == PNG_INTERLACE_ADAM7)
-is a somewhat complicated 2D interlace scheme, known as Adam7, that
-breaks down an image into seven smaller images of varying size, based
-on an 8x8 grid.
-
-libpng can fill out those images or it can give them to you "as is".
-If you want them filled out, there are two ways to do that.  The one
-mentioned in the PNG specification is to expand each pixel to cover
-those pixels that have not been read yet (the "rectangle" method).
-This results in a blocky image for the first pass, which gradually
-smooths out as more pixels are read.  The other method is the "sparkle"
-method, where pixels are drawn only in their final locations, with the
-rest of the image remaining whatever colors they were initialized to
-before the start of the read.  The first method usually looks better,
-but tends to be slower, as there are more pixels to put in the rows.
-
-If you don't want libpng to handle the interlacing details, just call
-png_read_rows() seven times to read in all seven images.  Each of the
-images is a valid image by itself, or they can all be combined on an
-8x8 grid to form a single image (although if you intend to combine them
-you would be far better off using the libpng interlace handling).
-
-The first pass will return an image 1/8 as wide as the entire image
-(every 8th column starting in column 0) and 1/8 as high as the original
-(every 8th row starting in row 0), the second will be 1/8 as wide
-(starting in column 4) and 1/8 as high (also starting in row 0).  The
-third pass will be 1/4 as wide (every 4th pixel starting in column 0) and
-1/8 as high (every 8th row starting in row 4), and the fourth pass will
-be 1/4 as wide and 1/4 as high (every 4th column starting in column 2,
-and every 4th row starting in row 0).  The fifth pass will return an
-image 1/2 as wide, and 1/4 as high (starting at column 0 and row 2),
-while the sixth pass will be 1/2 as wide and 1/2 as high as the original
-(starting in column 1 and row 0).  The seventh and final pass will be as
-wide as the original, and 1/2 as high, containing all of the odd
-numbered scanlines.  Phew!
-
-If you want libpng to expand the images, call this before calling
-png_start_read_image() or png_read_update_info():
-
-    if (interlace_type == PNG_INTERLACE_ADAM7)
-        number_of_passes
-           = png_set_interlace_handling(png_ptr);
-
-This will return the number of passes needed.  Currently, this
-is seven, but may change if another interlace type is added.
-This function can be called even if the file is not interlaced,
-where it will return one pass.
-
-If you are not going to display the image after each pass, but are
-going to wait until the entire image is read in, use the sparkle
-effect.  This effect is faster and the end result of either method
-is exactly the same.  If you are planning on displaying the image
-after each pass, the "rectangle" effect is generally considered the
-better looking one.
-
-If you only want the "sparkle" effect, just call png_read_rows() as
-normal, with the third parameter NULL.  Make sure you make pass over
-the image number_of_passes times, and you don't change the data in the
-rows between calls.  You can change the locations of the data, just
-not the data.  Each pass only writes the pixels appropriate for that
-pass, and assumes the data from previous passes is still valid.
-
-    png_read_rows(png_ptr, row_pointers, NULL,
-       number_of_rows);
-
-If you only want the first effect (the rectangles), do the same as
-before except pass the row buffer in the third parameter, and leave
-the second parameter NULL.
-
-    png_read_rows(png_ptr, NULL, row_pointers,
-       number_of_rows);
-
-Finishing a sequential read
-
-After you are finished reading the image through the
-low-level interface, you can finish reading the file.  If you are
-interested in comments or time, which may be stored either before or
-after the image data, you should pass the separate png_info struct if
-you want to keep the comments from before and after the image
-separate.  If you are not interested, you can pass NULL.
-
-   png_read_end(png_ptr, end_info);
-
-When you are done, you can free all memory allocated by libpng like this:
-
-   png_destroy_read_struct(&png_ptr, &info_ptr,
-       &end_info);
-
-It is also possible to individually free the info_ptr members that
-point to libpng-allocated storage with the following function:
-
-    png_free_data(png_ptr, info_ptr, mask, seq)
-    mask - identifies data to be freed, a mask
-           containing the bitwise OR of one or
-           more of
-             PNG_FREE_PLTE, PNG_FREE_TRNS,
-             PNG_FREE_HIST, PNG_FREE_ICCP,
-             PNG_FREE_PCAL, PNG_FREE_ROWS,
-             PNG_FREE_SCAL, PNG_FREE_SPLT,
-             PNG_FREE_TEXT, PNG_FREE_UNKN,
-           or simply PNG_FREE_ALL
-    seq  - sequence number of item to be freed
-           (-1 for all items)
-
-This function may be safely called when the relevant storage has
-already been freed, or has not yet been allocated, or was allocated
-by the user and not by libpng,  and will in those
-cases do nothing.  The "seq" parameter is ignored if only one item
-of the selected data type, such as PLTE, is allowed.  If "seq" is not
--1, and multiple items are allowed for the data type identified in
-the mask, such as text or sPLT, only the n'th item in the structure
-is freed, where n is "seq".
-
-The default behavior is only to free data that was allocated internally
-by libpng.  This can be changed, so that libpng will not free the data,
-or so that it will free data that was allocated by the user with png_malloc()
-or png_zalloc() and passed in via a png_set_*() function, with
-
-    png_data_freer(png_ptr, info_ptr, freer, mask)
-    mask   - which data elements are affected
-             same choices as in png_free_data()
-    freer  - one of
-               PNG_DESTROY_WILL_FREE_DATA
-               PNG_SET_WILL_FREE_DATA
-               PNG_USER_WILL_FREE_DATA
-
-This function only affects data that has already been allocated.
-You can call this function after reading the PNG data but before calling
-any png_set_*() functions, to control whether the user or the png_set_*()
-function is responsible for freeing any existing data that might be present,
-and again after the png_set_*() functions to control whether the user
-or png_destroy_*() is supposed to free the data.  When the user assumes
-responsibility for libpng-allocated data, the application must use
-png_free() to free it, and when the user transfers responsibility to libpng
-for data that the user has allocated, the user must have used png_malloc()
-or png_zalloc() to allocate it.
-
-If you allocated your row_pointers in a single block, as suggested above in
-the description of the high level read interface, you must not transfer
-responsibility for freeing it to the png_set_rows or png_read_destroy function,
-because they would also try to free the individual row_pointers[i].
-
-If you allocated text_ptr.text, text_ptr.lang, and text_ptr.translated_keyword
-separately, do not transfer responsibility for freeing text_ptr to libpng,
-because when libpng fills a png_text structure it combines these members with
-the key member, and png_free_data() will free only text_ptr.key.  Similarly,
-if you transfer responsibility for free'ing text_ptr from libpng to your
-application, your application must not separately free those members.
-
-The png_free_data() function will turn off the "valid" flag for anything
-it frees.  If you need to turn the flag off for a chunk that was freed by your
-application instead of by libpng, you can use
-
-    png_set_invalid(png_ptr, info_ptr, mask);
-    mask - identifies the chunks to be made invalid,
-           containing the bitwise OR of one or
-           more of
-             PNG_INFO_gAMA, PNG_INFO_sBIT,
-             PNG_INFO_cHRM, PNG_INFO_PLTE,
-             PNG_INFO_tRNS, PNG_INFO_bKGD,
-             PNG_INFO_hIST, PNG_INFO_pHYs,
-             PNG_INFO_oFFs, PNG_INFO_tIME,
-             PNG_INFO_pCAL, PNG_INFO_sRGB,
-             PNG_INFO_iCCP, PNG_INFO_sPLT,
-             PNG_INFO_sCAL, PNG_INFO_IDAT
-
-For a more compact example of reading a PNG image, see the file example.c.
-
-Reading PNG files progressively
-
-The progressive reader is slightly different then the non-progressive
-reader.  Instead of calling png_read_info(), png_read_rows(), and
-png_read_end(), you make one call to png_process_data(), which calls
-callbacks when it has the info, a row, or the end of the image.  You
-set up these callbacks with png_set_progressive_read_fn().  You don't
-have to worry about the input/output functions of libpng, as you are
-giving the library the data directly in png_process_data().  I will
-assume that you have read the section on reading PNG files above,
-so I will only highlight the differences (although I will show
-all of the code).
-
-png_structp png_ptr;
-png_infop info_ptr;
-
- /*  An example code fragment of how you would
-     initialize the progressive reader in your
-     application. */
- int
- initialize_png_reader()
- {
-    png_ptr = png_create_read_struct
-        (PNG_LIBPNG_VER_STRING, (png_voidp)user_error_ptr,
-         user_error_fn, user_warning_fn);
-    if (!png_ptr)
-        return (ERROR);
-    info_ptr = png_create_info_struct(png_ptr);
-    if (!info_ptr)
-    {
-        png_destroy_read_struct(&png_ptr, (png_infopp)NULL,
-           (png_infopp)NULL);
-        return (ERROR);
-    }
-
-    if (setjmp(png_jmpbuf(png_ptr)))
-    {
-        png_destroy_read_struct(&png_ptr, &info_ptr,
-           (png_infopp)NULL);
-        return (ERROR);
-    }
-
-    /* This one's new.  You can provide functions
-       to be called when the header info is valid,
-       when each row is completed, and when the image
-       is finished.  If you aren't using all functions,
-       you can specify NULL parameters.  Even when all
-       three functions are NULL, you need to call
-       png_set_progressive_read_fn().  You can use
-       any struct as the user_ptr (cast to a void pointer
-       for the function call), and retrieve the pointer
-       from inside the callbacks using the function
-
-          png_get_progressive_ptr(png_ptr);
-
-       which will return a void pointer, which you have
-       to cast appropriately.
-     */
-    png_set_progressive_read_fn(png_ptr, (void *)user_ptr,
-        info_callback, row_callback, end_callback);
-
-    return 0;
- }
-
- /* A code fragment that you call as you receive blocks
-   of data */
- int
- process_data(png_bytep buffer, png_uint_32 length)
- {
-    if (setjmp(png_jmpbuf(png_ptr)))
-    {
-        png_destroy_read_struct(&png_ptr, &info_ptr,
-           (png_infopp)NULL);
-        return (ERROR);
-    }
-
-    /* This one's new also.  Simply give it a chunk
-       of data from the file stream (in order, of
-       course).  On machines with segmented memory
-       models machines, don't give it any more than
-       64K.  The library seems to run fine with sizes
-       of 4K. Although you can give it much less if
-       necessary (I assume you can give it chunks of
-       1 byte, I haven't tried less then 256 bytes
-       yet).  When this function returns, you may
-       want to display any rows that were generated
-       in the row callback if you don't already do
-       so there.
-     */
-    png_process_data(png_ptr, info_ptr, buffer, length);
-    return 0;
- }
-
- /* This function is called (as set by
-    png_set_progressive_read_fn() above) when enough data
-    has been supplied so all of the header has been
-    read.
- */
- void
- info_callback(png_structp png_ptr, png_infop info)
- {
-    /* Do any setup here, including setting any of
-       the transformations mentioned in the Reading
-       PNG files section.  For now, you _must_ call
-       either png_start_read_image() or
-       png_read_update_info() after all the
-       transformations are set (even if you don't set
-       any).  You may start getting rows before
-       png_process_data() returns, so this is your
-       last chance to prepare for that.
-     */
- }
-
- /* This function is called when each row of image
-    data is complete */
- void
- row_callback(png_structp png_ptr, png_bytep new_row,
-    png_uint_32 row_num, int pass)
- {
-    /* If the image is interlaced, and you turned
-       on the interlace handler, this function will
-       be called for every row in every pass.  Some
-       of these rows will not be changed from the
-       previous pass.  When the row is not changed,
-       the new_row variable will be NULL.  The rows
-       and passes are called in order, so you don't
-       really need the row_num and pass, but I'm
-       supplying them because it may make your life
-       easier.
-
-       For the non-NULL rows of interlaced images,
-       you must call png_progressive_combine_row()
-       passing in the row and the old row.  You can
-       call this function for NULL rows (it will just
-       return) and for non-interlaced images (it just
-       does the memcpy for you) if it will make the
-       code easier.  Thus, you can just do this for
-       all cases:
-     */
-
-        png_progressive_combine_row(png_ptr, old_row,
-          new_row);
-
-    /* where old_row is what was displayed for
-       previously for the row.  Note that the first
-       pass (pass == 0, really) will completely cover
-       the old row, so the rows do not have to be
-       initialized.  After the first pass (and only
-       for interlaced images), you will have to pass
-       the current row, and the function will combine
-       the old row and the new row.
-    */
- }
-
- void
- end_callback(png_structp png_ptr, png_infop info)
- {
-    /* This function is called after the whole image
-       has been read, including any chunks after the
-       image (up to and including the IEND).  You
-       will usually have the same info chunk as you
-       had in the header, although some data may have
-       been added to the comments and time fields.
-
-       Most people won't do much here, perhaps setting
-       a flag that marks the image as finished.
-     */
- }
-
-
-
-IV. Writing
-
-Much of this is very similar to reading.  However, everything of
-importance is repeated here, so you won't have to constantly look
-back up in the reading section to understand writing.
-
-Setup
-
-You will want to do the I/O initialization before you get into libpng,
-so if it doesn't work, you don't have anything to undo. If you are not
-using the standard I/O functions, you will need to replace them with
-custom writing functions.  See the discussion under Customizing libpng.
-
-    FILE *fp = fopen(file_name, "wb");
-    if (!fp)
-    {
-       return (ERROR);
-    }
-
-Next, png_struct and png_info need to be allocated and initialized.
-As these can be both relatively large, you may not want to store these
-on the stack, unless you have stack space to spare.  Of course, you
-will want to check if they return NULL.  If you are also reading,
-you won't want to name your read structure and your write structure
-both "png_ptr"; you can call them anything you like, such as
-"read_ptr" and "write_ptr".  Look at pngtest.c, for example.
-
-    png_structp png_ptr = png_create_write_struct
-       (PNG_LIBPNG_VER_STRING, (png_voidp)user_error_ptr,
-        user_error_fn, user_warning_fn);
-    if (!png_ptr)
-       return (ERROR);
-
-    png_infop info_ptr = png_create_info_struct(png_ptr);
-    if (!info_ptr)
-    {
-       png_destroy_write_struct(&png_ptr,
-         (png_infopp)NULL);
-       return (ERROR);
-    }
-
-If you want to use your own memory allocation routines,
-define PNG_USER_MEM_SUPPORTED and use
-png_create_write_struct_2() instead of png_create_write_struct():
-
-    png_structp png_ptr = png_create_write_struct_2
-       (PNG_LIBPNG_VER_STRING, (png_voidp)user_error_ptr,
-        user_error_fn, user_warning_fn, (png_voidp)
-        user_mem_ptr, user_malloc_fn, user_free_fn);
-
-After you have these structures, you will need to set up the
-error handling.  When libpng encounters an error, it expects to
-longjmp() back to your routine.  Therefore, you will need to call
-setjmp() and pass the png_jmpbuf(png_ptr).  If you
-write the file from different routines, you will need to update
-the png_jmpbuf(png_ptr) every time you enter a new routine that will
-call a png_*() function.  See your documentation of setjmp/longjmp
-for your compiler for more information on setjmp/longjmp.  See
-the discussion on libpng error handling in the Customizing Libpng
-section below for more information on the libpng error handling.
-
-    if (setjmp(png_jmpbuf(png_ptr)))
-    {
-       png_destroy_write_struct(&png_ptr, &info_ptr);
-       fclose(fp);
-       return (ERROR);
-    }
-    ...
-    return;
-
-If you would rather avoid the complexity of setjmp/longjmp issues,
-you can compile libpng with PNG_SETJMP_NOT_SUPPORTED, in which case
-errors will result in a call to PNG_ABORT() which defaults to abort().
-
-Now you need to set up the output code.  The default for libpng is to
-use the C function fwrite().  If you use this, you will need to pass a
-valid FILE * in the function png_init_io().  Be sure that the file is
-opened in binary mode.  Again, if you wish to handle writing data in
-another way, see the discussion on libpng I/O handling in the Customizing
-Libpng section below.
-
-    png_init_io(png_ptr, fp);
-
-If you are embedding your PNG into a datastream such as MNG, and don't
-want libpng to write the 8-byte signature, or if you have already
-written the signature in your application, use
-
-    png_set_sig_bytes(png_ptr, 8);
-
-to inform libpng that it should not write a signature.
-
-Write callbacks
-
-At this point, you can set up a callback function that will be
-called after each row has been written, which you can use to control
-a progress meter or the like.  It's demonstrated in pngtest.c.
-You must supply a function
-
-    void write_row_callback(png_ptr, png_uint_32 row,
-       int pass);
-    {
-      /* put your code here */
-    }
-
-(You can give it another name that you like instead of "write_row_callback")
-
-To inform libpng about your function, use
-
-    png_set_write_status_fn(png_ptr, write_row_callback);
-
-You now have the option of modifying how the compression library will
-run.  The following functions are mainly for testing, but may be useful
-in some cases, like if you need to write PNG files extremely fast and
-are willing to give up some compression, or if you want to get the
-maximum possible compression at the expense of slower writing.  If you
-have no special needs in this area, let the library do what it wants by
-not calling this function at all, as it has been tuned to deliver a good
-speed/compression ratio. The second parameter to png_set_filter() is
-the filter method, for which the only valid values are 0 (as of the
-July 1999 PNG specification, version 1.2) or 64 (if you are writing
-a PNG datastream that is to be embedded in a MNG datastream).  The third
-parameter is a flag that indicates which filter type(s) are to be tested
-for each scanline.  See the PNG specification for details on the specific filter
-types.
-
-
-    /* turn on or off filtering, and/or choose
-       specific filters.  You can use either a single
-       PNG_FILTER_VALUE_NAME or the bitwise OR of one
-       or more PNG_FILTER_NAME masks. */
-    png_set_filter(png_ptr, 0,
-       PNG_FILTER_NONE  | PNG_FILTER_VALUE_NONE |
-       PNG_FILTER_SUB   | PNG_FILTER_VALUE_SUB  |
-       PNG_FILTER_UP    | PNG_FILTER_VALUE_UP   |
-       PNG_FILTER_AVE   | PNG_FILTER_VALUE_AVE  |
-       PNG_FILTER_PAETH | PNG_FILTER_VALUE_PAETH|
-       PNG_ALL_FILTERS);
-
-If an application
-wants to start and stop using particular filters during compression,
-it should start out with all of the filters (to ensure that the previous
-row of pixels will be stored in case it's needed later), and then add
-and remove them after the start of compression.
-
-If you are writing a PNG datastream that is to be embedded in a MNG
-datastream, the second parameter can be either 0 or 64.
-
-The png_set_compression_*() functions interface to the zlib compression
-library, and should mostly be ignored unless you really know what you are
-doing.  The only generally useful call is png_set_compression_level()
-which changes how much time zlib spends on trying to compress the image
-data.  See the Compression Library (zlib.h and algorithm.txt, distributed
-with zlib) for details on the compression levels.
-
-    /* set the zlib compression level */
-    png_set_compression_level(png_ptr,
-        Z_BEST_COMPRESSION);
-
-    /* set other zlib parameters */
-    png_set_compression_mem_level(png_ptr, 8);
-    png_set_compression_strategy(png_ptr,
-        Z_DEFAULT_STRATEGY);
-    png_set_compression_window_bits(png_ptr, 15);
-    png_set_compression_method(png_ptr, 8);
-    png_set_compression_buffer_size(png_ptr, 8192)
-
-extern PNG_EXPORT(void,png_set_zbuf_size)
-
-Setting the contents of info for output
-
-You now need to fill in the png_info structure with all the data you
-wish to write before the actual image.  Note that the only thing you
-are allowed to write after the image is the text chunks and the time
-chunk (as of PNG Specification 1.2, anyway).  See png_write_end() and
-the latest PNG specification for more information on that.  If you
-wish to write them before the image, fill them in now, and flag that
-data as being valid.  If you want to wait until after the data, don't
-fill them until png_write_end().  For all the fields in png_info and
-their data types, see png.h.  For explanations of what the fields
-contain, see the PNG specification.
-
-Some of the more important parts of the png_info are:
-
-    png_set_IHDR(png_ptr, info_ptr, width, height,
-       bit_depth, color_type, interlace_type,
-       compression_type, filter_method)
-    width          - holds the width of the image
-                     in pixels (up to 2^31).
-    height         - holds the height of the image
-                     in pixels (up to 2^31).
-    bit_depth      - holds the bit depth of one of the
-                     image channels.
-                     (valid values are 1, 2, 4, 8, 16
-                     and depend also on the
-                     color_type.  See also significant
-                     bits (sBIT) below).
-    color_type     - describes which color/alpha
-                     channels are present.
-                     PNG_COLOR_TYPE_GRAY
-                        (bit depths 1, 2, 4, 8, 16)
-                     PNG_COLOR_TYPE_GRAY_ALPHA
-                        (bit depths 8, 16)
-                     PNG_COLOR_TYPE_PALETTE
-                        (bit depths 1, 2, 4, 8)
-                     PNG_COLOR_TYPE_RGB
-                        (bit_depths 8, 16)
-                     PNG_COLOR_TYPE_RGB_ALPHA
-                        (bit_depths 8, 16)
-
-                     PNG_COLOR_MASK_PALETTE
-                     PNG_COLOR_MASK_COLOR
-                     PNG_COLOR_MASK_ALPHA
-
-    interlace_type - PNG_INTERLACE_NONE or
-                     PNG_INTERLACE_ADAM7
-    compression_type - (must be
-                     PNG_COMPRESSION_TYPE_DEFAULT)
-    filter_method  - (must be PNG_FILTER_TYPE_DEFAULT
-                     or, if you are writing a PNG to
-                     be embedded in a MNG datastream,
-                     can also be
-                     PNG_INTRAPIXEL_DIFFERENCING)
-
-    png_set_PLTE(png_ptr, info_ptr, palette,
-       num_palette);
-    palette        - the palette for the file
-                     (array of png_color)
-    num_palette    - number of entries in the palette
-
-    png_set_gAMA(png_ptr, info_ptr, gamma);
-    gamma          - the gamma the image was created
-                     at (PNG_INFO_gAMA)
-
-    png_set_sRGB(png_ptr, info_ptr, srgb_intent);
-    srgb_intent    - the rendering intent
-                     (PNG_INFO_sRGB) The presence of
-                     the sRGB chunk means that the pixel
-                     data is in the sRGB color space.
-                     This chunk also implies specific
-                     values of gAMA and cHRM.  Rendering
-                     intent is the CSS-1 property that
-                     has been defined by the International
-                     Color Consortium
-                     (http://www.color.org).
-                     It can be one of
-                     PNG_sRGB_INTENT_SATURATION,
-                     PNG_sRGB_INTENT_PERCEPTUAL,
-                     PNG_sRGB_INTENT_ABSOLUTE, or
-                     PNG_sRGB_INTENT_RELATIVE.
-
-
-    png_set_sRGB_gAMA_and_cHRM(png_ptr, info_ptr,
-       srgb_intent);
-    srgb_intent    - the rendering intent
-                     (PNG_INFO_sRGB) The presence of the
-                     sRGB chunk means that the pixel
-                     data is in the sRGB color space.
-                     This function also causes gAMA and
-                     cHRM chunks with the specific values
-                     that are consistent with sRGB to be
-                     written.
-
-    png_set_iCCP(png_ptr, info_ptr, name, compression_type,
-                      profile, proflen);
-    name            - The profile name.
-    compression     - The compression type; always
-                      PNG_COMPRESSION_TYPE_BASE for PNG 1.0.
-                      You may give NULL to this argument to
-                      ignore it.
-    profile         - International Color Consortium color
-                      profile data. May contain NULs.
-    proflen         - length of profile data in bytes.
-
-    png_set_sBIT(png_ptr, info_ptr, sig_bit);
-    sig_bit        - the number of significant bits for
-                     (PNG_INFO_sBIT) each of the gray, red,
-                     green, and blue channels, whichever are
-                     appropriate for the given color type
-                     (png_color_16)
-
-    png_set_tRNS(png_ptr, info_ptr, trans, num_trans,
-       trans_values);
-    trans          - array of transparent entries for
-                     palette (PNG_INFO_tRNS)
-    trans_values   - graylevel or color sample values of
-                     the single transparent color for
-                     non-paletted images (PNG_INFO_tRNS)
-    num_trans      - number of transparent entries
-                     (PNG_INFO_tRNS)
-
-    png_set_hIST(png_ptr, info_ptr, hist);
-                    (PNG_INFO_hIST)
-    hist           - histogram of palette (array of
-                     png_uint_16)
-
-    png_set_tIME(png_ptr, info_ptr, mod_time);
-    mod_time       - time image was last modified
-                     (PNG_VALID_tIME)
-
-    png_set_bKGD(png_ptr, info_ptr, background);
-    background     - background color (PNG_VALID_bKGD)
-
-    png_set_text(png_ptr, info_ptr, text_ptr, num_text);
-    text_ptr       - array of png_text holding image
-                     comments
-    text_ptr[i].compression - type of compression used
-                 on "text" PNG_TEXT_COMPRESSION_NONE
-                           PNG_TEXT_COMPRESSION_zTXt
-                           PNG_ITXT_COMPRESSION_NONE
-                           PNG_ITXT_COMPRESSION_zTXt
-    text_ptr[i].key   - keyword for comment.  Must contain
-                 1-79 characters.
-    text_ptr[i].text  - text comments for current
-                         keyword.  Can be NULL or empty.
-    text_ptr[i].text_length - length of text string,
-                 after decompression, 0 for iTXt
-    text_ptr[i].itxt_length - length of itxt string,
-                 after decompression, 0 for tEXt/zTXt
-    text_ptr[i].lang  - language of comment (NULL or
-                         empty for unknown).
-    text_ptr[i].translated_keyword  - keyword in UTF-8 (NULL
-                         or empty for unknown).
-    num_text       - number of comments
-
-    png_set_sPLT(png_ptr, info_ptr, &palette_ptr,
-       num_spalettes);
-    palette_ptr    - array of png_sPLT_struct structures
-                     to be added to the list of palettes
-                     in the info structure.
-    num_spalettes  - number of palette structures to be
-                     added.
-
-    png_set_oFFs(png_ptr, info_ptr, offset_x, offset_y,
-        unit_type);
-    offset_x  - positive offset from the left
-                     edge of the screen
-    offset_y  - positive offset from the top
-                     edge of the screen
-    unit_type - PNG_OFFSET_PIXEL, PNG_OFFSET_MICROMETER
-
-    png_set_pHYs(png_ptr, info_ptr, res_x, res_y,
-        unit_type);
-    res_x       - pixels/unit physical resolution
-                  in x direction
-    res_y       - pixels/unit physical resolution
-                  in y direction
-    unit_type   - PNG_RESOLUTION_UNKNOWN,
-                  PNG_RESOLUTION_METER
-
-    png_set_sCAL(png_ptr, info_ptr, unit, width, height)
-    unit        - physical scale units (an integer)
-    width       - width of a pixel in physical scale units
-    height      - height of a pixel in physical scale units
-                  (width and height are doubles)
-
-    png_set_sCAL_s(png_ptr, info_ptr, unit, width, height)
-    unit        - physical scale units (an integer)
-    width       - width of a pixel in physical scale units
-    height      - height of a pixel in physical scale units
-                 (width and height are strings like "2.54")
-
-    png_set_unknown_chunks(png_ptr, info_ptr, &unknowns,
-       num_unknowns)
-    unknowns          - array of png_unknown_chunk
-                        structures holding unknown chunks
-    unknowns[i].name  - name of unknown chunk
-    unknowns[i].data  - data of unknown chunk
-    unknowns[i].size  - size of unknown chunk's data
-    unknowns[i].location - position to write chunk in file
-                           0: do not write chunk
-                           PNG_HAVE_IHDR: before PLTE
-                           PNG_HAVE_PLTE: before IDAT
-                           PNG_AFTER_IDAT: after IDAT
-
-The "location" member is set automatically according to
-what part of the output file has already been written.
-You can change its value after calling png_set_unknown_chunks()
-as demonstrated in pngtest.c.  Within each of the "locations",
-the chunks are sequenced according to their position in the
-structure (that is, the value of "i", which is the order in which
-the chunk was either read from the input file or defined with
-png_set_unknown_chunks).
-
-A quick word about text and num_text.  text is an array of png_text
-structures.  num_text is the number of valid structures in the array.
-Each png_text structure holds a language code, a keyword, a text value,
-and a compression type.
-
-The compression types have the same valid numbers as the compression
-types of the image data.  Currently, the only valid number is zero.
-However, you can store text either compressed or uncompressed, unlike
-images, which always have to be compressed.  So if you don't want the
-text compressed, set the compression type to PNG_TEXT_COMPRESSION_NONE.
-Because tEXt and zTXt chunks don't have a language field, if you
-specify PNG_TEXT_COMPRESSION_NONE or PNG_TEXT_COMPRESSION_zTXt
-any language code or translated keyword will not be written out.
-
-Until text gets around 1000 bytes, it is not worth compressing it.
-After the text has been written out to the file, the compression type
-is set to PNG_TEXT_COMPRESSION_NONE_WR or PNG_TEXT_COMPRESSION_zTXt_WR,
-so that it isn't written out again at the end (in case you are calling
-png_write_end() with the same struct.
-
-The keywords that are given in the PNG Specification are:
-
-    Title            Short (one line) title or
-                     caption for image
-    Author           Name of image's creator
-    Description      Description of image (possibly long)
-    Copyright        Copyright notice
-    Creation Time    Time of original image creation
-                     (usually RFC 1123 format, see below)
-    Software         Software used to create the image
-    Disclaimer       Legal disclaimer
-    Warning          Warning of nature of content
-    Source           Device used to create the image
-    Comment          Miscellaneous comment; conversion
-                     from other image format
-
-The keyword-text pairs work like this.  Keywords should be short
-simple descriptions of what the comment is about.  Some typical
-keywords are found in the PNG specification, as is some recommendations
-on keywords.  You can repeat keywords in a file.  You can even write
-some text before the image and some after.  For example, you may want
-to put a description of the image before the image, but leave the
-disclaimer until after, so viewers working over modem connections
-don't have to wait for the disclaimer to go over the modem before
-they start seeing the image.  Finally, keywords should be full
-words, not abbreviations.  Keywords and text are in the ISO 8859-1
-(Latin-1) character set (a superset of regular ASCII) and can not
-contain NUL characters, and should not contain control or other
-unprintable characters.  To make the comments widely readable, stick
-with basic ASCII, and avoid machine specific character set extensions
-like the IBM-PC character set.  The keyword must be present, but
-you can leave off the text string on non-compressed pairs.
-Compressed pairs must have a text string, as only the text string
-is compressed anyway, so the compression would be meaningless.
-
-PNG supports modification time via the png_time structure.  Two
-conversion routines are provided, png_convert_from_time_t() for
-time_t and png_convert_from_struct_tm() for struct tm.  The
-time_t routine uses gmtime().  You don't have to use either of
-these, but if you wish to fill in the png_time structure directly,
-you should provide the time in universal time (GMT) if possible
-instead of your local time.  Note that the year number is the full
-year (e.g. 1998, rather than 98 - PNG is year 2000 compliant!), and
-that months start with 1.
-
-If you want to store the time of the original image creation, you should
-use a plain tEXt chunk with the "Creation Time" keyword.  This is
-necessary because the "creation time" of a PNG image is somewhat vague,
-depending on whether you mean the PNG file, the time the image was
-created in a non-PNG format, a still photo from which the image was
-scanned, or possibly the subject matter itself.  In order to facilitate
-machine-readable dates, it is recommended that the "Creation Time"
-tEXt chunk use RFC 1123 format dates (e.g. "22 May 1997 18:07:10 GMT"),
-although this isn't a requirement.  Unlike the tIME chunk, the
-"Creation Time" tEXt chunk is not expected to be automatically changed
-by the software.  To facilitate the use of RFC 1123 dates, a function
-png_convert_to_rfc1123(png_timep) is provided to convert from PNG
-time to an RFC 1123 format string.
-
-Writing unknown chunks
-
-You can use the png_set_unknown_chunks function to queue up chunks
-for writing.  You give it a chunk name, raw data, and a size; that's
-all there is to it.  The chunks will be written by the next following
-png_write_info_before_PLTE, png_write_info, or png_write_end function.
-Any chunks previously read into the info structure's unknown-chunk
-list will also be written out in a sequence that satisfies the PNG
-specification's ordering rules.
-
-The high-level write interface
-
-At this point there are two ways to proceed; through the high-level
-write interface, or through a sequence of low-level write operations.
-You can use the high-level interface if your image data is present
-in the info structure.  All defined output
-transformations are permitted, enabled by the following masks.
-
-    PNG_TRANSFORM_IDENTITY      No transformation
-    PNG_TRANSFORM_PACKING       Pack 1, 2 and 4-bit samples
-    PNG_TRANSFORM_PACKSWAP      Change order of packed
-                                pixels to LSB first
-    PNG_TRANSFORM_INVERT_MONO   Invert monochrome images
-    PNG_TRANSFORM_SHIFT         Normalize pixels to the
-                                sBIT depth
-    PNG_TRANSFORM_BGR           Flip RGB to BGR, RGBA
-                                to BGRA
-    PNG_TRANSFORM_SWAP_ALPHA    Flip RGBA to ARGB or GA
-                                to AG
-    PNG_TRANSFORM_INVERT_ALPHA  Change alpha from opacity
-                                to transparency
-    PNG_TRANSFORM_SWAP_ENDIAN   Byte-swap 16-bit samples
-    PNG_TRANSFORM_STRIP_FILLER  Strip out filler bytes.
-
-If you have valid image data in the info structure (you can use
-png_set_rows() to put image data in the info structure), simply do this:
-
-    png_write_png(png_ptr, info_ptr, png_transforms, NULL)
-
-where png_transforms is an integer containing the bitwise OR of some set of
-transformation flags.  This call is equivalent to png_write_info(),
-followed the set of transformations indicated by the transform mask,
-then png_write_image(), and finally png_write_end().
-
-(The final parameter of this call is not yet used.  Someday it might point
-to transformation parameters required by some future output transform.)
-
-You must use png_transforms and not call any png_set_transform() functions
-when you use png_write_png().
-
-The low-level write interface
-
-If you are going the low-level route instead, you are now ready to
-write all the file information up to the actual image data.  You do
-this with a call to png_write_info().
-
-    png_write_info(png_ptr, info_ptr);
-
-Note that there is one transformation you may need to do before
-png_write_info().  In PNG files, the alpha channel in an image is the
-level of opacity.  If your data is supplied as a level of
-transparency, you can invert the alpha channel before you write it, so
-that 0 is fully transparent and 255 (in 8-bit or paletted images) or
-65535 (in 16-bit images) is fully opaque, with
-
-    png_set_invert_alpha(png_ptr);
-
-This must appear before png_write_info() instead of later with the
-other transformations because in the case of paletted images the tRNS
-chunk data has to be inverted before the tRNS chunk is written.  If
-your image is not a paletted image, the tRNS data (which in such cases
-represents a single color to be rendered as transparent) won't need to
-be changed, and you can safely do this transformation after your
-png_write_info() call.
-
-If you need to write a private chunk that you want to appear before
-the PLTE chunk when PLTE is present, you can write the PNG info in
-two steps, and insert code to write your own chunk between them:
-
-    png_write_info_before_PLTE(png_ptr, info_ptr);
-    png_set_unknown_chunks(png_ptr, info_ptr, ...);
-    png_write_info(png_ptr, info_ptr);
-
-After you've written the file information, you can set up the library
-to handle any special transformations of the image data.  The various
-ways to transform the data will be described in the order that they
-should occur.  This is important, as some of these change the color
-type and/or bit depth of the data, and some others only work on
-certain color types and bit depths.  Even though each transformation
-checks to see if it has data that it can do something with, you should
-make sure to only enable a transformation if it will be valid for the
-data.  For example, don't swap red and blue on grayscale data.
-
-PNG files store RGB pixels packed into 3 or 6 bytes.  This code tells
-the library to strip input data that has 4 or 8 bytes per pixel down
-to 3 or 6 bytes (or strip 2 or 4-byte grayscale+filler data to 1 or 2
-bytes per pixel).
-
-    png_set_filler(png_ptr, 0, PNG_FILLER_BEFORE);
-
-where the 0 is unused, and the location is either PNG_FILLER_BEFORE or
-PNG_FILLER_AFTER, depending upon whether the filler byte in the pixel
-is stored XRGB or RGBX.
-
-PNG files pack pixels of bit depths 1, 2, and 4 into bytes as small as
-they can, resulting in, for example, 8 pixels per byte for 1 bit files.
-If the data is supplied at 1 pixel per byte, use this code, which will
-correctly pack the pixels into a single byte:
-
-    png_set_packing(png_ptr);
-
-PNG files reduce possible bit depths to 1, 2, 4, 8, and 16.  If your
-data is of another bit depth, you can write an sBIT chunk into the
-file so that decoders can recover the original data if desired.
-
-    /* Set the true bit depth of the image data */
-    if (color_type & PNG_COLOR_MASK_COLOR)
-    {
-        sig_bit.red = true_bit_depth;
-        sig_bit.green = true_bit_depth;
-        sig_bit.blue = true_bit_depth;
-    }
-    else
-    {
-        sig_bit.gray = true_bit_depth;
-    }
-    if (color_type & PNG_COLOR_MASK_ALPHA)
-    {
-        sig_bit.alpha = true_bit_depth;
-    }
-
-    png_set_sBIT(png_ptr, info_ptr, &sig_bit);
-
-If the data is stored in the row buffer in a bit depth other than
-one supported by PNG (e.g. 3 bit data in the range 0-7 for a 4-bit PNG),
-this will scale the values to appear to be the correct bit depth as
-is required by PNG.
-
-    png_set_shift(png_ptr, &sig_bit);
-
-PNG files store 16 bit pixels in network byte order (big-endian,
-ie. most significant bits first).  This code would be used if they are
-supplied the other way (little-endian, i.e. least significant bits
-first, the way PCs store them):
-
-    if (bit_depth > 8)
-       png_set_swap(png_ptr);
-
-If you are using packed-pixel images (1, 2, or 4 bits/pixel), and you
-need to change the order the pixels are packed into bytes, you can use:
-
-    if (bit_depth < 8)
-       png_set_packswap(png_ptr);
-
-PNG files store 3 color pixels in red, green, blue order.  This code
-would be used if they are supplied as blue, green, red:
-
-    png_set_bgr(png_ptr);
-
-PNG files describe monochrome as black being zero and white being
-one. This code would be used if the pixels are supplied with this reversed
-(black being one and white being zero):
-
-    png_set_invert_mono(png_ptr);
-
-Finally, you can write your own transformation function if none of
-the existing ones meets your needs.  This is done by setting a callback
-with
-
-    png_set_write_user_transform_fn(png_ptr,
-       write_transform_fn);
-
-You must supply the function
-
-    void write_transform_fn(png_ptr ptr, row_info_ptr
-       row_info, png_bytep data)
-
-See pngtest.c for a working example.  Your function will be called
-before any of the other transformations are processed.
-
-You can also set up a pointer to a user structure for use by your
-callback function.
-
-    png_set_user_transform_info(png_ptr, user_ptr, 0, 0);
-
-The user_channels and user_depth parameters of this function are ignored
-when writing; you can set them to zero as shown.
-
-You can retrieve the pointer via the function png_get_user_transform_ptr().
-For example:
-
-    voidp write_user_transform_ptr =
-       png_get_user_transform_ptr(png_ptr);
-
-It is possible to have libpng flush any pending output, either manually,
-or automatically after a certain number of lines have been written.  To
-flush the output stream a single time call:
-
-    png_write_flush(png_ptr);
-
-and to have libpng flush the output stream periodically after a certain
-number of scanlines have been written, call:
-
-    png_set_flush(png_ptr, nrows);
-
-Note that the distance between rows is from the last time png_write_flush()
-was called, or the first row of the image if it has never been called.
-So if you write 50 lines, and then png_set_flush 25, it will flush the
-output on the next scanline, and every 25 lines thereafter, unless
-png_write_flush() is called before 25 more lines have been written.
-If nrows is too small (less than about 10 lines for a 640 pixel wide
-RGB image) the image compression may decrease noticeably (although this
-may be acceptable for real-time applications).  Infrequent flushing will
-only degrade the compression performance by a few percent over images
-that do not use flushing.
-
-Writing the image data
-
-That's it for the transformations.  Now you can write the image data.
-The simplest way to do this is in one function call.  If you have the
-whole image in memory, you can just call png_write_image() and libpng
-will write the image.  You will need to pass in an array of pointers to
-each row.  This function automatically handles interlacing, so you don't
-need to call png_set_interlace_handling() or call this function multiple
-times, or any of that other stuff necessary with png_write_rows().
-
-    png_write_image(png_ptr, row_pointers);
-
-where row_pointers is:
-
-    png_byte *row_pointers[height];
-
-You can point to void or char or whatever you use for pixels.
-
-If you don't want to write the whole image at once, you can
-use png_write_rows() instead.  If the file is not interlaced,
-this is simple:
-
-    png_write_rows(png_ptr, row_pointers,
-       number_of_rows);
-
-row_pointers is the same as in the png_write_image() call.
-
-If you are just writing one row at a time, you can do this with
-a single row_pointer instead of an array of row_pointers:
-
-    png_bytep row_pointer = row;
-
-    png_write_row(png_ptr, row_pointer);
-
-When the file is interlaced, things can get a good deal more
-complicated.  The only currently (as of the PNG Specification
-version 1.2, dated July 1999) defined interlacing scheme for PNG files
-is the "Adam7" interlace scheme, that breaks down an
-image into seven smaller images of varying size.  libpng will build
-these images for you, or you can do them yourself.  If you want to
-build them yourself, see the PNG specification for details of which
-pixels to write when.
-
-If you don't want libpng to handle the interlacing details, just
-use png_set_interlace_handling() and call png_write_rows() the
-correct number of times to write all seven sub-images.
-
-If you want libpng to build the sub-images, call this before you start
-writing any rows:
-
-    number_of_passes =
-       png_set_interlace_handling(png_ptr);
-
-This will return the number of passes needed.  Currently, this
-is seven, but may change if another interlace type is added.
-
-Then write the complete image number_of_passes times.
-
-    png_write_rows(png_ptr, row_pointers,
-       number_of_rows);
-
-As some of these rows are not used, and thus return immediately,
-you may want to read about interlacing in the PNG specification,
-and only update the rows that are actually used.
-
-Finishing a sequential write
-
-After you are finished writing the image, you should finish writing
-the file.  If you are interested in writing comments or time, you should
-pass an appropriately filled png_info pointer.  If you are not interested,
-you can pass NULL.
-
-    png_write_end(png_ptr, info_ptr);
-
-When you are done, you can free all memory used by libpng like this:
-
-    png_destroy_write_struct(&png_ptr, &info_ptr);
-
-It is also possible to individually free the info_ptr members that
-point to libpng-allocated storage with the following function:
-
-    png_free_data(png_ptr, info_ptr, mask, seq)
-    mask  - identifies data to be freed, a mask
-            containing the bitwise OR of one or
-            more of
-              PNG_FREE_PLTE, PNG_FREE_TRNS,
-              PNG_FREE_HIST, PNG_FREE_ICCP,
-              PNG_FREE_PCAL, PNG_FREE_ROWS,
-              PNG_FREE_SCAL, PNG_FREE_SPLT,
-              PNG_FREE_TEXT, PNG_FREE_UNKN,
-            or simply PNG_FREE_ALL
-    seq   - sequence number of item to be freed
-            (-1 for all items)
-
-This function may be safely called when the relevant storage has
-already been freed, or has not yet been allocated, or was allocated
-by the user  and not by libpng,  and will in those
-cases do nothing.  The "seq" parameter is ignored if only one item
-of the selected data type, such as PLTE, is allowed.  If "seq" is not
--1, and multiple items are allowed for the data type identified in
-the mask, such as text or sPLT, only the n'th item in the structure
-is freed, where n is "seq".
-
-If you allocated data such as a palette that you passed
-in to libpng with png_set_*, you must not free it until just before the call to
-png_destroy_write_struct().
-
-The default behavior is only to free data that was allocated internally
-by libpng.  This can be changed, so that libpng will not free the data,
-or so that it will free data that was allocated by the user with png_malloc()
-or png_zalloc() and passed in via a png_set_*() function, with
-
-    png_data_freer(png_ptr, info_ptr, freer, mask)
-    mask   - which data elements are affected
-             same choices as in png_free_data()
-    freer  - one of
-               PNG_DESTROY_WILL_FREE_DATA
-               PNG_SET_WILL_FREE_DATA
-               PNG_USER_WILL_FREE_DATA
-
-For example, to transfer responsibility for some data from a read structure
-to a write structure, you could use
-
-    png_data_freer(read_ptr, read_info_ptr,
-       PNG_USER_WILL_FREE_DATA,
-       PNG_FREE_PLTE|PNG_FREE_tRNS|PNG_FREE_hIST)
-    png_data_freer(write_ptr, write_info_ptr,
-       PNG_DESTROY_WILL_FREE_DATA,
-       PNG_FREE_PLTE|PNG_FREE_tRNS|PNG_FREE_hIST)
-
-thereby briefly reassigning responsibility for freeing to the user but
-immediately afterwards reassigning it once more to the write_destroy
-function.  Having done this, it would then be safe to destroy the read
-structure and continue to use the PLTE, tRNS, and hIST data in the write
-structure.
-
-This function only affects data that has already been allocated.
-You can call this function before calling after the png_set_*() functions
-to control whether the user or png_destroy_*() is supposed to free the data.
-When the user assumes responsibility for libpng-allocated data, the
-application must use
-png_free() to free it, and when the user transfers responsibility to libpng
-for data that the user has allocated, the user must have used png_malloc()
-or png_zalloc() to allocate it.
-
-If you allocated text_ptr.text, text_ptr.lang, and text_ptr.translated_keyword
-separately, do not transfer responsibility for freeing text_ptr to libpng,
-because when libpng fills a png_text structure it combines these members with
-the key member, and png_free_data() will free only text_ptr.key.  Similarly,
-if you transfer responsibility for free'ing text_ptr from libpng to your
-application, your application must not separately free those members.
-For a more compact example of writing a PNG image, see the file example.c.
-
-V. Modifying/Customizing libpng:
-
-There are three issues here.  The first is changing how libpng does
-standard things like memory allocation, input/output, and error handling.
-The second deals with more complicated things like adding new chunks,
-adding new transformations, and generally changing how libpng works.
-Both of those are compile-time issues; that is, they are generally
-determined at the time the code is written, and there is rarely a need
-to provide the user with a means of changing them.  The third is a
-run-time issue:  choosing between and/or tuning one or more alternate
-versions of computationally intensive routines; specifically, optimized
-assembly-language (and therefore compiler- and platform-dependent)
-versions.
-
-Memory allocation, input/output, and error handling
-
-All of the memory allocation, input/output, and error handling in libpng
-goes through callbacks that are user-settable.  The default routines are
-in pngmem.c, pngrio.c, pngwio.c, and pngerror.c, respectively.  To change
-these functions, call the appropriate png_set_*_fn() function.
-
-Memory allocation is done through the functions png_malloc()
-and png_free().  These currently just call the standard C functions.  If
-your pointers can't access more then 64K at a time, you will want to set
-MAXSEG_64K in zlib.h.  Since it is unlikely that the method of handling
-memory allocation on a platform will change between applications, these
-functions must be modified in the library at compile time.  If you prefer
-to use a different method of allocating and freeing data, you can use
-png_create_read_struct_2() or png_create_write_struct_2() to register
-your own functions as described above.
-These functions also provide a void pointer that can be retrieved via
-
-    mem_ptr=png_get_mem_ptr(png_ptr);
-
-Your replacement memory functions must have prototypes as follows:
-
-    png_voidp malloc_fn(png_structp png_ptr,
-       png_size_t size);
-    void free_fn(png_structp png_ptr, png_voidp ptr);
-
-Your malloc_fn() must return NULL in case of failure.  The png_malloc()
-function will normally call png_error() if it receives a NULL from the
-system memory allocator or from your replacement malloc_fn().
-
-Input/Output in libpng is done through png_read() and png_write(),
-which currently just call fread() and fwrite().  The FILE * is stored in
-png_struct and is initialized via png_init_io().  If you wish to change
-the method of I/O, the library supplies callbacks that you can set
-through the function png_set_read_fn() and png_set_write_fn() at run
-time, instead of calling the png_init_io() function.  These functions
-also provide a void pointer that can be retrieved via the function
-png_get_io_ptr().  For example:
-
-    png_set_read_fn(png_structp read_ptr,
-        voidp read_io_ptr, png_rw_ptr read_data_fn)
-
-    png_set_write_fn(png_structp write_ptr,
-        voidp write_io_ptr, png_rw_ptr write_data_fn,
-        png_flush_ptr output_flush_fn);
-
-    voidp read_io_ptr = png_get_io_ptr(read_ptr);
-    voidp write_io_ptr = png_get_io_ptr(write_ptr);
-
-The replacement I/O functions must have prototypes as follows:
-
-    void user_read_data(png_structp png_ptr,
-        png_bytep data, png_size_t length);
-    void user_write_data(png_structp png_ptr,
-        png_bytep data, png_size_t length);
-    void user_flush_data(png_structp png_ptr);
-
-Supplying NULL for the read, write, or flush functions sets them back
-to using the default C stream functions.  It is an error to read from
-a write stream, and vice versa.
-
-Error handling in libpng is done through png_error() and png_warning().
-Errors handled through png_error() are fatal, meaning that png_error()
-should never return to its caller.  Currently, this is handled via
-setjmp() and longjmp() (unless you have compiled libpng with
-PNG_SETJMP_NOT_SUPPORTED, in which case it is handled via PNG_ABORT()),
-but you could change this to do things like exit() if you should wish.
-
-On non-fatal errors, png_warning() is called
-to print a warning message, and then control returns to the calling code.
-By default png_error() and png_warning() print a message on stderr via
-fprintf() unless the library is compiled with PNG_NO_CONSOLE_IO defined
-(because you don't want the messages) or PNG_NO_STDIO defined (because
-fprintf() isn't available).  If you wish to change the behavior of the error
-functions, you will need to set up your own message callbacks.  These
-functions are normally supplied at the time that the png_struct is created.
-It is also possible to redirect errors and warnings to your own replacement
-functions after png_create_*_struct() has been called by calling:
-
-    png_set_error_fn(png_structp png_ptr,
-        png_voidp error_ptr, png_error_ptr error_fn,
-        png_error_ptr warning_fn);
-
-    png_voidp error_ptr = png_get_error_ptr(png_ptr);
-
-If NULL is supplied for either error_fn or warning_fn, then the libpng
-default function will be used, calling fprintf() and/or longjmp() if a
-problem is encountered.  The replacement error functions should have
-parameters as follows:
-
-    void user_error_fn(png_structp png_ptr,
-        png_const_charp error_msg);
-    void user_warning_fn(png_structp png_ptr,
-        png_const_charp warning_msg);
-
-The motivation behind using setjmp() and longjmp() is the C++ throw and
-catch exception handling methods.  This makes the code much easier to write,
-as there is no need to check every return code of every function call.
-However, there are some uncertainties about the status of local variables
-after a longjmp, so the user may want to be careful about doing anything after
-setjmp returns non-zero besides returning itself.  Consult your compiler
-documentation for more details.  For an alternative approach, you may wish
-to use the "cexcept" facility (see http://cexcept.sourceforge.net).
-
-Custom chunks
-
-If you need to read or write custom chunks, you may need to get deeper
-into the libpng code.  The library now has mechanisms for storing
-and writing chunks of unknown type; you can even declare callbacks
-for custom chunks.  However, this may not be good enough if the
-library code itself needs to know about interactions between your
-chunk and existing `intrinsic' chunks.
-
-If you need to write a new intrinsic chunk, first read the PNG
-specification. Acquire a first level of
-understanding of how it works.  Pay particular attention to the
-sections that describe chunk names, and look at how other chunks were
-designed, so you can do things similarly.  Second, check out the
-sections of libpng that read and write chunks.  Try to find a chunk
-that is similar to yours and use it as a template.  More details can
-be found in the comments inside the code.  It is best to handle unknown
-chunks in a generic method, via callback functions, instead of by
-modifying libpng functions.
-
-If you wish to write your own transformation for the data, look through
-the part of the code that does the transformations, and check out some of
-the simpler ones to get an idea of how they work.  Try to find a similar
-transformation to the one you want to add and copy off of it.  More details
-can be found in the comments inside the code itself.
-
-Configuring for 16 bit platforms
-
-You will want to look into zconf.h to tell zlib (and thus libpng) that
-it cannot allocate more then 64K at a time.  Even if you can, the memory
-won't be accessible.  So limit zlib and libpng to 64K by defining MAXSEG_64K.
-
-Configuring for DOS
-
-For DOS users who only have access to the lower 640K, you will
-have to limit zlib's memory usage via a png_set_compression_mem_level()
-call.  See zlib.h or zconf.h in the zlib library for more information.
-
-Configuring for Medium Model
-
-Libpng's support for medium model has been tested on most of the popular
-compilers.  Make sure MAXSEG_64K gets defined, USE_FAR_KEYWORD gets
-defined, and FAR gets defined to far in pngconf.h, and you should be
-all set.  Everything in the library (except for zlib's structure) is
-expecting far data.  You must use the typedefs with the p or pp on
-the end for pointers (or at least look at them and be careful).  Make
-note that the rows of data are defined as png_bytepp, which is an
-unsigned char far * far *.
-
-Configuring for gui/windowing platforms:
-
-You will need to write new error and warning functions that use the GUI
-interface, as described previously, and set them to be the error and
-warning functions at the time that png_create_*_struct() is called,
-in order to have them available during the structure initialization.
-They can be changed later via png_set_error_fn().  On some compilers,
-you may also have to change the memory allocators (png_malloc, etc.).
-
-Configuring for compiler xxx:
-
-All includes for libpng are in pngconf.h.  If you need to add/change/delete
-an include, this is the place to do it.  The includes that are not
-needed outside libpng are protected by the PNG_INTERNAL definition,
-which is only defined for those routines inside libpng itself.  The
-files in libpng proper only include png.h, which includes pngconf.h.
-
-Configuring zlib:
-
-There are special functions to configure the compression.  Perhaps the
-most useful one changes the compression level, which currently uses
-input compression values in the range 0 - 9.  The library normally
-uses the default compression level (Z_DEFAULT_COMPRESSION = 6).  Tests
-have shown that for a large majority of images, compression values in
-the range 3-6 compress nearly as well as higher levels, and do so much
-faster.  For online applications it may be desirable to have maximum speed
-(Z_BEST_SPEED = 1).  With versions of zlib after v0.99, you can also
-specify no compression (Z_NO_COMPRESSION = 0), but this would create
-files larger than just storing the raw bitmap.  You can specify the
-compression level by calling:
-
-    png_set_compression_level(png_ptr, level);
-
-Another useful one is to reduce the memory level used by the library.
-The memory level defaults to 8, but it can be lowered if you are
-short on memory (running DOS, for example, where you only have 640K).
-Note that the memory level does have an effect on compression; among
-other things, lower levels will result in sections of incompressible
-data being emitted in smaller stored blocks, with a correspondingly
-larger relative overhead of up to 15% in the worst case.
-
-    png_set_compression_mem_level(png_ptr, level);
-
-The other functions are for configuring zlib.  They are not recommended
-for normal use and may result in writing an invalid PNG file.  See
-zlib.h for more information on what these mean.
-
-    png_set_compression_strategy(png_ptr,
-        strategy);
-    png_set_compression_window_bits(png_ptr,
-        window_bits);
-    png_set_compression_method(png_ptr, method);
-    png_set_compression_buffer_size(png_ptr, size);
-
-Controlling row filtering
-
-If you want to control whether libpng uses filtering or not, which
-filters are used, and how it goes about picking row filters, you
-can call one of these functions.  The selection and configuration
-of row filters can have a significant impact on the size and
-encoding speed and a somewhat lesser impact on the decoding speed
-of an image.  Filtering is enabled by default for RGB and grayscale
-images (with and without alpha), but not for paletted images nor
-for any images with bit depths less than 8 bits/pixel.
-
-The 'method' parameter sets the main filtering method, which is
-currently only '0' in the PNG 1.2 specification.  The 'filters'
-parameter sets which filter(s), if any, should be used for each
-scanline.  Possible values are PNG_ALL_FILTERS and PNG_NO_FILTERS
-to turn filtering on and off, respectively.
-
-Individual filter types are PNG_FILTER_NONE, PNG_FILTER_SUB,
-PNG_FILTER_UP, PNG_FILTER_AVG, PNG_FILTER_PAETH, which can be bitwise
-ORed together with '|' to specify one or more filters to use.
-These filters are described in more detail in the PNG specification.
-If you intend to change the filter type during the course of writing
-the image, you should start with flags set for all of the filters
-you intend to use so that libpng can initialize its internal
-structures appropriately for all of the filter types.  (Note that this
-means the first row must always be adaptively filtered, because libpng
-currently does not allocate the filter buffers until png_write_row()
-is called for the first time.)
-
-    filters = PNG_FILTER_NONE | PNG_FILTER_SUB
-              PNG_FILTER_UP | PNG_FILTER_AVE |
-              PNG_FILTER_PAETH | PNG_ALL_FILTERS;
-
-    png_set_filter(png_ptr, PNG_FILTER_TYPE_BASE,
-       filters);
-              The second parameter can also be
-              PNG_INTRAPIXEL_DIFFERENCING if you are
-              writing a PNG to be embedded in a MNG
-              datastream.  This parameter must be the
-              same as the value of filter_method used
-              in png_set_IHDR().
-
-It is also possible to influence how libpng chooses from among the
-available filters.  This is done in one or both of two ways - by
-telling it how important it is to keep the same filter for successive
-rows, and by telling it the relative computational costs of the filters.
-
-    double weights[3] = {1.5, 1.3, 1.1},
-       costs[PNG_FILTER_VALUE_LAST] =
-       {1.0, 1.3, 1.3, 1.5, 1.7};
-
-    png_set_filter_heuristics(png_ptr,
-       PNG_FILTER_HEURISTIC_WEIGHTED, 3,
-       weights, costs);
-
-The weights are multiplying factors that indicate to libpng that the
-row filter should be the same for successive rows unless another row filter
-is that many times better than the previous filter.  In the above example,
-if the previous 3 filters were SUB, SUB, NONE, the SUB filter could have a
-"sum of absolute differences" 1.5 x 1.3 times higher than other filters
-and still be chosen, while the NONE filter could have a sum 1.1 times
-higher than other filters and still be chosen.  Unspecified weights are
-taken to be 1.0, and the specified weights should probably be declining
-like those above in order to emphasize recent filters over older filters.
-
-The filter costs specify for each filter type a relative decoding cost
-to be considered when selecting row filters.  This means that filters
-with higher costs are less likely to be chosen over filters with lower
-costs, unless their "sum of absolute differences" is that much smaller.
-The costs do not necessarily reflect the exact computational speeds of
-the various filters, since this would unduly influence the final image
-size.
-
-Note that the numbers above were invented purely for this example and
-are given only to help explain the function usage.  Little testing has
-been done to find optimum values for either the costs or the weights.
-
-Removing unwanted object code
-
-There are a bunch of #define's in pngconf.h that control what parts of
-libpng are compiled.  All the defines end in _SUPPORTED.  If you are
-never going to use a capability, you can change the #define to #undef
-before recompiling libpng and save yourself code and data space, or
-you can turn off individual capabilities with defines that begin with
-PNG_NO_.
-
-You can also turn all of the transforms and ancillary chunk capabilities
-off en masse with compiler directives that define
-PNG_NO_READ[or WRITE]_TRANSFORMS, or PNG_NO_READ[or WRITE]_ANCILLARY_CHUNKS,
-or all four,
-along with directives to turn on any of the capabilities that you do
-want.  The PNG_NO_READ[or WRITE]_TRANSFORMS directives disable
-the extra transformations but still leave the library fully capable of reading
-and writing PNG files with all known public chunks
-Use of the PNG_NO_READ[or WRITE]_ANCILLARY_CHUNKS directive
-produces a library that is incapable of reading or writing ancillary chunks.
-If you are not using the progressive reading capability, you can
-turn that off with PNG_NO_PROGRESSIVE_READ (don't confuse
-this with the INTERLACING capability, which you'll still have).
-
-All the reading and writing specific code are in separate files, so the
-linker should only grab the files it needs.  However, if you want to
-make sure, or if you are building a stand alone library, all the
-reading files start with pngr and all the writing files start with
-pngw.  The files that don't match either (like png.c, pngtrans.c, etc.)
-are used for both reading and writing, and always need to be included.
-The progressive reader is in pngpread.c
-
-If you are creating or distributing a dynamically linked library (a .so
-or DLL file), you should not remove or disable any parts of the library,
-as this will cause applications linked with different versions of the
-library to fail if they call functions not available in your library.
-The size of the library itself should not be an issue, because only
-those sections that are actually used will be loaded into memory.
-
-Requesting debug printout
-
-The macro definition PNG_DEBUG can be used to request debugging
-printout.  Set it to an integer value in the range 0 to 3.  Higher
-numbers result in increasing amounts of debugging information.  The
-information is printed to the "stderr" file, unless another file
-name is specified in the PNG_DEBUG_FILE macro definition.
-
-When PNG_DEBUG > 0, the following functions (macros) become available:
-
-   png_debug(level, message)
-   png_debug1(level, message, p1)
-   png_debug2(level, message, p1, p2)
-
-in which "level" is compared to PNG_DEBUG to decide whether to print
-the message, "message" is the formatted string to be printed,
-and p1 and p2 are parameters that are to be embedded in the string
-according to printf-style formatting directives.  For example,
-
-   png_debug1(2, "foo=%d\n", foo);
-
-is expanded to
-
-   if(PNG_DEBUG > 2)
-     fprintf(PNG_DEBUG_FILE, "foo=%d\n", foo);
-
-When PNG_DEBUG is defined but is zero, the macros aren't defined, but you
-can still use PNG_DEBUG to control your own debugging:
-
-   #ifdef PNG_DEBUG
-       fprintf(stderr, ...
-   #endif
-
-When PNG_DEBUG = 1, the macros are defined, but only png_debug statements
-having level = 0 will be printed.  There aren't any such statements in
-this version of libpng, but if you insert some they will be printed.
-
-VI.  Runtime optimization
-
-A new feature in libpng 1.2.0 is the ability to dynamically switch between
-standard and optimized versions of some routines.  Currently these are
-limited to three computationally intensive tasks when reading PNG files:
-decoding row filters, expanding interlacing, and combining interlaced or
-transparent row data with previous row data.  Currently the optimized
-versions are available only for x86 (Intel, AMD, etc.) platforms with
-MMX support, though this may change in future versions.  (For example,
-the non-MMX assembler optimizations for zlib might become similarly
-runtime-selectable in future releases, in which case libpng could be
-extended to support them.  Alternatively, the compile-time choice of
-floating-point versus integer routines for gamma correction might become
-runtime-selectable.)
-
-Because such optimizations tend to be very platform- and compiler-dependent,
-both in how they are written and in how they perform, the new runtime code
-in libpng has been written to allow programs to query, enable, and disable
-either specific optimizations or all such optimizations.  For example, to
-enable all possible optimizations (bearing in mind that some "optimizations"
-may actually run more slowly in rare cases):
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       png_uint_32 mask, flags;
-
-       flags = png_get_asm_flags(png_ptr);
-       mask = png_get_asm_flagmask(PNG_SELECT_READ | PNG_SELECT_WRITE);
-       png_set_asm_flags(png_ptr, flags | mask);
-    #endif
-
-To enable only optimizations relevant to reading PNGs, use PNG_SELECT_READ
-by itself when calling png_get_asm_flagmask(); similarly for optimizing
-only writing.  To disable all optimizations:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       flags = png_get_asm_flags(png_ptr);
-       mask = png_get_asm_flagmask(PNG_SELECT_READ | PNG_SELECT_WRITE);
-       png_set_asm_flags(png_ptr, flags & ~mask);
-    #endif
-
-To enable or disable only MMX-related features, use png_get_mmx_flagmask()
-in place of png_get_asm_flagmask().  The mmx version takes one additional
-parameter:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       int selection = PNG_SELECT_READ | PNG_SELECT_WRITE;
-       int compilerID;
-
-       mask = png_get_mmx_flagmask(selection, &compilerID);
-    #endif
-
-On return, compilerID will indicate which version of the MMX assembler
-optimizations was compiled.  Currently two flavors exist:  Microsoft
-Visual C++ (compilerID == 1) and GNU C (a.k.a. gcc/gas, compilerID == 2).
-On non-x86 platforms or on systems compiled without MMX optimizations, a
-value of -1 is used.
-
-Note that both png_get_asm_flagmask() and png_get_mmx_flagmask() return
-all valid, settable optimization bits for the version of the library that's
-currently in use.  In the case of shared (dynamically linked) libraries,
-this may include optimizations that did not exist at the time the code was
-written and compiled.  It is also possible, of course, to enable only known,
-specific optimizations; for example:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200) && \
-       defined(PNG_USE_PNGGCCRD) && defined(PNG_ASSEMBLER_CODE_SUPPORTED)
-       flags = PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  \
-             | PNG_ASM_FLAG_MMX_READ_INTERLACE    \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_SUB   \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_UP    \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_AVG   \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
-       png_set_asm_flags(png_ptr, flags);
-    #endif
-
-This method would enable only the MMX read-optimizations available at the
-time of libpng 1.2.0's release, regardless of whether a later version of
-the DLL were actually being used.  (Also note that these functions did not
-exist in versions older than 1.2.0, so any attempt to run a dynamically
-linked app on such an older version would fail.)
-
-To determine whether the processor supports MMX instructions at all, use
-the png_mmx_support() function:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       mmxsupport = png_mmx_support();
-    #endif
-
-It returns -1 if MMX support is not compiled into libpng, 0 if MMX code
-is compiled but MMX is not supported by the processor, or 1 if MMX support
-is fully available.  Note that png_mmx_support(), png_get_mmx_flagmask(),
-and png_get_asm_flagmask() all may be called without allocating and ini-
-tializing any PNG structures (for example, as part of a usage screen or
-"about" box).
-
-The following code can be used to prevent an application from using the
-thread_unsafe features, even if libpng was built with PNG_THREAD_UNSAFE_OK
-defined:
-
-#if defined(PNG_USE_PNGGCCRD) && defined(PNG_ASSEMBLER_CODE_SUPPORTED) \
-  && defined(PNG_THREAD_UNSAFE_OK)
-    /* Disable thread-unsafe features of pnggccrd */
-    if (png_access_version_number() >= 10200)
-    {
-      png_uint_32 mmx_disable_mask = 0;
-      png_uint_32 asm_flags;
-
-      mmx_disable_mask |= ( PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_SUB   \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_AVG   \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH );
-      asm_flags = png_get_asm_flags(png_ptr);
-      png_set_asm_flags(png_ptr, asm_flags & ~mmx_disable_mask);
-    }
-#endif
-
-For more extensive examples of runtime querying, enabling and disabling
-of optimized features, see contrib/gregbook/readpng2.c in the libpng
-source-code distribution.
-
-It is also possible to disable or enable specific optimization features
-at compile time.  To disable them entirely, which may result in slower
-but smaller and less complex and troublesome code, define
-
-     PNG_NO_ASSEMBLER_CODE
-
-If you do this, then the run-time code for setting and querying flags
-described above, and the assembler code in pnggccrd.c or pngvcrd.c will
-not be built.
-
-If you have disabled the assembler code, you can also disable the
-optimized C code, to obtain even slower but smaller code, by defining
-
-     PNG_NO_OPTIMZED_CODE
-
-To disable only the MMX assembler code, define
-
-     PNG_NO_MMX_CODE
-
-There are two versions of the MMX code: one for gcc compilers and one for
-MSVC compilers.  Pngconf.h should automatically detect which you are
-using, and it will set either PNG_USE_PNGGCCRD or PNG_USE_PNGVCRD for you.
-
-Define one or more of the following to disable specific parts of the
-assembler code:
-
-     PNG_NO_MMX_COMBINE_ROW
-     PNG_NO_MMX_READ_INTERLACE
-     PNG_NO_MMX_READ_FILTER_ROW
-
-If the latter is not disabled, you can disable one or more of the
-individual filter types by defining
-
-     PNG_NO_MMX_FILTER_SUB
-     PNG_NO_MMX_FILTER_UP
-     PNG_NO_MMX_FILTER_AVG
-     PNG_NO_MMX_FILTER_PAETH
-
-By default, libpng only enables the "sub" filter when gcc-3.x is used
-because experiments show that gcc-3.x produces bad code for the others.
-
-When you disable various MMX features, libpng will still be able to decode
-files with those features.  It will fall back upon the optimized C code
-or the unoptimized C code to decode them.
-
-
-VII.  MNG support
-
-The MNG specification (available at http://www.libpng.org/pub/mng) allows
-certain extensions to PNG for PNG images that are embedded in MNG datastreams.
-Libpng can support some of these extensions.  To enable them, use the
-png_permit_mng_features() function:
-
-   feature_set = png_permit_mng_features(png_ptr, mask)
-   mask is a png_uint_32 containing the bitwise OR of the
-        features you want to enable.  These include
-        PNG_FLAG_MNG_EMPTY_PLTE
-        PNG_FLAG_MNG_FILTER_64
-        PNG_ALL_MNG_FEATURES
-   feature_set is a png_uint_32 that is the bitwise AND of
-      your mask with the set of MNG features that is
-      supported by the version of libpng that you are using.
-
-It is an error to use this function when reading or writing a standalone
-PNG file with the PNG 8-byte signature.  The PNG datastream must be wrapped
-in a MNG datastream.  As a minimum, it must have the MNG 8-byte signature
-and the MHDR and MEND chunks.  Libpng does not provide support for these
-or any other MNG chunks; your application must provide its own support for
-them.  You may wish to consider using libmng (available at
-http://www.libmng.com) instead.
-
-VIII.  Changes to Libpng from version 0.88
-
-It should be noted that versions of libpng later than 0.96 are not
-distributed by the original libpng author, Guy Schalnat, nor by
-Andreas Dilger, who had taken over from Guy during 1996 and 1997, and
-distributed versions 0.89 through 0.96, but rather by another member
-of the original PNG Group, Glenn Randers-Pehrson.  Guy and Andreas are
-still alive and well, but they have moved on to other things.
-
-The old libpng functions png_read_init(), png_write_init(),
-png_info_init(), png_read_destroy(), and png_write_destroy() have been
-moved to PNG_INTERNAL in version 0.95 to discourage their use.  These
-functions will be removed from libpng version 2.0.0.
-
-The preferred method of creating and initializing the libpng structures is
-via the png_create_read_struct(), png_create_write_struct(), and
-png_create_info_struct() because they isolate the size of the structures
-from the application, allow version error checking, and also allow the
-use of custom error handling routines during the initialization, which
-the old functions do not.  The functions png_read_destroy() and
-png_write_destroy() do not actually free the memory that libpng
-allocated for these structs, but just reset the data structures, so they
-can be used instead of png_destroy_read_struct() and
-png_destroy_write_struct() if you feel there is too much system overhead
-allocating and freeing the png_struct for each image read.
-
-Setting the error callbacks via png_set_message_fn() before
-png_read_init() as was suggested in libpng-0.88 is no longer supported
-because this caused applications that do not use custom error functions
-to fail if the png_ptr was not initialized to zero.  It is still possible
-to set the error callbacks AFTER png_read_init(), or to change them with
-png_set_error_fn(), which is essentially the same function, but with a new
-name to force compilation errors with applications that try to use the old
-method.
-
-Starting with version 1.0.7, you can find out which version of the library
-you are using at run-time:
-
-   png_uint_32 libpng_vn = png_access_version_number();
-
-The number libpng_vn is constructed from the major version, minor
-version with leading zero, and release number with leading zero,
-(e.g., libpng_vn for version 1.0.7 is 10007).
-
-You can also check which version of png.h you used when compiling your
-application:
-
-   png_uint_32 application_vn = PNG_LIBPNG_VER;
-
-IX. Y2K Compliance in libpng
-
-August 30, 2007
-
-Since the PNG Development group is an ad-hoc body, we can't make
-an official declaration.
-
-This is your unofficial assurance that libpng from version 0.71 and
-upward through 1.2.20rc3 are Y2K compliant.  It is my belief that earlier
-versions were also Y2K compliant.
-
-Libpng only has three year fields.  One is a 2-byte unsigned integer that
-will hold years up to 65535.  The other two hold the date in text
-format, and will hold years up to 9999.
-
-The integer is
-    "png_uint_16 year" in png_time_struct.
-
-The strings are
-    "png_charp time_buffer" in png_struct and
-    "near_time_buffer", which is a local character string in png.c.
-
-There are seven time-related functions:
-
-    png_convert_to_rfc_1123() in png.c
-      (formerly png_convert_to_rfc_1152() in error)
-    png_convert_from_struct_tm() in pngwrite.c, called
-      in pngwrite.c
-    png_convert_from_time_t() in pngwrite.c
-    png_get_tIME() in pngget.c
-    png_handle_tIME() in pngrutil.c, called in pngread.c
-    png_set_tIME() in pngset.c
-    png_write_tIME() in pngwutil.c, called in pngwrite.c
-
-All appear to handle dates properly in a Y2K environment.  The
-png_convert_from_time_t() function calls gmtime() to convert from system
-clock time, which returns (year - 1900), which we properly convert to
-the full 4-digit year.  There is a possibility that applications using
-libpng are not passing 4-digit years into the png_convert_to_rfc_1123()
-function, or that they are incorrectly passing only a 2-digit year
-instead of "year - 1900" into the png_convert_from_struct_tm() function,
-but this is not under our control.  The libpng documentation has always
-stated that it works with 4-digit years, and the APIs have been
-documented as such.
-
-The tIME chunk itself is also Y2K compliant.  It uses a 2-byte unsigned
-integer to hold the year, and can hold years as large as 65535.
-
-zlib, upon which libpng depends, is also Y2K compliant.  It contains
-no date-related code.
-
-
-   Glenn Randers-Pehrson
-   libpng maintainer
-   PNG Development Group
diff --git a/libpng-1.2.20rc4.txt b/libpng-1.2.20rc4.txt
new file mode 100644
index 000000000..ba4d9ab7d
--- /dev/null
+++ b/libpng-1.2.20rc4.txt
@@ -0,0 +1,2851 @@
+libpng.txt - A description on how to use and modify libpng
+
+ libpng version 1.2.20rc4 - September 1, 2007
+ Updated and distributed by Glenn Randers-Pehrson
+ <glennrp at users.sourceforge.net>
+ Copyright (c) 1998-2007 Glenn Randers-Pehrson
+ For conditions of distribution and use, see copyright
+ notice in png.h.
+
+ based on:
+
+ libpng 1.0 beta 6  version 0.96 May 28, 1997
+ Updated and distributed by Andreas Dilger
+ Copyright (c) 1996, 1997 Andreas Dilger
+
+ libpng 1.0 beta 2 - version 0.88  January 26, 1996
+ For conditions of distribution and use, see copyright
+ notice in png.h. Copyright (c) 1995, 1996 Guy Eric
+ Schalnat, Group 42, Inc.
+
+ Updated/rewritten per request in the libpng FAQ
+ Copyright (c) 1995, 1996 Frank J. T. Wojcik
+ December 18, 1995 & January 20, 1996
+
+I. Introduction
+
+This file describes how to use and modify the PNG reference library
+(known as libpng) for your own use.  There are five sections to this
+file: introduction, structures, reading, writing, and modification and
+configuration notes for various special platforms.  In addition to this
+file, example.c is a good starting point for using the library, as
+it is heavily commented and should include everything most people
+will need.  We assume that libpng is already installed; see the
+INSTALL file for instructions on how to install libpng.
+
+For examples of libpng usage, see the files "example.c", "pngtest.c",
+and the files in the "contrib" directory, all of which are included in the
+libpng distribution.
+
+Libpng was written as a companion to the PNG specification, as a way
+of reducing the amount of time and effort it takes to support the PNG
+file format in application programs.
+
+The PNG specification (second edition), November 2003, is available as
+a W3C Recommendation and as an ISO Standard (ISO/IEC 15948:2003 (E)) at
+<http://www.w3.org/TR/2003/REC-PNG-20031110/
+The W3C and ISO documents have identical technical content.
+
+The PNG-1.2 specification is available at
+<http://www.libpng.org/pub/png/documents/>
+
+The PNG-1.0 specification is available
+as RFC 2083 <http://www.libpng.org/pub/png/documents/> and as a
+W3C Recommendation <http://www.w3.org/TR/REC.png.html>. Some
+additional chunks are described in the special-purpose public chunks
+documents at <http://www.libpng.org/pub/png/documents/>.
+
+Other information
+about PNG, and the latest version of libpng, can be found at the PNG home
+page, <http://www.libpng.org/pub/png/>.
+
+Most users will not have to modify the library significantly; advanced
+users may want to modify it more.  All attempts were made to make it as
+complete as possible, while keeping the code easy to understand.
+Currently, this library only supports C.  Support for other languages
+is being considered.
+
+Libpng has been designed to handle multiple sessions at one time,
+to be easily modifiable, to be portable to the vast majority of
+machines (ANSI, K&R, 16-, 32-, and 64-bit) available, and to be easy
+to use.  The ultimate goal of libpng is to promote the acceptance of
+the PNG file format in whatever way possible.  While there is still
+work to be done (see the TODO file), libpng should cover the
+majority of the needs of its users.
+
+Libpng uses zlib for its compression and decompression of PNG files.
+Further information about zlib, and the latest version of zlib, can
+be found at the zlib home page, <http://www.info-zip.org/pub/infozip/zlib/>.
+The zlib compression utility is a general purpose utility that is
+useful for more than PNG files, and can be used without libpng.
+See the documentation delivered with zlib for more details.
+You can usually find the source files for the zlib utility wherever you
+find the libpng source files.
+
+Libpng is thread safe, provided the threads are using different
+instances of the structures.  Each thread should have its own
+png_struct and png_info instances, and thus its own image.
+Libpng does not protect itself against two threads using the
+same instance of a structure.
+
+II. Structures
+
+There are two main structures that are important to libpng, png_struct
+and png_info.  The first, png_struct, is an internal structure that
+will not, for the most part, be used by a user except as the first
+variable passed to every libpng function call.
+
+The png_info structure is designed to provide information about the
+PNG file.  At one time, the fields of png_info were intended to be
+directly accessible to the user.  However, this tended to cause problems
+with applications using dynamically loaded libraries, and as a result
+a set of interface functions for png_info (the png_get_*() and png_set_*()
+functions) was developed.  The fields of png_info are still available for
+older applications, but it is suggested that applications use the new
+interfaces if at all possible.
+
+Applications that do make direct access to the members of png_struct (except
+for png_ptr->jmpbuf) must be recompiled whenever the library is updated,
+and applications that make direct access to the members of png_info must
+be recompiled if they were compiled or loaded with libpng version 1.0.6,
+in which the members were in a different order.  In version 1.0.7, the
+members of the png_info structure reverted to the old order, as they were
+in versions 0.97c through 1.0.5.  Starting with version 2.0.0, both
+structures are going to be hidden, and the contents of the structures will
+only be accessible through the png_get/png_set functions.
+
+The png.h header file is an invaluable reference for programming with libpng.
+And while I'm on the topic, make sure you include the libpng header file:
+
+#include <png.h>
+
+III. Reading
+
+We'll now walk you through the possible functions to call when reading
+in a PNG file sequentially, briefly explaining the syntax and purpose
+of each one.  See example.c and png.h for more detail.  While
+progressive reading is covered in the next section, you will still
+need some of the functions discussed in this section to read a PNG
+file.
+
+Setup
+
+You will want to do the I/O initialization(*) before you get into libpng,
+so if it doesn't work, you don't have much to undo.  Of course, you
+will also want to insure that you are, in fact, dealing with a PNG
+file.  Libpng provides a simple check to see if a file is a PNG file.
+To use it, pass in the first 1 to 8 bytes of the file to the function
+png_sig_cmp(), and it will return 0 if the bytes match the corresponding
+bytes of the PNG signature, or nonzero otherwise.  Of course, the more bytes
+you pass in, the greater the accuracy of the prediction.
+
+If you are intending to keep the file pointer open for use in libpng,
+you must ensure you don't read more than 8 bytes from the beginning
+of the file, and you also have to make a call to png_set_sig_bytes_read()
+with the number of bytes you read from the beginning.  Libpng will
+then only check the bytes (if any) that your program didn't read.
+
+(*): If you are not using the standard I/O functions, you will need
+to replace them with custom functions.  See the discussion under
+Customizing libpng.
+
+
+    FILE *fp = fopen(file_name, "rb");
+    if (!fp)
+    {
+        return (ERROR);
+    }
+    fread(header, 1, number, fp);
+    is_png = !png_sig_cmp(header, 0, number);
+    if (!is_png)
+    {
+        return (NOT_PNG);
+    }
+
+
+Next, png_struct and png_info need to be allocated and initialized.  In
+order to ensure that the size of these structures is correct even with a
+dynamically linked libpng, there are functions to initialize and
+allocate the structures.  We also pass the library version, optional
+pointers to error handling functions, and a pointer to a data struct for
+use by the error functions, if necessary (the pointer and functions can
+be NULL if the default error handlers are to be used).  See the section
+on Changes to Libpng below regarding the old initialization functions.
+The structure allocation functions quietly return NULL if they fail to
+create the structure, so your application should check for that.
+
+    png_structp png_ptr = png_create_read_struct
+       (PNG_LIBPNG_VER_STRING, (png_voidp)user_error_ptr,
+        user_error_fn, user_warning_fn);
+    if (!png_ptr)
+        return (ERROR);
+
+    png_infop info_ptr = png_create_info_struct(png_ptr);
+    if (!info_ptr)
+    {
+        png_destroy_read_struct(&png_ptr,
+           (png_infopp)NULL, (png_infopp)NULL);
+        return (ERROR);
+    }
+
+    png_infop end_info = png_create_info_struct(png_ptr);
+    if (!end_info)
+    {
+        png_destroy_read_struct(&png_ptr, &info_ptr,
+          (png_infopp)NULL);
+        return (ERROR);
+    }
+
+If you want to use your own memory allocation routines,
+define PNG_USER_MEM_SUPPORTED and use
+png_create_read_struct_2() instead of png_create_read_struct():
+
+    png_structp png_ptr = png_create_read_struct_2
+       (PNG_LIBPNG_VER_STRING, (png_voidp)user_error_ptr,
+        user_error_fn, user_warning_fn, (png_voidp)
+        user_mem_ptr, user_malloc_fn, user_free_fn);
+
+The error handling routines passed to png_create_read_struct()
+and the memory alloc/free routines passed to png_create_struct_2()
+are only necessary if you are not using the libpng supplied error
+handling and memory alloc/free functions.
+
+When libpng encounters an error, it expects to longjmp back
+to your routine.  Therefore, you will need to call setjmp and pass
+your png_jmpbuf(png_ptr).  If you read the file from different
+routines, you will need to update the jmpbuf field every time you enter
+a new routine that will call a png_*() function.
+
+See your documentation of setjmp/longjmp for your compiler for more
+information on setjmp/longjmp.  See the discussion on libpng error
+handling in the Customizing Libpng section below for more information
+on the libpng error handling.  If an error occurs, and libpng longjmp's
+back to your setjmp, you will want to call png_destroy_read_struct() to
+free any memory.
+
+    if (setjmp(png_jmpbuf(png_ptr)))
+    {
+        png_destroy_read_struct(&png_ptr, &info_ptr,
+           &end_info);
+        fclose(fp);
+        return (ERROR);
+    }
+
+If you would rather avoid the complexity of setjmp/longjmp issues,
+you can compile libpng with PNG_SETJMP_NOT_SUPPORTED, in which case
+errors will result in a call to PNG_ABORT() which defaults to abort().
+
+Now you need to set up the input code.  The default for libpng is to
+use the C function fread().  If you use this, you will need to pass a
+valid FILE * in the function png_init_io().  Be sure that the file is
+opened in binary mode.  If you wish to handle reading data in another
+way, you need not call the png_init_io() function, but you must then
+implement the libpng I/O methods discussed in the Customizing Libpng
+section below.
+
+    png_init_io(png_ptr, fp);
+
+If you had previously opened the file and read any of the signature from
+the beginning in order to see if this was a PNG file, you need to let
+libpng know that there are some bytes missing from the start of the file.
+
+    png_set_sig_bytes(png_ptr, number);
+
+Setting up callback code
+
+You can set up a callback function to handle any unknown chunks in the
+input stream. You must supply the function
+
+    read_chunk_callback(png_ptr ptr,
+         png_unknown_chunkp chunk);
+    {
+       /* The unknown chunk structure contains your
+          chunk data: */
+           png_byte name[5];
+           png_byte *data;
+           png_size_t size;
+       /* Note that libpng has already taken care of
+          the CRC handling */
+
+       /* put your code here.  Return one of the
+          following: */
+
+       return (-n); /* chunk had an error */
+       return (0); /* did not recognize */
+       return (n); /* success */
+    }
+
+(You can give your function another name that you like instead of
+"read_chunk_callback")
+
+To inform libpng about your function, use
+
+    png_set_read_user_chunk_fn(png_ptr, user_chunk_ptr,
+        read_chunk_callback);
+
+This names not only the callback function, but also a user pointer that
+you can retrieve with
+
+    png_get_user_chunk_ptr(png_ptr);
+
+At this point, you can set up a callback function that will be
+called after each row has been read, which you can use to control
+a progress meter or the like.  It's demonstrated in pngtest.c.
+You must supply a function
+
+    void read_row_callback(png_ptr ptr, png_uint_32 row,
+       int pass);
+    {
+      /* put your code here */
+    }
+
+(You can give it another name that you like instead of "read_row_callback")
+
+To inform libpng about your function, use
+
+    png_set_read_status_fn(png_ptr, read_row_callback);
+
+Width and height limits
+
+The PNG specification allows the width and height of an image to be as
+large as 2^31-1 (0x7fffffff), or about 2.147 billion rows and columns.
+Since very few applications really need to process such large images,
+we have imposed an arbitrary 1-million limit on rows and columns.
+Larger images will be rejected immediately with a png_error() call. If
+you wish to override this limit, you can use
+
+   png_set_user_limits(png_ptr, width_max, height_max);
+
+to set your own limits, or use width_max = height_max = 0x7fffffffL
+to allow all valid dimensions (libpng may reject some very large images
+anyway because of potential buffer overflow conditions).
+
+You should put this statement after you create the PNG structure and
+before calling png_read_info(), png_read_png(), or png_process_data().
+If you need to retrieve the limits that are being applied, use
+
+   width_max = png_get_user_width_max(png_ptr);
+   height_max = png_get_user_height_max(png_ptr);
+
+Unknown-chunk handling
+
+Now you get to set the way the library processes unknown chunks in the
+input PNG stream. Both known and unknown chunks will be read.  Normal
+behavior is that known chunks will be parsed into information in
+various info_ptr members; unknown chunks will be discarded. To change
+this, you can call:
+
+    png_set_keep_unknown_chunks(png_ptr, keep,
+        chunk_list, num_chunks);
+    keep       - 0: do not handle as unknown
+                 1: do not keep
+                 2: keep only if safe-to-copy
+                 3: keep even if unsafe-to-copy
+               You can use these definitions:
+                 PNG_HANDLE_CHUNK_AS_DEFAULT   0
+                 PNG_HANDLE_CHUNK_NEVER        1
+                 PNG_HANDLE_CHUNK_IF_SAFE      2
+                 PNG_HANDLE_CHUNK_ALWAYS       3
+    chunk_list - list of chunks affected (a byte string,
+                 five bytes per chunk, NULL or '\0' if
+                 num_chunks is 0)
+    num_chunks - number of chunks affected; if 0, all
+                 unknown chunks are affected.  If nonzero,
+                 only the chunks in the list are affected
+
+Unknown chunks declared in this way will be saved as raw data onto a
+list of png_unknown_chunk structures.  If a chunk that is normally
+known to libpng is named in the list, it will be handled as unknown,
+according to the "keep" directive.  If a chunk is named in successive
+instances of png_set_keep_unknown_chunks(), the final instance will
+take precedence.  The IHDR and IEND chunks should not be named in
+chunk_list; if they are, libpng will process them normally anyway.
+
+The high-level read interface
+
+At this point there are two ways to proceed; through the high-level
+read interface, or through a sequence of low-level read operations.
+You can use the high-level interface if (a) you are willing to read
+the entire image into memory, and (b) the input transformations
+you want to do are limited to the following set:
+
+    PNG_TRANSFORM_IDENTITY      No transformation
+    PNG_TRANSFORM_STRIP_16      Strip 16-bit samples to
+                                8 bits
+    PNG_TRANSFORM_STRIP_ALPHA   Discard the alpha channel
+    PNG_TRANSFORM_PACKING       Expand 1, 2 and 4-bit
+                                samples to bytes
+    PNG_TRANSFORM_PACKSWAP      Change order of packed
+                                pixels to LSB first
+    PNG_TRANSFORM_EXPAND        Perform set_expand()
+    PNG_TRANSFORM_INVERT_MONO   Invert monochrome images
+    PNG_TRANSFORM_SHIFT         Normalize pixels to the
+                                sBIT depth
+    PNG_TRANSFORM_BGR           Flip RGB to BGR, RGBA
+                                to BGRA
+    PNG_TRANSFORM_SWAP_ALPHA    Flip RGBA to ARGB or GA
+                                to AG
+    PNG_TRANSFORM_INVERT_ALPHA  Change alpha from opacity
+                                to transparency
+    PNG_TRANSFORM_SWAP_ENDIAN   Byte-swap 16-bit samples
+
+(This excludes setting a background color, doing gamma transformation,
+dithering, and setting filler.)  If this is the case, simply do this:
+
+    png_read_png(png_ptr, info_ptr, png_transforms, NULL)
+
+where png_transforms is an integer containing the bitwise OR of
+some set of transformation flags.  This call is equivalent to png_read_info(),
+followed the set of transformations indicated by the transform mask,
+then png_read_image(), and finally png_read_end().
+
+(The final parameter of this call is not yet used.  Someday it might point
+to transformation parameters required by some future input transform.)
+
+You must use png_transforms and not call any png_set_transform() functions
+when you use png_read_png().
+
+After you have called png_read_png(), you can retrieve the image data
+with
+
+   row_pointers = png_get_rows(png_ptr, info_ptr);
+
+where row_pointers is an array of pointers to the pixel data for each row:
+
+   png_bytep row_pointers[height];
+
+If you know your image size and pixel size ahead of time, you can allocate
+row_pointers prior to calling png_read_png() with
+
+   if (height > PNG_UINT_32_MAX/png_sizeof(png_byte))
+      png_error (png_ptr,
+         "Image is too tall to process in memory");
+   if (width > PNG_UINT_32_MAX/pixel_size)
+      png_error (png_ptr,
+         "Image is too wide to process in memory");
+   row_pointers = png_malloc(png_ptr,
+      height*png_sizeof(png_bytep));
+   for (int i=0; i<height, i++)
+      row_pointers[i]=png_malloc(png_ptr,
+         width*pixel_size);
+   png_set_rows(png_ptr, info_ptr, &row_pointers);
+
+Alternatively you could allocate your image in one big block and define
+row_pointers[i] to point into the proper places in your block.
+
+If you use png_set_rows(), the application is responsible for freeing
+row_pointers (and row_pointers[i], if they were separately allocated).
+
+If you don't allocate row_pointers ahead of time, png_read_png() will
+do it, and it'll be free'ed when you call png_destroy_*().
+
+The low-level read interface
+
+If you are going the low-level route, you are now ready to read all
+the file information up to the actual image data.  You do this with a
+call to png_read_info().
+
+    png_read_info(png_ptr, info_ptr);
+
+This will process all chunks up to but not including the image data.
+
+Querying the info structure
+
+Functions are used to get the information from the info_ptr once it
+has been read.  Note that these fields may not be completely filled
+in until png_read_end() has read the chunk data following the image.
+
+    png_get_IHDR(png_ptr, info_ptr, &width, &height,
+       &bit_depth, &color_type, &interlace_type,
+       &compression_type, &filter_method);
+
+    width          - holds the width of the image
+                     in pixels (up to 2^31).
+    height         - holds the height of the image
+                     in pixels (up to 2^31).
+    bit_depth      - holds the bit depth of one of the
+                     image channels.  (valid values are
+                     1, 2, 4, 8, 16 and depend also on
+                     the color_type.  See also
+                     significant bits (sBIT) below).
+    color_type     - describes which color/alpha channels
+                         are present.
+                     PNG_COLOR_TYPE_GRAY
+                        (bit depths 1, 2, 4, 8, 16)
+                     PNG_COLOR_TYPE_GRAY_ALPHA
+                        (bit depths 8, 16)
+                     PNG_COLOR_TYPE_PALETTE
+                        (bit depths 1, 2, 4, 8)
+                     PNG_COLOR_TYPE_RGB
+                        (bit_depths 8, 16)
+                     PNG_COLOR_TYPE_RGB_ALPHA
+                        (bit_depths 8, 16)
+
+                     PNG_COLOR_MASK_PALETTE
+                     PNG_COLOR_MASK_COLOR
+                     PNG_COLOR_MASK_ALPHA
+
+    filter_method  - (must be PNG_FILTER_TYPE_BASE
+                     for PNG 1.0, and can also be
+                     PNG_INTRAPIXEL_DIFFERENCING if
+                     the PNG datastream is embedded in
+                     a MNG-1.0 datastream)
+    compression_type - (must be PNG_COMPRESSION_TYPE_BASE
+                     for PNG 1.0)
+    interlace_type - (PNG_INTERLACE_NONE or
+                     PNG_INTERLACE_ADAM7)
+    Any or all of interlace_type, compression_type, of
+    filter_method can be NULL if you are
+    not interested in their values.
+
+    channels = png_get_channels(png_ptr, info_ptr);
+    channels       - number of channels of info for the
+                     color type (valid values are 1 (GRAY,
+                     PALETTE), 2 (GRAY_ALPHA), 3 (RGB),
+                     4 (RGB_ALPHA or RGB + filler byte))
+    rowbytes = png_get_rowbytes(png_ptr, info_ptr);
+    rowbytes       - number of bytes needed to hold a row
+
+    signature = png_get_signature(png_ptr, info_ptr);
+    signature      - holds the signature read from the
+                     file (if any).  The data is kept in
+                     the same offset it would be if the
+                     whole signature were read (i.e. if an
+                     application had already read in 4
+                     bytes of signature before starting
+                     libpng, the remaining 4 bytes would
+                     be in signature[4] through signature[7]
+                     (see png_set_sig_bytes())).
+
+
+    width            = png_get_image_width(png_ptr,
+                         info_ptr);
+    height           = png_get_image_height(png_ptr,
+                         info_ptr);
+    bit_depth        = png_get_bit_depth(png_ptr,
+                         info_ptr);
+    color_type       = png_get_color_type(png_ptr,
+                         info_ptr);
+    filter_method    = png_get_filter_type(png_ptr,
+                         info_ptr);
+    compression_type = png_get_compression_type(png_ptr,
+                         info_ptr);
+    interlace_type   = png_get_interlace_type(png_ptr,
+                         info_ptr);
+
+
+These are also important, but their validity depends on whether the chunk
+has been read.  The png_get_valid(png_ptr, info_ptr, PNG_INFO_<chunk>) and
+png_get_<chunk>(png_ptr, info_ptr, ...) functions return non-zero if the
+data has been read, or zero if it is missing.  The parameters to the
+png_get_<chunk> are set directly if they are simple data types, or a pointer
+into the info_ptr is returned for any complex types.
+
+    png_get_PLTE(png_ptr, info_ptr, &palette,
+                     &num_palette);
+    palette        - the palette for the file
+                     (array of png_color)
+    num_palette    - number of entries in the palette
+
+    png_get_gAMA(png_ptr, info_ptr, &gamma);
+    gamma          - the gamma the file is written
+                     at (PNG_INFO_gAMA)
+
+    png_get_sRGB(png_ptr, info_ptr, &srgb_intent);
+    srgb_intent    - the rendering intent (PNG_INFO_sRGB)
+                     The presence of the sRGB chunk
+                     means that the pixel data is in the
+                     sRGB color space.  This chunk also
+                     implies specific values of gAMA and
+                     cHRM.
+
+    png_get_iCCP(png_ptr, info_ptr, &name,
+       &compression_type, &profile, &proflen);
+    name            - The profile name.
+    compression     - The compression type; always
+                      PNG_COMPRESSION_TYPE_BASE for PNG 1.0.
+                      You may give NULL to this argument to
+                      ignore it.
+    profile         - International Color Consortium color
+                      profile data. May contain NULs.
+    proflen         - length of profile data in bytes.
+
+    png_get_sBIT(png_ptr, info_ptr, &sig_bit);
+    sig_bit        - the number of significant bits for
+                     (PNG_INFO_sBIT) each of the gray,
+                     red, green, and blue channels,
+                     whichever are appropriate for the
+                     given color type (png_color_16)
+
+    png_get_tRNS(png_ptr, info_ptr, &trans, &num_trans,
+                     &trans_values);
+    trans          - array of transparent entries for
+                     palette (PNG_INFO_tRNS)
+    trans_values   - graylevel or color sample values of
+                     the single transparent color for
+                     non-paletted images (PNG_INFO_tRNS)
+    num_trans      - number of transparent entries
+                     (PNG_INFO_tRNS)
+
+    png_get_hIST(png_ptr, info_ptr, &hist);
+                     (PNG_INFO_hIST)
+    hist           - histogram of palette (array of
+                     png_uint_16)
+
+    png_get_tIME(png_ptr, info_ptr, &mod_time);
+    mod_time       - time image was last modified
+                    (PNG_VALID_tIME)
+
+    png_get_bKGD(png_ptr, info_ptr, &background);
+    background     - background color (PNG_VALID_bKGD)
+                     valid 16-bit red, green and blue
+                     values, regardless of color_type
+
+    num_comments   = png_get_text(png_ptr, info_ptr,
+                     &text_ptr, &num_text);
+    num_comments   - number of comments
+    text_ptr       - array of png_text holding image
+                     comments
+    text_ptr[i].compression - type of compression used
+                 on "text" PNG_TEXT_COMPRESSION_NONE
+                           PNG_TEXT_COMPRESSION_zTXt
+                           PNG_ITXT_COMPRESSION_NONE
+                           PNG_ITXT_COMPRESSION_zTXt
+    text_ptr[i].key   - keyword for comment.  Must contain
+                         1-79 characters.
+    text_ptr[i].text  - text comments for current
+                         keyword.  Can be empty.
+    text_ptr[i].text_length - length of text string,
+                 after decompression, 0 for iTXt
+    text_ptr[i].itxt_length - length of itxt string,
+                 after decompression, 0 for tEXt/zTXt
+    text_ptr[i].lang  - language of comment (empty
+                         string for unknown).
+    text_ptr[i].lang_key  - keyword in UTF-8
+                         (empty string for unknown).
+    num_text       - number of comments (same as
+                     num_comments; you can put NULL here
+                     to avoid the duplication)
+    Note while png_set_text() will accept text, language,
+    and translated keywords that can be NULL pointers, the
+    structure returned by png_get_text will always contain
+    regular zero-terminated C strings.  They might be
+    empty strings but they will never be NULL pointers.
+
+    num_spalettes = png_get_sPLT(png_ptr, info_ptr,
+       &palette_ptr);
+    palette_ptr    - array of palette structures holding
+                     contents of one or more sPLT chunks
+                     read.
+    num_spalettes  - number of sPLT chunks read.
+
+    png_get_oFFs(png_ptr, info_ptr, &offset_x, &offset_y,
+       &unit_type);
+    offset_x       - positive offset from the left edge
+                     of the screen
+    offset_y       - positive offset from the top edge
+                     of the screen
+    unit_type      - PNG_OFFSET_PIXEL, PNG_OFFSET_MICROMETER
+
+    png_get_pHYs(png_ptr, info_ptr, &res_x, &res_y,
+       &unit_type);
+    res_x          - pixels/unit physical resolution in
+                     x direction
+    res_y          - pixels/unit physical resolution in
+                     x direction
+    unit_type      - PNG_RESOLUTION_UNKNOWN,
+                     PNG_RESOLUTION_METER
+
+    png_get_sCAL(png_ptr, info_ptr, &unit, &width,
+       &height)
+    unit        - physical scale units (an integer)
+    width       - width of a pixel in physical scale units
+    height      - height of a pixel in physical scale units
+                 (width and height are doubles)
+
+    png_get_sCAL_s(png_ptr, info_ptr, &unit, &width,
+       &height)
+    unit        - physical scale units (an integer)
+    width       - width of a pixel in physical scale units
+    height      - height of a pixel in physical scale units
+                 (width and height are strings like "2.54")
+
+    num_unknown_chunks = png_get_unknown_chunks(png_ptr,
+       info_ptr, &unknowns)
+    unknowns          - array of png_unknown_chunk
+                        structures holding unknown chunks
+    unknowns[i].name  - name of unknown chunk
+    unknowns[i].data  - data of unknown chunk
+    unknowns[i].size  - size of unknown chunk's data
+    unknowns[i].location - position of chunk in file
+
+    The value of "i" corresponds to the order in which the
+    chunks were read from the PNG file or inserted with the
+    png_set_unknown_chunks() function.
+
+The data from the pHYs chunk can be retrieved in several convenient
+forms:
+
+    res_x = png_get_x_pixels_per_meter(png_ptr,
+       info_ptr)
+    res_y = png_get_y_pixels_per_meter(png_ptr,
+       info_ptr)
+    res_x_and_y = png_get_pixels_per_meter(png_ptr,
+       info_ptr)
+    res_x = png_get_x_pixels_per_inch(png_ptr,
+       info_ptr)
+    res_y = png_get_y_pixels_per_inch(png_ptr,
+       info_ptr)
+    res_x_and_y = png_get_pixels_per_inch(png_ptr,
+       info_ptr)
+    aspect_ratio = png_get_pixel_aspect_ratio(png_ptr,
+       info_ptr)
+
+   (Each of these returns 0 [signifying "unknown"] if
+       the data is not present or if res_x is 0;
+       res_x_and_y is 0 if res_x != res_y)
+
+The data from the oFFs chunk can be retrieved in several convenient
+forms:
+
+    x_offset = png_get_x_offset_microns(png_ptr, info_ptr);
+    y_offset = png_get_y_offset_microns(png_ptr, info_ptr);
+    x_offset = png_get_x_offset_inches(png_ptr, info_ptr);
+    y_offset = png_get_y_offset_inches(png_ptr, info_ptr);
+
+   (Each of these returns 0 [signifying "unknown" if both
+       x and y are 0] if the data is not present or if the
+       chunk is present but the unit is the pixel)
+
+For more information, see the png_info definition in png.h and the
+PNG specification for chunk contents.  Be careful with trusting
+rowbytes, as some of the transformations could increase the space
+needed to hold a row (expand, filler, gray_to_rgb, etc.).
+See png_read_update_info(), below.
+
+A quick word about text_ptr and num_text.  PNG stores comments in
+keyword/text pairs, one pair per chunk, with no limit on the number
+of text chunks, and a 2^31 byte limit on their size.  While there are
+suggested keywords, there is no requirement to restrict the use to these
+strings.  It is strongly suggested that keywords and text be sensible
+to humans (that's the point), so don't use abbreviations.  Non-printing
+symbols are not allowed.  See the PNG specification for more details.
+There is also no requirement to have text after the keyword.
+
+Keywords should be limited to 79 Latin-1 characters without leading or
+trailing spaces, but non-consecutive spaces are allowed within the
+keyword.  It is possible to have the same keyword any number of times.
+The text_ptr is an array of png_text structures, each holding a
+pointer to a language string, a pointer to a keyword and a pointer to
+a text string.  The text string, language code, and translated
+keyword may be empty or NULL pointers.  The keyword/text
+pairs are put into the array in the order that they are received.
+However, some or all of the text chunks may be after the image, so, to
+make sure you have read all the text chunks, don't mess with these
+until after you read the stuff after the image.  This will be
+mentioned again below in the discussion that goes with png_read_end().
+
+Input transformations
+
+After you've read the header information, you can set up the library
+to handle any special transformations of the image data.  The various
+ways to transform the data will be described in the order that they
+should occur.  This is important, as some of these change the color
+type and/or bit depth of the data, and some others only work on
+certain color types and bit depths.  Even though each transformation
+checks to see if it has data that it can do something with, you should
+make sure to only enable a transformation if it will be valid for the
+data.  For example, don't swap red and blue on grayscale data.
+
+The colors used for the background and transparency values should be
+supplied in the same format/depth as the current image data.  They
+are stored in the same format/depth as the image data in a bKGD or tRNS
+chunk, so this is what libpng expects for this data.  The colors are
+transformed to keep in sync with the image data when an application
+calls the png_read_update_info() routine (see below).
+
+Data will be decoded into the supplied row buffers packed into bytes
+unless the library has been told to transform it into another format.
+For example, 4 bit/pixel paletted or grayscale data will be returned
+2 pixels/byte with the leftmost pixel in the high-order bits of the
+byte, unless png_set_packing() is called.  8-bit RGB data will be stored
+in RGB RGB RGB format unless png_set_filler() or png_set_add_alpha()
+is called to insert filler bytes, either before or after each RGB triplet.
+16-bit RGB data will be returned RRGGBB RRGGBB, with the most significant
+byte of the color value first, unless png_set_strip_16() is called to
+transform it to regular RGB RGB triplets, or png_set_filler() or
+png_set_add alpha() is called to insert filler bytes, either before or
+after each RRGGBB triplet.  Similarly, 8-bit or 16-bit grayscale data can
+be modified with
+png_set_filler(), png_set_add_alpha(), or png_set_strip_16().
+
+The following code transforms grayscale images of less than 8 to 8 bits,
+changes paletted images to RGB, and adds a full alpha channel if there is
+transparency information in a tRNS chunk.  This is most useful on
+grayscale images with bit depths of 2 or 4 or if there is a multiple-image
+viewing application that wishes to treat all images in the same way.
+
+    if (color_type == PNG_COLOR_TYPE_PALETTE)
+        png_set_palette_to_rgb(png_ptr);
+
+    if (color_type == PNG_COLOR_TYPE_GRAY &&
+        bit_depth < 8) png_set_expand_gray_1_2_4_to_8(png_ptr);
+
+    if (png_get_valid(png_ptr, info_ptr,
+        PNG_INFO_tRNS)) png_set_tRNS_to_alpha(png_ptr);
+
+These three functions are actually aliases for png_set_expand(), added
+in libpng version 1.0.4, with the function names expanded to improve code
+readability.  In some future version they may actually do different
+things.
+
+As of libpng version 1.2.9, png_set_expand_gray_1_2_4_to_8() was
+added.  It expands the sample depth without changing tRNS to alpha.
+At the same time, png_set_gray_1_2_4_to_8() was deprecated, and it
+will be removed from a future version.
+
+PNG can have files with 16 bits per channel.  If you only can handle
+8 bits per channel, this will strip the pixels down to 8 bit.
+
+    if (bit_depth == 16)
+        png_set_strip_16(png_ptr);
+
+If, for some reason, you don't need the alpha channel on an image,
+and you want to remove it rather than combining it with the background
+(but the image author certainly had in mind that you *would* combine
+it with the background, so that's what you should probably do):
+
+    if (color_type & PNG_COLOR_MASK_ALPHA)
+        png_set_strip_alpha(png_ptr);
+
+In PNG files, the alpha channel in an image
+is the level of opacity.  If you need the alpha channel in an image to
+be the level of transparency instead of opacity, you can invert the
+alpha channel (or the tRNS chunk data) after it's read, so that 0 is
+fully opaque and 255 (in 8-bit or paletted images) or 65535 (in 16-bit
+images) is fully transparent, with
+
+    png_set_invert_alpha(png_ptr);
+
+PNG files pack pixels of bit depths 1, 2, and 4 into bytes as small as
+they can, resulting in, for example, 8 pixels per byte for 1 bit
+files.  This code expands to 1 pixel per byte without changing the
+values of the pixels:
+
+    if (bit_depth < 8)
+        png_set_packing(png_ptr);
+
+PNG files have possible bit depths of 1, 2, 4, 8, and 16.  All pixels
+stored in a PNG image have been "scaled" or "shifted" up to the next
+higher possible bit depth (e.g. from 5 bits/sample in the range [0,31] to
+8 bits/sample in the range [0, 255]).  However, it is also possible to
+convert the PNG pixel data back to the original bit depth of the image.
+This call reduces the pixels back down to the original bit depth:
+
+    png_color_8p sig_bit;
+
+    if (png_get_sBIT(png_ptr, info_ptr, &sig_bit))
+        png_set_shift(png_ptr, sig_bit);
+
+PNG files store 3-color pixels in red, green, blue order.  This code
+changes the storage of the pixels to blue, green, red:
+
+    if (color_type == PNG_COLOR_TYPE_RGB ||
+        color_type == PNG_COLOR_TYPE_RGB_ALPHA)
+        png_set_bgr(png_ptr);
+
+PNG files store RGB pixels packed into 3 or 6 bytes. This code expands them
+into 4 or 8 bytes for windowing systems that need them in this format:
+
+    if (color_type == PNG_COLOR_TYPE_RGB)
+        png_set_filler(png_ptr, filler, PNG_FILLER_BEFORE);
+
+where "filler" is the 8 or 16-bit number to fill with, and the location is
+either PNG_FILLER_BEFORE or PNG_FILLER_AFTER, depending upon whether
+you want the filler before the RGB or after.  This transformation
+does not affect images that already have full alpha channels.  To add an
+opaque alpha channel, use filler=0xff or 0xffff and PNG_FILLER_AFTER which
+will generate RGBA pixels.
+
+Note that png_set_filler() does not change the color type.  If you want
+to do that, you can add a true alpha channel with
+
+    if (color_type == PNG_COLOR_TYPE_RGB ||
+           color_type == PNG_COLOR_TYPE_GRAY)
+    png_set_add_alpha(png_ptr, filler, PNG_FILLER_AFTER);
+
+where "filler" contains the alpha value to assign to each pixel.
+This function was added in libpng-1.2.7.
+
+If you are reading an image with an alpha channel, and you need the
+data as ARGB instead of the normal PNG format RGBA:
+
+    if (color_type == PNG_COLOR_TYPE_RGB_ALPHA)
+        png_set_swap_alpha(png_ptr);
+
+For some uses, you may want a grayscale image to be represented as
+RGB.  This code will do that conversion:
+
+    if (color_type == PNG_COLOR_TYPE_GRAY ||
+        color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
+          png_set_gray_to_rgb(png_ptr);
+
+Conversely, you can convert an RGB or RGBA image to grayscale or grayscale
+with alpha.
+
+    if (color_type == PNG_COLOR_TYPE_RGB ||
+        color_type == PNG_COLOR_TYPE_RGB_ALPHA)
+          png_set_rgb_to_gray_fixed(png_ptr, error_action,
+             int red_weight, int green_weight);
+
+    error_action = 1: silently do the conversion
+    error_action = 2: issue a warning if the original
+                      image has any pixel where
+                      red != green or red != blue
+    error_action = 3: issue an error and abort the
+                      conversion if the original
+                      image has any pixel where
+                      red != green or red != blue
+
+    red_weight:       weight of red component times 100000
+    green_weight:     weight of green component times 100000
+                      If either weight is negative, default
+                      weights (21268, 71514) are used.
+
+If you have set error_action = 1 or 2, you can
+later check whether the image really was gray, after processing
+the image rows, with the png_get_rgb_to_gray_status(png_ptr) function.
+It will return a png_byte that is zero if the image was gray or
+1 if there were any non-gray pixels.  bKGD and sBIT data
+will be silently converted to grayscale, using the green channel
+data, regardless of the error_action setting.
+
+With red_weight+green_weight<=100000,
+the normalized graylevel is computed:
+
+    int rw = red_weight * 65536;
+    int gw = green_weight * 65536;
+    int bw = 65536 - (rw + gw);
+    gray = (rw*red + gw*green + bw*blue)/65536;
+
+The default values approximate those recommended in the Charles
+Poynton's Color FAQ, <http://www.inforamp.net/~poynton/>
+Copyright (c) 1998-01-04 Charles Poynton <poynton at inforamp.net>
+
+    Y = 0.212671 * R + 0.715160 * G + 0.072169 * B
+
+Libpng approximates this with
+
+    Y = 0.21268 * R    + 0.7151 * G    + 0.07217 * B
+
+which can be expressed with integers as
+
+    Y = (6969 * R + 23434 * G + 2365 * B)/32768
+
+The calculation is done in a linear colorspace, if the image gamma
+is known.
+
+If you have a grayscale and you are using png_set_expand_depth(),
+png_set_expand(), or png_set_gray_to_rgb to change to truecolor or to
+a higher bit-depth, you must either supply the background color as a gray
+value at the original file bit-depth (need_expand = 1) or else supply the
+background color as an RGB triplet at the final, expanded bit depth
+(need_expand = 0).  Similarly, if you are reading a paletted image, you
+must either supply the background color as a palette index (need_expand = 1)
+or as an RGB triplet that may or may not be in the palette (need_expand = 0).
+
+    png_color_16 my_background;
+    png_color_16p image_background;
+
+    if (png_get_bKGD(png_ptr, info_ptr, &image_background))
+        png_set_background(png_ptr, image_background,
+          PNG_BACKGROUND_GAMMA_FILE, 1, 1.0);
+    else
+        png_set_background(png_ptr, &my_background,
+          PNG_BACKGROUND_GAMMA_SCREEN, 0, 1.0);
+
+The png_set_background() function tells libpng to composite images
+with alpha or simple transparency against the supplied background
+color.  If the PNG file contains a bKGD chunk (PNG_INFO_bKGD valid),
+you may use this color, or supply another color more suitable for
+the current display (e.g., the background color from a web page).  You
+need to tell libpng whether the color is in the gamma space of the
+display (PNG_BACKGROUND_GAMMA_SCREEN for colors you supply), the file
+(PNG_BACKGROUND_GAMMA_FILE for colors from the bKGD chunk), or one
+that is neither of these gammas (PNG_BACKGROUND_GAMMA_UNIQUE - I don't
+know why anyone would use this, but it's here).
+
+To properly display PNG images on any kind of system, the application needs
+to know what the display gamma is.  Ideally, the user will know this, and
+the application will allow them to set it.  One method of allowing the user
+to set the display gamma separately for each system is to check for a
+SCREEN_GAMMA or DISPLAY_GAMMA environment variable, which will hopefully be
+correctly set.
+
+Note that display_gamma is the overall gamma correction required to produce
+pleasing results, which depends on the lighting conditions in the surrounding
+environment.  In a dim or brightly lit room, no compensation other than
+the physical gamma exponent of the monitor is needed, while in a dark room
+a slightly smaller exponent is better.
+
+   double gamma, screen_gamma;
+
+   if (/* We have a user-defined screen
+       gamma value */)
+   {
+      screen_gamma = user_defined_screen_gamma;
+   }
+   /* One way that applications can share the same
+      screen gamma value */
+   else if ((gamma_str = getenv("SCREEN_GAMMA"))
+      != NULL)
+   {
+      screen_gamma = (double)atof(gamma_str);
+   }
+   /* If we don't have another value */
+   else
+   {
+      screen_gamma = 2.2; /* A good guess for a
+           PC monitor in a bright office or a dim room */
+      screen_gamma = 2.0; /* A good guess for a
+           PC monitor in a dark room */
+      screen_gamma = 1.7 or 1.0;  /* A good
+           guess for Mac systems */
+   }
+
+The png_set_gamma() function handles gamma transformations of the data.
+Pass both the file gamma and the current screen_gamma.  If the file does
+not have a gamma value, you can pass one anyway if you have an idea what
+it is (usually 0.45455 is a good guess for GIF images on PCs).  Note
+that file gammas are inverted from screen gammas.  See the discussions
+on gamma in the PNG specification for an excellent description of what
+gamma is, and why all applications should support it.  It is strongly
+recommended that PNG viewers support gamma correction.
+
+   if (png_get_gAMA(png_ptr, info_ptr, &gamma))
+      png_set_gamma(png_ptr, screen_gamma, gamma);
+   else
+      png_set_gamma(png_ptr, screen_gamma, 0.45455);
+
+If you need to reduce an RGB file to a paletted file, or if a paletted
+file has more entries then will fit on your screen, png_set_dither()
+will do that.  Note that this is a simple match dither that merely
+finds the closest color available.  This should work fairly well with
+optimized palettes, and fairly badly with linear color cubes.  If you
+pass a palette that is larger then maximum_colors, the file will
+reduce the number of colors in the palette so it will fit into
+maximum_colors.  If there is a histogram, it will use it to make
+more intelligent choices when reducing the palette.  If there is no
+histogram, it may not do as good a job.
+
+   if (color_type & PNG_COLOR_MASK_COLOR)
+   {
+      if (png_get_valid(png_ptr, info_ptr,
+         PNG_INFO_PLTE))
+      {
+         png_uint_16p histogram = NULL;
+
+         png_get_hIST(png_ptr, info_ptr,
+            &histogram);
+         png_set_dither(png_ptr, palette, num_palette,
+            max_screen_colors, histogram, 1);
+      }
+      else
+      {
+         png_color std_color_cube[MAX_SCREEN_COLORS] =
+            { ... colors ... };
+
+         png_set_dither(png_ptr, std_color_cube,
+            MAX_SCREEN_COLORS, MAX_SCREEN_COLORS,
+            NULL,0);
+      }
+   }
+
+PNG files describe monochrome as black being zero and white being one.
+The following code will reverse this (make black be one and white be
+zero):
+
+   if (bit_depth == 1 && color_type == PNG_COLOR_TYPE_GRAY)
+      png_set_invert_mono(png_ptr);
+
+This function can also be used to invert grayscale and gray-alpha images:
+
+   if (color_type == PNG_COLOR_TYPE_GRAY ||
+        color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
+      png_set_invert_mono(png_ptr);
+
+PNG files store 16 bit pixels in network byte order (big-endian,
+ie. most significant bits first).  This code changes the storage to the
+other way (little-endian, i.e. least significant bits first, the
+way PCs store them):
+
+    if (bit_depth == 16)
+        png_set_swap(png_ptr);
+
+If you are using packed-pixel images (1, 2, or 4 bits/pixel), and you
+need to change the order the pixels are packed into bytes, you can use:
+
+    if (bit_depth < 8)
+       png_set_packswap(png_ptr);
+
+Finally, you can write your own transformation function if none of
+the existing ones meets your needs.  This is done by setting a callback
+with
+
+    png_set_read_user_transform_fn(png_ptr,
+       read_transform_fn);
+
+You must supply the function
+
+    void read_transform_fn(png_ptr ptr, row_info_ptr
+       row_info, png_bytep data)
+
+See pngtest.c for a working example.  Your function will be called
+after all of the other transformations have been processed.
+
+You can also set up a pointer to a user structure for use by your
+callback function, and you can inform libpng that your transform
+function will change the number of channels or bit depth with the
+function
+
+    png_set_user_transform_info(png_ptr, user_ptr,
+       user_depth, user_channels);
+
+The user's application, not libpng, is responsible for allocating and
+freeing any memory required for the user structure.
+
+You can retrieve the pointer via the function
+png_get_user_transform_ptr().  For example:
+
+    voidp read_user_transform_ptr =
+       png_get_user_transform_ptr(png_ptr);
+
+The last thing to handle is interlacing; this is covered in detail below,
+but you must call the function here if you want libpng to handle expansion
+of the interlaced image.
+
+    number_of_passes = png_set_interlace_handling(png_ptr);
+
+After setting the transformations, libpng can update your png_info
+structure to reflect any transformations you've requested with this
+call.  This is most useful to update the info structure's rowbytes
+field so you can use it to allocate your image memory.  This function
+will also update your palette with the correct screen_gamma and
+background if these have been given with the calls above.
+
+    png_read_update_info(png_ptr, info_ptr);
+
+After you call png_read_update_info(), you can allocate any
+memory you need to hold the image.  The row data is simply
+raw byte data for all forms of images.  As the actual allocation
+varies among applications, no example will be given.  If you
+are allocating one large chunk, you will need to build an
+array of pointers to each row, as it will be needed for some
+of the functions below.
+
+Reading image data
+
+After you've allocated memory, you can read the image data.
+The simplest way to do this is in one function call.  If you are
+allocating enough memory to hold the whole image, you can just
+call png_read_image() and libpng will read in all the image data
+and put it in the memory area supplied.  You will need to pass in
+an array of pointers to each row.
+
+This function automatically handles interlacing, so you don't need
+to call png_set_interlace_handling() or call this function multiple
+times, or any of that other stuff necessary with png_read_rows().
+
+   png_read_image(png_ptr, row_pointers);
+
+where row_pointers is:
+
+   png_bytep row_pointers[height];
+
+You can point to void or char or whatever you use for pixels.
+
+If you don't want to read in the whole image at once, you can
+use png_read_rows() instead.  If there is no interlacing (check
+interlace_type == PNG_INTERLACE_NONE), this is simple:
+
+    png_read_rows(png_ptr, row_pointers, NULL,
+       number_of_rows);
+
+where row_pointers is the same as in the png_read_image() call.
+
+If you are doing this just one row at a time, you can do this with
+a single row_pointer instead of an array of row_pointers:
+
+    png_bytep row_pointer = row;
+    png_read_row(png_ptr, row_pointer, NULL);
+
+If the file is interlaced (interlace_type != 0 in the IHDR chunk), things
+get somewhat harder.  The only current (PNG Specification version 1.2)
+interlacing type for PNG is (interlace_type == PNG_INTERLACE_ADAM7)
+is a somewhat complicated 2D interlace scheme, known as Adam7, that
+breaks down an image into seven smaller images of varying size, based
+on an 8x8 grid.
+
+libpng can fill out those images or it can give them to you "as is".
+If you want them filled out, there are two ways to do that.  The one
+mentioned in the PNG specification is to expand each pixel to cover
+those pixels that have not been read yet (the "rectangle" method).
+This results in a blocky image for the first pass, which gradually
+smooths out as more pixels are read.  The other method is the "sparkle"
+method, where pixels are drawn only in their final locations, with the
+rest of the image remaining whatever colors they were initialized to
+before the start of the read.  The first method usually looks better,
+but tends to be slower, as there are more pixels to put in the rows.
+
+If you don't want libpng to handle the interlacing details, just call
+png_read_rows() seven times to read in all seven images.  Each of the
+images is a valid image by itself, or they can all be combined on an
+8x8 grid to form a single image (although if you intend to combine them
+you would be far better off using the libpng interlace handling).
+
+The first pass will return an image 1/8 as wide as the entire image
+(every 8th column starting in column 0) and 1/8 as high as the original
+(every 8th row starting in row 0), the second will be 1/8 as wide
+(starting in column 4) and 1/8 as high (also starting in row 0).  The
+third pass will be 1/4 as wide (every 4th pixel starting in column 0) and
+1/8 as high (every 8th row starting in row 4), and the fourth pass will
+be 1/4 as wide and 1/4 as high (every 4th column starting in column 2,
+and every 4th row starting in row 0).  The fifth pass will return an
+image 1/2 as wide, and 1/4 as high (starting at column 0 and row 2),
+while the sixth pass will be 1/2 as wide and 1/2 as high as the original
+(starting in column 1 and row 0).  The seventh and final pass will be as
+wide as the original, and 1/2 as high, containing all of the odd
+numbered scanlines.  Phew!
+
+If you want libpng to expand the images, call this before calling
+png_start_read_image() or png_read_update_info():
+
+    if (interlace_type == PNG_INTERLACE_ADAM7)
+        number_of_passes
+           = png_set_interlace_handling(png_ptr);
+
+This will return the number of passes needed.  Currently, this
+is seven, but may change if another interlace type is added.
+This function can be called even if the file is not interlaced,
+where it will return one pass.
+
+If you are not going to display the image after each pass, but are
+going to wait until the entire image is read in, use the sparkle
+effect.  This effect is faster and the end result of either method
+is exactly the same.  If you are planning on displaying the image
+after each pass, the "rectangle" effect is generally considered the
+better looking one.
+
+If you only want the "sparkle" effect, just call png_read_rows() as
+normal, with the third parameter NULL.  Make sure you make pass over
+the image number_of_passes times, and you don't change the data in the
+rows between calls.  You can change the locations of the data, just
+not the data.  Each pass only writes the pixels appropriate for that
+pass, and assumes the data from previous passes is still valid.
+
+    png_read_rows(png_ptr, row_pointers, NULL,
+       number_of_rows);
+
+If you only want the first effect (the rectangles), do the same as
+before except pass the row buffer in the third parameter, and leave
+the second parameter NULL.
+
+    png_read_rows(png_ptr, NULL, row_pointers,
+       number_of_rows);
+
+Finishing a sequential read
+
+After you are finished reading the image through the
+low-level interface, you can finish reading the file.  If you are
+interested in comments or time, which may be stored either before or
+after the image data, you should pass the separate png_info struct if
+you want to keep the comments from before and after the image
+separate.  If you are not interested, you can pass NULL.
+
+   png_read_end(png_ptr, end_info);
+
+When you are done, you can free all memory allocated by libpng like this:
+
+   png_destroy_read_struct(&png_ptr, &info_ptr,
+       &end_info);
+
+It is also possible to individually free the info_ptr members that
+point to libpng-allocated storage with the following function:
+
+    png_free_data(png_ptr, info_ptr, mask, seq)
+    mask - identifies data to be freed, a mask
+           containing the bitwise OR of one or
+           more of
+             PNG_FREE_PLTE, PNG_FREE_TRNS,
+             PNG_FREE_HIST, PNG_FREE_ICCP,
+             PNG_FREE_PCAL, PNG_FREE_ROWS,
+             PNG_FREE_SCAL, PNG_FREE_SPLT,
+             PNG_FREE_TEXT, PNG_FREE_UNKN,
+           or simply PNG_FREE_ALL
+    seq  - sequence number of item to be freed
+           (-1 for all items)
+
+This function may be safely called when the relevant storage has
+already been freed, or has not yet been allocated, or was allocated
+by the user and not by libpng,  and will in those
+cases do nothing.  The "seq" parameter is ignored if only one item
+of the selected data type, such as PLTE, is allowed.  If "seq" is not
+-1, and multiple items are allowed for the data type identified in
+the mask, such as text or sPLT, only the n'th item in the structure
+is freed, where n is "seq".
+
+The default behavior is only to free data that was allocated internally
+by libpng.  This can be changed, so that libpng will not free the data,
+or so that it will free data that was allocated by the user with png_malloc()
+or png_zalloc() and passed in via a png_set_*() function, with
+
+    png_data_freer(png_ptr, info_ptr, freer, mask)
+    mask   - which data elements are affected
+             same choices as in png_free_data()
+    freer  - one of
+               PNG_DESTROY_WILL_FREE_DATA
+               PNG_SET_WILL_FREE_DATA
+               PNG_USER_WILL_FREE_DATA
+
+This function only affects data that has already been allocated.
+You can call this function after reading the PNG data but before calling
+any png_set_*() functions, to control whether the user or the png_set_*()
+function is responsible for freeing any existing data that might be present,
+and again after the png_set_*() functions to control whether the user
+or png_destroy_*() is supposed to free the data.  When the user assumes
+responsibility for libpng-allocated data, the application must use
+png_free() to free it, and when the user transfers responsibility to libpng
+for data that the user has allocated, the user must have used png_malloc()
+or png_zalloc() to allocate it.
+
+If you allocated your row_pointers in a single block, as suggested above in
+the description of the high level read interface, you must not transfer
+responsibility for freeing it to the png_set_rows or png_read_destroy function,
+because they would also try to free the individual row_pointers[i].
+
+If you allocated text_ptr.text, text_ptr.lang, and text_ptr.translated_keyword
+separately, do not transfer responsibility for freeing text_ptr to libpng,
+because when libpng fills a png_text structure it combines these members with
+the key member, and png_free_data() will free only text_ptr.key.  Similarly,
+if you transfer responsibility for free'ing text_ptr from libpng to your
+application, your application must not separately free those members.
+
+The png_free_data() function will turn off the "valid" flag for anything
+it frees.  If you need to turn the flag off for a chunk that was freed by your
+application instead of by libpng, you can use
+
+    png_set_invalid(png_ptr, info_ptr, mask);
+    mask - identifies the chunks to be made invalid,
+           containing the bitwise OR of one or
+           more of
+             PNG_INFO_gAMA, PNG_INFO_sBIT,
+             PNG_INFO_cHRM, PNG_INFO_PLTE,
+             PNG_INFO_tRNS, PNG_INFO_bKGD,
+             PNG_INFO_hIST, PNG_INFO_pHYs,
+             PNG_INFO_oFFs, PNG_INFO_tIME,
+             PNG_INFO_pCAL, PNG_INFO_sRGB,
+             PNG_INFO_iCCP, PNG_INFO_sPLT,
+             PNG_INFO_sCAL, PNG_INFO_IDAT
+
+For a more compact example of reading a PNG image, see the file example.c.
+
+Reading PNG files progressively
+
+The progressive reader is slightly different then the non-progressive
+reader.  Instead of calling png_read_info(), png_read_rows(), and
+png_read_end(), you make one call to png_process_data(), which calls
+callbacks when it has the info, a row, or the end of the image.  You
+set up these callbacks with png_set_progressive_read_fn().  You don't
+have to worry about the input/output functions of libpng, as you are
+giving the library the data directly in png_process_data().  I will
+assume that you have read the section on reading PNG files above,
+so I will only highlight the differences (although I will show
+all of the code).
+
+png_structp png_ptr;
+png_infop info_ptr;
+
+ /*  An example code fragment of how you would
+     initialize the progressive reader in your
+     application. */
+ int
+ initialize_png_reader()
+ {
+    png_ptr = png_create_read_struct
+        (PNG_LIBPNG_VER_STRING, (png_voidp)user_error_ptr,
+         user_error_fn, user_warning_fn);
+    if (!png_ptr)
+        return (ERROR);
+    info_ptr = png_create_info_struct(png_ptr);
+    if (!info_ptr)
+    {
+        png_destroy_read_struct(&png_ptr, (png_infopp)NULL,
+           (png_infopp)NULL);
+        return (ERROR);
+    }
+
+    if (setjmp(png_jmpbuf(png_ptr)))
+    {
+        png_destroy_read_struct(&png_ptr, &info_ptr,
+           (png_infopp)NULL);
+        return (ERROR);
+    }
+
+    /* This one's new.  You can provide functions
+       to be called when the header info is valid,
+       when each row is completed, and when the image
+       is finished.  If you aren't using all functions,
+       you can specify NULL parameters.  Even when all
+       three functions are NULL, you need to call
+       png_set_progressive_read_fn().  You can use
+       any struct as the user_ptr (cast to a void pointer
+       for the function call), and retrieve the pointer
+       from inside the callbacks using the function
+
+          png_get_progressive_ptr(png_ptr);
+
+       which will return a void pointer, which you have
+       to cast appropriately.
+     */
+    png_set_progressive_read_fn(png_ptr, (void *)user_ptr,
+        info_callback, row_callback, end_callback);
+
+    return 0;
+ }
+
+ /* A code fragment that you call as you receive blocks
+   of data */
+ int
+ process_data(png_bytep buffer, png_uint_32 length)
+ {
+    if (setjmp(png_jmpbuf(png_ptr)))
+    {
+        png_destroy_read_struct(&png_ptr, &info_ptr,
+           (png_infopp)NULL);
+        return (ERROR);
+    }
+
+    /* This one's new also.  Simply give it a chunk
+       of data from the file stream (in order, of
+       course).  On machines with segmented memory
+       models machines, don't give it any more than
+       64K.  The library seems to run fine with sizes
+       of 4K. Although you can give it much less if
+       necessary (I assume you can give it chunks of
+       1 byte, I haven't tried less then 256 bytes
+       yet).  When this function returns, you may
+       want to display any rows that were generated
+       in the row callback if you don't already do
+       so there.
+     */
+    png_process_data(png_ptr, info_ptr, buffer, length);
+    return 0;
+ }
+
+ /* This function is called (as set by
+    png_set_progressive_read_fn() above) when enough data
+    has been supplied so all of the header has been
+    read.
+ */
+ void
+ info_callback(png_structp png_ptr, png_infop info)
+ {
+    /* Do any setup here, including setting any of
+       the transformations mentioned in the Reading
+       PNG files section.  For now, you _must_ call
+       either png_start_read_image() or
+       png_read_update_info() after all the
+       transformations are set (even if you don't set
+       any).  You may start getting rows before
+       png_process_data() returns, so this is your
+       last chance to prepare for that.
+     */
+ }
+
+ /* This function is called when each row of image
+    data is complete */
+ void
+ row_callback(png_structp png_ptr, png_bytep new_row,
+    png_uint_32 row_num, int pass)
+ {
+    /* If the image is interlaced, and you turned
+       on the interlace handler, this function will
+       be called for every row in every pass.  Some
+       of these rows will not be changed from the
+       previous pass.  When the row is not changed,
+       the new_row variable will be NULL.  The rows
+       and passes are called in order, so you don't
+       really need the row_num and pass, but I'm
+       supplying them because it may make your life
+       easier.
+
+       For the non-NULL rows of interlaced images,
+       you must call png_progressive_combine_row()
+       passing in the row and the old row.  You can
+       call this function for NULL rows (it will just
+       return) and for non-interlaced images (it just
+       does the memcpy for you) if it will make the
+       code easier.  Thus, you can just do this for
+       all cases:
+     */
+
+        png_progressive_combine_row(png_ptr, old_row,
+          new_row);
+
+    /* where old_row is what was displayed for
+       previously for the row.  Note that the first
+       pass (pass == 0, really) will completely cover
+       the old row, so the rows do not have to be
+       initialized.  After the first pass (and only
+       for interlaced images), you will have to pass
+       the current row, and the function will combine
+       the old row and the new row.
+    */
+ }
+
+ void
+ end_callback(png_structp png_ptr, png_infop info)
+ {
+    /* This function is called after the whole image
+       has been read, including any chunks after the
+       image (up to and including the IEND).  You
+       will usually have the same info chunk as you
+       had in the header, although some data may have
+       been added to the comments and time fields.
+
+       Most people won't do much here, perhaps setting
+       a flag that marks the image as finished.
+     */
+ }
+
+
+
+IV. Writing
+
+Much of this is very similar to reading.  However, everything of
+importance is repeated here, so you won't have to constantly look
+back up in the reading section to understand writing.
+
+Setup
+
+You will want to do the I/O initialization before you get into libpng,
+so if it doesn't work, you don't have anything to undo. If you are not
+using the standard I/O functions, you will need to replace them with
+custom writing functions.  See the discussion under Customizing libpng.
+
+    FILE *fp = fopen(file_name, "wb");
+    if (!fp)
+    {
+       return (ERROR);
+    }
+
+Next, png_struct and png_info need to be allocated and initialized.
+As these can be both relatively large, you may not want to store these
+on the stack, unless you have stack space to spare.  Of course, you
+will want to check if they return NULL.  If you are also reading,
+you won't want to name your read structure and your write structure
+both "png_ptr"; you can call them anything you like, such as
+"read_ptr" and "write_ptr".  Look at pngtest.c, for example.
+
+    png_structp png_ptr = png_create_write_struct
+       (PNG_LIBPNG_VER_STRING, (png_voidp)user_error_ptr,
+        user_error_fn, user_warning_fn);
+    if (!png_ptr)
+       return (ERROR);
+
+    png_infop info_ptr = png_create_info_struct(png_ptr);
+    if (!info_ptr)
+    {
+       png_destroy_write_struct(&png_ptr,
+         (png_infopp)NULL);
+       return (ERROR);
+    }
+
+If you want to use your own memory allocation routines,
+define PNG_USER_MEM_SUPPORTED and use
+png_create_write_struct_2() instead of png_create_write_struct():
+
+    png_structp png_ptr = png_create_write_struct_2
+       (PNG_LIBPNG_VER_STRING, (png_voidp)user_error_ptr,
+        user_error_fn, user_warning_fn, (png_voidp)
+        user_mem_ptr, user_malloc_fn, user_free_fn);
+
+After you have these structures, you will need to set up the
+error handling.  When libpng encounters an error, it expects to
+longjmp() back to your routine.  Therefore, you will need to call
+setjmp() and pass the png_jmpbuf(png_ptr).  If you
+write the file from different routines, you will need to update
+the png_jmpbuf(png_ptr) every time you enter a new routine that will
+call a png_*() function.  See your documentation of setjmp/longjmp
+for your compiler for more information on setjmp/longjmp.  See
+the discussion on libpng error handling in the Customizing Libpng
+section below for more information on the libpng error handling.
+
+    if (setjmp(png_jmpbuf(png_ptr)))
+    {
+       png_destroy_write_struct(&png_ptr, &info_ptr);
+       fclose(fp);
+       return (ERROR);
+    }
+    ...
+    return;
+
+If you would rather avoid the complexity of setjmp/longjmp issues,
+you can compile libpng with PNG_SETJMP_NOT_SUPPORTED, in which case
+errors will result in a call to PNG_ABORT() which defaults to abort().
+
+Now you need to set up the output code.  The default for libpng is to
+use the C function fwrite().  If you use this, you will need to pass a
+valid FILE * in the function png_init_io().  Be sure that the file is
+opened in binary mode.  Again, if you wish to handle writing data in
+another way, see the discussion on libpng I/O handling in the Customizing
+Libpng section below.
+
+    png_init_io(png_ptr, fp);
+
+If you are embedding your PNG into a datastream such as MNG, and don't
+want libpng to write the 8-byte signature, or if you have already
+written the signature in your application, use
+
+    png_set_sig_bytes(png_ptr, 8);
+
+to inform libpng that it should not write a signature.
+
+Write callbacks
+
+At this point, you can set up a callback function that will be
+called after each row has been written, which you can use to control
+a progress meter or the like.  It's demonstrated in pngtest.c.
+You must supply a function
+
+    void write_row_callback(png_ptr, png_uint_32 row,
+       int pass);
+    {
+      /* put your code here */
+    }
+
+(You can give it another name that you like instead of "write_row_callback")
+
+To inform libpng about your function, use
+
+    png_set_write_status_fn(png_ptr, write_row_callback);
+
+You now have the option of modifying how the compression library will
+run.  The following functions are mainly for testing, but may be useful
+in some cases, like if you need to write PNG files extremely fast and
+are willing to give up some compression, or if you want to get the
+maximum possible compression at the expense of slower writing.  If you
+have no special needs in this area, let the library do what it wants by
+not calling this function at all, as it has been tuned to deliver a good
+speed/compression ratio. The second parameter to png_set_filter() is
+the filter method, for which the only valid values are 0 (as of the
+July 1999 PNG specification, version 1.2) or 64 (if you are writing
+a PNG datastream that is to be embedded in a MNG datastream).  The third
+parameter is a flag that indicates which filter type(s) are to be tested
+for each scanline.  See the PNG specification for details on the specific filter
+types.
+
+
+    /* turn on or off filtering, and/or choose
+       specific filters.  You can use either a single
+       PNG_FILTER_VALUE_NAME or the bitwise OR of one
+       or more PNG_FILTER_NAME masks. */
+    png_set_filter(png_ptr, 0,
+       PNG_FILTER_NONE  | PNG_FILTER_VALUE_NONE |
+       PNG_FILTER_SUB   | PNG_FILTER_VALUE_SUB  |
+       PNG_FILTER_UP    | PNG_FILTER_VALUE_UP   |
+       PNG_FILTER_AVE   | PNG_FILTER_VALUE_AVE  |
+       PNG_FILTER_PAETH | PNG_FILTER_VALUE_PAETH|
+       PNG_ALL_FILTERS);
+
+If an application
+wants to start and stop using particular filters during compression,
+it should start out with all of the filters (to ensure that the previous
+row of pixels will be stored in case it's needed later), and then add
+and remove them after the start of compression.
+
+If you are writing a PNG datastream that is to be embedded in a MNG
+datastream, the second parameter can be either 0 or 64.
+
+The png_set_compression_*() functions interface to the zlib compression
+library, and should mostly be ignored unless you really know what you are
+doing.  The only generally useful call is png_set_compression_level()
+which changes how much time zlib spends on trying to compress the image
+data.  See the Compression Library (zlib.h and algorithm.txt, distributed
+with zlib) for details on the compression levels.
+
+    /* set the zlib compression level */
+    png_set_compression_level(png_ptr,
+        Z_BEST_COMPRESSION);
+
+    /* set other zlib parameters */
+    png_set_compression_mem_level(png_ptr, 8);
+    png_set_compression_strategy(png_ptr,
+        Z_DEFAULT_STRATEGY);
+    png_set_compression_window_bits(png_ptr, 15);
+    png_set_compression_method(png_ptr, 8);
+    png_set_compression_buffer_size(png_ptr, 8192)
+
+extern PNG_EXPORT(void,png_set_zbuf_size)
+
+Setting the contents of info for output
+
+You now need to fill in the png_info structure with all the data you
+wish to write before the actual image.  Note that the only thing you
+are allowed to write after the image is the text chunks and the time
+chunk (as of PNG Specification 1.2, anyway).  See png_write_end() and
+the latest PNG specification for more information on that.  If you
+wish to write them before the image, fill them in now, and flag that
+data as being valid.  If you want to wait until after the data, don't
+fill them until png_write_end().  For all the fields in png_info and
+their data types, see png.h.  For explanations of what the fields
+contain, see the PNG specification.
+
+Some of the more important parts of the png_info are:
+
+    png_set_IHDR(png_ptr, info_ptr, width, height,
+       bit_depth, color_type, interlace_type,
+       compression_type, filter_method)
+    width          - holds the width of the image
+                     in pixels (up to 2^31).
+    height         - holds the height of the image
+                     in pixels (up to 2^31).
+    bit_depth      - holds the bit depth of one of the
+                     image channels.
+                     (valid values are 1, 2, 4, 8, 16
+                     and depend also on the
+                     color_type.  See also significant
+                     bits (sBIT) below).
+    color_type     - describes which color/alpha
+                     channels are present.
+                     PNG_COLOR_TYPE_GRAY
+                        (bit depths 1, 2, 4, 8, 16)
+                     PNG_COLOR_TYPE_GRAY_ALPHA
+                        (bit depths 8, 16)
+                     PNG_COLOR_TYPE_PALETTE
+                        (bit depths 1, 2, 4, 8)
+                     PNG_COLOR_TYPE_RGB
+                        (bit_depths 8, 16)
+                     PNG_COLOR_TYPE_RGB_ALPHA
+                        (bit_depths 8, 16)
+
+                     PNG_COLOR_MASK_PALETTE
+                     PNG_COLOR_MASK_COLOR
+                     PNG_COLOR_MASK_ALPHA
+
+    interlace_type - PNG_INTERLACE_NONE or
+                     PNG_INTERLACE_ADAM7
+    compression_type - (must be
+                     PNG_COMPRESSION_TYPE_DEFAULT)
+    filter_method  - (must be PNG_FILTER_TYPE_DEFAULT
+                     or, if you are writing a PNG to
+                     be embedded in a MNG datastream,
+                     can also be
+                     PNG_INTRAPIXEL_DIFFERENCING)
+
+    png_set_PLTE(png_ptr, info_ptr, palette,
+       num_palette);
+    palette        - the palette for the file
+                     (array of png_color)
+    num_palette    - number of entries in the palette
+
+    png_set_gAMA(png_ptr, info_ptr, gamma);
+    gamma          - the gamma the image was created
+                     at (PNG_INFO_gAMA)
+
+    png_set_sRGB(png_ptr, info_ptr, srgb_intent);
+    srgb_intent    - the rendering intent
+                     (PNG_INFO_sRGB) The presence of
+                     the sRGB chunk means that the pixel
+                     data is in the sRGB color space.
+                     This chunk also implies specific
+                     values of gAMA and cHRM.  Rendering
+                     intent is the CSS-1 property that
+                     has been defined by the International
+                     Color Consortium
+                     (http://www.color.org).
+                     It can be one of
+                     PNG_sRGB_INTENT_SATURATION,
+                     PNG_sRGB_INTENT_PERCEPTUAL,
+                     PNG_sRGB_INTENT_ABSOLUTE, or
+                     PNG_sRGB_INTENT_RELATIVE.
+
+
+    png_set_sRGB_gAMA_and_cHRM(png_ptr, info_ptr,
+       srgb_intent);
+    srgb_intent    - the rendering intent
+                     (PNG_INFO_sRGB) The presence of the
+                     sRGB chunk means that the pixel
+                     data is in the sRGB color space.
+                     This function also causes gAMA and
+                     cHRM chunks with the specific values
+                     that are consistent with sRGB to be
+                     written.
+
+    png_set_iCCP(png_ptr, info_ptr, name, compression_type,
+                      profile, proflen);
+    name            - The profile name.
+    compression     - The compression type; always
+                      PNG_COMPRESSION_TYPE_BASE for PNG 1.0.
+                      You may give NULL to this argument to
+                      ignore it.
+    profile         - International Color Consortium color
+                      profile data. May contain NULs.
+    proflen         - length of profile data in bytes.
+
+    png_set_sBIT(png_ptr, info_ptr, sig_bit);
+    sig_bit        - the number of significant bits for
+                     (PNG_INFO_sBIT) each of the gray, red,
+                     green, and blue channels, whichever are
+                     appropriate for the given color type
+                     (png_color_16)
+
+    png_set_tRNS(png_ptr, info_ptr, trans, num_trans,
+       trans_values);
+    trans          - array of transparent entries for
+                     palette (PNG_INFO_tRNS)
+    trans_values   - graylevel or color sample values of
+                     the single transparent color for
+                     non-paletted images (PNG_INFO_tRNS)
+    num_trans      - number of transparent entries
+                     (PNG_INFO_tRNS)
+
+    png_set_hIST(png_ptr, info_ptr, hist);
+                    (PNG_INFO_hIST)
+    hist           - histogram of palette (array of
+                     png_uint_16)
+
+    png_set_tIME(png_ptr, info_ptr, mod_time);
+    mod_time       - time image was last modified
+                     (PNG_VALID_tIME)
+
+    png_set_bKGD(png_ptr, info_ptr, background);
+    background     - background color (PNG_VALID_bKGD)
+
+    png_set_text(png_ptr, info_ptr, text_ptr, num_text);
+    text_ptr       - array of png_text holding image
+                     comments
+    text_ptr[i].compression - type of compression used
+                 on "text" PNG_TEXT_COMPRESSION_NONE
+                           PNG_TEXT_COMPRESSION_zTXt
+                           PNG_ITXT_COMPRESSION_NONE
+                           PNG_ITXT_COMPRESSION_zTXt
+    text_ptr[i].key   - keyword for comment.  Must contain
+                 1-79 characters.
+    text_ptr[i].text  - text comments for current
+                         keyword.  Can be NULL or empty.
+    text_ptr[i].text_length - length of text string,
+                 after decompression, 0 for iTXt
+    text_ptr[i].itxt_length - length of itxt string,
+                 after decompression, 0 for tEXt/zTXt
+    text_ptr[i].lang  - language of comment (NULL or
+                         empty for unknown).
+    text_ptr[i].translated_keyword  - keyword in UTF-8 (NULL
+                         or empty for unknown).
+    num_text       - number of comments
+
+    png_set_sPLT(png_ptr, info_ptr, &palette_ptr,
+       num_spalettes);
+    palette_ptr    - array of png_sPLT_struct structures
+                     to be added to the list of palettes
+                     in the info structure.
+    num_spalettes  - number of palette structures to be
+                     added.
+
+    png_set_oFFs(png_ptr, info_ptr, offset_x, offset_y,
+        unit_type);
+    offset_x  - positive offset from the left
+                     edge of the screen
+    offset_y  - positive offset from the top
+                     edge of the screen
+    unit_type - PNG_OFFSET_PIXEL, PNG_OFFSET_MICROMETER
+
+    png_set_pHYs(png_ptr, info_ptr, res_x, res_y,
+        unit_type);
+    res_x       - pixels/unit physical resolution
+                  in x direction
+    res_y       - pixels/unit physical resolution
+                  in y direction
+    unit_type   - PNG_RESOLUTION_UNKNOWN,
+                  PNG_RESOLUTION_METER
+
+    png_set_sCAL(png_ptr, info_ptr, unit, width, height)
+    unit        - physical scale units (an integer)
+    width       - width of a pixel in physical scale units
+    height      - height of a pixel in physical scale units
+                  (width and height are doubles)
+
+    png_set_sCAL_s(png_ptr, info_ptr, unit, width, height)
+    unit        - physical scale units (an integer)
+    width       - width of a pixel in physical scale units
+    height      - height of a pixel in physical scale units
+                 (width and height are strings like "2.54")
+
+    png_set_unknown_chunks(png_ptr, info_ptr, &unknowns,
+       num_unknowns)
+    unknowns          - array of png_unknown_chunk
+                        structures holding unknown chunks
+    unknowns[i].name  - name of unknown chunk
+    unknowns[i].data  - data of unknown chunk
+    unknowns[i].size  - size of unknown chunk's data
+    unknowns[i].location - position to write chunk in file
+                           0: do not write chunk
+                           PNG_HAVE_IHDR: before PLTE
+                           PNG_HAVE_PLTE: before IDAT
+                           PNG_AFTER_IDAT: after IDAT
+
+The "location" member is set automatically according to
+what part of the output file has already been written.
+You can change its value after calling png_set_unknown_chunks()
+as demonstrated in pngtest.c.  Within each of the "locations",
+the chunks are sequenced according to their position in the
+structure (that is, the value of "i", which is the order in which
+the chunk was either read from the input file or defined with
+png_set_unknown_chunks).
+
+A quick word about text and num_text.  text is an array of png_text
+structures.  num_text is the number of valid structures in the array.
+Each png_text structure holds a language code, a keyword, a text value,
+and a compression type.
+
+The compression types have the same valid numbers as the compression
+types of the image data.  Currently, the only valid number is zero.
+However, you can store text either compressed or uncompressed, unlike
+images, which always have to be compressed.  So if you don't want the
+text compressed, set the compression type to PNG_TEXT_COMPRESSION_NONE.
+Because tEXt and zTXt chunks don't have a language field, if you
+specify PNG_TEXT_COMPRESSION_NONE or PNG_TEXT_COMPRESSION_zTXt
+any language code or translated keyword will not be written out.
+
+Until text gets around 1000 bytes, it is not worth compressing it.
+After the text has been written out to the file, the compression type
+is set to PNG_TEXT_COMPRESSION_NONE_WR or PNG_TEXT_COMPRESSION_zTXt_WR,
+so that it isn't written out again at the end (in case you are calling
+png_write_end() with the same struct.
+
+The keywords that are given in the PNG Specification are:
+
+    Title            Short (one line) title or
+                     caption for image
+    Author           Name of image's creator
+    Description      Description of image (possibly long)
+    Copyright        Copyright notice
+    Creation Time    Time of original image creation
+                     (usually RFC 1123 format, see below)
+    Software         Software used to create the image
+    Disclaimer       Legal disclaimer
+    Warning          Warning of nature of content
+    Source           Device used to create the image
+    Comment          Miscellaneous comment; conversion
+                     from other image format
+
+The keyword-text pairs work like this.  Keywords should be short
+simple descriptions of what the comment is about.  Some typical
+keywords are found in the PNG specification, as is some recommendations
+on keywords.  You can repeat keywords in a file.  You can even write
+some text before the image and some after.  For example, you may want
+to put a description of the image before the image, but leave the
+disclaimer until after, so viewers working over modem connections
+don't have to wait for the disclaimer to go over the modem before
+they start seeing the image.  Finally, keywords should be full
+words, not abbreviations.  Keywords and text are in the ISO 8859-1
+(Latin-1) character set (a superset of regular ASCII) and can not
+contain NUL characters, and should not contain control or other
+unprintable characters.  To make the comments widely readable, stick
+with basic ASCII, and avoid machine specific character set extensions
+like the IBM-PC character set.  The keyword must be present, but
+you can leave off the text string on non-compressed pairs.
+Compressed pairs must have a text string, as only the text string
+is compressed anyway, so the compression would be meaningless.
+
+PNG supports modification time via the png_time structure.  Two
+conversion routines are provided, png_convert_from_time_t() for
+time_t and png_convert_from_struct_tm() for struct tm.  The
+time_t routine uses gmtime().  You don't have to use either of
+these, but if you wish to fill in the png_time structure directly,
+you should provide the time in universal time (GMT) if possible
+instead of your local time.  Note that the year number is the full
+year (e.g. 1998, rather than 98 - PNG is year 2000 compliant!), and
+that months start with 1.
+
+If you want to store the time of the original image creation, you should
+use a plain tEXt chunk with the "Creation Time" keyword.  This is
+necessary because the "creation time" of a PNG image is somewhat vague,
+depending on whether you mean the PNG file, the time the image was
+created in a non-PNG format, a still photo from which the image was
+scanned, or possibly the subject matter itself.  In order to facilitate
+machine-readable dates, it is recommended that the "Creation Time"
+tEXt chunk use RFC 1123 format dates (e.g. "22 May 1997 18:07:10 GMT"),
+although this isn't a requirement.  Unlike the tIME chunk, the
+"Creation Time" tEXt chunk is not expected to be automatically changed
+by the software.  To facilitate the use of RFC 1123 dates, a function
+png_convert_to_rfc1123(png_timep) is provided to convert from PNG
+time to an RFC 1123 format string.
+
+Writing unknown chunks
+
+You can use the png_set_unknown_chunks function to queue up chunks
+for writing.  You give it a chunk name, raw data, and a size; that's
+all there is to it.  The chunks will be written by the next following
+png_write_info_before_PLTE, png_write_info, or png_write_end function.
+Any chunks previously read into the info structure's unknown-chunk
+list will also be written out in a sequence that satisfies the PNG
+specification's ordering rules.
+
+The high-level write interface
+
+At this point there are two ways to proceed; through the high-level
+write interface, or through a sequence of low-level write operations.
+You can use the high-level interface if your image data is present
+in the info structure.  All defined output
+transformations are permitted, enabled by the following masks.
+
+    PNG_TRANSFORM_IDENTITY      No transformation
+    PNG_TRANSFORM_PACKING       Pack 1, 2 and 4-bit samples
+    PNG_TRANSFORM_PACKSWAP      Change order of packed
+                                pixels to LSB first
+    PNG_TRANSFORM_INVERT_MONO   Invert monochrome images
+    PNG_TRANSFORM_SHIFT         Normalize pixels to the
+                                sBIT depth
+    PNG_TRANSFORM_BGR           Flip RGB to BGR, RGBA
+                                to BGRA
+    PNG_TRANSFORM_SWAP_ALPHA    Flip RGBA to ARGB or GA
+                                to AG
+    PNG_TRANSFORM_INVERT_ALPHA  Change alpha from opacity
+                                to transparency
+    PNG_TRANSFORM_SWAP_ENDIAN   Byte-swap 16-bit samples
+    PNG_TRANSFORM_STRIP_FILLER  Strip out filler bytes.
+
+If you have valid image data in the info structure (you can use
+png_set_rows() to put image data in the info structure), simply do this:
+
+    png_write_png(png_ptr, info_ptr, png_transforms, NULL)
+
+where png_transforms is an integer containing the bitwise OR of some set of
+transformation flags.  This call is equivalent to png_write_info(),
+followed the set of transformations indicated by the transform mask,
+then png_write_image(), and finally png_write_end().
+
+(The final parameter of this call is not yet used.  Someday it might point
+to transformation parameters required by some future output transform.)
+
+You must use png_transforms and not call any png_set_transform() functions
+when you use png_write_png().
+
+The low-level write interface
+
+If you are going the low-level route instead, you are now ready to
+write all the file information up to the actual image data.  You do
+this with a call to png_write_info().
+
+    png_write_info(png_ptr, info_ptr);
+
+Note that there is one transformation you may need to do before
+png_write_info().  In PNG files, the alpha channel in an image is the
+level of opacity.  If your data is supplied as a level of
+transparency, you can invert the alpha channel before you write it, so
+that 0 is fully transparent and 255 (in 8-bit or paletted images) or
+65535 (in 16-bit images) is fully opaque, with
+
+    png_set_invert_alpha(png_ptr);
+
+This must appear before png_write_info() instead of later with the
+other transformations because in the case of paletted images the tRNS
+chunk data has to be inverted before the tRNS chunk is written.  If
+your image is not a paletted image, the tRNS data (which in such cases
+represents a single color to be rendered as transparent) won't need to
+be changed, and you can safely do this transformation after your
+png_write_info() call.
+
+If you need to write a private chunk that you want to appear before
+the PLTE chunk when PLTE is present, you can write the PNG info in
+two steps, and insert code to write your own chunk between them:
+
+    png_write_info_before_PLTE(png_ptr, info_ptr);
+    png_set_unknown_chunks(png_ptr, info_ptr, ...);
+    png_write_info(png_ptr, info_ptr);
+
+After you've written the file information, you can set up the library
+to handle any special transformations of the image data.  The various
+ways to transform the data will be described in the order that they
+should occur.  This is important, as some of these change the color
+type and/or bit depth of the data, and some others only work on
+certain color types and bit depths.  Even though each transformation
+checks to see if it has data that it can do something with, you should
+make sure to only enable a transformation if it will be valid for the
+data.  For example, don't swap red and blue on grayscale data.
+
+PNG files store RGB pixels packed into 3 or 6 bytes.  This code tells
+the library to strip input data that has 4 or 8 bytes per pixel down
+to 3 or 6 bytes (or strip 2 or 4-byte grayscale+filler data to 1 or 2
+bytes per pixel).
+
+    png_set_filler(png_ptr, 0, PNG_FILLER_BEFORE);
+
+where the 0 is unused, and the location is either PNG_FILLER_BEFORE or
+PNG_FILLER_AFTER, depending upon whether the filler byte in the pixel
+is stored XRGB or RGBX.
+
+PNG files pack pixels of bit depths 1, 2, and 4 into bytes as small as
+they can, resulting in, for example, 8 pixels per byte for 1 bit files.
+If the data is supplied at 1 pixel per byte, use this code, which will
+correctly pack the pixels into a single byte:
+
+    png_set_packing(png_ptr);
+
+PNG files reduce possible bit depths to 1, 2, 4, 8, and 16.  If your
+data is of another bit depth, you can write an sBIT chunk into the
+file so that decoders can recover the original data if desired.
+
+    /* Set the true bit depth of the image data */
+    if (color_type & PNG_COLOR_MASK_COLOR)
+    {
+        sig_bit.red = true_bit_depth;
+        sig_bit.green = true_bit_depth;
+        sig_bit.blue = true_bit_depth;
+    }
+    else
+    {
+        sig_bit.gray = true_bit_depth;
+    }
+    if (color_type & PNG_COLOR_MASK_ALPHA)
+    {
+        sig_bit.alpha = true_bit_depth;
+    }
+
+    png_set_sBIT(png_ptr, info_ptr, &sig_bit);
+
+If the data is stored in the row buffer in a bit depth other than
+one supported by PNG (e.g. 3 bit data in the range 0-7 for a 4-bit PNG),
+this will scale the values to appear to be the correct bit depth as
+is required by PNG.
+
+    png_set_shift(png_ptr, &sig_bit);
+
+PNG files store 16 bit pixels in network byte order (big-endian,
+ie. most significant bits first).  This code would be used if they are
+supplied the other way (little-endian, i.e. least significant bits
+first, the way PCs store them):
+
+    if (bit_depth > 8)
+       png_set_swap(png_ptr);
+
+If you are using packed-pixel images (1, 2, or 4 bits/pixel), and you
+need to change the order the pixels are packed into bytes, you can use:
+
+    if (bit_depth < 8)
+       png_set_packswap(png_ptr);
+
+PNG files store 3 color pixels in red, green, blue order.  This code
+would be used if they are supplied as blue, green, red:
+
+    png_set_bgr(png_ptr);
+
+PNG files describe monochrome as black being zero and white being
+one. This code would be used if the pixels are supplied with this reversed
+(black being one and white being zero):
+
+    png_set_invert_mono(png_ptr);
+
+Finally, you can write your own transformation function if none of
+the existing ones meets your needs.  This is done by setting a callback
+with
+
+    png_set_write_user_transform_fn(png_ptr,
+       write_transform_fn);
+
+You must supply the function
+
+    void write_transform_fn(png_ptr ptr, row_info_ptr
+       row_info, png_bytep data)
+
+See pngtest.c for a working example.  Your function will be called
+before any of the other transformations are processed.
+
+You can also set up a pointer to a user structure for use by your
+callback function.
+
+    png_set_user_transform_info(png_ptr, user_ptr, 0, 0);
+
+The user_channels and user_depth parameters of this function are ignored
+when writing; you can set them to zero as shown.
+
+You can retrieve the pointer via the function png_get_user_transform_ptr().
+For example:
+
+    voidp write_user_transform_ptr =
+       png_get_user_transform_ptr(png_ptr);
+
+It is possible to have libpng flush any pending output, either manually,
+or automatically after a certain number of lines have been written.  To
+flush the output stream a single time call:
+
+    png_write_flush(png_ptr);
+
+and to have libpng flush the output stream periodically after a certain
+number of scanlines have been written, call:
+
+    png_set_flush(png_ptr, nrows);
+
+Note that the distance between rows is from the last time png_write_flush()
+was called, or the first row of the image if it has never been called.
+So if you write 50 lines, and then png_set_flush 25, it will flush the
+output on the next scanline, and every 25 lines thereafter, unless
+png_write_flush() is called before 25 more lines have been written.
+If nrows is too small (less than about 10 lines for a 640 pixel wide
+RGB image) the image compression may decrease noticeably (although this
+may be acceptable for real-time applications).  Infrequent flushing will
+only degrade the compression performance by a few percent over images
+that do not use flushing.
+
+Writing the image data
+
+That's it for the transformations.  Now you can write the image data.
+The simplest way to do this is in one function call.  If you have the
+whole image in memory, you can just call png_write_image() and libpng
+will write the image.  You will need to pass in an array of pointers to
+each row.  This function automatically handles interlacing, so you don't
+need to call png_set_interlace_handling() or call this function multiple
+times, or any of that other stuff necessary with png_write_rows().
+
+    png_write_image(png_ptr, row_pointers);
+
+where row_pointers is:
+
+    png_byte *row_pointers[height];
+
+You can point to void or char or whatever you use for pixels.
+
+If you don't want to write the whole image at once, you can
+use png_write_rows() instead.  If the file is not interlaced,
+this is simple:
+
+    png_write_rows(png_ptr, row_pointers,
+       number_of_rows);
+
+row_pointers is the same as in the png_write_image() call.
+
+If you are just writing one row at a time, you can do this with
+a single row_pointer instead of an array of row_pointers:
+
+    png_bytep row_pointer = row;
+
+    png_write_row(png_ptr, row_pointer);
+
+When the file is interlaced, things can get a good deal more
+complicated.  The only currently (as of the PNG Specification
+version 1.2, dated July 1999) defined interlacing scheme for PNG files
+is the "Adam7" interlace scheme, that breaks down an
+image into seven smaller images of varying size.  libpng will build
+these images for you, or you can do them yourself.  If you want to
+build them yourself, see the PNG specification for details of which
+pixels to write when.
+
+If you don't want libpng to handle the interlacing details, just
+use png_set_interlace_handling() and call png_write_rows() the
+correct number of times to write all seven sub-images.
+
+If you want libpng to build the sub-images, call this before you start
+writing any rows:
+
+    number_of_passes =
+       png_set_interlace_handling(png_ptr);
+
+This will return the number of passes needed.  Currently, this
+is seven, but may change if another interlace type is added.
+
+Then write the complete image number_of_passes times.
+
+    png_write_rows(png_ptr, row_pointers,
+       number_of_rows);
+
+As some of these rows are not used, and thus return immediately,
+you may want to read about interlacing in the PNG specification,
+and only update the rows that are actually used.
+
+Finishing a sequential write
+
+After you are finished writing the image, you should finish writing
+the file.  If you are interested in writing comments or time, you should
+pass an appropriately filled png_info pointer.  If you are not interested,
+you can pass NULL.
+
+    png_write_end(png_ptr, info_ptr);
+
+When you are done, you can free all memory used by libpng like this:
+
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+
+It is also possible to individually free the info_ptr members that
+point to libpng-allocated storage with the following function:
+
+    png_free_data(png_ptr, info_ptr, mask, seq)
+    mask  - identifies data to be freed, a mask
+            containing the bitwise OR of one or
+            more of
+              PNG_FREE_PLTE, PNG_FREE_TRNS,
+              PNG_FREE_HIST, PNG_FREE_ICCP,
+              PNG_FREE_PCAL, PNG_FREE_ROWS,
+              PNG_FREE_SCAL, PNG_FREE_SPLT,
+              PNG_FREE_TEXT, PNG_FREE_UNKN,
+            or simply PNG_FREE_ALL
+    seq   - sequence number of item to be freed
+            (-1 for all items)
+
+This function may be safely called when the relevant storage has
+already been freed, or has not yet been allocated, or was allocated
+by the user  and not by libpng,  and will in those
+cases do nothing.  The "seq" parameter is ignored if only one item
+of the selected data type, such as PLTE, is allowed.  If "seq" is not
+-1, and multiple items are allowed for the data type identified in
+the mask, such as text or sPLT, only the n'th item in the structure
+is freed, where n is "seq".
+
+If you allocated data such as a palette that you passed
+in to libpng with png_set_*, you must not free it until just before the call to
+png_destroy_write_struct().
+
+The default behavior is only to free data that was allocated internally
+by libpng.  This can be changed, so that libpng will not free the data,
+or so that it will free data that was allocated by the user with png_malloc()
+or png_zalloc() and passed in via a png_set_*() function, with
+
+    png_data_freer(png_ptr, info_ptr, freer, mask)
+    mask   - which data elements are affected
+             same choices as in png_free_data()
+    freer  - one of
+               PNG_DESTROY_WILL_FREE_DATA
+               PNG_SET_WILL_FREE_DATA
+               PNG_USER_WILL_FREE_DATA
+
+For example, to transfer responsibility for some data from a read structure
+to a write structure, you could use
+
+    png_data_freer(read_ptr, read_info_ptr,
+       PNG_USER_WILL_FREE_DATA,
+       PNG_FREE_PLTE|PNG_FREE_tRNS|PNG_FREE_hIST)
+    png_data_freer(write_ptr, write_info_ptr,
+       PNG_DESTROY_WILL_FREE_DATA,
+       PNG_FREE_PLTE|PNG_FREE_tRNS|PNG_FREE_hIST)
+
+thereby briefly reassigning responsibility for freeing to the user but
+immediately afterwards reassigning it once more to the write_destroy
+function.  Having done this, it would then be safe to destroy the read
+structure and continue to use the PLTE, tRNS, and hIST data in the write
+structure.
+
+This function only affects data that has already been allocated.
+You can call this function before calling after the png_set_*() functions
+to control whether the user or png_destroy_*() is supposed to free the data.
+When the user assumes responsibility for libpng-allocated data, the
+application must use
+png_free() to free it, and when the user transfers responsibility to libpng
+for data that the user has allocated, the user must have used png_malloc()
+or png_zalloc() to allocate it.
+
+If you allocated text_ptr.text, text_ptr.lang, and text_ptr.translated_keyword
+separately, do not transfer responsibility for freeing text_ptr to libpng,
+because when libpng fills a png_text structure it combines these members with
+the key member, and png_free_data() will free only text_ptr.key.  Similarly,
+if you transfer responsibility for free'ing text_ptr from libpng to your
+application, your application must not separately free those members.
+For a more compact example of writing a PNG image, see the file example.c.
+
+V. Modifying/Customizing libpng:
+
+There are two issues here.  The first is changing how libpng does
+standard things like memory allocation, input/output, and error handling.
+The second deals with more complicated things like adding new chunks,
+adding new transformations, and generally changing how libpng works.
+Both of those are compile-time issues; that is, they are generally
+determined at the time the code is written, and there is rarely a need
+to provide the user with a means of changing them.
+
+Memory allocation, input/output, and error handling
+
+All of the memory allocation, input/output, and error handling in libpng
+goes through callbacks that are user-settable.  The default routines are
+in pngmem.c, pngrio.c, pngwio.c, and pngerror.c, respectively.  To change
+these functions, call the appropriate png_set_*_fn() function.
+
+Memory allocation is done through the functions png_malloc()
+and png_free().  These currently just call the standard C functions.  If
+your pointers can't access more then 64K at a time, you will want to set
+MAXSEG_64K in zlib.h.  Since it is unlikely that the method of handling
+memory allocation on a platform will change between applications, these
+functions must be modified in the library at compile time.  If you prefer
+to use a different method of allocating and freeing data, you can use
+png_create_read_struct_2() or png_create_write_struct_2() to register
+your own functions as described above.
+These functions also provide a void pointer that can be retrieved via
+
+    mem_ptr=png_get_mem_ptr(png_ptr);
+
+Your replacement memory functions must have prototypes as follows:
+
+    png_voidp malloc_fn(png_structp png_ptr,
+       png_size_t size);
+    void free_fn(png_structp png_ptr, png_voidp ptr);
+
+Your malloc_fn() must return NULL in case of failure.  The png_malloc()
+function will normally call png_error() if it receives a NULL from the
+system memory allocator or from your replacement malloc_fn().
+
+Input/Output in libpng is done through png_read() and png_write(),
+which currently just call fread() and fwrite().  The FILE * is stored in
+png_struct and is initialized via png_init_io().  If you wish to change
+the method of I/O, the library supplies callbacks that you can set
+through the function png_set_read_fn() and png_set_write_fn() at run
+time, instead of calling the png_init_io() function.  These functions
+also provide a void pointer that can be retrieved via the function
+png_get_io_ptr().  For example:
+
+    png_set_read_fn(png_structp read_ptr,
+        voidp read_io_ptr, png_rw_ptr read_data_fn)
+
+    png_set_write_fn(png_structp write_ptr,
+        voidp write_io_ptr, png_rw_ptr write_data_fn,
+        png_flush_ptr output_flush_fn);
+
+    voidp read_io_ptr = png_get_io_ptr(read_ptr);
+    voidp write_io_ptr = png_get_io_ptr(write_ptr);
+
+The replacement I/O functions must have prototypes as follows:
+
+    void user_read_data(png_structp png_ptr,
+        png_bytep data, png_size_t length);
+    void user_write_data(png_structp png_ptr,
+        png_bytep data, png_size_t length);
+    void user_flush_data(png_structp png_ptr);
+
+Supplying NULL for the read, write, or flush functions sets them back
+to using the default C stream functions.  It is an error to read from
+a write stream, and vice versa.
+
+Error handling in libpng is done through png_error() and png_warning().
+Errors handled through png_error() are fatal, meaning that png_error()
+should never return to its caller.  Currently, this is handled via
+setjmp() and longjmp() (unless you have compiled libpng with
+PNG_SETJMP_NOT_SUPPORTED, in which case it is handled via PNG_ABORT()),
+but you could change this to do things like exit() if you should wish.
+
+On non-fatal errors, png_warning() is called
+to print a warning message, and then control returns to the calling code.
+By default png_error() and png_warning() print a message on stderr via
+fprintf() unless the library is compiled with PNG_NO_CONSOLE_IO defined
+(because you don't want the messages) or PNG_NO_STDIO defined (because
+fprintf() isn't available).  If you wish to change the behavior of the error
+functions, you will need to set up your own message callbacks.  These
+functions are normally supplied at the time that the png_struct is created.
+It is also possible to redirect errors and warnings to your own replacement
+functions after png_create_*_struct() has been called by calling:
+
+    png_set_error_fn(png_structp png_ptr,
+        png_voidp error_ptr, png_error_ptr error_fn,
+        png_error_ptr warning_fn);
+
+    png_voidp error_ptr = png_get_error_ptr(png_ptr);
+
+If NULL is supplied for either error_fn or warning_fn, then the libpng
+default function will be used, calling fprintf() and/or longjmp() if a
+problem is encountered.  The replacement error functions should have
+parameters as follows:
+
+    void user_error_fn(png_structp png_ptr,
+        png_const_charp error_msg);
+    void user_warning_fn(png_structp png_ptr,
+        png_const_charp warning_msg);
+
+The motivation behind using setjmp() and longjmp() is the C++ throw and
+catch exception handling methods.  This makes the code much easier to write,
+as there is no need to check every return code of every function call.
+However, there are some uncertainties about the status of local variables
+after a longjmp, so the user may want to be careful about doing anything after
+setjmp returns non-zero besides returning itself.  Consult your compiler
+documentation for more details.  For an alternative approach, you may wish
+to use the "cexcept" facility (see http://cexcept.sourceforge.net).
+
+Custom chunks
+
+If you need to read or write custom chunks, you may need to get deeper
+into the libpng code.  The library now has mechanisms for storing
+and writing chunks of unknown type; you can even declare callbacks
+for custom chunks.  However, this may not be good enough if the
+library code itself needs to know about interactions between your
+chunk and existing `intrinsic' chunks.
+
+If you need to write a new intrinsic chunk, first read the PNG
+specification. Acquire a first level of
+understanding of how it works.  Pay particular attention to the
+sections that describe chunk names, and look at how other chunks were
+designed, so you can do things similarly.  Second, check out the
+sections of libpng that read and write chunks.  Try to find a chunk
+that is similar to yours and use it as a template.  More details can
+be found in the comments inside the code.  It is best to handle unknown
+chunks in a generic method, via callback functions, instead of by
+modifying libpng functions.
+
+If you wish to write your own transformation for the data, look through
+the part of the code that does the transformations, and check out some of
+the simpler ones to get an idea of how they work.  Try to find a similar
+transformation to the one you want to add and copy off of it.  More details
+can be found in the comments inside the code itself.
+
+Configuring for 16 bit platforms
+
+You will want to look into zconf.h to tell zlib (and thus libpng) that
+it cannot allocate more then 64K at a time.  Even if you can, the memory
+won't be accessible.  So limit zlib and libpng to 64K by defining MAXSEG_64K.
+
+Configuring for DOS
+
+For DOS users who only have access to the lower 640K, you will
+have to limit zlib's memory usage via a png_set_compression_mem_level()
+call.  See zlib.h or zconf.h in the zlib library for more information.
+
+Configuring for Medium Model
+
+Libpng's support for medium model has been tested on most of the popular
+compilers.  Make sure MAXSEG_64K gets defined, USE_FAR_KEYWORD gets
+defined, and FAR gets defined to far in pngconf.h, and you should be
+all set.  Everything in the library (except for zlib's structure) is
+expecting far data.  You must use the typedefs with the p or pp on
+the end for pointers (or at least look at them and be careful).  Make
+note that the rows of data are defined as png_bytepp, which is an
+unsigned char far * far *.
+
+Configuring for gui/windowing platforms:
+
+You will need to write new error and warning functions that use the GUI
+interface, as described previously, and set them to be the error and
+warning functions at the time that png_create_*_struct() is called,
+in order to have them available during the structure initialization.
+They can be changed later via png_set_error_fn().  On some compilers,
+you may also have to change the memory allocators (png_malloc, etc.).
+
+Configuring for compiler xxx:
+
+All includes for libpng are in pngconf.h.  If you need to add/change/delete
+an include, this is the place to do it.  The includes that are not
+needed outside libpng are protected by the PNG_INTERNAL definition,
+which is only defined for those routines inside libpng itself.  The
+files in libpng proper only include png.h, which includes pngconf.h.
+
+Configuring zlib:
+
+There are special functions to configure the compression.  Perhaps the
+most useful one changes the compression level, which currently uses
+input compression values in the range 0 - 9.  The library normally
+uses the default compression level (Z_DEFAULT_COMPRESSION = 6).  Tests
+have shown that for a large majority of images, compression values in
+the range 3-6 compress nearly as well as higher levels, and do so much
+faster.  For online applications it may be desirable to have maximum speed
+(Z_BEST_SPEED = 1).  With versions of zlib after v0.99, you can also
+specify no compression (Z_NO_COMPRESSION = 0), but this would create
+files larger than just storing the raw bitmap.  You can specify the
+compression level by calling:
+
+    png_set_compression_level(png_ptr, level);
+
+Another useful one is to reduce the memory level used by the library.
+The memory level defaults to 8, but it can be lowered if you are
+short on memory (running DOS, for example, where you only have 640K).
+Note that the memory level does have an effect on compression; among
+other things, lower levels will result in sections of incompressible
+data being emitted in smaller stored blocks, with a correspondingly
+larger relative overhead of up to 15% in the worst case.
+
+    png_set_compression_mem_level(png_ptr, level);
+
+The other functions are for configuring zlib.  They are not recommended
+for normal use and may result in writing an invalid PNG file.  See
+zlib.h for more information on what these mean.
+
+    png_set_compression_strategy(png_ptr,
+        strategy);
+    png_set_compression_window_bits(png_ptr,
+        window_bits);
+    png_set_compression_method(png_ptr, method);
+    png_set_compression_buffer_size(png_ptr, size);
+
+Controlling row filtering
+
+If you want to control whether libpng uses filtering or not, which
+filters are used, and how it goes about picking row filters, you
+can call one of these functions.  The selection and configuration
+of row filters can have a significant impact on the size and
+encoding speed and a somewhat lesser impact on the decoding speed
+of an image.  Filtering is enabled by default for RGB and grayscale
+images (with and without alpha), but not for paletted images nor
+for any images with bit depths less than 8 bits/pixel.
+
+The 'method' parameter sets the main filtering method, which is
+currently only '0' in the PNG 1.2 specification.  The 'filters'
+parameter sets which filter(s), if any, should be used for each
+scanline.  Possible values are PNG_ALL_FILTERS and PNG_NO_FILTERS
+to turn filtering on and off, respectively.
+
+Individual filter types are PNG_FILTER_NONE, PNG_FILTER_SUB,
+PNG_FILTER_UP, PNG_FILTER_AVG, PNG_FILTER_PAETH, which can be bitwise
+ORed together with '|' to specify one or more filters to use.
+These filters are described in more detail in the PNG specification.
+If you intend to change the filter type during the course of writing
+the image, you should start with flags set for all of the filters
+you intend to use so that libpng can initialize its internal
+structures appropriately for all of the filter types.  (Note that this
+means the first row must always be adaptively filtered, because libpng
+currently does not allocate the filter buffers until png_write_row()
+is called for the first time.)
+
+    filters = PNG_FILTER_NONE | PNG_FILTER_SUB
+              PNG_FILTER_UP | PNG_FILTER_AVE |
+              PNG_FILTER_PAETH | PNG_ALL_FILTERS;
+
+    png_set_filter(png_ptr, PNG_FILTER_TYPE_BASE,
+       filters);
+              The second parameter can also be
+              PNG_INTRAPIXEL_DIFFERENCING if you are
+              writing a PNG to be embedded in a MNG
+              datastream.  This parameter must be the
+              same as the value of filter_method used
+              in png_set_IHDR().
+
+It is also possible to influence how libpng chooses from among the
+available filters.  This is done in one or both of two ways - by
+telling it how important it is to keep the same filter for successive
+rows, and by telling it the relative computational costs of the filters.
+
+    double weights[3] = {1.5, 1.3, 1.1},
+       costs[PNG_FILTER_VALUE_LAST] =
+       {1.0, 1.3, 1.3, 1.5, 1.7};
+
+    png_set_filter_heuristics(png_ptr,
+       PNG_FILTER_HEURISTIC_WEIGHTED, 3,
+       weights, costs);
+
+The weights are multiplying factors that indicate to libpng that the
+row filter should be the same for successive rows unless another row filter
+is that many times better than the previous filter.  In the above example,
+if the previous 3 filters were SUB, SUB, NONE, the SUB filter could have a
+"sum of absolute differences" 1.5 x 1.3 times higher than other filters
+and still be chosen, while the NONE filter could have a sum 1.1 times
+higher than other filters and still be chosen.  Unspecified weights are
+taken to be 1.0, and the specified weights should probably be declining
+like those above in order to emphasize recent filters over older filters.
+
+The filter costs specify for each filter type a relative decoding cost
+to be considered when selecting row filters.  This means that filters
+with higher costs are less likely to be chosen over filters with lower
+costs, unless their "sum of absolute differences" is that much smaller.
+The costs do not necessarily reflect the exact computational speeds of
+the various filters, since this would unduly influence the final image
+size.
+
+Note that the numbers above were invented purely for this example and
+are given only to help explain the function usage.  Little testing has
+been done to find optimum values for either the costs or the weights.
+
+Removing unwanted object code
+
+There are a bunch of #define's in pngconf.h that control what parts of
+libpng are compiled.  All the defines end in _SUPPORTED.  If you are
+never going to use a capability, you can change the #define to #undef
+before recompiling libpng and save yourself code and data space, or
+you can turn off individual capabilities with defines that begin with
+PNG_NO_.
+
+You can also turn all of the transforms and ancillary chunk capabilities
+off en masse with compiler directives that define
+PNG_NO_READ[or WRITE]_TRANSFORMS, or PNG_NO_READ[or WRITE]_ANCILLARY_CHUNKS,
+or all four,
+along with directives to turn on any of the capabilities that you do
+want.  The PNG_NO_READ[or WRITE]_TRANSFORMS directives disable
+the extra transformations but still leave the library fully capable of reading
+and writing PNG files with all known public chunks
+Use of the PNG_NO_READ[or WRITE]_ANCILLARY_CHUNKS directive
+produces a library that is incapable of reading or writing ancillary chunks.
+If you are not using the progressive reading capability, you can
+turn that off with PNG_NO_PROGRESSIVE_READ (don't confuse
+this with the INTERLACING capability, which you'll still have).
+
+All the reading and writing specific code are in separate files, so the
+linker should only grab the files it needs.  However, if you want to
+make sure, or if you are building a stand alone library, all the
+reading files start with pngr and all the writing files start with
+pngw.  The files that don't match either (like png.c, pngtrans.c, etc.)
+are used for both reading and writing, and always need to be included.
+The progressive reader is in pngpread.c
+
+If you are creating or distributing a dynamically linked library (a .so
+or DLL file), you should not remove or disable any parts of the library,
+as this will cause applications linked with different versions of the
+library to fail if they call functions not available in your library.
+The size of the library itself should not be an issue, because only
+those sections that are actually used will be loaded into memory.
+
+Requesting debug printout
+
+The macro definition PNG_DEBUG can be used to request debugging
+printout.  Set it to an integer value in the range 0 to 3.  Higher
+numbers result in increasing amounts of debugging information.  The
+information is printed to the "stderr" file, unless another file
+name is specified in the PNG_DEBUG_FILE macro definition.
+
+When PNG_DEBUG > 0, the following functions (macros) become available:
+
+   png_debug(level, message)
+   png_debug1(level, message, p1)
+   png_debug2(level, message, p1, p2)
+
+in which "level" is compared to PNG_DEBUG to decide whether to print
+the message, "message" is the formatted string to be printed,
+and p1 and p2 are parameters that are to be embedded in the string
+according to printf-style formatting directives.  For example,
+
+   png_debug1(2, "foo=%d\n", foo);
+
+is expanded to
+
+   if(PNG_DEBUG > 2)
+     fprintf(PNG_DEBUG_FILE, "foo=%d\n", foo);
+
+When PNG_DEBUG is defined but is zero, the macros aren't defined, but you
+can still use PNG_DEBUG to control your own debugging:
+
+   #ifdef PNG_DEBUG
+       fprintf(stderr, ...
+   #endif
+
+When PNG_DEBUG = 1, the macros are defined, but only png_debug statements
+having level = 0 will be printed.  There aren't any such statements in
+this version of libpng, but if you insert some they will be printed.
+
+VII.  MNG support
+
+The MNG specification (available at http://www.libpng.org/pub/mng) allows
+certain extensions to PNG for PNG images that are embedded in MNG datastreams.
+Libpng can support some of these extensions.  To enable them, use the
+png_permit_mng_features() function:
+
+   feature_set = png_permit_mng_features(png_ptr, mask)
+   mask is a png_uint_32 containing the bitwise OR of the
+        features you want to enable.  These include
+        PNG_FLAG_MNG_EMPTY_PLTE
+        PNG_FLAG_MNG_FILTER_64
+        PNG_ALL_MNG_FEATURES
+   feature_set is a png_uint_32 that is the bitwise AND of
+      your mask with the set of MNG features that is
+      supported by the version of libpng that you are using.
+
+It is an error to use this function when reading or writing a standalone
+PNG file with the PNG 8-byte signature.  The PNG datastream must be wrapped
+in a MNG datastream.  As a minimum, it must have the MNG 8-byte signature
+and the MHDR and MEND chunks.  Libpng does not provide support for these
+or any other MNG chunks; your application must provide its own support for
+them.  You may wish to consider using libmng (available at
+http://www.libmng.com) instead.
+
+VIII.  Changes to Libpng from version 0.88
+
+It should be noted that versions of libpng later than 0.96 are not
+distributed by the original libpng author, Guy Schalnat, nor by
+Andreas Dilger, who had taken over from Guy during 1996 and 1997, and
+distributed versions 0.89 through 0.96, but rather by another member
+of the original PNG Group, Glenn Randers-Pehrson.  Guy and Andreas are
+still alive and well, but they have moved on to other things.
+
+The old libpng functions png_read_init(), png_write_init(),
+png_info_init(), png_read_destroy(), and png_write_destroy() have been
+moved to PNG_INTERNAL in version 0.95 to discourage their use.  These
+functions will be removed from libpng version 2.0.0.
+
+The preferred method of creating and initializing the libpng structures is
+via the png_create_read_struct(), png_create_write_struct(), and
+png_create_info_struct() because they isolate the size of the structures
+from the application, allow version error checking, and also allow the
+use of custom error handling routines during the initialization, which
+the old functions do not.  The functions png_read_destroy() and
+png_write_destroy() do not actually free the memory that libpng
+allocated for these structs, but just reset the data structures, so they
+can be used instead of png_destroy_read_struct() and
+png_destroy_write_struct() if you feel there is too much system overhead
+allocating and freeing the png_struct for each image read.
+
+Setting the error callbacks via png_set_message_fn() before
+png_read_init() as was suggested in libpng-0.88 is no longer supported
+because this caused applications that do not use custom error functions
+to fail if the png_ptr was not initialized to zero.  It is still possible
+to set the error callbacks AFTER png_read_init(), or to change them with
+png_set_error_fn(), which is essentially the same function, but with a new
+name to force compilation errors with applications that try to use the old
+method.
+
+Starting with version 1.0.7, you can find out which version of the library
+you are using at run-time:
+
+   png_uint_32 libpng_vn = png_access_version_number();
+
+The number libpng_vn is constructed from the major version, minor
+version with leading zero, and release number with leading zero,
+(e.g., libpng_vn for version 1.0.7 is 10007).
+
+You can also check which version of png.h you used when compiling your
+application:
+
+   png_uint_32 application_vn = PNG_LIBPNG_VER;
+
+IX. Y2K Compliance in libpng
+
+September 1, 2007
+
+Since the PNG Development group is an ad-hoc body, we can't make
+an official declaration.
+
+This is your unofficial assurance that libpng from version 0.71 and
+upward through 1.2.20rc4 are Y2K compliant.  It is my belief that earlier
+versions were also Y2K compliant.
+
+Libpng only has three year fields.  One is a 2-byte unsigned integer that
+will hold years up to 65535.  The other two hold the date in text
+format, and will hold years up to 9999.
+
+The integer is
+    "png_uint_16 year" in png_time_struct.
+
+The strings are
+    "png_charp time_buffer" in png_struct and
+    "near_time_buffer", which is a local character string in png.c.
+
+There are seven time-related functions:
+
+    png_convert_to_rfc_1123() in png.c
+      (formerly png_convert_to_rfc_1152() in error)
+    png_convert_from_struct_tm() in pngwrite.c, called
+      in pngwrite.c
+    png_convert_from_time_t() in pngwrite.c
+    png_get_tIME() in pngget.c
+    png_handle_tIME() in pngrutil.c, called in pngread.c
+    png_set_tIME() in pngset.c
+    png_write_tIME() in pngwutil.c, called in pngwrite.c
+
+All appear to handle dates properly in a Y2K environment.  The
+png_convert_from_time_t() function calls gmtime() to convert from system
+clock time, which returns (year - 1900), which we properly convert to
+the full 4-digit year.  There is a possibility that applications using
+libpng are not passing 4-digit years into the png_convert_to_rfc_1123()
+function, or that they are incorrectly passing only a 2-digit year
+instead of "year - 1900" into the png_convert_from_struct_tm() function,
+but this is not under our control.  The libpng documentation has always
+stated that it works with 4-digit years, and the APIs have been
+documented as such.
+
+The tIME chunk itself is also Y2K compliant.  It uses a 2-byte unsigned
+integer to hold the year, and can hold years as large as 65535.
+
+zlib, upon which libpng depends, is also Y2K compliant.  It contains
+no date-related code.
+
+
+   Glenn Randers-Pehrson
+   libpng maintainer
+   PNG Development Group
diff --git a/libpng.3 b/libpng.3
index d70734131..d80f90bf8 100644
--- a/libpng.3
+++ b/libpng.3
@@ -1,6 +1,6 @@
-.TH LIBPNG 3 "August 30, 2007"
+.TH LIBPNG 3 "September 1, 2007"
 .SH NAME
-libpng \- Portable Network Graphics (PNG) Reference Library 1.2.20rc3
+libpng \- Portable Network Graphics (PNG) Reference Library 1.2.20rc4
 .SH SYNOPSIS
 \fB
 #include <png.h>\fP
@@ -410,7 +410,7 @@ Following is a copy of the libpng.txt file that accompanies libpng.
 .SH LIBPNG.TXT
 libpng.txt - A description on how to use and modify libpng
 
- libpng version 1.2.20rc3 - August 30, 2007
+ libpng version 1.2.20rc4 - September 1, 2007
  Updated and distributed by Glenn Randers-Pehrson
  <glennrp at users.sourceforge.net>
  Copyright (c) 1998-2007 Glenn Randers-Pehrson
@@ -496,9 +496,7 @@ Libpng is thread safe, provided the threads are using different
 instances of the structures.  Each thread should have its own
 png_struct and png_info instances, and thus its own image.
 Libpng does not protect itself against two threads using the
-same instance of a structure.  Note: thread safety may be defeated
-by use of some of the MMX assembler code in pnggccrd.c, which is only
-compiled when the user defines PNG_THREAD_UNSAFE_OK.
+same instance of a structure.
 
 .SH II. Structures
 
@@ -2765,17 +2763,13 @@ For a more compact example of writing a PNG image, see the file example.c.
 
 .SH V. Modifying/Customizing libpng:
 
-There are three issues here.  The first is changing how libpng does
+There are two issues here.  The first is changing how libpng does
 standard things like memory allocation, input/output, and error handling.
 The second deals with more complicated things like adding new chunks,
 adding new transformations, and generally changing how libpng works.
 Both of those are compile-time issues; that is, they are generally
 determined at the time the code is written, and there is rarely a need
-to provide the user with a means of changing them.  The third is a
-run-time issue:  choosing between and/or tuning one or more alternate
-versions of computationally intensive routines; specifically, optimized
-assembly-language (and therefore compiler- and platform-dependent)
-versions.
+to provide the user with a means of changing them.
 
 Memory allocation, input/output, and error handling
 
@@ -3136,172 +3130,6 @@ When PNG_DEBUG = 1, the macros are defined, but only png_debug statements
 having level = 0 will be printed.  There aren't any such statements in
 this version of libpng, but if you insert some they will be printed.
 
-.SH VI.  Runtime optimization
-
-A new feature in libpng 1.2.0 is the ability to dynamically switch between
-standard and optimized versions of some routines.  Currently these are
-limited to three computationally intensive tasks when reading PNG files:
-decoding row filters, expanding interlacing, and combining interlaced or
-transparent row data with previous row data.  Currently the optimized
-versions are available only for x86 (Intel, AMD, etc.) platforms with
-MMX support, though this may change in future versions.  (For example,
-the non-MMX assembler optimizations for zlib might become similarly
-runtime-selectable in future releases, in which case libpng could be
-extended to support them.  Alternatively, the compile-time choice of
-floating-point versus integer routines for gamma correction might become
-runtime-selectable.)
-
-Because such optimizations tend to be very platform- and compiler-dependent,
-both in how they are written and in how they perform, the new runtime code
-in libpng has been written to allow programs to query, enable, and disable
-either specific optimizations or all such optimizations.  For example, to
-enable all possible optimizations (bearing in mind that some "optimizations"
-may actually run more slowly in rare cases):
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       png_uint_32 mask, flags;
-
-       flags = png_get_asm_flags(png_ptr);
-       mask = png_get_asm_flagmask(PNG_SELECT_READ | PNG_SELECT_WRITE);
-       png_set_asm_flags(png_ptr, flags | mask);
-    #endif
-
-To enable only optimizations relevant to reading PNGs, use PNG_SELECT_READ
-by itself when calling png_get_asm_flagmask(); similarly for optimizing
-only writing.  To disable all optimizations:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       flags = png_get_asm_flags(png_ptr);
-       mask = png_get_asm_flagmask(PNG_SELECT_READ | PNG_SELECT_WRITE);
-       png_set_asm_flags(png_ptr, flags & ~mask);
-    #endif
-
-To enable or disable only MMX-related features, use png_get_mmx_flagmask()
-in place of png_get_asm_flagmask().  The mmx version takes one additional
-parameter:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       int selection = PNG_SELECT_READ | PNG_SELECT_WRITE;
-       int compilerID;
-
-       mask = png_get_mmx_flagmask(selection, &compilerID);
-    #endif
-
-On return, compilerID will indicate which version of the MMX assembler
-optimizations was compiled.  Currently two flavors exist:  Microsoft
-Visual C++ (compilerID == 1) and GNU C (a.k.a. gcc/gas, compilerID == 2).
-On non-x86 platforms or on systems compiled without MMX optimizations, a
-value of -1 is used.
-
-Note that both png_get_asm_flagmask() and png_get_mmx_flagmask() return
-all valid, settable optimization bits for the version of the library that's
-currently in use.  In the case of shared (dynamically linked) libraries,
-this may include optimizations that did not exist at the time the code was
-written and compiled.  It is also possible, of course, to enable only known,
-specific optimizations; for example:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200) && \
-       defined(PNG_USE_PNGGCCRD) && defined(PNG_ASSEMBLER_CODE_SUPPORTED)
-       flags = PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  \
-             | PNG_ASM_FLAG_MMX_READ_INTERLACE    \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_SUB   \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_UP    \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_AVG   \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
-       png_set_asm_flags(png_ptr, flags);
-    #endif
-
-This method would enable only the MMX read-optimizations available at the
-time of libpng 1.2.0's release, regardless of whether a later version of
-the DLL were actually being used.  (Also note that these functions did not
-exist in versions older than 1.2.0, so any attempt to run a dynamically
-linked app on such an older version would fail.)
-
-To determine whether the processor supports MMX instructions at all, use
-the png_mmx_support() function:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       mmxsupport = png_mmx_support();
-    #endif
-
-It returns -1 if MMX support is not compiled into libpng, 0 if MMX code
-is compiled but MMX is not supported by the processor, or 1 if MMX support
-is fully available.  Note that png_mmx_support(), png_get_mmx_flagmask(),
-and png_get_asm_flagmask() all may be called without allocating and ini-
-tializing any PNG structures (for example, as part of a usage screen or
-"about" box).
-
-The following code can be used to prevent an application from using the
-thread_unsafe features, even if libpng was built with PNG_THREAD_UNSAFE_OK
-defined:
-
-#if defined(PNG_USE_PNGGCCRD) && defined(PNG_ASSEMBLER_CODE_SUPPORTED) \
-  && defined(PNG_THREAD_UNSAFE_OK)
-    /* Disable thread-unsafe features of pnggccrd */
-    if (png_access_version_number() >= 10200)
-    {
-      png_uint_32 mmx_disable_mask = 0;
-      png_uint_32 asm_flags;
-
-      mmx_disable_mask |= ( PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_SUB   \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_AVG   \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH );
-      asm_flags = png_get_asm_flags(png_ptr);
-      png_set_asm_flags(png_ptr, asm_flags & ~mmx_disable_mask);
-    }
-#endif
-
-For more extensive examples of runtime querying, enabling and disabling
-of optimized features, see contrib/gregbook/readpng2.c in the libpng
-source-code distribution.
-
-It is also possible to disable or enable specific optimization features
-at compile time.  To disable them entirely, which may result in slower
-but smaller and less complex and troublesome code, define
-
-     PNG_NO_ASSEMBLER_CODE
-
-If you do this, then the run-time code for setting and querying flags
-described above, and the assembler code in pnggccrd.c or pngvcrd.c will
-not be built.
-
-If you have disabled the assembler code, you can also disable the
-optimized C code, to obtain even slower but smaller code, by defining
-
-     PNG_NO_OPTIMZED_CODE
-
-To disable only the MMX assembler code, define
-
-     PNG_NO_MMX_CODE
-
-There are two versions of the MMX code: one for gcc compilers and one for
-MSVC compilers.  Pngconf.h should automatically detect which you are
-using, and it will set either PNG_USE_PNGGCCRD or PNG_USE_PNGVCRD for you.
-
-Define one or more of the following to disable specific parts of the
-assembler code:
-
-     PNG_NO_MMX_COMBINE_ROW
-     PNG_NO_MMX_READ_INTERLACE
-     PNG_NO_MMX_READ_FILTER_ROW
-
-If the latter is not disabled, you can disable one or more of the
-individual filter types by defining
-
-     PNG_NO_MMX_FILTER_SUB
-     PNG_NO_MMX_FILTER_UP
-     PNG_NO_MMX_FILTER_AVG
-     PNG_NO_MMX_FILTER_PAETH
-
-By default, libpng only enables the "sub" filter when gcc-3.x is used
-because experiments show that gcc-3.x produces bad code for the others.
-
-When you disable various MMX features, libpng will still be able to decode
-files with those features.  It will fall back upon the optimized C code
-or the unoptimized C code to decode them.
-
-
 .SH VII.  MNG support
 
 The MNG specification (available at http://www.libpng.org/pub/mng) allows
@@ -3378,13 +3206,13 @@ application:
 
 .SH IX. Y2K Compliance in libpng
 
-August 30, 2007
+September 1, 2007
 
 Since the PNG Development group is an ad-hoc body, we can't make
 an official declaration.
 
 This is your unofficial assurance that libpng from version 0.71 and
-upward through 1.2.20rc3 are Y2K compliant.  It is my belief that earlier
+upward through 1.2.20rc4 are Y2K compliant.  It is my belief that earlier
 versions were also Y2K compliant.
 
 Libpng only has three year fields.  One is a 2-byte unsigned integer that
@@ -3579,8 +3407,8 @@ the first widely used release:
  1.0.27              10    10027  10.so.0.27[.0]
  1.2.19              13    10219  12.so.0.19[.0]
  1.2.20beta01-04     13    10220  12.so.0.20[.0]
- 1.0.28rc1-2         10    10028  10.so.0.28[.0]
- 1.2.20rc1-2         13    10220  12.so.0.20[.0]
+ 1.0.28rc1-4         10    10028  10.so.0.28[.0]
+ 1.2.20rc1-4         13    10220  12.so.0.20[.0]
  1.0.28              10    10028  10.so.0.28[.0]
  1.2.20              13    10220  12.so.0.20[.0]
 
@@ -3638,7 +3466,7 @@ possible without all of you.
 
 Thanks to Frank J. T. Wojcik for helping with the documentation.
 
-Libpng version 1.2.20rc3 - August 30, 2007:
+Libpng version 1.2.20rc4 - September 1, 2007:
 Initially created in 1995 by Guy Eric Schalnat, then of Group 42, Inc.
 Currently maintained by Glenn Randers-Pehrson (glennrp at users.sourceforge.net).
 
@@ -3659,7 +3487,7 @@ included in the libpng distribution, the latter shall prevail.)
 If you modify libpng you may insert additional notices immediately following
 this sentence.
 
-libpng versions 1.2.6, August 15, 2004, through 1.2.20rc3, August 30, 2007, are
+libpng versions 1.2.6, August 15, 2004, through 1.2.20rc4, September 1, 2007, are
 Copyright (c) 2004,2006-2007 Glenn Randers-Pehrson, and are
 distributed according to the same disclaimer and license as libpng-1.2.5
 with the following individual added to the list of Contributing Authors
@@ -3758,7 +3586,7 @@ certification mark of the Open Source Initiative.
 
 Glenn Randers-Pehrson
 glennrp at users.sourceforge.net
-August 30, 2007
+September 1, 2007
 
 .\" end of man page
 
diff --git a/libpngpf.3 b/libpngpf.3
index 1aea5a8d8..06fddfa36 100644
--- a/libpngpf.3
+++ b/libpngpf.3
@@ -1,6 +1,6 @@
-.TH LIBPNGPF 3 "August 30, 2007"
+.TH LIBPNGPF 3 "September 1, 2007"
 .SH NAME
-libpng \- Portable Network Graphics (PNG) Reference Library 1.2.20rc3
+libpng \- Portable Network Graphics (PNG) Reference Library 1.2.20rc4
 (private functions)
 .SH SYNOPSIS
 \fB#include <png.h>\fP
diff --git a/png.5 b/png.5
index fadb858d5..317b7aeb3 100644
--- a/png.5
+++ b/png.5
@@ -1,4 +1,4 @@
-.TH PNG 5 "August 30, 2007"
+.TH PNG 5 "September 1, 2007"
 .SH NAME
 png \- Portable Network Graphics (PNG) format
 .SH DESCRIPTION
diff --git a/png.c b/png.c
index 54d6f7ab1..e6ffcd4dc 100644
--- a/png.c
+++ b/png.c
@@ -13,7 +13,7 @@
 #include "png.h"
 
 /* Generate a compiler error if there is an old png.h in the search path. */
-typedef version_1_2_20rc3 Your_png_h_is_not_version_1_2_20rc3;
+typedef version_1_2_20rc4 Your_png_h_is_not_version_1_2_20rc4;
 
 /* Version information for C files.  This had better match the version
  * string defined in png.h.  */
@@ -67,11 +67,6 @@ PNG_CONST int FARDATA png_pass_ystart[] = {0, 0, 4, 0, 2, 0, 1};
 /* offset to next interlace block in the y direction */
 PNG_CONST int FARDATA png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
 
-/* width of interlace block (used in assembler routines only) */
-#if defined(PNG_HAVE_MMX_COMBINE_ROW) || defined(PNG_OPTIMIZED_CODE_SUPPORTED)
-PNG_CONST int FARDATA png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
-#endif
-
 /* Height of interlace block.  This is not currently used - if you need
  * it, uncomment it here and in png.h
 PNG_CONST int FARDATA png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
@@ -698,7 +693,7 @@ png_charp PNGAPI
 png_get_copyright(png_structp png_ptr)
 {
    png_ptr = png_ptr;  /* silence compiler warning about unused png_ptr */
-   return ((png_charp) "\n libpng version 1.2.20rc3 - August 30, 2007\n\
+   return ((png_charp) "\n libpng version 1.2.20rc4 - September 1, 2007\n\
    Copyright (c) 1998-2007 Glenn Randers-Pehrson\n\
    Copyright (c) 1996-1997 Andreas Dilger\n\
    Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.\n");
@@ -825,57 +820,18 @@ png_access_version_number(void)
 void /* PRIVATE */
 png_init_mmx_flags (png_structp png_ptr)
 {
-    if(png_ptr == NULL) return;
-    png_ptr->mmx_rowbytes_threshold = 0;
-    png_ptr->mmx_bitdepth_threshold = 0;
-
-#  if (defined(PNG_USE_PNGVCRD) || defined(PNG_USE_PNGGCCRD))
-
-    png_ptr->asm_flags |= PNG_ASM_FLAG_MMX_SUPPORT_COMPILED;
-
-    if (png_mmx_support() > 0) {
-        png_ptr->asm_flags |= PNG_ASM_FLAG_MMX_SUPPORT_IN_CPU
-#    ifdef PNG_HAVE_MMX_COMBINE_ROW
-                              | PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
-#    endif
-#    ifdef PNG_HAVE_MMX_READ_INTERLACE
-                              | PNG_ASM_FLAG_MMX_READ_INTERLACE
-#    endif
-#    ifndef PNG_HAVE_MMX_READ_FILTER_ROW
-                              ;
-#    else
-                              | PNG_ASM_FLAG_MMX_READ_FILTER_SUB
-                              | PNG_ASM_FLAG_MMX_READ_FILTER_UP
-                              | PNG_ASM_FLAG_MMX_READ_FILTER_AVG
-                              | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
-
-        png_ptr->mmx_rowbytes_threshold = PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT;
-        png_ptr->mmx_bitdepth_threshold = PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT;
-#    endif
-    } else {
-        png_ptr->asm_flags &= ~( PNG_ASM_FLAG_MMX_SUPPORT_IN_CPU
-                               | PNG_MMX_READ_FLAGS
-                               | PNG_MMX_WRITE_FLAGS );
-    }
-
-#  else /* !(PNGVCRD || PNGGCCRD) */
-
-    /* clear all MMX flags; no support is compiled in */
-    png_ptr->asm_flags &= ~( PNG_MMX_FLAGS );
-
-#  endif /* ?(PNGVCRD || PNGGCCRD) */
+   /* obsolete, to be removed from libpng-1.4.0 */
 }
 
 #endif /* !(PNG_MMX_CODE_SUPPORTED) */
 
 /* this function was added to libpng 1.2.0 */
-#if !defined(PNG_USE_PNGGCCRD) && !defined(PNG_USE_PNGVCRD)
 int PNGAPI
 png_mmx_support(void)
 {
+   /* obsolete, to be removed from libpng-1.4.0 */
     return -1;
 }
-#endif
 #endif /* PNG_1_0_X */
 #endif /* PNG_READ_SUPPORTED && PNG_ASSEMBLER_CODE_SUPPORTED */
 
diff --git a/png.h b/png.h
index a26a725cd..a4aa8d0af 100644
--- a/png.h
+++ b/png.h
@@ -1,7 +1,7 @@
 
 /* png.h - header file for PNG reference library
  *
- * libpng version 1.2.20rc3 - August 30, 2007
+ * libpng version 1.2.20rc4 - September 1, 2007
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
@@ -9,7 +9,7 @@
  * Authors and maintainers:
  *  libpng versions 0.71, May 1995, through 0.88, January 1996: Guy Schalnat
  *  libpng versions 0.89c, June 1996, through 0.96, May 1997: Andreas Dilger
- *  libpng versions 0.97, January 1998, through 1.2.20rc3 - August 30, 2007: Glenn
+ *  libpng versions 0.97, January 1998, through 1.2.20rc4 - September 1, 2007: Glenn
  *  See also "Contributing Authors", below.
  *
  * Note about libpng version numbers:
@@ -156,8 +156,8 @@
  *    1.0.27                  10    10027  10.so.0.27[.0]
  *    1.2.19                  13    10219  12.so.0.19[.0]
  *    1.2.20beta01-04         13    10220  12.so.0.20[.0]
- *    1.0.28rc1-2             10    10028  10.so.0.28[.0]
- *    1.2.20rc1-2             13    10220  12.so.0.20[.0]
+ *    1.0.28rc1-4             10    10028  10.so.0.28[.0]
+ *    1.2.20rc1-4             13    10220  12.so.0.20[.0]
  *    1.0.28                  10    10028  10.so.0.28[.0]
  *    1.2.20                  13    10220  12.so.0.20[.0]
  *
@@ -189,7 +189,7 @@
  * If you modify libpng you may insert additional notices immediately following
  * this sentence.
  *
- * libpng versions 1.2.6, August 15, 2004, through 1.2.20rc3, August 30, 2007, are
+ * libpng versions 1.2.6, August 15, 2004, through 1.2.20rc4, September 1, 2007, are
  * Copyright (c) 2004, 2006-2007 Glenn Randers-Pehrson, and are
  * distributed according to the same disclaimer and license as libpng-1.2.5
  * with the following individual added to the list of Contributing Authors:
@@ -301,13 +301,13 @@
  * Y2K compliance in libpng:
  * =========================
  *
- *    August 30, 2007
+ *    September 1, 2007
  *
  *    Since the PNG Development group is an ad-hoc body, we can't make
  *    an official declaration.
  *
  *    This is your unofficial assurance that libpng from version 0.71 and
- *    upward through 1.2.20rc3 are Y2K compliant.  It is my belief that earlier
+ *    upward through 1.2.20rc4 are Y2K compliant.  It is my belief that earlier
  *    versions were also Y2K compliant.
  *
  *    Libpng only has three year fields.  One is a 2-byte unsigned integer
@@ -363,9 +363,9 @@
  */
 
 /* Version information for png.h - this should match the version in png.c */
-#define PNG_LIBPNG_VER_STRING "1.2.20rc3"
+#define PNG_LIBPNG_VER_STRING "1.2.20rc4"
 #define PNG_HEADER_VERSION_STRING \
-   " libpng version 1.2.20rc3 - August 30, 2007\n"
+   " libpng version 1.2.20rc4 - September 1, 2007\n"
 
 #define PNG_LIBPNG_VER_SONUM   0
 #define PNG_LIBPNG_VER_DLLNUM  13
@@ -377,7 +377,7 @@
 /* This should match the numeric part of the final component of
  * PNG_LIBPNG_VER_STRING, omitting any leading zero: */
 
-#define PNG_LIBPNG_VER_BUILD  3
+#define PNG_LIBPNG_VER_BUILD  4
 
 /* Release Status */
 #define PNG_LIBPNG_BUILD_ALPHA    1
@@ -504,9 +504,6 @@ PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_ystart[7];
 PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_yinc[7];
 PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_mask[7];
 PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_dsp_mask[7];
-#if defined(PNG_HAVE_MMX_COMBINE_ROW) || defined(PNG_OPTIMIZED_CODE_SUPPORTED)
-PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_width[7];
-#endif
 /* This isn't currently used.  If you need it, see png.c for more details.
 PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_height[7];
 */
@@ -1279,8 +1276,7 @@ struct png_struct_def
      png_size_t current_text_left;   /* how much text left to read in input */
      png_charp current_text;         /* current text chunk buffer */
      png_charp current_text_ptr;     /* current location in current_text */
-#  endif /* PNG_PROGRESSIVE_READ_SUPPORTED && PNG_TEXT_SUPPORTED */
-
+#  endif /* PNG_TEXT_SUPPORTED */
 #endif /* PNG_PROGRESSIVE_READ_SUPPORTED */
 
 #if defined(__TURBOC__) && !defined(_Windows) && !defined(__FLAT__)
@@ -1416,7 +1412,7 @@ struct png_struct_def
 /* This triggers a compiler error in png.c, if png.c and png.h
  * do not agree upon the version number.
  */
-typedef png_structp version_1_2_20rc3;
+typedef png_structp version_1_2_20rc4;
 
 typedef png_struct FAR * FAR * png_structpp;
 
diff --git a/pngconf.h b/pngconf.h
index d5ef5e3e7..01d757891 100644
--- a/pngconf.h
+++ b/pngconf.h
@@ -1,7 +1,7 @@
 
 /* pngconf.h - machine configurable file for libpng
  *
- * libpng version 1.2.20rc3 - August 30, 2007
+ * libpng version 1.2.20rc4 - September 1, 2007
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -600,10 +600,10 @@
 #endif /* PNG_READ_TRANSFORMS_SUPPORTED */
 
 #if !defined(PNG_NO_PROGRESSIVE_READ) && \
- !defined(PNG_PROGRESSIVE_READ_NOT_SUPPORTED) /* if you don't do progressive */
-#  define PNG_PROGRESSIVE_READ_SUPPORTED    /* reading.  This is not talking */
-#endif                              /* about interlacing capability!  You'll */
-             /* still have interlacing unless you change the following line: */
+ !defined(PNG_PROGRESSIVE_READ_SUPPORTED) /* if you don't do progressive   */
+#  define PNG_PROGRESSIVE_READ_SUPPORTED  /* reading.  This is not talking */
+#endif                            /* about interlacing capability!  You'll */
+           /* still have interlacing unless you change the following line: */
 
 #define PNG_READ_INTERLACING_SUPPORTED /* required in PNG-compliant decoders */
 
@@ -727,15 +727,8 @@
 #endif
 
 /* PNG_ASSEMBLER_CODE was enabled by default in version 1.2.0 
- * even when PNG_USE_PNGVCRD or PNG_USE_PNGGCCRD is not defined.
- *
- * PNG_NO_ASSEMBLER_CODE disables use of all assembler code,
- * and removes several functions from the API.
- *
- * PNG_NO_MMX_CODE disables the use of MMX code without changing the API.
- * When MMX code is off, then optimized C replacement functions are used,
- * if PNG_NO_OPTIMIZED_CODE is not enabled.  This was added in version
- * 1.2.19.
+ * and removed from version 1.2.20.  The following will be removed
+ * from libpng-1.4.0
 */
 
 #if defined(PNG_READ_SUPPORTED) && !defined(PNG_NO_OPTIMIZED_CODE)
@@ -762,6 +755,12 @@
 #    endif
 #  endif
 
+#  if (defined(__MWERKS__) && ((__MWERKS__ < 0x0900) || macintosh))
+#    if !defined(PNG_MMX_CODE_SUPPORTED) && !defined(PNG_NO_MMX_CODE)
+#      define PNG_NO_MMX_CODE
+#    endif
+#  endif
+
 #  if !defined(PNG_MMX_CODE_SUPPORTED) && !defined(PNG_NO_MMX_CODE)
 #    define PNG_MMX_CODE_SUPPORTED
 #  endif
@@ -774,14 +773,11 @@
 #  if !defined(PNG_USE_PNGGCCRD) && defined(PNG_MMX_CODE_SUPPORTED) && \
      !defined(PNG_USE_PNGVCRD)
 #    define PNG_USE_PNGGCCRD
-     /* If you are sure that you don't need thread safety and you are compiling
-        with PNG_USE_PNGCCRD for an MMX application, you can define this for
-        faster execution.  See pnggccrd.c.
 #    define PNG_THREAD_UNSAFE_OK
-     */
 #  endif
 
 #endif
+/* end of obsolete code to be removed from libpng-1.4.0 */
 
 #if !defined(PNG_1_0_X)
 #if !defined(PNG_NO_USER_MEM) && !defined(PNG_USER_MEM_SUPPORTED)
@@ -1489,53 +1485,6 @@ typedef z_stream FAR *  png_zstreamp;
 #  define PNG_ZBUF_SIZE 65536L
 #endif
 
-#ifdef PNG_READ_SUPPORTED
-/* Prior to libpng-1.0.9, this block was in pngasmrd.h */
-#if defined(PNG_INTERNAL)
-
-#if defined(PNG_USE_PNGGCCRD) || defined(PNG_USE_PNGVCRD)
-  /* Platform must be Pentium.  Makefile must assemble and load
-   * pnggccrd.c or  pngvcrd.c. MMX will be detected at run time and
-   * used if present.
-   */
-#  ifndef PNG_NO_MMX_COMBINE_ROW
-#    define PNG_HAVE_MMX_COMBINE_ROW
-#  endif
-#  ifndef PNG_NO_MMX_READ_INTERLACE
-#    define PNG_HAVE_MMX_READ_INTERLACE
-#  endif
-#  ifndef PNG_NO_MMX_READ_FILTER_ROW
-#    define PNG_HAVE_MMX_READ_FILTER_ROW
-#    ifndef PNG_NO_MMX_FILTER_SUB
-#      define PNG_MMX_READ_FILTER_SUB_SUPPORTED
-#    endif
-#    ifndef PNG_NO_MMX_FILTER_UP
-#      define PNG_MMX_READ_FILTER_UP_SUPPORTED
-#    endif
-#    ifndef PNG_NO_MMX_FILTER_AVG
-#      define PNG_MMX_READ_FILTER_AVG_SUPPORTED
-#    endif
-#    ifndef PNG_NO_MMX_FILTER_PAETH
-#      define PNG_MMX_READ_FILTER_PAETH_SUPPORTED
-#    endif
-#  endif
-  /* These are the default thresholds before the MMX code kicks in; if either
-   * rowbytes or bitdepth is below the threshold, plain C code is used.  These
-   * can be overridden at runtime via the png_set_mmx_thresholds() call in
-   * libpng 1.2.0 and later.  The values below were chosen by Intel.
-   */
-#  ifndef PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT
-#    define PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT  128  /*  >=  */
-#  endif
-#  ifndef PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT
-#    define PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT  9    /*  >=  */   
-#  endif
-#endif /* PNG_USE_PNGGCCRD || PNG_USE_PNGVCRD */
-/* - see pngvcrd.c or pnggccrd.c for info about what is currently enabled */
-
-#endif /* PNG_INTERNAL */
-#endif /* PNG_READ_SUPPORTED */
-
 /* Added at libpng-1.2.8 */
 #endif /* PNG_VERSION_INFO_ONLY */
 
diff --git a/pngerror.c b/pngerror.c
index 93ce47b61..2d03c56f1 100644
--- a/pngerror.c
+++ b/pngerror.c
@@ -1,7 +1,7 @@
 
 /* pngerror.c - stub functions for i/o and memory allocation
  *
- * Last changed in libpng 1.2.20 August 30, 2007
+ * Last changed in libpng 1.2.20 September 1, 2007
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pnggccrd.c b/pnggccrd.c
index 37f3f33cd..7007ad9f0 100644
--- a/pnggccrd.c
+++ b/pnggccrd.c
@@ -1,6042 +1 @@
-
-/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
- *
- * For Intel/AMD x86 or x86-64 CPU (Pentium-MMX or later) and GNU C compiler.
- *
- * Last changed in libpng 1.2.19 August 19, 2007
- * For conditions of distribution and use, see copyright notice in png.h
- * Copyright (c) 1998 Intel Corporation
- * Copyright (c) 1999-2002,2007 Greg Roelofs
- * Copyright (c) 1998-2007 Glenn Randers-Pehrson
- *
- * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
- * Interface to libpng contributed by Gilles Vollant, 1999.
- * GNU C port by Greg Roelofs, 1999-2001.
- *
- * References:
- *
- *     http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
- *     http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
- *       [Intel's performance analysis of the MMX vs. non-MMX code;
- *        moved/deleted as of 2006, but text and some graphs still
- *        available via WayBack Machine at archive.org]
- *
- *     http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html
- *     http://sam.zoy.org/blog/2007-04-13-shlib-with-non-pic-code-have-inline-assembly-and-pic-mix-well
- *     http://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html
- *     http://gcc.gnu.org/onlinedocs/gcc/Variable-Attributes.html
- *     http://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
- *     AMD64 Architecture Programmer's Manual, volumes 1 and 5
- *       [http://www.amd.com/us-en/Processors/TechnicalResources/0,,30_182_739_7044,00.html]
- *     Intel 64 and IA-32 Software Developer's Manuals
- *       [http://developer.intel.com/products/processor/manuals/]
- *
- * png_read_filter_row_mmx_*() were converted in place with intel2gas 1.3.1:
- *
- *     intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
- *
- * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
- *
- * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
- * is required to assemble the newer asm instructions such as movq.  (Version
- * 2.5.2l.15 is definitely too old.)  See ftp://ftp.gnu.org/pub/gnu/binutils/ .
- */
-
-/*
- * PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
- * ===========================
- *
- * 19991006:
- *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
- *
- * 19991007:
- *  - additional optimizations (possible or definite):
- *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
- *     - write MMX code for 48-bit case (pixel_bytes == 6)
- *     - figure out what's up with 24-bit case (pixel_bytes == 3):
- *        why subtract 8 from width_mmx in the pass 4/5 case?
- *        (only width_mmx case) (near line 2335)
- *     x [DONE] replace pixel_bytes within each block with the true
- *        constant value (or are compilers smart enough to do that?)
- *     - rewrite all MMX interlacing code so it's aligned with
- *        the *beginning* of the row buffer, not the end.  This
- *        would not only allow one to eliminate half of the memory
- *        writes for odd passes (that is, pass == odd), it may also
- *        eliminate some unaligned-data-access exceptions (assuming
- *        there's a penalty for not aligning 64-bit accesses on
- *        64-bit boundaries).  The only catch is that the "leftover"
- *        pixel(s) at the end of the row would have to be saved,
- *        but there are enough unused MMX registers in every case,
- *        so this is not a problem.  A further benefit is that the
- *        post-MMX cleanup code (C code) in at least some of the
- *        cases could be done within the assembler block.
- *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
- *     inconsistent, and don't match the MMX Programmer's Reference
- *     Manual conventions anyway.  They should be changed to
- *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
- *     was lowest in memory (i.e., corresponding to a left pixel)
- *     and b7 is the byte that was highest (i.e., a right pixel).
- *
- * 19991016:
- *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
- *     want globals prefixed by underscores when referencing them--
- *     i.e., if the variable is const4, then refer to it as const4,
- *     not _const4.  This seems to be a djgpp-specific requirement.
- *     Also, such variables apparently *must* be declared outside
- *     of functions; neither static nor automatic variables work if
- *     defined within the scope of a single function, but both
- *     static and truly global (multi-module) variables work fine.
- *
- * 19991017:
- *  - replaced pixel_bytes in each png_memcpy() call with constant value for
- *     inlining (png_do_read_interlace() "non-MMX/modified C code" block)
- *
- * 19991023:
- *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
- *  - switched from string-concatenation-with-macros to cleaner method of
- *     renaming global variables for djgpp--i.e., always use prefixes in
- *     inlined assembler code (== strings) and conditionally rename the
- *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
- *
- * 19991024:
- *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
- *     This one was severely weird:  even though mmxsupport() doesn't touch
- *     ebx (where "row" pointer was stored), it nevertheless managed to zero
- *     the register (even in static/non-fPIC code--see below), which in turn
- *     caused png_do_read_interlace() to return prematurely on the first row of
- *     interlaced images (i.e., without expanding the interlaced pixels).
- *     Inspection of the generated assembly code didn't turn up any clues,
- *     although it did point at a minor optimization (i.e., get rid of
- *     mmx_supported_local variable and just use eax).  Possibly the CPUID
- *     instruction is more destructive than it looks?  (Not yet checked.)
- *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
- *     listings...  Apparently register spillage has to do with ebx, since
- *     it's used to index the global offset table.  Commenting it out of the
- *     input-reg lists in png_combine_row() eliminated compiler barfage, so
- *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
- *
- * 19991107:
- *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
- *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
- *
- * 19991120:
- *  - made "diff" variable (now "_dif") global to simplify conversion of
- *     filtering routines (running out of regs, sigh).  "diff" is still used
- *     in interlacing routines, however.
- *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
- *     macro determines which is used); original not yet tested.
- *
- * 20000213:
- *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
- *
- * 20000319:
- *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
- *     pass == 4 or 5, that caused visible corruption of interlaced images
- *
- * 20000623:
- *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
- *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
- *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
- *     Chuck Wilson supplied a patch involving dummy output registers.  See
- *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
- *     for the original (anonymous) SourceForge bug report.
- *
- * 20000706:
- *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
- *       pnggccrd.c: In function `png_combine_row':
- *       pnggccrd.c:525: more than 10 operands in `asm'
- *       pnggccrd.c:669: more than 10 operands in `asm'
- *       pnggccrd.c:828: more than 10 operands in `asm'
- *       pnggccrd.c:994: more than 10 operands in `asm'
- *       pnggccrd.c:1177: more than 10 operands in `asm'
- *     They are all the same problem and can be worked around by using the
- *     global _unmask variable unconditionally, not just in the -fPIC case.
- *     Reportedly earlier versions of gcc also have the problem with more than
- *     10 operands; they just don't report it.  Much strangeness ensues, etc.
- *
- * 20000729:
- *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
- *     MMX routine); began converting png_read_filter_row_mmx_sub()
- *  - to finish remaining sections:
- *     - clean up indentation and comments
- *     - preload local variables
- *     - add output and input regs (order of former determines numerical
- *        mapping of latter)
- *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
- *     - remove "$" from addressing of Shift and Mask variables [20000823]
- *
- * 20000731:
- *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
- *
- * 20000822:
- *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
- *     shared-library (-fPIC) version!  Code works just fine as part of static
- *     library.  Should have tested that sooner.
- *     ebx is getting clobbered again (explicitly this time); need to save it
- *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
- *
- * 20000823:
- *  - first section was trickiest; all remaining sections have ebx -> edx now.
- *     (-fPIC works again.)  Also added missing underscores to various Shift*
- *     and *Mask* globals and got rid of leading "$" signs.
- *
- * 20000826:
- *  - added visual separators to help navigate microscopic printed copies
- *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
- *     on png_read_filter_row_mmx_avg()
- *
- * 20000828:
- *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
- *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
- *     cleaned up/shortened in either routine, but functionality is complete
- *     and seems to be working fine.
- *
- * 20000829:
- *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
- *     as an input reg (with dummy output variables, etc.), then it *cannot*
- *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
- *     is simple enough...
- *
- * 20000914:
- *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
- *     correctly (but 48-bit RGB just fine)
- *
- * 20000916:
- *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
- *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
- *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
- *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
- *
- * 20010101:
- *  - added new png_init_mmx_flags() function (here only because it needs to
- *     call mmxsupport(), which should probably become global png_mmxsupport());
- *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
- *
- * 20010103:
- *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
- *     and made it public; moved png_init_mmx_flags() to png.c as internal func
- *
- * 20010104:
- *  - removed dependency on png_read_filter_row_c() (C code already duplicated
- *     within MMX version of png_read_filter_row()) so no longer necessary to
- *     compile it into pngrutil.o
- *
- * 20010310:
- *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
- *
- * 20010808:
- *  - added PNG_THREAD_UNSAFE_OK around code using global variables [GR-P]
- *
- * 20011124:
- *  - fixed missing save of Eflag in png_mmx_support() [Maxim Sobolev]
- *
- * 20020304:
- *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
- *
- * 20020407:
- *  - fixed insufficient preservation of ebx register [Sami Farin]
- *
- * 20040724:
- *  - more tinkering with clobber list at lines 4529 and 5033 to get it to
- *     compile with gcc 3.4 [GR-P]
- *
- * 20040809:
- *  - added "rim" definitions for CONST4 and CONST6 [GR-P]
- *
- * 20060303:
- *  - added "OS2" to list of systems that don't need leading underscores [GR-P]
- *
- * 20060320:
- *  - made PIC-compliant [Christian Aichinger]
- *
- * 20070313:
- *  - finally applied Giuseppe Ghib�'s 64-bit patch of 20060803 (completely
- *     overlooked Dylan Alex Simon's similar patch of 20060414, oops...)
- *
- * 20070524:
- *  - fixed link failure caused by asm-only variables being optimized out
- *     (identified by Dimitri of Trolltech) with __attribute__((used)), which
- *     also gets rid of warnings => nuked ugly png_squelch_warnings() hack
- *  - dropped redundant ifdef
- *  - moved png_mmx_support() back up where originally intended (as in
- *     pngvcrd.c), using __attribute__((noinline)) in extra prototype
- *
- * 20070527:
- *  - revised png_combine_row() to reuse mask in lieu of external _unmask
- *  - moved 32-bit (RGBA) case to top of png_combine_row():  most common
- *  - just about ready to give up on x86-64 -fPIC mode; can't even access 16
- *     _mask*_* constants without triggering link error on shared library:
- *       /usr/bin/ld: pnggccrd.pic.o: relocation R_X86_64_32S against `a local
- *         symbol' can not be used when making a shared object; recompile with
- *         -fPIC
- *       pnggccrd.pic.o: could not read symbols: Bad value
- *       ("objdump -x pnggccrd.pic.o | grep rodata" to verify)
- *     [might be able to work around by doing within assembly code whatever
- *     -fPIC does, but given problems to date, seems like long shot...]
- *     [relevant ifdefs:  __x86_64__ && __PIC__ => C code only]
- *  - changed #if 0 to #ifdef PNG_CLOBBER_MMX_REGS_SUPPORTED in case gcc ever
- *     supports MMX regs (%mm0, etc.) in clobber list (not supported by gcc
- *     2.7.2.3, 2.91.66 (egcs 1.1.2), 3.x, or 4.1.2)
- *
- * 20070603:
- *  - revised png_combine_row() to use @GOTPCREL(%%rip) addressing on _c64
- *     struct of _mask*_* constants for x86-64 -fPIC; see sam.zoy.org link
- *     above for details
- *  - moved _const4 and _const6 into _c64 struct, renamed to _amask5_3_0 and
- *     _amask7_1_0, respectively
- *  - can't figure out how to use _c64._mask*_* vars within asm code, so still
- *     need single variables for non-x86-64/-fPIC half :-(
- *  - replaced various __PIC__ ifdefs with *_GOT_ebx macros
- *  - moved _LBCarryMask and _HBClearMask into _c64 struct
- *  - conditionally replaced _p*temp variables with %r11d-%r13d (via p*_TEMP
- *     and CLOBBER_r1*d macros)
- *
- * 20070604:
- *  - replaced all _ActiveMask and _ActiveMaskEnd with new _amask*_*_* consts
- *     (_amask naming convention:  numbers of 00-bytes, ff-bytes, 00-bytes)
- *    - _ActiveMask     // (10) // avg/paeth/sub; read-only; consts; movq/pand
- *       0x0000000000ffffffLL (bpp 3, avg)      _amask5_3_0
- *       0xffffffffffffffffLL (bpp 4, 6, avg)   _amask0_8_0
- *       0x000000000000ffffLL (bpp 2, avg)      _amask6_2_0
- *       0x0000000000ffffffLL (bpp 3, paeth)    _amask5_3_0
- *       0x00000000ffffffffLL (bpp 6, paeth)    _amask4_4_0
- *       0x00000000ffffffffLL (bpp 4, paeth)    _amask4_4_0
- *       0x00000000ffffffffLL (bpp 8, paeth)    _amask4_4_0
- *       0x0000ffffff000000LL (bpp 3, sub)      _amask2_3_3
- *       0x00000000ffff0000LL (bpp 2, sub)      _amask4_2_2
- *    - _ActiveMaskEnd  // (1)  // paeth only; read-only; const; pand
- *       0xffff000000000000LL (bpp 3, paeth)    _amask0_2_6
- *  - changed all "#if defined(__x86_64__) // later // && defined(__PIC__)"
- *     lines to "#ifdef PNG_x86_64_USE_GOTPCREL" for easier/safer testing
- *
- * 20070605:
- *  - merged PNG_x86_64_USE_GOTPCREL, non-PNG_x86_64_USE_GOTPCREL code via
- *     *MASK* and LOAD/RESTORE macros
- *
- * 20070607:
- *  - replaced all constant instances of _ShiftBpp, _ShiftRem with immediates
- *     (still have two shared cases in avg, sub routines)
- *
- * 20070609:
- *  - replaced remaining instances of _ShiftBpp, _ShiftRem with immediates
- *     (split sub and avg 4/6-bpp cases into separate blocks)
- *  - fixed paeth bug due to clobbered r11/r12/r13 regs
- *
- * 20070610:
- *  - made global "_dif" variable (avg/paeth/sub routines) local again (now
- *     "diff"--see 19991120 entry above), using register constraints
- *  - note that %ebp in clobber list doesn't actually work, at least for 32-bit
- *     version and gcc 4.1.2; must save and restore manually.  (Seems to work
- *     OK for 64-bit version and gcc 3.4.3, but gcc may not be using ebp/rbp
- *     in that case.)
- *  - started replacing direct _MMXLength accesses with register constraints
- *
- * 20070612:
- *  - continued replacing direct _MMXLength accesses with register constraints
- *
- * 20070613:
- *  - finished replacing direct _MMXLength accesses with register constraints;
- *     switched to local variable (and renamed back to MMXLength)
- *
- * 20070614:
- *  - fixed sub bpp = 1 bug
- *  - started replacing direct _FullLength accesses with register constraints
- *
- * 20070615:
- *  - fixed 64-bit paeth bpp 3 crash bug (misplaced LOAD_GOT_rbp)
- *  - fixed 64-bit paeth bpp 1/2 and cleanup-block crash bugs (misplaced
- *     RESTORE_r11_r12_r13)
- *  - slightly optimized avg/paeth cleanup blocks and paeth bpp 1/2 block
- *     (save/restore ebx only if needed)
- *  - continued replacing direct _FullLength accesses with register constraints
- *
- * 20070616:
- *  - finished replacing direct _FullLength accesses with register constraints
- *     (*ugly* conditional clobber-separator macros for avg and paeth, sigh)
- *
- * 20070618:
- *  - fixed misplaced PNG_THREAD_UNSAFE_OK endif (was missing LOAD_GOT_rbp/
- *     RESTORE_rbp in 32-bit thread-safe case)
- *  - changed all "ifdef *" to "if defined(*)" [GR-P]
- *
- * 20070619:
- *  - rearranged most bitdepth-related case statements to put most frequent
- *     cases at top (24-bit, 32-bit, 8-bit, rest)
- *
- * 20070623:
- *  - cleaned up png_debug() warnings/formatting
- *  - removed PNG_MMX_CODE_SUPPORTED ifdefs and added outer __GNUC__ ifdef
- *     (module no longer used by non-x86/non-GCC builds as of libpng 1.2.19)
- *  - removed single libpng-1.2.x PNG_DEBUG dependency on 1.0.x png_struct
- *     member (row_buf_size)
- *  - rearranged pass-related if-blocks in png_do_read_interlace() to put most
- *     frequent cases (4, 5) at top [GR-P suggestion]
- *
- * 20070624-29:
- *  - fixed 64-bit crash bug:  pointers -> rsi/rdi, not esi/edi (switched to
- *     %0/%1/%2/%3/%4 notation; eliminated size suffixes from relevant add/
- *     inc/sub/mov instructions; changed dummy vars to pointers)
- *     - png_combine_row()
- *     - png_do_read_interlace()
- *     - png_read_filter_row_mmx_avg()
- *     - png_read_filter_row_mmx_paeth()
- *     - png_read_filter_row_mmx_sub()
- *     - png_read_filter_row_mmx_up()
- *  - NOTE:  this fix makes use of the fact that modifying a 32-bit reg (e.g.,
- *     %%ebx) clears the top half of its corresponding 64-bit reg (%%rbx), so
- *     it's safe to mix 32-bit operations with 64-bit base/index addressing
- *     (see new PSI/PAX/PBX/PDX/PBP/etc. "pointer-register" macros); applies
- *     also to clobber lists
- *
- * 20070630:
- *  - cleaned up formatting, macros, minor png_read_filter_row_mmx_sub() 8-bpp
- *     register-usage inefficiency
- *  - fixed 32-bit png_do_read_interlace() bug (was using pointer size for
- *     64-bit dummy values)
- *
- * 20070703:
- *  - added check for (manual) PIC macro to fix OpenBSD crash bug
- *
- * 20070717:
- *  - fixed 48-bit png_combine_row() bug (was acting like 32-bit):  copy 6
- *     bytes per pixel, not 4, and use stride of 6, not 4, in the second loop
- *     of interlace processing of 48-bit pixels [GR-P]
- *
- * 20070722:
- *  - fixed 64-bit png_uint_32 bug with MMXLength/FullLength temp vars
- *
- * [still broken:  tops of all row-filter blocks (input/output constraints);
- *  shows up on 64-bit dynamic (-fPIC) version with -O2, especially if debug-
- *  printfs enabled, but at right edge of odd-width images even if disabled]
- *
- *
- * STILL TO DO:
- *  - fix final thread-unsafe code using stack vars and pointer? (paeth top,
- *     default, bottom only:  default, bottom already 5 reg constraints; could
- *     replace bpp with pointer and group bpp/patemp/pbtemp/pctemp in array)
- *  - fix ebp/no-reg-constraint inefficiency (avg/paeth/sub top)
- *  - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
- *  - write MMX code for 48-bit case (pixel_bytes == 6)
- *  - figure out what's up with 24-bit case (pixel_bytes == 3):
- *     why subtract 8 from width_mmx in the pass 4/5 case?  due to
- *     odd number of bytes? (only width_mmx case) (near line 2335)
- *  - rewrite all MMX interlacing code so it's aligned with beginning
- *     of the row buffer, not the end (see 19991007 for details)
- *  - add error messages to any remaining bogus default cases
- *  - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
- *  - try =r, etc., as reg constraints?  (would gcc use 64-bit ones on x86-64?)
- *  - need full, non-graphical, CRC-based test suite...  maybe autogenerate
- *     random data of various height/width/depth, compute CRCs, write (C
- *     funcs), read (asm/MMX), recompute CRCs, and compare?
- *  - write true x86-64 version using 128-bit "media instructions", %xmm0-15,
- *     and extra general-purpose registers
- */
-
-#if defined(__GNUC__)
-
-#define PNG_INTERNAL
-#include "png.h"
-
-
-/* for some inexplicable reason, gcc 3.3.5 on OpenBSD (and elsewhere?) does
- * *not* define __PIC__ when the -fPIC option is used, so we have to rely on
- * makefiles and whatnot to define the PIC macro explicitly */
-#if defined(PIC) && !defined(__PIC__)   // (this can/should move to pngconf.h)
-#  define __PIC__
-#endif
-
-#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && \
-    defined(PNG_USE_PNGGCCRD) && defined(PNG_MMX_CODE_SUPPORTED)
-
-/* if you want/need full thread-safety on x86-64 even when linking statically,
- * comment out the "&& defined(__PIC__)" part here: */
-#if defined(__x86_64__) && defined(__PIC__)
-#  define PNG_x86_64_USE_GOTPCREL            // GOTPCREL => full thread-safety
-#  define PNG_CLOBBER_x86_64_REGS_SUPPORTED  // works as of gcc 3.4.3 ...
-#endif
-
-int PNGAPI png_mmx_support(void);
-
-#if defined(PNG_USE_LOCAL_ARRAYS)
-static PNG_CONST int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
-static PNG_CONST int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
-static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
-#endif
-
-/* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
- * so define them without: */
-#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
-    defined(__OS2__) || defined(__APPLE__)
-#  define _mmx_supported  mmx_supported
-#  define _mask8_0        mask8_0
-#  define _mask16_1       mask16_1
-#  define _mask16_0       mask16_0
-#  define _mask24_2       mask24_2
-#  define _mask24_1       mask24_1
-#  define _mask24_0       mask24_0
-#  define _mask32_3       mask32_3
-#  define _mask32_2       mask32_2
-#  define _mask32_1       mask32_1
-#  define _mask32_0       mask32_0
-#  define _mask48_5       mask48_5
-#  define _mask48_4       mask48_4
-#  define _mask48_3       mask48_3
-#  define _mask48_2       mask48_2
-#  define _mask48_1       mask48_1
-#  define _mask48_0       mask48_0
-#  define _amask5_3_0     amask5_3_0
-#  define _amask7_1_0     amask7_1_0
-#  define _LBCarryMask    LBCarryMask
-#  define _HBClearMask    HBClearMask
-#  define _amask0_8_0     amask0_8_0
-#  define _amask6_2_0     amask6_2_0
-#  define _amask4_4_0     amask4_4_0
-#  define _amask0_2_6     amask0_2_6
-#  define _amask2_3_3     amask2_3_3
-#  define _amask4_2_2     amask4_2_2
-#  if defined(PNG_THREAD_UNSAFE_OK)
-#    define _patemp       patemp
-#    define _pbtemp       pbtemp
-#    define _pctemp       pctemp
-#  endif
-#endif // djgpp, Win32, Cygwin, OS2
-
-
-/* These constants are used in the inlined MMX assembly code. */
-
-typedef unsigned long long  ull;
-
-#if defined(PNG_x86_64_USE_GOTPCREL)
-static PNG_CONST struct {
-    //ull _mask_array[26];
-
-    // png_combine_row() constants:
-    ull _mask8_0;
-    ull _mask16_0, _mask16_1;
-    ull _mask24_0, _mask24_1, _mask24_2;
-    ull _mask32_0, _mask32_1, _mask32_2, _mask32_3;
-    ull _mask48_0, _mask48_1, _mask48_2, _mask48_3, _mask48_4, _mask48_5;
-
-    // png_do_read_interlace() constants:
-    ull _amask5_3_0, _amask7_1_0;  // was _const4 and _const6, respectively
-
-    // png_read_filter_row_mmx_avg() constants (also uses _amask5_3_0):
-    ull _LBCarryMask, _HBClearMask;
-    ull _amask0_8_0, _amask6_2_0;  // was ActiveMask for bpp 4/6 and 2 cases
-
-    // png_read_filter_row_mmx_paeth() constants (also uses _amask5_3_0):
-    ull _amask4_4_0, _amask0_2_6;  // was ActiveMask{,End} for bpp 6/4/8 and 3
-
-    // png_read_filter_row_mmx_sub() constants:
-    ull _amask2_3_3, _amask4_2_2;  // was ActiveMask for bpp 3 and 2 cases
-
-} _c64 __attribute__((used, aligned(8))) = {
-
-    // png_combine_row() constants:
-    0x0102040810204080LL, // _mask8_0      offset 0
-
-    0x1010202040408080LL, // _mask16_0     offset 8
-    0x0101020204040808LL, // _mask16_1     offset 16
-
-    0x2020404040808080LL, // _mask24_0     offset 24
-    0x0408080810101020LL, // _mask24_1     offset 32
-    0x0101010202020404LL, // _mask24_2     offset 40
-
-    0x4040404080808080LL, // _mask32_0     offset 48
-    0x1010101020202020LL, // _mask32_1     offset 56
-    0x0404040408080808LL, // _mask32_2     offset 64
-    0x0101010102020202LL, // _mask32_3     offset 72
-
-    0x4040808080808080LL, // _mask48_0     offset 80
-    0x2020202040404040LL, // _mask48_1     offset 88
-    0x1010101010102020LL, // _mask48_2     offset 96
-    0x0404080808080808LL, // _mask48_3     offset 104
-    0x0202020204040404LL, // _mask48_4     offset 112
-    0x0101010101010202LL, // _mask48_5     offset 120
-
-    // png_do_read_interlace() constants:
-    0x0000000000FFFFFFLL, // _amask5_3_0   offset 128  (bpp 3, avg/paeth) const4
-    0x00000000000000FFLL, // _amask7_1_0   offset 136                     const6
-
-    // png_read_filter_row_mmx_avg() constants:
-    0x0101010101010101LL, // _LBCarryMask  offset 144
-    0x7F7F7F7F7F7F7F7FLL, // _HBClearMask  offset 152
-    0xFFFFFFFFFFFFFFFFLL, // _amask0_8_0   offset 160  (bpp 4/6, avg)
-    0x000000000000FFFFLL, // _amask6_2_0   offset 168  (bpp 2,   avg)
-
-    // png_read_filter_row_mmx_paeth() constants:
-    0x00000000FFFFFFFFLL, // _amask4_4_0   offset 176  (bpp 6/4/8, paeth)
-    0xFFFF000000000000LL, // _amask0_2_6   offset 184  (bpp 3, paeth)   A.M.End
-
-    // png_read_filter_row_mmx_sub() constants:
-    0x0000FFFFFF000000LL, // _amask2_3_3   offset 192  (bpp 3, sub)
-    0x00000000FFFF0000LL, // _amask4_2_2   offset 200  (bpp 2, sub)
-
-};
-
-#define MASK8_0        "(%%rbp)"
-#define MASK16_0       "8(%%rbp)"
-#define MASK16_1       "16(%%rbp)"
-#define MASK24_0       "24(%%rbp)"
-#define MASK24_1       "32(%%rbp)"
-#define MASK24_2       "40(%%rbp)"
-#define MASK32_0       "48(%%rbp)"
-#define MASK32_1       "56(%%rbp)"
-#define MASK32_2       "64(%%rbp)"
-#define MASK32_3       "72(%%rbp)"
-#define MASK48_0       "80(%%rbp)"
-#define MASK48_1       "88(%%rbp)"
-#define MASK48_2       "96(%%rbp)"
-#define MASK48_3       "104(%%rbp)"
-#define MASK48_4       "112(%%rbp)"
-#define MASK48_5       "120(%%rbp)"
-#define AMASK5_3_0     "128(%%rbp)"
-#define AMASK7_1_0     "136(%%rbp)"
-#define LB_CARRY_MASK  "144(%%rbp)"
-#define HB_CLEAR_MASK  "152(%%rbp)"
-#define AMASK0_8_0     "160(%%rbp)"
-#define AMASK6_2_0     "168(%%rbp)"
-#define AMASK4_4_0     "176(%%rbp)"
-#define AMASK0_2_6     "184(%%rbp)"
-#define AMASK2_3_3     "192(%%rbp)"
-#define AMASK4_2_2     "200(%%rbp)"
-
-#else // !PNG_x86_64_USE_GOTPCREL
-
-static PNG_CONST ull _mask8_0  __attribute__((used, aligned(8))) = 0x0102040810204080LL;
-
-static PNG_CONST ull _mask16_1 __attribute__((used, aligned(8))) = 0x0101020204040808LL;
-static PNG_CONST ull _mask16_0 __attribute__((used, aligned(8))) = 0x1010202040408080LL;
-
-static PNG_CONST ull _mask24_2 __attribute__((used, aligned(8))) = 0x0101010202020404LL;
-static PNG_CONST ull _mask24_1 __attribute__((used, aligned(8))) = 0x0408080810101020LL;
-static PNG_CONST ull _mask24_0 __attribute__((used, aligned(8))) = 0x2020404040808080LL;
-
-static PNG_CONST ull _mask32_3 __attribute__((used, aligned(8))) = 0x0101010102020202LL;
-static PNG_CONST ull _mask32_2 __attribute__((used, aligned(8))) = 0x0404040408080808LL;
-static PNG_CONST ull _mask32_1 __attribute__((used, aligned(8))) = 0x1010101020202020LL;
-static PNG_CONST ull _mask32_0 __attribute__((used, aligned(8))) = 0x4040404080808080LL;
-
-static PNG_CONST ull _mask48_5 __attribute__((used, aligned(8))) = 0x0101010101010202LL;
-static PNG_CONST ull _mask48_4 __attribute__((used, aligned(8))) = 0x0202020204040404LL;
-static PNG_CONST ull _mask48_3 __attribute__((used, aligned(8))) = 0x0404080808080808LL;
-static PNG_CONST ull _mask48_2 __attribute__((used, aligned(8))) = 0x1010101010102020LL;
-static PNG_CONST ull _mask48_1 __attribute__((used, aligned(8))) = 0x2020202040404040LL;
-static PNG_CONST ull _mask48_0 __attribute__((used, aligned(8))) = 0x4040808080808080LL;
-
-// png_do_read_interlace() constants:
-static PNG_CONST ull _amask5_3_0  __attribute__((aligned(8))) = 0x0000000000FFFFFFLL;  // was _const4
-static PNG_CONST ull _amask7_1_0  __attribute__((aligned(8))) = 0x00000000000000FFLL;  // was _const6
-
-// png_read_filter_row_mmx_avg() constants:
-static PNG_CONST ull _LBCarryMask __attribute__((used, aligned(8))) = 0x0101010101010101LL;
-static PNG_CONST ull _HBClearMask __attribute__((used, aligned(8))) = 0x7f7f7f7f7f7f7f7fLL;
-static PNG_CONST ull _amask0_8_0  __attribute__((used, aligned(8))) = 0xFFFFFFFFFFFFFFFFLL;
-static PNG_CONST ull _amask6_2_0  __attribute__((used, aligned(8))) = 0x000000000000FFFFLL;
-
-// png_read_filter_row_mmx_paeth() constants:
-static PNG_CONST ull _amask4_4_0  __attribute__((used, aligned(8))) = 0x00000000FFFFFFFFLL;
-static PNG_CONST ull _amask0_2_6  __attribute__((used, aligned(8))) = 0xFFFF000000000000LL;
-
-// png_read_filter_row_mmx_sub() constants:
-static PNG_CONST ull _amask2_3_3  __attribute__((used, aligned(8))) = 0x0000FFFFFF000000LL;
-static PNG_CONST ull _amask4_2_2  __attribute__((used, aligned(8))) = 0x00000000FFFF0000LL;
-
-#define MASK8_0        "_mask8_0"
-#define MASK16_0       "_mask16_0"
-#define MASK16_1       "_mask16_1"
-#define MASK24_0       "_mask24_0"
-#define MASK24_1       "_mask24_1"
-#define MASK24_2       "_mask24_2"
-#define MASK32_0       "_mask32_0"
-#define MASK32_1       "_mask32_1"
-#define MASK32_2       "_mask32_2"
-#define MASK32_3       "_mask32_3"
-#define MASK48_0       "_mask48_0"
-#define MASK48_1       "_mask48_1"
-#define MASK48_2       "_mask48_2"
-#define MASK48_3       "_mask48_3"
-#define MASK48_4       "_mask48_4"
-#define MASK48_5       "_mask48_5"
-#define AMASK5_3_0     "_amask5_3_0"
-#define AMASK7_1_0     "_amask7_1_0"
-#define LB_CARRY_MASK  "_LBCarryMask"
-#define HB_CLEAR_MASK  "_HBClearMask"
-#define AMASK0_8_0     "_amask0_8_0"
-#define AMASK6_2_0     "_amask6_2_0"
-#define AMASK4_4_0     "_amask4_4_0"
-#define AMASK0_2_6     "_amask0_2_6"
-#define AMASK2_3_3     "_amask2_3_3"
-#define AMASK4_2_2     "_amask4_2_2"
-
-#endif // ?PNG_x86_64_USE_GOTPCREL
-
-
-#if defined(PNG_HAVE_MMX_READ_FILTER_ROW) || defined(PNG_HAVE_MMX_COMBINE_ROW)
-
-// this block is specific to png_read_filter_row_mmx_paeth() except for
-// LOAD_GOT_rbp and RESTORE_rbp, which are also used in png_combine_row()
-#if defined(PNG_x86_64_USE_GOTPCREL)
-#  define pa_TEMP                "%%r11d"
-#  define pb_TEMP                "%%r12d"
-#  define pc_TEMP                "%%r13d"
-#  if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)  // works as of gcc 3.4.3 ...
-#    define SAVE_r11_r12_r13
-#    define RESTORE_r11_r12_r13
-#    define _CLOBBER_r11_r12_r13 ,"%r11", "%r12", "%r13"
-#    define CLOBBER_r11_r12_r13  "%r11", "%r12", "%r13"
-#  else // !PNG_CLOBBER_x86_64_REGS_SUPPORTED
-#    define SAVE_r11_r12_r13     "pushq %%r11  \n\t" \
-                                 "pushq %%r12  \n\t" \
-                                 "pushq %%r13  \n\t"  // "normally 0-extended"
-#    define RESTORE_r11_r12_r13  "popq  %%r13  \n\t" \
-                                 "popq  %%r12  \n\t" \
-                                 "popq  %%r11  \n\t"
-#    define _CLOBBER_r11_r12_r13
-#    define CLOBBER_r11_r12_r13
-#  endif
-#  define LOAD_GOT_rbp           "pushq %%rbp                        \n\t" \
-                                 "movq  _c64@GOTPCREL(%%rip), %%rbp  \n\t"
-#  define RESTORE_rbp            "popq  %%rbp                        \n\t"
-#else // 32-bit and/or non-PIC
-#  if defined(PNG_THREAD_UNSAFE_OK)
-     // These variables are used in png_read_filter_row_mmx_paeth() and would be
-     //   local variables if not for gcc-inline-assembly addressing limitations
-     //   (some apparently related to ELF format, others to CPU type).
-     //
-     // WARNING: Their presence defeats the thread-safety of libpng.
-     static int                     _patemp  __attribute__((used));
-     static int                     _pbtemp  __attribute__((used));
-     static int                     _pctemp  __attribute__((used));
-#    define pa_TEMP                "_patemp"
-#    define pb_TEMP                "_pbtemp"  // temp variables for
-#    define pc_TEMP                "_pctemp"  //  Paeth routine
-#    define SAVE_r11_r12_r13
-#    define RESTORE_r11_r12_r13
-#    define _CLOBBER_r11_r12_r13   // not using regs => not clobbering
-#    define CLOBBER_r11_r12_r13
-#  endif // PNG_THREAD_UNSAFE_OK
-#  define LOAD_GOT_rbp
-#  define RESTORE_rbp
-#endif
-
-#if defined(__x86_64__)
-#  define SAVE_ebp
-#  define RESTORE_ebp
-#  define _CLOBBER_ebp         ,"%ebp"
-#  define CLOBBER_ebp          "%ebp"
-#  define SAVE_FullLength      "movl %%eax, %%r15d  \n\t"
-#  define RESTORE_FullLength   "movl %%r15d, "     // may go into eax or ecx
-#  if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)   // works as of gcc 3.4.3 ...
-#    define SAVE_r15
-#    define RESTORE_r15
-#    define _CLOBBER_r15       ,"%r15"
-#  else
-#    define SAVE_r15           "pushq %%r15  \n\t"
-#    define RESTORE_r15        "popq  %%r15  \n\t"
-#    define _CLOBBER_r15
-#  endif
-#  define PBP                  "%%rbp"             // regs used for 64-bit
-#  define PAX                  "%%rax"             //  pointers or in
-#  define PBX                  "%%rbx"             //  combination with
-#  define PCX                  "%%rcx"             //  64-bit pointer-regs
-#  define PDX                  "%%rdx"             //  (base/index pairs,
-#  define PSI                  "%%rsi"             //  add/sub/mov pairs)
-#  define CLEAR_BOTTOM_3_BITS  "and  $0xfffffffffffffff8, "
-#else
-#  define SAVE_ebp             "pushl %%ebp \n\t"  // clobber list doesn't work
-#  define RESTORE_ebp          "popl  %%ebp \n\t"  //  for %ebp on 32-bit; not
-#  define _CLOBBER_ebp                             //  clear why not
-#  define CLOBBER_ebp
-#  define SAVE_FullLength      "pushl %%eax \n\t"
-#  define RESTORE_FullLength   "popl "             // eax (avg) or ecx (paeth)
-#  define SAVE_r15
-#  define RESTORE_r15
-#  define _CLOBBER_r15
-#  define PBP                  "%%ebp"             // regs used for or in
-#  define PAX                  "%%eax"             //  combination with
-#  define PBX                  "%%ebx"             //  "normal," 32-bit
-#  define PCX                  "%%ecx"             //  pointers
-#  define PDX                  "%%edx"
-#  define PSI                  "%%esi"
-#  define CLEAR_BOTTOM_3_BITS  "and  $0xfffffff8, "
-#endif
-
-// CLOB_COMMA_ebx_ebp:  need comma ONLY if both CLOBBER_ebp and CLOBBER_GOT_ebx
-//                      have values, i.e., only if __x86_64__ AND !__PIC__
-#if defined(__x86_64__) && !defined(__PIC__)
-#  define CLOB_COMMA_ebx_ebp    , // clobbering both ebp and ebx => need comma
-#else
-#  define CLOB_COMMA_ebx_ebp
-#endif
-
-// CLOB_COMMA_ebX_r1X:  need comma UNLESS both CLOBBER_ebp and CLOBBER_GOT_ebx
-//                   are empty OR CLOBBER_r11_r12_r13 is empty--i.e., NO comma
-//                   if (!__x86_64__ AND __PIC__) OR !(PNG_x86_64_USE_GOTPCREL
-//                   AND PNG_CLOBBER_x86_64_REGS_SUPPORTED)   (double sigh...)
-#if (!defined(__x86_64__) && defined(__PIC__)) || \
-    !defined(PNG_x86_64_USE_GOTPCREL) || \
-    !defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)
-#  define CLOB_COMMA_ebX_r1X
-#else
-#  define CLOB_COMMA_ebX_r1X    , // clobbering (ebp OR ebx) AND r11_r12_r13
-#endif
-
-// CLOB_COLON_ebx_ebp:  need colon unless CLOBBER_ebp and CLOBBER_GOT_ebx are
-//                      BOTH empty--i.e., NO colon if (!__x86_64__ AND __PIC__)
-// CLOB_COLON_ebx_ebp_r1X:  if, in addition, CLOBBER_r11_r12_r13 is empty, then
-//                          no colon for Paeth blocks, either--i.e., NO colon
-//                          if !(PNG_x86_64_USE_GOTPCREL AND
-//                               PNG_CLOBBER_x86_64_REGS_SUPPORTED)
-#if (!defined(__x86_64__) && defined(__PIC__))
-#  define CLOB_COLON_ebx_ebp
-#  if !(defined(PNG_x86_64_USE_GOTPCREL) && \
-        defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED))
-#    define CLOB_COLON_ebx_ebp_r1X
-#  else
-#    define CLOB_COLON_ebx_ebp_r1X  : // clobbering ebp OR ebx OR r11_r12_r13
-#  endif
-#else
-#  define CLOB_COLON_ebx_ebp        : // clobbering ebp OR ebx
-#  define CLOB_COLON_ebx_ebp_r1X    : // clobbering ebp OR ebx OR r11_r12_r13
-#endif
-
-#endif // PNG_HAVE_MMX_READ_FILTER_ROW
-
-#if defined(__PIC__)  // macros to save, restore index to Global Offset Table
-#  if defined(__x86_64__)
-#    define SAVE_GOT_ebx     "pushq %%rbx \n\t"
-#    define RESTORE_GOT_ebx  "popq  %%rbx \n\t"
-#  else
-#    define SAVE_GOT_ebx     "pushl %%ebx \n\t"
-#    define RESTORE_GOT_ebx  "popl  %%ebx \n\t"
-#  endif
-#  define _CLOBBER_GOT_ebx   // explicitly saved, restored => not clobbered
-#  define CLOBBER_GOT_ebx
-#else
-#  define SAVE_GOT_ebx
-#  define RESTORE_GOT_ebx
-#  define _CLOBBER_GOT_ebx   ,"%ebx"
-#  define CLOBBER_GOT_ebx    "%ebx"
-#endif
-
-#if defined(PNG_HAVE_MMX_COMBINE_ROW) || defined(PNG_HAVE_MMX_READ_INTERLACE)
-#  define BPP2  2
-#  define BPP3  3  // bytes per pixel (a.k.a. pixel_bytes)
-#  define BPP4  4  // (defined only to help avoid cut-and-paste errors)
-#  define BPP6  6
-#  define BPP8  8
-#endif
-
-
-
-static int _mmx_supported = 2; // 0: no MMX; 1: MMX supported; 2: not tested
-
-/*===========================================================================*/
-/*                                                                           */
-/*                      P N G _ M M X _ S U P P O R T                        */
-/*                                                                           */
-/*===========================================================================*/
-
-// GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
-//             (2) all instructions compile with gcc 2.7.2.3 and later
-//           x (3) the function is moved down here to prevent gcc from
-//           x      inlining it in multiple places and then barfing be-
-//           x      cause the ".NOT_SUPPORTED" label is multiply defined
-//                  [need to retest with gcc 2.7.2.3]
-
-// GRR 20070524:  This declaration apparently is compatible with but supersedes
-//   the one in png.h; in any case, the generated object file is slightly
-//   smaller.  It is unnecessary with gcc 4.1.2, but gcc 2.x apparently
-//   replicated the ".NOT_SUPPORTED" label in each location the function was
-//   inlined, leading to compilation errors due to the "multiply defined"
-//   label.  Old workaround was to leave the function at the end of this
-//   file; new one (still testing) is to use a gcc-specific function attribute
-//   to prevent local inlining.
-int PNGAPI
-png_mmx_support(void) __attribute__((noinline));
-
-int PNGAPI
-png_mmx_support(void)
-{
-    int result;
-#if defined(PNG_MMX_CODE_SUPPORTED)  // superfluous, but what the heck
-    __asm__ __volatile__ (
-#if defined(__x86_64__)
-        "pushq %%rbx          \n\t"  // rbx gets clobbered by CPUID instruction
-        "pushq %%rcx          \n\t"  // so does rcx...
-        "pushq %%rdx          \n\t"  // ...and rdx (but rcx & rdx safe on Linux)
-        "pushfq               \n\t"  // save Eflag to stack
-        "popq %%rax           \n\t"  // get Eflag from stack into rax
-        "movq %%rax, %%rcx    \n\t"  // make another copy of Eflag in rcx
-        "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
-        "pushq %%rax          \n\t"  // save modified Eflag back to stack
-        "popfq                \n\t"  // restore modified value to Eflag reg
-        "pushfq               \n\t"  // save Eflag to stack
-        "popq %%rax           \n\t"  // get Eflag from stack
-        "pushq %%rcx          \n\t"  // save original Eflag to stack
-        "popfq                \n\t"  // restore original Eflag
-#else
-        "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
-        "pushl %%ecx          \n\t"  // so does ecx...
-        "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
-        "pushfl               \n\t"  // save Eflag to stack
-        "popl %%eax           \n\t"  // get Eflag from stack into eax
-        "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
-        "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
-        "pushl %%eax          \n\t"  // save modified Eflag back to stack
-        "popfl                \n\t"  // restore modified value to Eflag reg
-        "pushfl               \n\t"  // save Eflag to stack
-        "popl %%eax           \n\t"  // get Eflag from stack
-        "pushl %%ecx          \n\t"  // save original Eflag to stack
-        "popfl                \n\t"  // restore original Eflag
-#endif
-        "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
-        "jz 0f                \n\t"  // if same, CPUID instr. is not supported
-
-        "xorl %%eax, %%eax    \n\t"  // set eax to zero
-//      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
-        "cpuid                \n\t"  // get the CPU identification info
-        "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
-        "jl 0f                \n\t"  // if eax is zero, MMX is not supported
-
-        "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
-        "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
-                                     // faster than the instruction "mov eax, 1"
-        "cpuid                \n\t"  // get the CPU identification info again
-        "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
-        "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
-        "jz 0f                \n\t"  // non-zero = yes, MMX IS supported
-
-        "movl $1, %%eax       \n\t"  // set return value to 1
-        "jmp  1f              \n\t"  // DONE:  have MMX support
-
-    "0:                       \n\t"  // .NOT_SUPPORTED: target label for jump instructions
-        "movl $0, %%eax       \n\t"  // set return value to 0
-    "1:                       \n\t"  // .RETURN: target label for jump instructions
-#if defined(__x86_64__)
-        "popq %%rdx           \n\t"  // restore rdx
-        "popq %%rcx           \n\t"  // restore rcx
-        "popq %%rbx           \n\t"  // restore rbx
-#else
-        "popl %%edx           \n\t"  // restore edx
-        "popl %%ecx           \n\t"  // restore ecx
-        "popl %%ebx           \n\t"  // restore ebx
-#endif
-
-//      "ret                  \n\t"  // DONE:  no MMX support
-                                     // (fall through to standard C "ret")
-
-        : "=a" (result)              // output list
-
-        :                            // any variables used on input (none)
-
-                                     // no clobber list
-//      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
-//      , "memory"   // if write to a variable gcc thought was in a reg
-//      , "cc"       // "condition codes" (flag bits)
-    );
-    _mmx_supported = result;
-#else
-    _mmx_supported = 0;
-#endif /* PNG_MMX_CODE_SUPPORTED */
-
-    return _mmx_supported;
-}
-
-
-/*===========================================================================*/
-/*                                                                           */
-/*                       P N G _ C O M B I N E _ R O W                       */
-/*                                                                           */
-/*===========================================================================*/
-
-#if defined(PNG_HAVE_MMX_COMBINE_ROW)
-
-/* Combines the row recently read in with the previous row.
-   This routine takes care of alpha and transparency if requested.
-   This routine also handles the two methods of progressive display
-   of interlaced images, depending on the mask value.
-   The mask value describes which pixels are to be combined with
-   the row.  The pattern always repeats every 8 pixels, so just 8
-   bits are needed.  A one indicates the pixel is to be combined; a
-   zero indicates the pixel is to be skipped.  This is in addition
-   to any alpha or transparency value associated with the pixel.
-   If you want all pixels to be combined, pass 0xff (255) in mask. */
-
-/* Use this routine for the x86 platform - it uses a faster MMX routine
-   if the machine supports MMX. */
-
-void /* PRIVATE */
-png_combine_row(png_structp png_ptr, png_bytep row, int mask)
-{
-   int dummy_value_a;    // fix 'forbidden register spilled' error
-   int dummy_value_c;
-   int dummy_value_d;
-   png_bytep dummy_value_S;
-   png_bytep dummy_value_D;
-
-   png_debug(1, "in png_combine_row (pnggccrd.c)\n");
-
-   if (_mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-   if (mask == 0xff)
-   {
-      png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
-      png_memcpy(row, png_ptr->row_buf + 1,
-       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
-   }
-   else   /* (png_combine_row() is never called with mask == 0) */
-   {
-      switch (png_ptr->row_info.pixel_depth)
-      {
-         case 24:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               png_uint_32 len;
-               int diff;
-
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               len  = png_ptr->width & ~7;          // reduce to multiple of 8
-               diff = (int) (png_ptr->width & 7);   // amount lost
-
-               __asm__ __volatile__ (
-                  "not       %%edx            \n\t" // mask => unmask
-                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
-                  "not       %%edx            \n\t" // unmask => mask for later
-                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
-                  "punpcklbw %%mm7, %%mm7     \n\t"
-                  "punpcklwd %%mm7, %%mm7     \n\t"
-                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
-
-                  LOAD_GOT_rbp
-                  "movq   " MASK24_0 ", %%mm0 \n\t" // _mask24_0 -> mm0
-                  "movq   " MASK24_1 ", %%mm1 \n\t" // _mask24_1 -> mm1
-                  "movq   " MASK24_2 ", %%mm2 \n\t" // _mask24_2 -> mm2
-                  RESTORE_rbp
-
-                  "pand      %%mm7, %%mm0     \n\t"
-                  "pand      %%mm7, %%mm1     \n\t"
-                  "pand      %%mm7, %%mm2     \n\t"
-
-                  "pcmpeqb   %%mm6, %%mm0     \n\t"
-                  "pcmpeqb   %%mm6, %%mm1     \n\t"
-                  "pcmpeqb   %%mm6, %%mm2     \n\t"
-
-// preload        "movl      len, %%ecx       \n\t" // load length of line
-// preload        "movl      srcptr, %3       \n\t" // load source
-// preload        "movl      dstptr, %4       \n\t" // load dest
-
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        mainloop24end    \n\t"
-
-                "mainloop24:                  \n\t"
-                  "movq      (%3), %%mm4      \n\t"
-                  "pand      %%mm0, %%mm4     \n\t"
-                  "movq      %%mm0, %%mm6     \n\t"
-                  "movq      (%4), %%mm7      \n\t"
-                  "pandn     %%mm7, %%mm6     \n\t"
-                  "por       %%mm6, %%mm4     \n\t"
-                  "movq      %%mm4, (%4)      \n\t"
-
-                  "movq      8(%3), %%mm5     \n\t"
-                  "pand      %%mm1, %%mm5     \n\t"
-                  "movq      %%mm1, %%mm7     \n\t"
-                  "movq      8(%4), %%mm6     \n\t"
-                  "pandn     %%mm6, %%mm7     \n\t"
-                  "por       %%mm7, %%mm5     \n\t"
-                  "movq      %%mm5, 8(%4)     \n\t"
-
-                  "movq      16(%3), %%mm6    \n\t"
-                  "pand      %%mm2, %%mm6     \n\t"
-                  "movq      %%mm2, %%mm4     \n\t"
-                  "movq      16(%4), %%mm7    \n\t"
-                  "pandn     %%mm7, %%mm4     \n\t"
-                  "por       %%mm4, %%mm6     \n\t"
-                  "movq      %%mm6, 16(%4)    \n\t"
-
-                  "add       $24, %3          \n\t" // inc by 24 bytes processed
-                  "add       $24, %4          \n\t"
-                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
-
-                  "ja        mainloop24       \n\t"
-
-                "mainloop24end:               \n\t"
-// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
-                  "movl      %%eax, %%ecx     \n\t"
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        end24            \n\t"
-// preload        "movl      mask, %%edx      \n\t"
-                  "sall      $24, %%edx       \n\t" // make low byte, high byte
-
-                "secondloop24:                \n\t"
-                  "sall      %%edx            \n\t" // move high bit to CF
-                  "jnc       skip24           \n\t" // if CF = 0
-                  "movw      (%3), %%ax       \n\t"
-                  "movw      %%ax, (%4)       \n\t"
-                  "xorl      %%eax, %%eax     \n\t"
-                  "movb      2(%3), %%al      \n\t"
-                  "movb      %%al, 2(%4)      \n\t"
-
-                "skip24:                      \n\t"
-                  "add       $3, %3           \n\t"
-                  "add       $3, %4           \n\t"
-                  "decl      %%ecx            \n\t"
-                  "jnz       secondloop24     \n\t"
-
-                "end24:                       \n\t"
-                  "EMMS                       \n\t" // DONE
-
-                  : "=a" (dummy_value_a),           // output regs (dummy)
-                    "=d" (dummy_value_d),
-                    "=c" (dummy_value_c),
-                    "=S" (dummy_value_S),
-                    "=D" (dummy_value_D)
-
-                  : "0" (diff),        // eax       // input regs
-                    "1" (mask),        // edx
-                    "2" (len),         // ecx
-// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
-                    "3" (srcptr),      // esi/rsi
-                    "4" (dstptr)       // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                  : "%mm0", "%mm1", "%mm2"          // clobber list
-                  , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-               );
-            }
-            else /* not _mmx_supported - use modified C routine */
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP3 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP3;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 24 bpp */
-
-         // formerly claimed to be most common case (combining 32-bit RGBA),
-         // but almost certainly less common than 24-bit RGB case
-         case 32:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               png_uint_32 len;
-               int diff;
-
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               len  = png_ptr->width & ~7;          // reduce to multiple of 8
-               diff = (int) (png_ptr->width & 7);   // amount lost
-
-               __asm__ __volatile__ (
-                  "not       %%edx            \n\t" // mask => unmask
-                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
-                  "not       %%edx            \n\t" // unmask => mask for later
-                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
-                  "punpcklbw %%mm7, %%mm7     \n\t"
-                  "punpcklwd %%mm7, %%mm7     \n\t"
-                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
-
-                  LOAD_GOT_rbp
-                  "movq   " MASK32_0 ", %%mm0 \n\t" // _mask32_0
-                  "movq   " MASK32_1 ", %%mm1 \n\t" // _mask32_1
-                  "movq   " MASK32_2 ", %%mm2 \n\t" // _mask32_2
-                  "movq   " MASK32_3 ", %%mm3 \n\t" // _mask32_3
-                  RESTORE_rbp
-
-                  "pand      %%mm7, %%mm0     \n\t"
-                  "pand      %%mm7, %%mm1     \n\t"
-                  "pand      %%mm7, %%mm2     \n\t"
-                  "pand      %%mm7, %%mm3     \n\t"
-
-                  "pcmpeqb   %%mm6, %%mm0     \n\t"
-                  "pcmpeqb   %%mm6, %%mm1     \n\t"
-                  "pcmpeqb   %%mm6, %%mm2     \n\t"
-                  "pcmpeqb   %%mm6, %%mm3     \n\t"
-
-// preload        "movl      len, %%ecx       \n\t" // load length of line
-// preload        "movl      srcptr, %3       \n\t" // load source
-// preload        "movl      dstptr, %4       \n\t" // load dest
-
-                  "cmpl      $0, %%ecx        \n\t" // lcr
-                  "jz        mainloop32end    \n\t"
-
-                "mainloop32:                  \n\t"
-                  "movq      (%3), %%mm4      \n\t"
-                  "pand      %%mm0, %%mm4     \n\t"
-                  "movq      %%mm0, %%mm6     \n\t"
-                  "movq      (%4), %%mm7      \n\t"
-                  "pandn     %%mm7, %%mm6     \n\t"
-                  "por       %%mm6, %%mm4     \n\t"
-                  "movq      %%mm4, (%4)      \n\t"
-
-                  "movq      8(%3), %%mm5     \n\t"
-                  "pand      %%mm1, %%mm5     \n\t"
-                  "movq      %%mm1, %%mm7     \n\t"
-                  "movq      8(%4), %%mm6     \n\t"
-                  "pandn     %%mm6, %%mm7     \n\t"
-                  "por       %%mm7, %%mm5     \n\t"
-                  "movq      %%mm5, 8(%4)     \n\t"
-
-                  "movq      16(%3), %%mm6    \n\t"
-                  "pand      %%mm2, %%mm6     \n\t"
-                  "movq      %%mm2, %%mm4     \n\t"
-                  "movq      16(%4), %%mm7    \n\t"
-                  "pandn     %%mm7, %%mm4     \n\t"
-                  "por       %%mm4, %%mm6     \n\t"
-                  "movq      %%mm6, 16(%4)    \n\t"
-
-                  "movq      24(%3), %%mm7    \n\t"
-                  "pand      %%mm3, %%mm7     \n\t"
-                  "movq      %%mm3, %%mm5     \n\t"
-                  "movq      24(%4), %%mm4    \n\t"
-                  "pandn     %%mm4, %%mm5     \n\t"
-                  "por       %%mm5, %%mm7     \n\t"
-                  "movq      %%mm7, 24(%4)    \n\t"
-
-                  "add       $32, %3          \n\t" // inc by 32 bytes processed
-                  "add       $32, %4          \n\t"
-                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
-                  "ja        mainloop32       \n\t"
-
-                "mainloop32end:               \n\t"
-// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
-                  "movl      %%eax, %%ecx     \n\t"
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        end32            \n\t"
-// preload        "movl      mask, %%edx      \n\t"
-                  "sall      $24, %%edx       \n\t" // low byte => high byte
-
-                "secondloop32:                \n\t"
-                  "sall      %%edx            \n\t" // move high bit to CF
-                  "jnc       skip32           \n\t" // if CF = 0
-                  "movl      (%3), %%eax      \n\t"
-                  "movl      %%eax, (%4)      \n\t"
-
-                "skip32:                      \n\t"
-                  "add       $4, %3           \n\t"
-                  "add       $4, %4           \n\t"
-                  "decl      %%ecx            \n\t"
-                  "jnz       secondloop32     \n\t"
-
-                "end32:                       \n\t"
-                  "EMMS                       \n\t" // DONE
-
-                  : "=a" (dummy_value_a),           // output regs (dummy)
-                    "=d" (dummy_value_d),
-                    "=c" (dummy_value_c),
-                    "=S" (dummy_value_S),
-                    "=D" (dummy_value_D)
-
-                  : "0" (diff),        // eax       // input regs
-                    "1" (mask),        // edx
-                    "2" (len),         // ecx
-// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
-                    "3" (srcptr),      // esi/rsi
-                    "4" (dstptr)       // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                  : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
-                  , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-               );
-            }
-            else /* not _mmx_supported - use modified C routine */
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP4 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP4;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 32 bpp */
-
-         case 8:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               png_uint_32 len;
-               int diff;
-
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               len  = png_ptr->width & ~7;          // reduce to multiple of 8
-               diff = (int) (png_ptr->width & 7);   // amount lost
-
-               __asm__ __volatile__ (
-                  "not       %%edx            \n\t" // mask => unmask
-                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
-                  "not       %%edx            \n\t" // unmask => mask for later
-                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
-                  "punpcklbw %%mm7, %%mm7     \n\t"
-                  "punpcklwd %%mm7, %%mm7     \n\t"
-                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
-
-                  LOAD_GOT_rbp
-                  "movq   " MASK8_0 ", %%mm0  \n\t" // _mask8_0 -> mm0
-                  RESTORE_rbp
-
-                  "pand      %%mm7, %%mm0     \n\t" // nonzero if keep byte
-                  "pcmpeqb   %%mm6, %%mm0     \n\t" // zeros->1s, v versa
-
-// preload        "movl      len, %%ecx       \n\t" // load length of line
-// preload        "movl      srcptr, %3       \n\t" // load source
-// preload        "movl      dstptr, %4       \n\t" // load dest
-
-                  "cmpl      $0, %%ecx        \n\t" // len == 0 ?
-                  "je        mainloop8end     \n\t"
-
-                "mainloop8:                   \n\t"
-                  "movq      (%3), %%mm4      \n\t" // *srcptr
-                  "pand      %%mm0, %%mm4     \n\t"
-                  "movq      %%mm0, %%mm6     \n\t"
-                  "pandn     (%4), %%mm6      \n\t" // *dstptr
-                  "por       %%mm6, %%mm4     \n\t"
-                  "movq      %%mm4, (%4)      \n\t"
-                  "add       $8, %3           \n\t" // inc by 8 bytes processed
-                  "add       $8, %4           \n\t"
-                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
-                  "ja        mainloop8        \n\t"
-
-                "mainloop8end:                \n\t"
-// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
-                  "movl      %%eax, %%ecx     \n\t"
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        end8             \n\t"
-// preload        "movl      mask, %%edx      \n\t"
-                  "sall      $24, %%edx       \n\t" // make low byte, high byte
-
-                "secondloop8:                 \n\t"
-                  "sall      %%edx            \n\t" // move high bit to CF
-                  "jnc       skip8            \n\t" // if CF = 0
-                  "movb      (%3), %%al       \n\t"
-                  "movb      %%al, (%4)       \n\t"
-
-                "skip8:                       \n\t"
-                  "inc       %3               \n\t"
-                  "inc       %4               \n\t"
-                  "decl      %%ecx            \n\t"
-                  "jnz       secondloop8      \n\t"
-
-                "end8:                        \n\t"
-                  "EMMS                       \n\t" // DONE
-
-                  : "=a" (dummy_value_a),           // output regs (dummy)
-                    "=d" (dummy_value_d),
-                    "=c" (dummy_value_c),
-                    "=S" (dummy_value_S),
-                    "=D" (dummy_value_D)
-
-                  : "0" (diff),        // eax       // input regs
-                    "1" (mask),        // edx
-                    "2" (len),         // ecx
-// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
-                    "3" (srcptr),      // esi/rsi
-                    "4" (dstptr)       // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                  : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
-#endif
-               );
-            }
-            else /* not _mmx_supported - use modified C routine */
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = len;  /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff /* *BPP1 */ ;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 8 bpp */
-
-         case 1:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_inc, s_start, s_end;
-            int m;
-            int shift;
-            png_uint_32 i;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 7;
-               s_inc = 1;
-            }
-            else
-#endif
-            {
-               s_start = 7;
-               s_end = 0;
-               s_inc = -1;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  int value;
-
-                  value = (*sp >> shift) & 0x1;
-                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 1 bpp */
-
-         case 2:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-            else
-#endif
-            {
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0x3;
-                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 2 bpp */
-
-         case 4:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-            else
-#endif
-            {
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0xf;
-                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 4 bpp */
-
-         case 16:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               png_uint_32 len;
-               int diff;
-
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               len  = png_ptr->width & ~7;          // reduce to multiple of 8
-               diff = (int) (png_ptr->width & 7);   // amount lost
-
-               __asm__ __volatile__ (
-                  "not       %%edx            \n\t" // mask => unmask
-                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
-                  "not       %%edx            \n\t" // unmask => mask for later
-                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
-                  "punpcklbw %%mm7, %%mm7     \n\t"
-                  "punpcklwd %%mm7, %%mm7     \n\t"
-                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
-
-                  LOAD_GOT_rbp
-                  "movq   " MASK16_0 ", %%mm0 \n\t" // _mask16_0 -> mm0
-                  "movq   " MASK16_1 ", %%mm1 \n\t" // _mask16_1 -> mm1
-                  RESTORE_rbp
-
-                  "pand      %%mm7, %%mm0     \n\t"
-                  "pand      %%mm7, %%mm1     \n\t"
-
-                  "pcmpeqb   %%mm6, %%mm0     \n\t"
-                  "pcmpeqb   %%mm6, %%mm1     \n\t"
-
-// preload        "movl      len, %%ecx       \n\t" // load length of line
-// preload        "movl      srcptr, %3       \n\t" // load source
-// preload        "movl      dstptr, %4       \n\t" // load dest
-
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        mainloop16end    \n\t"
-
-                "mainloop16:                  \n\t"
-                  "movq      (%3), %%mm4      \n\t"
-                  "pand      %%mm0, %%mm4     \n\t"
-                  "movq      %%mm0, %%mm6     \n\t"
-                  "movq      (%4), %%mm7      \n\t"
-                  "pandn     %%mm7, %%mm6     \n\t"
-                  "por       %%mm6, %%mm4     \n\t"
-                  "movq      %%mm4, (%4)      \n\t"
-
-                  "movq      8(%3), %%mm5     \n\t"
-                  "pand      %%mm1, %%mm5     \n\t"
-                  "movq      %%mm1, %%mm7     \n\t"
-                  "movq      8(%4), %%mm6     \n\t"
-                  "pandn     %%mm6, %%mm7     \n\t"
-                  "por       %%mm7, %%mm5     \n\t"
-                  "movq      %%mm5, 8(%4)     \n\t"
-
-                  "add       $16, %3          \n\t" // inc by 16 bytes processed
-                  "add       $16, %4          \n\t"
-                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
-                  "ja        mainloop16       \n\t"
-
-                "mainloop16end:               \n\t"
-// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
-                  "movl      %%eax, %%ecx     \n\t"
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        end16            \n\t"
-// preload        "movl      mask, %%edx      \n\t"
-                  "sall      $24, %%edx       \n\t" // make low byte, high byte
-
-                "secondloop16:                \n\t"
-                  "sall      %%edx            \n\t" // move high bit to CF
-                  "jnc       skip16           \n\t" // if CF = 0
-                  "movw      (%3), %%ax       \n\t"
-                  "movw      %%ax, (%4)       \n\t"
-
-                "skip16:                      \n\t"
-                  "add       $2, %3           \n\t"
-                  "add       $2, %4           \n\t"
-                  "decl      %%ecx            \n\t"
-                  "jnz       secondloop16     \n\t"
-
-                "end16:                       \n\t"
-                  "EMMS                       \n\t" // DONE
-
-                  : "=a" (dummy_value_a),           // output regs (dummy)
-                    "=d" (dummy_value_d),
-                    "=c" (dummy_value_c),
-                    "=S" (dummy_value_S),
-                    "=D" (dummy_value_D)
-
-                  : "0" (diff),        // eax       // input regs
-                    "1" (mask),        // edx
-                    "2" (len),         // ecx
-// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
-                    "3" (srcptr),      // esi/rsi
-                    "4" (dstptr)       // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                  : "%mm0", "%mm1", "%mm4"          // clobber list
-                  , "%mm5", "%mm6", "%mm7"
-#endif
-               );
-            }
-            else /* not _mmx_supported - use modified C routine */
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP2 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP2;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 16 bpp */
-
-         case 48:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               png_uint_32 len;
-               int diff;
-
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               len  = png_ptr->width & ~7;          // reduce to multiple of 8
-               diff = (int) (png_ptr->width & 7);   // amount lost
-
-               __asm__ __volatile__ (
-                  "not       %%edx            \n\t" // mask => unmask
-                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
-                  "not       %%edx            \n\t" // unmask => mask for later
-                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
-                  "punpcklbw %%mm7, %%mm7     \n\t"
-                  "punpcklwd %%mm7, %%mm7     \n\t"
-                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
-
-                  LOAD_GOT_rbp
-                  "movq   " MASK48_0 ", %%mm0 \n\t" // _mask48_0 -> mm0
-                  "movq   " MASK48_1 ", %%mm1 \n\t" // _mask48_1 -> mm1
-                  "movq   " MASK48_2 ", %%mm2 \n\t" // _mask48_2 -> mm2
-                  "movq   " MASK48_3 ", %%mm3 \n\t" // _mask48_3 -> mm3
-                  "movq   " MASK48_4 ", %%mm4 \n\t" // _mask48_4 -> mm4
-                  "movq   " MASK48_5 ", %%mm5 \n\t" // _mask48_5 -> mm5
-                  RESTORE_rbp
-
-                  "pand      %%mm7, %%mm0     \n\t"
-                  "pand      %%mm7, %%mm1     \n\t"
-                  "pand      %%mm7, %%mm2     \n\t"
-                  "pand      %%mm7, %%mm3     \n\t"
-                  "pand      %%mm7, %%mm4     \n\t"
-                  "pand      %%mm7, %%mm5     \n\t"
-
-                  "pcmpeqb   %%mm6, %%mm0     \n\t"
-                  "pcmpeqb   %%mm6, %%mm1     \n\t"
-                  "pcmpeqb   %%mm6, %%mm2     \n\t"
-                  "pcmpeqb   %%mm6, %%mm3     \n\t"
-                  "pcmpeqb   %%mm6, %%mm4     \n\t"
-                  "pcmpeqb   %%mm6, %%mm5     \n\t"
-
-// preload        "movl      len, %%ecx       \n\t" // load length of line
-// preload        "movl      srcptr, %3       \n\t" // load source
-// preload        "movl      dstptr, %4       \n\t" // load dest
-
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        mainloop48end    \n\t"
-
-                "mainloop48:                  \n\t"
-                  "movq      (%3), %%mm7      \n\t"
-                  "pand      %%mm0, %%mm7     \n\t"
-                  "movq      %%mm0, %%mm6     \n\t"
-                  "pandn     (%4), %%mm6      \n\t"
-                  "por       %%mm6, %%mm7     \n\t"
-                  "movq      %%mm7, (%4)      \n\t"
-
-                  "movq      8(%3), %%mm6     \n\t"
-                  "pand      %%mm1, %%mm6     \n\t"
-                  "movq      %%mm1, %%mm7     \n\t"
-                  "pandn     8(%4), %%mm7     \n\t"
-                  "por       %%mm7, %%mm6     \n\t"
-                  "movq      %%mm6, 8(%4)     \n\t"
-
-                  "movq      16(%3), %%mm6    \n\t"
-                  "pand      %%mm2, %%mm6     \n\t"
-                  "movq      %%mm2, %%mm7     \n\t"
-                  "pandn     16(%4), %%mm7    \n\t"
-                  "por       %%mm7, %%mm6     \n\t"
-                  "movq      %%mm6, 16(%4)    \n\t"
-
-                  "movq      24(%3), %%mm7    \n\t"
-                  "pand      %%mm3, %%mm7     \n\t"
-                  "movq      %%mm3, %%mm6     \n\t"
-                  "pandn     24(%4), %%mm6    \n\t"
-                  "por       %%mm6, %%mm7     \n\t"
-                  "movq      %%mm7, 24(%4)    \n\t"
-
-                  "movq      32(%3), %%mm6    \n\t"
-                  "pand      %%mm4, %%mm6     \n\t"
-                  "movq      %%mm4, %%mm7     \n\t"
-                  "pandn     32(%4), %%mm7    \n\t"
-                  "por       %%mm7, %%mm6     \n\t"
-                  "movq      %%mm6, 32(%4)    \n\t"
-
-                  "movq      40(%3), %%mm7    \n\t"
-                  "pand      %%mm5, %%mm7     \n\t"
-                  "movq      %%mm5, %%mm6     \n\t"
-                  "pandn     40(%4), %%mm6    \n\t"
-                  "por       %%mm6, %%mm7     \n\t"
-                  "movq      %%mm7, 40(%4)    \n\t"
-
-                  "add       $48, %3          \n\t" // inc by 48 bytes processed
-                  "add       $48, %4          \n\t"
-                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
-
-                  "ja        mainloop48       \n\t"
-
-                "mainloop48end:               \n\t"
-// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
-                  "movl      %%eax, %%ecx     \n\t"
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        end48            \n\t"
-// preload        "movl      mask, %%edx      \n\t"
-                  "sall      $24, %%edx       \n\t" // make low byte, high byte
-
-                "secondloop48:                \n\t"
-                  "sall      %%edx            \n\t" // move high bit to CF
-                  "jnc       skip48           \n\t" // if CF = 0
-                  "movl      (%3), %%eax      \n\t"
-                  "movl      %%eax, (%4)      \n\t"
-                  "movw      4(%3), %%ax      \n\t" // GR-P bugfix 20070717
-                  "movw      %%ax, 4(%4)      \n\t" // GR-P bugfix 20070717
-
-                "skip48:                      \n\t"
-                  "add       $6, %3           \n\t" // GR-P bugfix 20070717
-                  "add       $6, %4           \n\t" // GR-P bugfix 20070717
-                  "decl      %%ecx            \n\t"
-                  "jnz       secondloop48     \n\t"
-
-                "end48:                       \n\t"
-                  "EMMS                       \n\t" // DONE
-
-                  : "=a" (dummy_value_a),           // output regs (dummy)
-                    "=d" (dummy_value_d),
-                    "=c" (dummy_value_c),
-                    "=S" (dummy_value_S),
-                    "=D" (dummy_value_D)
-
-                  : "0" (diff),        // eax       // input regs
-                    "1" (mask),        // edx
-                    "2" (len),         // ecx
-// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
-                    "3" (srcptr),      // esi/rsi
-                    "4" (dstptr)       // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                  : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
-                  , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-               );
-            }
-            else /* not _mmx_supported - use modified C routine */
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP6 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP6;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 48 bpp */
-
-         case 64:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            register png_uint_32 i;
-            png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
-              /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-            register int stride = BPP8 * png_pass_inc[png_ptr->pass];
-              /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-            register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
-              /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-            png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-            int diff = (int) (png_ptr->width & 7); /* amount lost */
-            register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
-
-            srcptr = png_ptr->row_buf + 1 + initial_val;
-            dstptr = row + initial_val;
-
-            for (i = initial_val; i < final_val; i += stride)
-            {
-               png_memcpy(dstptr, srcptr, rep_bytes);
-               srcptr += stride;
-               dstptr += stride;
-            }
-            if (diff)  /* number of leftover pixels:  3 for pngtest */
-            {
-               final_val += diff*BPP8;
-               for (; i < final_val; i += stride)
-               {
-                  if (rep_bytes > (int)(final_val-i))
-                     rep_bytes = (int)(final_val-i);
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-            }
-
-            break;
-         }       /* end 64 bpp */
-
-         default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
-         {
-            // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-            png_debug(1, "Internal libpng logic error (GCC "
-              "png_combine_row() pixel_depth)\n");
-#endif
-            break;
-         }
-      } /* end switch (png_ptr->row_info.pixel_depth) */
-
-   } /* end if (non-trivial mask) */
-
-} /* end png_combine_row() */
-
-#endif /* PNG_HAVE_MMX_COMBINE_ROW */
-
-
-
-
-/*===========================================================================*/
-/*                                                                           */
-/*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
-/*                                                                           */
-/*===========================================================================*/
-
-#if defined(PNG_READ_INTERLACING_SUPPORTED)
-#if defined(PNG_HAVE_MMX_READ_INTERLACE)
-
-/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
- * has taken place.  [GRR: what other steps come before and/or after?]
- */
-
-void /* PRIVATE */
-png_do_read_interlace(png_structp png_ptr)
-{
-   png_row_infop row_info = &(png_ptr->row_info);
-   png_bytep row = png_ptr->row_buf + 1;
-   int pass = png_ptr->pass;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-   png_uint_32 transformations = png_ptr->transformations;
-#endif
-
-   png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
-
-   if (_mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-   if (row != NULL && row_info != NULL)
-   {
-      png_uint_32 final_width;
-
-      final_width = row_info->width * png_pass_inc[pass];
-
-      switch (row_info->pixel_depth)
-      {
-         case 1:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_byte v;
-            png_uint_32 i;
-            int j;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 3);
-            dp = row + (png_size_t)((final_width - 1) >> 3);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (int)((row_info->width + 7) & 7);
-               dshift = (int)((final_width + 7) & 7);
-               s_start = 7;
-               s_end = 0;
-               s_inc = -1;
-            }
-            else
-#endif
-            {
-               sshift = 7 - (int)((row_info->width + 7) & 7);
-               dshift = 7 - (int)((final_width + 7) & 7);
-               s_start = 0;
-               s_end = 7;
-               s_inc = 1;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               v = (png_byte)((*sp >> sshift) & 0x1);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 2:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 2);
-            dp = row + (png_size_t)((final_width - 1) >> 2);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
-               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
-               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0x3);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 4:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 1);
-            dp = row + (png_size_t)((final_width - 1) >> 1);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
-               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
-               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0xf);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-       /*====================================================================*/
-
-         default: /* 8-bit or larger (this is where the routine is modified) */
-         {
-            png_bytep sptr, dp;
-            png_uint_32 i;
-            png_size_t pixel_bytes;
-            int width = (int)row_info->width;
-
-            pixel_bytes = (row_info->pixel_depth >> 3);
-
-            /* point sptr at the last pixel in the pre-expanded row: */
-            sptr = row + (width - 1) * pixel_bytes;
-
-            /* point dp at the last pixel position in the expanded row: */
-            dp = row + (final_width - 1) * pixel_bytes;
-
-            /* New code by Nirav Chhatrapati - Intel Corporation */
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               int dummy_value_c;        // fix 'forbidden register spilled'
-               png_bytep dummy_value_S;
-               png_bytep dummy_value_D;
-               png_bytep dummy_value_a;
-               png_bytep dummy_value_d;
-
-               //--------------------------------------------------------------
-               if (pixel_bytes == BPP3)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
-                     if (width_mmx < 0)
-                         width_mmx = 0;
-                     width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
-                     if (width_mmx)
-                     {
-                        // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
-                        // sptr points at last pixel in pre-expanded row
-                        // dp points at last pixel position in expanded row
-                        __asm__ __volatile__ (
-                           "sub  $3, %1             \n\t"
-                           "sub  $9, %2             \n\t"
-                                        // (png_pass_inc[pass] + 1)*pixel_bytes
-
-                        ".loop3_pass4:              \n\t"
-                           "movq (%1), %%mm0        \n\t" // x x 5 4 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
-                           "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
-                           "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
-                           "pand (%3), %%mm1        \n\t" // z z z z z 2 1 0
-                           "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
-                           "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
-                           "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
-                           "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
-                           "movq %%mm0, (%2)        \n\t"
-                           "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
-                           "pand (%4), %%mm3        \n\t" // z z z z z z z 5
-                           "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
-                           "sub  $6, %1             \n\t"
-                           "movd %%mm2, 8(%2)       \n\t"
-                           "sub  $12, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop3_pass4        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D),
-                             "=a" (dummy_value_a),
-                             "=d" (dummy_value_d)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp),            // edi/rdi
-#if defined(PNG_x86_64_USE_GOTPCREL)     // formerly _const4 and _const6:
-                             "3" (&_c64._amask5_3_0), // (0x0000000000FFFFFFLL)
-                             "4" (&_c64._amask7_1_0)  // (0x00000000000000FFLL)
-#else
-                             "3" (&_amask5_3_0),  // eax (0x0000000000FFFFFFLL)
-                             "4" (&_amask7_1_0)   // edx (0x00000000000000FFLL)
-#endif
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-                           , "%mm2", "%mm3"
-#endif
-                        );
-                     }
-
-                     sptr -= width_mmx*BPP3;
-                     dp -= width_mmx*2*BPP3;
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-
-                        png_memcpy(v, sptr, BPP3);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           png_memcpy(dp, v, BPP3);
-                           dp -= BPP3;
-                        }
-                        sptr -= BPP3;
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     __asm__ __volatile__ (
-                        "sub  $9, %2             \n\t"
-                                     // (png_pass_inc[pass] - 1)*pixel_bytes
-
-                     ".loop3_pass2:              \n\t"
-                        "movd (%1), %%mm0        \n\t" // x x x x x 2 1 0
-                        "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
-                        "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
-                        "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
-                        "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
-                        "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
-                        "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
-                        "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
-                        "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
-                        "movq %%mm0, 4(%2)       \n\t"
-                        "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
-                        "sub  $3, %1             \n\t"
-                        "movd %%mm0, (%2)        \n\t"
-                        "sub  $12, %2            \n\t"
-                        "decl %%ecx              \n\t"
-                        "jnz .loop3_pass2        \n\t"
-                        "EMMS                    \n\t" // DONE
-
-                        : "=c" (dummy_value_c),        // output regs (dummy)
-                          "=S" (dummy_value_S),
-                          "=D" (dummy_value_D),
-                          "=a" (dummy_value_a)
-
-                        : "0" (width),         // ecx  // input regs
-                          "1" (sptr),          // esi/rsi
-                          "2" (dp),            // edi/rdi
-#if defined(PNG_x86_64_USE_GOTPCREL)           // formerly _const4:
-                          "3" (&_c64._amask5_3_0)  // (0x0000000000FFFFFFLL)
-#else
-                          "3" (&_amask5_3_0)   // eax (0x0000000000FFFFFFLL)
-#endif
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                        : "%mm0", "%mm1", "%mm2"       // clobber list
-#endif
-                     );
-                  }
-                  else if (width)  // && ((pass == 0) || (pass == 1))
-                  {
-                     __asm__ __volatile__ (
-                        "sub  $21, %2            \n\t"
-                                     // (png_pass_inc[pass] - 1)*pixel_bytes
-
-                     ".loop3_pass0:              \n\t"
-                        "movd (%1), %%mm0        \n\t" // x x x x x 2 1 0
-                        "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
-                        "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
-                        "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
-                        "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
-                        "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
-                        "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
-                        "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
-                        "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
-                        "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
-                        "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
-                        "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
-                        "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
-                        "movq %%mm4, 16(%2)      \n\t"
-                        "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
-                        "movq %%mm3, 8(%2)       \n\t"
-                        "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
-                        "sub  $3, %1             \n\t"
-                        "movq %%mm0, (%2)        \n\t"
-                        "sub  $24, %2            \n\t"
-                        "decl %%ecx              \n\t"
-                        "jnz .loop3_pass0        \n\t"
-                        "EMMS                    \n\t" // DONE
-
-                        : "=c" (dummy_value_c),        // output regs (dummy)
-                          "=S" (dummy_value_S),
-                          "=D" (dummy_value_D),
-                          "=a" (dummy_value_a)
-
-                        : "0" (width),         // ecx  // input regs
-                          "1" (sptr),          // esi/rsi
-                          "2" (dp),            // edi/rdi
-#if defined(PNG_x86_64_USE_GOTPCREL)           // formerly _const4:
-                          "3" (&_c64._amask5_3_0)  // (0x0000000000FFFFFFLL)
-#else
-                          "3" (&_amask5_3_0)   // eax (0x0000000000FFFFFFLL)
-#endif
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                        : "%mm0", "%mm1", "%mm2"       // clobber list
-                        , "%mm3", "%mm4"
-#endif
-                     );
-                  }
-               } /* end of pixel_bytes == 3 */
-
-               //--------------------------------------------------------------
-               else if (pixel_bytes == BPP4)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $4, %1             \n\t"
-                           "sub  $12, %2            \n\t"
-
-                        ".loop4_pass4:              \n\t"
-                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
-                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
-                           "movq %%mm0, (%2)        \n\t"
-                           "sub  $8, %1             \n\t"
-                           "movq %%mm1, 8(%2)       \n\t"
-                           "sub  $16, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop4_pass4        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*BPP4 - BPP4); // sign fixed
-                     dp -= (width_mmx*2*BPP4 - BPP4); // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= BPP4;
-                        png_memcpy(v, sptr, BPP4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= BPP4;
-                           png_memcpy(dp, v, BPP4);
-                        }
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1);
-                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $4, %1             \n\t"
-                           "sub  $28, %2            \n\t"
-
-                        ".loop4_pass2:              \n\t"
-                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
-                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
-                           "movq %%mm0, (%2)        \n\t"
-                           "movq %%mm0, 8(%2)       \n\t"
-                           "movq %%mm1, 16(%2)      \n\t"
-                           "movq %%mm1, 24(%2)      \n\t"
-                           "sub  $8, %1             \n\t"
-                           "sub  $32, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop4_pass2        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*4 - 4); // sign fixed
-                     dp -= (width_mmx*16 - 4);  // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 4;
-                        png_memcpy(v, sptr, 4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 4;
-                           png_memcpy(dp, v, 4);
-                        }
-                     }
-                  }
-                  else if (width)  // && ((pass == 0) || (pass == 1))
-                  {
-                     int width_mmx = ((width >> 1) << 1);
-                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $4, %1             \n\t"
-                           "sub  $60, %2            \n\t"
-
-                        ".loop4_pass0:              \n\t"
-                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
-                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
-                           "movq %%mm0, (%2)        \n\t"
-                           "movq %%mm0, 8(%2)       \n\t"
-                           "movq %%mm0, 16(%2)      \n\t"
-                           "movq %%mm0, 24(%2)      \n\t"
-                           "movq %%mm1, 32(%2)      \n\t"
-                           "movq %%mm1, 40(%2)      \n\t"
-                           "movq %%mm1, 48(%2)      \n\t"
-                           "sub  $8, %1             \n\t"
-                           "movq %%mm1, 56(%2)      \n\t"
-                           "sub  $64, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop4_pass0        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*4 - 4); // sign fixed
-                     dp -= (width_mmx*32 - 4);  // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 4;
-                        png_memcpy(v, sptr, 4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 4;
-                           png_memcpy(dp, v, 4);
-                        }
-                     }
-                  }
-               } /* end of pixel_bytes == 4 */
-
-               //--------------------------------------------------------------
-               else if (pixel_bytes == 1)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 3) << 3);
-                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $7, %1             \n\t"
-                           "sub  $15, %2            \n\t"
-
-                        ".loop1_pass4:              \n\t"
-                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
-                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
-                           "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
-                           "movq %%mm1, 8(%2)       \n\t"
-                           "sub  $8, %1             \n\t"
-                           "movq %%mm0, (%2)        \n\t"
-                           "sub  $16, %2            \n\t"
-                           "subl $8, %%ecx          \n\t"
-                           "jnz .loop1_pass4        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*2;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           *dp-- = *sptr;
-                        }
-                        --sptr;
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 2) << 2);
-                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $3, %1             \n\t"
-                           "sub  $15, %2            \n\t"
-
-                        ".loop1_pass2:              \n\t"
-                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
-                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
-                           "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
-                           "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
-                           "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
-                           "movq %%mm0, (%2)        \n\t"
-                           "sub  $4, %1             \n\t"
-                           "movq %%mm1, 8(%2)       \n\t"
-                           "sub  $16, %2            \n\t"
-                           "subl $4, %%ecx          \n\t"
-                           "jnz .loop1_pass2        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*4;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           *dp-- = *sptr;
-                        }
-                        --sptr;
-                     }
-                  }
-                  else if (width)  // && ((pass == 0) || (pass == 1))
-                  {
-                     int width_mmx = ((width >> 2) << 2);
-                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $3, %1             \n\t"
-                           "sub  $31, %2            \n\t"
-
-                        ".loop1_pass0:              \n\t"
-                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
-                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
-                           "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
-                           "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
-                           "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
-                           "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
-                           "movq %%mm0, (%2)        \n\t"
-                           "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
-                           "movq %%mm3, 8(%2)       \n\t"
-                           "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
-                           "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
-                           "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
-                           "movq %%mm2, 16(%2)      \n\t"
-                           "sub  $4, %1             \n\t"
-                           "movq %%mm4, 24(%2)      \n\t"
-                           "sub  $32, %2            \n\t"
-                           "subl $4, %%ecx          \n\t"
-                           "jnz .loop1_pass0        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1", "%mm2"       // clobber list
-                           , "%mm3", "%mm4"
-#endif
-                        );
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*8;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                       /* I simplified this part in version 1.0.4e
-                        * here and in several other instances where
-                        * pixel_bytes == 1  -- GR-P
-                        *
-                        * Original code:
-                        *
-                        * png_byte v[8];
-                        * png_memcpy(v, sptr, pixel_bytes);
-                        * for (j = 0; j < png_pass_inc[pass]; j++)
-                        * {
-                        *    png_memcpy(dp, v, pixel_bytes);
-                        *    dp -= pixel_bytes;
-                        * }
-                        * sptr -= pixel_bytes;
-                        *
-                        * Replacement code is in the next three lines:
-                        */
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           *dp-- = *sptr;
-                        }
-                        --sptr;
-                     }
-                  }
-               } /* end of pixel_bytes == 1 */
-
-               //--------------------------------------------------------------
-               else if (pixel_bytes == BPP2)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $2, %1             \n\t"
-                           "sub  $6, %2             \n\t"
-
-                        ".loop2_pass4:              \n\t"
-                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
-                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
-                           "sub  $4, %1             \n\t"
-                           "movq %%mm0, (%2)        \n\t"
-                           "sub  $8, %2             \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop2_pass4        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0"                       // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*BPP2 - BPP2); // sign fixed
-                     dp -= (width_mmx*2*BPP2 - BPP2); // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= BPP2;
-                        png_memcpy(v, sptr, BPP2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= BPP2;
-                           png_memcpy(dp, v, BPP2);
-                        }
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $2, %1             \n\t"
-                           "sub  $14, %2            \n\t"
-
-                        ".loop2_pass2:              \n\t"
-                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
-                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
-                           "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
-                           "movq %%mm0, (%2)        \n\t"
-                           "sub  $4, %1             \n\t"
-                           "movq %%mm1, 8(%2)       \n\t"
-                           "sub  $16, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop2_pass2        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*2 - 2); // sign fixed
-                     dp -= (width_mmx*8 - 2);   // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 2;
-                        png_memcpy(v, sptr, 2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 2;
-                           png_memcpy(dp, v, 2);
-                        }
-                     }
-                  }
-                  else if (width)  // && ((pass == 0) || (pass == 1))
-                  {
-                     int width_mmx = ((width >> 1) << 1);
-                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $2, %1             \n\t"
-                           "sub  $30, %2            \n\t"
-
-                        ".loop2_pass0:              \n\t"
-                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
-                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
-                           "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
-                           "movq %%mm0, (%2)        \n\t"
-                           "movq %%mm0, 8(%2)       \n\t"
-                           "movq %%mm1, 16(%2)      \n\t"
-                           "sub  $4, %1             \n\t"
-                           "movq %%mm1, 24(%2)      \n\t"
-                           "sub  $32, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop2_pass0        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*2 - 2); // sign fixed
-                     dp -= (width_mmx*16 - 2);  // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 2;
-                        png_memcpy(v, sptr, 2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 2;
-                           png_memcpy(dp, v, 2);
-                        }
-                     }
-                  }
-               } /* end of pixel_bytes == 2 */
-
-               //--------------------------------------------------------------
-               else if (pixel_bytes == BPP8)
-               {
-// GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
-                  // GRR NOTE:  no need to combine passes here!
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     // source is 8-byte RRGGBBAA
-                     // dest is 16-byte RRGGBBAA RRGGBBAA
-                     __asm__ __volatile__ (
-                        "sub  $8, %2             \n\t" // start of last block
-
-                     ".loop8_pass4:              \n\t"
-                        "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                        "movq %%mm0, (%2)        \n\t"
-                        "sub  $8, %1             \n\t"
-                        "movq %%mm0, 8(%2)       \n\t"
-                        "sub  $16, %2            \n\t"
-                        "decl %%ecx              \n\t"
-                        "jnz .loop8_pass4        \n\t"
-                        "EMMS                    \n\t" // DONE
-
-                        : "=c" (dummy_value_c),        // output regs (dummy)
-                          "=S" (dummy_value_S),
-                          "=D" (dummy_value_D)
-
-                        : "0" (width),         // ecx  // input regs
-                          "1" (sptr),          // esi/rsi
-                          "2" (dp)             // edi/rdi
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                        : "%mm0"                       // clobber list
-#endif
-                     );
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     // source is 8-byte RRGGBBAA
-                     // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
-                     // (recall that expansion is _in place_:  sptr and dp
-                     //  both point at locations within same row buffer)
-                     __asm__ __volatile__ (
-                        "sub  $24, %2            \n\t" // start of last block
-
-                     ".loop8_pass2:              \n\t"
-                        "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                        "movq %%mm0, (%2)        \n\t"
-                        "movq %%mm0, 8(%2)       \n\t"
-                        "movq %%mm0, 16(%2)      \n\t"
-                        "sub  $8, %1             \n\t"
-                        "movq %%mm0, 24(%2)      \n\t"
-                        "sub  $32, %2            \n\t"
-                        "decl %%ecx              \n\t"
-                        "jnz .loop8_pass2        \n\t"
-                        "EMMS                    \n\t" // DONE
-
-                        : "=c" (dummy_value_c),        // output regs (dummy)
-                          "=S" (dummy_value_S),
-                          "=D" (dummy_value_D)
-
-                        : "0" (width),         // ecx  // input regs
-                          "1" (sptr),          // esi/rsi
-                          "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                        : "%mm0"                       // clobber list
-#endif
-                     );
-                  }
-                  else if (width)  // && ((pass == 0) || (pass == 1))
-                  {
-                     // source is 8-byte RRGGBBAA
-                     // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
-                     __asm__ __volatile__ (
-                        "sub  $56, %2            \n\t" // start of last block
-
-                     ".loop8_pass0:              \n\t"
-                        "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                        "movq %%mm0, (%2)        \n\t"
-                        "movq %%mm0, 8(%2)       \n\t"
-                        "movq %%mm0, 16(%2)      \n\t"
-                        "movq %%mm0, 24(%2)      \n\t"
-                        "movq %%mm0, 32(%2)      \n\t"
-                        "movq %%mm0, 40(%2)      \n\t"
-                        "movq %%mm0, 48(%2)      \n\t"
-                        "sub  $8, %1             \n\t"
-                        "movq %%mm0, 56(%2)      \n\t"
-                        "sub  $64, %2            \n\t"
-                        "decl %%ecx              \n\t"
-                        "jnz .loop8_pass0        \n\t"
-                        "EMMS                    \n\t" // DONE
-
-                        : "=c" (dummy_value_c),        // output regs (dummy)
-                          "=S" (dummy_value_S),
-                          "=D" (dummy_value_D)
-
-                        : "0" (width),         // ecx  // input regs
-                          "1" (sptr),          // esi/rsi
-                          "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                        : "%mm0"                       // clobber list
-#endif
-                     );
-                  }
-               } /* end of pixel_bytes == 8 */
-
-               //--------------------------------------------------------------
-               else if (pixel_bytes == BPP6)   // why no MMX for this case?
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP6);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, BPP6);
-                        dp -= BPP6;
-                     }
-                     sptr -= BPP6;
-                  }
-               } /* end of pixel_bytes == 6 */
-
-               //--------------------------------------------------------------
-               else
-               {
-                  // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-                  png_debug(1, "Internal libpng logic error (GCC "
-                    "png_do_read_interlace() _mmx_supported)\n");
-#endif
-               }
-
-            } // end of _mmx_supported ========================================
-
-            else /* MMX not supported:  use modified C code - takes advantage
-                  *   of inlining of png_memcpy for a constant */
-            {
-               if (pixel_bytes == BPP3)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP3);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, BPP3);
-                        dp -= BPP3;
-                     }
-                     sptr -= BPP3;
-                  }
-               }
-               else if (pixel_bytes == BPP4)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP4);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-#if defined(PNG_DEBUG) && defined(PNG_1_0_X)  // row_buf_size gone in 1.2.x
-                        if (dp < row || dp+3 > row+png_ptr->row_buf_size)
-                        {
-                           printf("dp out of bounds: row=%10p, dp=%10p, "
-                             "rp=%10p\n", row, dp, row+png_ptr->row_buf_size);
-                           printf("row_buf_size=%lu\n", png_ptr->row_buf_size);
-                        }
-#endif
-                        png_memcpy(dp, v, BPP4);
-                        dp -= BPP4;
-                     }
-                     sptr -= BPP4;
-                  }
-               }
-               else if (pixel_bytes == 1)
-               {
-                  for (i = width; i; i--)
-                  {
-                     int j;
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        *dp-- = *sptr;
-                     }
-                     --sptr;
-                  }
-               }
-               else if (pixel_bytes == BPP2)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP2);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, BPP2);
-                        dp -= BPP2;
-                     }
-                     sptr -= BPP2;
-                  }
-               }
-               else if (pixel_bytes == BPP6)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP6);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, BPP6);
-                        dp -= BPP6;
-                     }
-                     sptr -= BPP6;
-                  }
-               }
-               else if (pixel_bytes == BPP8)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP8);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, BPP8);
-                        dp -= BPP8;
-                     }
-                     sptr -= BPP8;
-                  }
-               }
-               else
-               {
-                  // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-                  png_debug(1, "Internal libpng logic error (GCC "
-                    "png_do_read_interlace() !_mmx_supported)\n");
-#endif
-               }
-
-            } /* end if (MMX not supported) */
-            break;
-         } /* end default (8-bit or larger) */
-      } /* end switch (row_info->pixel_depth) */
-
-      row_info->width = final_width;
-
-      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
-   }
-
-} /* end png_do_read_interlace() */
-
-#endif /* PNG_HAVE_MMX_READ_INTERLACE */
-#endif /* PNG_READ_INTERLACING_SUPPORTED */
-
-
-
-#if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
-#if defined(PNG_MMX_READ_FILTER_AVG_SUPPORTED)
-
-//===========================================================================//
-//                                                                           //
-//           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
-//                                                                           //
-//===========================================================================//
-
-// Optimized code for PNG Average filter decoder
-
-static void /* PRIVATE */
-png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
-                            png_bytep prev_row)
-{
-   unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
-   int bpp;
-   int dummy_value_a;
-   int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
-   int dummy_value_d;
-   png_bytep dummy_value_S;
-   png_bytep dummy_value_D;
-   int diff; //     __attribute__((used));
-
-   bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
-   FullLength = row_info->rowbytes;         // number of bytes to filter
-
-   __asm__ __volatile__ (
-   "avg_top:                       \n\t"
-      SAVE_GOT_ebx
-      SAVE_r15
-      SAVE_ebp
-      // initialize address pointers and offset
-//pre "movl row, %5                \n\t" // edi/rdi:  ptr to Avg(x)
-      "xorl %%ebx, %%ebx           \n\t" // ebx:  x
-//pre "movl prev_row, %4           \n\t" // esi/rsi:  ptr to Prior(x)
-      "mov  %5, " PDX "            \n\t" // copy of row ptr...
-//pre "subl bpp, " PDX "           \n\t" // (bpp is preloaded into ecx)
-      "sub  " PCX "," PDX "        \n\t" // edx/rdx:  ptr to Raw(x-bpp)
-//pre "movl FullLength, %%eax      \n\t" // bring in via eax...
-      SAVE_FullLength                    // ...but store for later use
-      "xorl %%eax, %%eax           \n\t"
-
-      // Compute the Raw value for the first bpp bytes
-      //    Raw(x) = Avg(x) + (Prior(x)/2)
-   "avg_rlp:                       \n\t"
-      "movb (%4," PBX ",), %%al    \n\t" // load al with Prior(x)
-      "incl %%ebx                  \n\t"
-      "shrb %%al                   \n\t" // divide by 2
-      "addb -1(%5," PBX ",), %%al  \n\t" // add Avg(x); -1 to offset inc ebx
-//pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movb %%al, -1(%5," PBX ",)  \n\t" // write Raw(x); -1 to offset inc ebx
-      "jb avg_rlp                  \n\t" // mov does not affect flags
-
-      // get # of bytes to alignment (32-bit mask _would_ be good enough
-      // [computing delta], but 32-bit ops are zero-extended on 64-bit, argh)
-      // (if swapped edx and ebp, could do 8-bit or 16-bit mask...FIXME?)
-      "mov  %5, " PBP "            \n\t" // take start of row
-      "add  " PBX "," PBP "        \n\t" // add bpp
-      "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
-//    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
-      CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
-      "sub  %5, " PBP "            \n\t" // subtract row ptr again => ebp =
-      "jz avg_go                   \n\t" //  target value of ebx at alignment
-
-      "xorl %%ecx, %%ecx           \n\t"
-
-      // fix alignment
-      // Compute the Raw value for the bytes up to the alignment boundary
-      //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-   "avg_lp1:                       \n\t"
-      "xorl %%eax, %%eax           \n\t"
-      "movb (%4," PBX ",), %%cl    \n\t" // load cl with Prior(x)
-      "movb (" PDX "," PBX ",), %%al \n\t" // load al with Raw(x-bpp)
-      "addw %%cx, %%ax             \n\t"
-      "incl %%ebx                  \n\t"
-      "shrw %%ax                   \n\t" // divide by 2
-      "addb -1(%5," PBX ",), %%al  \n\t" // add Avg(x); -1 to offset inc ebx
-      "cmpl %%ebp, %%ebx           \n\t" // check if at alignment boundary
-      "movb %%al, -1(%5," PBX ",)  \n\t" // write Raw(x); -1 to offset inc ebx
-      "jb avg_lp1                  \n\t" // repeat until at alignment boundary
-
-   "avg_go:                        \n\t"
-      RESTORE_FullLength "%%eax    \n\t" // FullLength -> eax
-      "movl %%eax, %%ecx           \n\t" // copy -> ecx
-      "subl %%ebx, %%eax           \n\t" // subtract alignment fix
-      "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
-      "subl %%eax, %%ecx           \n\t" // sub over-bytes from original length
-//out "movl %%ecx, MMXLength       \n\t"
-      "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
-      RESTORE_ebp                        //  (could swap ebp and edx functions)
-      RESTORE_r15
-      RESTORE_GOT_ebx
-
-// "There is no way for you to specify that an input operand is modified
-// without also specifying it as an output operand."  [makes sense]
-
-// "Unless an output operand has the `&' constraint modifier, GCC may
-// allocate it in the same register as an unrelated input operand, on the
-// assumption the inputs are consumed before the outputs are produced."
-// [trying to _force_ this]
-
-// "`='   Means that this operand is write-only for this instruction:
-//        the previous value is discarded and replaced by output data."
-//        [operand == variable name, presumably]
-
-      // output regs
-      // these are operands 0-1 (originally 0-3):
-      : "=c" (MMXLength),      // %0 -> %0
-        "=a" (diff)            // %3 -> %1
-//      "=S" (dummy_value_S),  // %1 -> GONE
-//      "=D" (dummy_value_D),  // %2 -> GONE
-
-      // input regs
-      // these are operands 2-5 (originally 4-7); two of their constraints say
-      // they must go in same places as operands 0-1 (originally 0-3) above:
-      : "0" (bpp),         // %4 -> %2 ecx
-        "1" (FullLength),  // %7 -> %3 eax
-        "S" (prev_row),    // %5 -> %4 esi/rsi
-        "D" (row)          // %6 -> %5 edi/rdi
-
-      : "%edx"                           // clobber list
-        _CLOBBER_r15
-        _CLOBBER_ebp
-        _CLOBBER_GOT_ebx
-   );
-
-   // now do the math for the rest of the row
-   switch (bpp)
-   {
-      case 3:
-      {
-//       _ShiftBpp = 24;    // == 3 * 8
-//       _ShiftRem = 40;    // == 64 - 24
-
-         __asm__ __volatile__ (
-            // re-init address pointers and offset
-            LOAD_GOT_rbp
-            "movq " AMASK5_3_0 ", %%mm7    \n\t" // _amask5_3_0 -> mm7
-// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
-                                                 //  alignment boundary
-            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
-// preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
-            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-// preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
-            RESTORE_rbp
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq  -8(%1," PCX ",), %%mm2 \n\t"// load previous aligned 8 bytes
-                                               // (correct pos. in loop below)
-         "avg_3lp:                        \n\t"
-            "movq  (%1," PCX ",), %%mm0   \n\t" // load mm0 with Avg(x)
-            "movq  %%mm5, %%mm3           \n\t"
-            "psrlq $40, %%mm2             \n\t" // correct position Raw(x-bpp)
-                                                // data
-            "movq  (%0," PCX ",), %%mm1   \n\t" // load mm1 with Prior(x)
-            "movq  %%mm7, %%mm6           \n\t"
-            "pand  %%mm1, %%mm3           \n\t" // get lsb for each prevrow byte
-            "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
-            "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
-                                                // byte
-            "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
-                                                // each byte
-            // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
-            "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                                // LBCarrys
-            "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                                // where both lsb's were == 1
-                                                // (valid only for active group)
-            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
-                                                // byte
-            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
-                                                // for each byte
-            "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 1
-                                                // bytes to add to Avg
-            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
-                                                // Avg for each Active byte
-            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "psllq $24, %%mm6             \n\t" // shift the mm6 mask to cover
-                                                // bytes 3-5
-            "movq  %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $24, %%mm2             \n\t" // shift data to pos. correctly
-            "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                                // LBCarrys
-            "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                                // where both lsb's were == 1
-                                                // (valid only for active group)
-            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
-                                                // byte
-            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
-                                                // for each byte
-            "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                                // bytes to add to Avg
-            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
-                                                // Avg for each Active byte
-
-            // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "psllq $24, %%mm6             \n\t" // shift mm6 mask to cover last
-                                                // two bytes
-            "movq  %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $24, %%mm2             \n\t" // shift data to pos. correctly
-                              // Data need be shifted only once here to
-                              // get the correct x-bpp offset.
-            "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                                // LBCarrys
-            "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                                // where both
-                              // lsb's were == 1 (only valid for active group)
-            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
-                                                // byte
-            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
-                                                // for each byte
-            "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                                // bytes to add to Avg
-            "addl  $8, %%ecx              \n\t"
-            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
-                                                // Avg for each Active byte
-            // now ready to write back to memory
-            "movq  %%mm0, -8(%1," PCX ",) \n\t"
-            // move updated Raw(x) to use as Raw(x-bpp) for next loop
-            "cmpl  %%eax, %%ecx           \n\t" // MMXLength
-            "movq  %%mm0, %%mm2           \n\t" // mov updated Raw(x) to mm2
-            "jb avg_3lp                   \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (MMXLength)    // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 3 bpp
-
-      case 4:   // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem,
-      {         // but loop uses all 8 MMX regs, and psrlq/psllq require 64-bit
-                // mem (PIC/.so problems), MMX reg (none left), or immediate
-//       _ShiftBpp = bpp << 3;        // 32 (psllq)
-//       _ShiftRem = 64 - _ShiftBpp;  // 32 (psrlq)
-
-         __asm__ __volatile__ (
-            LOAD_GOT_rbp
-            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
-            // re-init address pointers and offset
-// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
-                                                 // alignment boundary
-            "movq " AMASK0_8_0 ", %%mm7    \n\t" // _amask0_8_0 -> mm7
-            RESTORE_rbp
-
-            // ... and clear all bytes except for 1st active group
-// preload  "movl  row, %1               \n\t" // edi:  Avg(x)
-            "psrlq $32, %%mm7            \n\t" // was _ShiftRem
-// preload  "movl  prev_row, %0          \n\t" // esi:  Prior(x)
-            "movq  %%mm7, %%mm6          \n\t"
-            "psllq $32, %%mm6            \n\t" // mask for 2nd active group
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
-                                             // (we correct pos. in loop below)
-         "avg_4lp:                       \n\t"
-            "movq (%1," PCX ",), %%mm0   \n\t"
-            "psrlq $32, %%mm2            \n\t" // shift data to pos. correctly
-            "movq (%0," PCX ",), %%mm1   \n\t"
-            // add (Prev_row/2) to average
-            "movq %%mm5, %%mm3           \n\t"
-            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
-            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
-            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
-                                               // each byte
-            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                              // lsb's were == 1 (only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
-                                               // for each Active
-                              // byte
-            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $32, %%mm2            \n\t" // shift data to pos. correctly
-            "addl $8, %%ecx              \n\t"
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                              // lsb's were == 1 (only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
-                                               // Avg for each Active byte
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            // now ready to write back to memory
-            "movq %%mm0, -8(%1," PCX ",) \n\t"
-            // prep Raw(x-bpp) for next loop
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "jb avg_4lp                  \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (MMXLength)    // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 4 bpp
-
-      case 1:
-      {
-         __asm__ __volatile__ (
-            // re-init address pointers and offset
-// preload  "movl diff, %%ecx            \n\t" // ecx: x = offset to align. bdry
-// preload  "movl row, %1                \n\t" // edi/rdi:  Avg(x)
-// preload  "movl FullLength, %%eax      \n\t"
-            "cmpl %%eax, %%ecx           \n\t" // test if offset at end of array
-            "jnb avg_1end                \n\t"
-
-            SAVE_ebp
-
-            // do Avg decode for remaining bytes
-// preload  "movl prev_row, %0           \n\t" // esi/rsi:  Prior(x)
-            "mov  %1, " PBP "            \n\t" // copy of row pointer...
-            "dec  " PBP "                \n\t" // ebp/rbp:  Raw(x-bpp)
-            "xorl %%edx, %%edx           \n\t" // zero edx before using dl & dx
-                                               //  in loop below
-            SAVE_GOT_ebx
-
-         "avg_1lp:                       \n\t"
-            // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-            "xorl %%ebx, %%ebx           \n\t"
-            "movb (%0," PCX ",), %%dl    \n\t" // load dl with Prior(x)
-            "movb (" PBP "," PCX ",), %%bl \n\t" // load bl with Raw(x-bpp)
-            "addw %%dx, %%bx             \n\t"
-            "incl %%ecx                  \n\t"
-            "shrw %%bx                   \n\t" // divide by 2
-            "addb -1(%1," PCX ",), %%bl  \n\t" // add Avg(x); -1 to offset
-                                               // inc ecx
-            "cmpl %%eax, %%ecx           \n\t" // check if at end of array
-            "movb %%bl, -1(%1," PCX ",)  \n\t" // write back Raw(x);
-                         // mov does not affect flags; -1 to offset inc ecx
-            "jb avg_1lp                  \n\t"
-
-            RESTORE_GOT_ebx
-            RESTORE_ebp
-
-         "avg_1end:                      \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (FullLength)   // eax
-
-            : "%edx"                           // clobber list
-              _CLOBBER_GOT_ebx
-              _CLOBBER_ebp
-         );
-      }
-      return;  // end 1 bpp
-
-      case 2:
-      {
-//       _ShiftBpp = 16;   // == 2 * 8
-//       _ShiftRem = 48;   // == 64 - _ShiftBpp
-
-         __asm__ __volatile__ (
-            LOAD_GOT_rbp
-            // load (former) _ActiveMask
-            "movq " AMASK6_2_0 ", %%mm7    \n\t" // _amask6_2_0 -> mm7
-            // re-init address pointers and offset
-// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
-                                                 // alignment boundary
-            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
-// preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
-            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-// preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
-            RESTORE_rbp
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
-                                             // (we correct pos. in loop below)
-         "avg_2lp:                       \n\t"
-            "movq (%1," PCX ",), %%mm0   \n\t"
-            "psrlq $48, %%mm2            \n\t" // shift data to pos. correctly
-            "movq (%0," PCX ",), %%mm1   \n\t" //  (GRR BUGFIX:  was psllq)
-            // add (Prev_row/2) to average
-            "movq %%mm5, %%mm3           \n\t"
-            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
-            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
-            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "movq %%mm7, %%mm6           \n\t"
-            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
-                                               // each byte
-
-            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                                               // lsb's were == 1 (only valid
-                                               // for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
-                                               // for each Active byte
-
-            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
-                                               // bytes 2 & 3
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                                               // lsb's were == 1 (only valid
-                                               // for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
-                                               // Avg for each Active byte
-
-            // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
-                                               // bytes 4 & 5
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both lsb's were == 1
-                                               // (only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
-                                               // Avg for each Active byte
-
-            // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
-            "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
-                                               // bytes 6 & 7
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
-            "addl $8, %%ecx              \n\t"
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                                               // lsb's were == 1 (only valid
-                                               // for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
-                                               // Avg for each Active byte
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            // now ready to write back to memory
-            "movq %%mm0, -8(%1," PCX ",) \n\t"
-            // prep Raw(x-bpp) for next loop
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "jb avg_2lp                  \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (MMXLength)    // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 2 bpp
-
-      case 6:   // formerly shared with 4 bpp case (see comments there)
-      {
-//       _ShiftBpp = bpp << 3;        // 48 (psllq)
-//       _ShiftRem = 64 - _ShiftBpp;  // 16 (psrlq)
-
-         __asm__ __volatile__ (
-            LOAD_GOT_rbp
-            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
-            // re-init address pointers and offset
-// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
-                                                 // alignment boundary
-            "movq " AMASK0_8_0 ", %%mm7    \n\t" // _amask0_8_0 -> mm7
-            RESTORE_rbp
-
-            // ... and clear all bytes except for 1st active group
-// preload  "movl  row, %1               \n\t" // edi:  Avg(x)
-            "psrlq $16, %%mm7            \n\t"
-// preload  "movl  prev_row, %0          \n\t" // esi:  Prior(x)
-            "movq  %%mm7, %%mm6          \n\t"
-            "psllq $48, %%mm6            \n\t" // mask for 2nd active group
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
-                                             // (we correct pos. in loop below)
-         "avg_6lp:                       \n\t"
-            "movq (%1," PCX ",), %%mm0   \n\t"
-            "psrlq $16, %%mm2            \n\t" // shift data to pos. correctly
-            "movq (%0," PCX ",), %%mm1   \n\t"
-            // add (Prev_row/2) to average
-            "movq %%mm5, %%mm3           \n\t"
-            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
-            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
-            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
-                                               // each byte
-            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                              // lsb's were == 1 (only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
-                                               // for each Active
-                              // byte
-            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $48, %%mm2            \n\t" // shift data to pos. correctly
-            "addl $8, %%ecx              \n\t"
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                              // lsb's were == 1 (only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
-                                               // Avg for each Active byte
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            // now ready to write back to memory
-            "movq %%mm0, -8(%1," PCX ",) \n\t"
-            // prep Raw(x-bpp) for next loop
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "jb avg_6lp                  \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (MMXLength)    // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 6 bpp
-
-      case 8:
-      {
-         __asm__ __volatile__ (
-            // re-init address pointers and offset
-// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
-                                                 // alignment boundary
-            LOAD_GOT_rbp
-            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
-// preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
-            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-// preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
-            RESTORE_rbp
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
-                                      // (NO NEED to correct pos. in loop below)
-
-         "avg_8lp:                       \n\t"
-            "movq (%1," PCX ",), %%mm0   \n\t"
-            "movq %%mm5, %%mm3           \n\t"
-            "movq (%0," PCX ",), %%mm1   \n\t"
-            "addl $8, %%ecx              \n\t"
-            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
-            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
-            "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
-                                               //  where both lsb's were == 1
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
-            "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
-            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            "movq %%mm0, -8(%1," PCX ",) \n\t"
-            "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
-            "jb avg_8lp                  \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (MMXLength)    // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2"           // clobber list
-            , "%mm3", "%mm4", "%mm5"
-#endif
-         );
-      }
-      break;  // end 8 bpp
-
-      default:                // bpp != 1,2,3,4,6,8:  doesn't exist
-      {
-         // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-         png_debug(1, "Internal libpng logic error (GCC "
-           "png_read_filter_row_mmx_avg())\n");
-#endif
-      }
-      break;
-
-   } // end switch (bpp)
-
-   __asm__ __volatile__ (
-      // MMX acceleration complete; now do clean-up
-      // check if any remaining bytes left to decode
-//pre "movl FullLength, %%edx      \n\t"
-//pre "movl MMXLength, %%eax       \n\t" // eax:  x == offset bytes after MMX
-//pre "movl row, %2                \n\t" // edi:  Avg(x)
-      "cmpl %%edx, %%eax           \n\t" // test if offset at end of array
-      "jnb avg_end                 \n\t"
-
-      SAVE_ebp
-
-      // do Avg decode for remaining bytes
-//pre "movl prev_row, %1           \n\t" // esi:  Prior(x)
-      "mov  %2, " PBP "            \n\t" // copy of row pointer...
-//pre "subl bpp, " PBP "           \n\t" // (bpp is preloaded into ecx)
-      "sub  " PCX "," PBP "        \n\t" // ebp:  Raw(x-bpp)
-      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
-
-      SAVE_GOT_ebx
-
-   "avg_lp2:                       \n\t"
-      // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-      "xorl %%ebx, %%ebx           \n\t"
-      "movb (%1," PAX ",), %%cl    \n\t" // load cl with Prior(x)
-      "movb (" PBP "," PAX ",), %%bl \n\t" // load bl with Raw(x-bpp)
-      "addw %%cx, %%bx             \n\t"
-      "incl %%eax                  \n\t"
-      "shrw %%bx                   \n\t" // divide by 2
-      "addb -1(%2," PAX ",), %%bl  \n\t" // add Avg(x); -1 to offset inc eax
-      "cmpl %%edx, %%eax           \n\t" // check if at end of array
-      "movb %%bl, -1(%2," PAX ",)  \n\t" // write back Raw(x) [mov does not
-      "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc eax]
-
-      RESTORE_GOT_ebx
-      RESTORE_ebp
-
-   "avg_end:                       \n\t"
-      "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
-
-      : "=c" (dummy_value_c),            // output regs (dummy)
-        "=S" (dummy_value_S),
-        "=D" (dummy_value_D),
-        "=a" (dummy_value_a),
-        "=d" (dummy_value_d)
-
-      : "0" (bpp),         // ecx        // input regs
-        "1" (prev_row),    // esi/rsi
-        "2" (row),         // edi/rdi
-        "3" (MMXLength),   // eax
-        "4" (FullLength)   // edx
-
-      CLOB_COLON_ebx_ebp                 // clobber list
-        CLOBBER_GOT_ebx
-        CLOB_COMMA_ebx_ebp
-        CLOBBER_ebp
-   );
-
-} /* end png_read_filter_row_mmx_avg() */
-
-#endif /* PNG_MMX_READ_FILTER_AVG_SUPPORTED */
-
-
-
-#if defined(PNG_MMX_READ_FILTER_PAETH_SUPPORTED)
-#if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
-
-//===========================================================================//
-//                                                                           //
-//         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
-//                                                                           //
-//===========================================================================//
-
-// Optimized code for PNG Paeth filter decoder
-
-static void /* PRIVATE */
-png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
-                              png_bytep prev_row)
-{
-   unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
-   int bpp;
-   int dummy_value_a;
-   int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
-   int dummy_value_d;
-   png_charp dummy_value_S;
-   png_charp dummy_value_D;
-   int diff; //     __attribute__((used));
-
-   bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
-   FullLength = row_info->rowbytes;         // number of bytes to filter
-
-   __asm__ __volatile__ (
-      SAVE_GOT_ebx
-      SAVE_r15
-      SAVE_ebp
-//pre "movl row, %2                \n\t" // edi/rdi
-      "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
-//pre "movl prev_row, %1           \n\t" // esi/rsi
-      "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
-//pre "movl FullLength, %%eax      \n\t" // bring in via eax...
-      SAVE_FullLength                    // ...but store for later use
-      "xorl %%eax, %%eax           \n\t"
-
-      // Compute the Raw value for the first bpp bytes
-      // Note: the formula works out to be always
-      //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
-   "paeth_rlp:                     \n\t"
-      "movb (%2," PBX ",), %%al    \n\t"
-      "addb (%1," PBX ",), %%al    \n\t"
-      "incl %%ebx                  \n\t"
-//pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movb %%al, -1(%2," PBX ",)  \n\t"
-      "jb paeth_rlp                \n\t"
-
-      // get # of bytes to alignment (note:  computing _delta_ of two pointers,
-      // so hereafter %%ebp is sufficient even on 64-bit)
-      "mov  %2, " PBP "            \n\t" // take start of row
-      "add  " PBX "," PBP "        \n\t" // add bpp
-      "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
-//    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
-      CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
-      "sub  %2, " PBP "            \n\t" // subtract row ptr again => ebp =
-      "jz paeth_go                 \n\t" //  target value of ebx at alignment
-
-      "xorl %%ecx, %%ecx           \n\t"
-
-      SAVE_r11_r12_r13
-
-      // fix alignment
-   "paeth_lp1:                     \n\t"
-      "xorl %%eax, %%eax           \n\t"
-      // pav = p - a = (a + b - c) - a = b - c
-      "movb (%1," PBX ",), %%al    \n\t" // load Prior(x) into al
-      "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
-      "movl %%eax, " pa_TEMP "     \n\t" // Save pav for later use
-      "xorl %%eax, %%eax           \n\t"
-      // pbv = p - b = (a + b - c) - b = a - c
-      "movb (%2," PDX ",), %%al    \n\t" // load Raw(x-bpp) into al
-      "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
-      "movl %%eax, %%ecx           \n\t"
-      // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
-      "addl " pa_TEMP ", %%eax     \n\t" // pcv = pav + pbv
-      // pc = abs(pcv)
-      "testl $0x80000000, %%eax    \n\t"
-      "jz paeth_pca                \n\t"
-      "negl %%eax                  \n\t" // reverse sign of neg values
-
-   "paeth_pca:                     \n\t"
-      "movl %%eax, " pc_TEMP "     \n\t" // save pc for later use
-      // pb = abs(pbv)
-      "testl $0x80000000, %%ecx    \n\t"
-      "jz paeth_pba                \n\t"
-      "negl %%ecx                  \n\t" // reverse sign of neg values
-
-   "paeth_pba:                     \n\t"
-      "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
-      // pa = abs(pav)
-      "movl " pa_TEMP ", %%eax     \n\t"
-      "testl $0x80000000, %%eax    \n\t"
-      "jz paeth_paa                \n\t"
-      "negl %%eax                  \n\t" // reverse sign of neg values
-
-   "paeth_paa:                     \n\t"
-      "movl %%eax, " pa_TEMP "     \n\t" // save pa for later use
-      // test if pa <= pb
-      "cmpl %%ecx, %%eax           \n\t"
-      "jna paeth_abb               \n\t"
-      // pa > pb; now test if pb <= pc
-      "cmpl " pc_TEMP ", %%ecx     \n\t"
-      "jna paeth_bbc               \n\t"
-      // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-      "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "jmp paeth_paeth             \n\t"
-
-   "paeth_bbc:                     \n\t"
-      // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-      "movb (%1," PBX ",), %%cl    \n\t" // load Prior(x) into cl
-      "jmp paeth_paeth             \n\t"
-
-   "paeth_abb:                     \n\t"
-      // pa <= pb; now test if pa <= pc
-      "cmpl " pc_TEMP ", %%eax     \n\t"
-      "jna paeth_abc               \n\t"
-      // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-      "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "jmp paeth_paeth             \n\t"
-
-   "paeth_abc:                     \n\t"
-      // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-      "movb (%2," PDX ",), %%cl    \n\t" // load Raw(x-bpp) into cl
-
-   "paeth_paeth:                   \n\t"
-      "incl %%ebx                  \n\t"
-      "incl %%edx                  \n\t"
-      // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-      "addb %%cl, -1(%2," PBX ",)  \n\t"
-      "cmpl %%ebp, %%ebx           \n\t"
-      "jb paeth_lp1                \n\t"
-
-      RESTORE_r11_r12_r13
-
-   "paeth_go:                      \n\t"
-      RESTORE_FullLength "%%ecx    \n\t" // FullLength -> ecx
-      "movl %%ecx, %%eax           \n\t"
-      "subl %%ebx, %%eax           \n\t" // subtract alignment fix
-      "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
-      "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
-//out "movl %%ecx, MMXLength       \n\t"
-      "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
-      RESTORE_ebp                        //  (could swap ebp and edx functions)
-      RESTORE_r15
-      RESTORE_GOT_ebx
-
-      : "=c" (MMXLength),                // output regs
-        "=S" (dummy_value_S),
-        "=D" (dummy_value_D),
-        "=a" (diff)
-
-      : "0" (bpp),         // ecx        // input regs
-        "1" (prev_row),    // esi/rsi
-        "2" (row),         // edi/rdi
-        "3" (FullLength)   // eax
-
-      : "%edx"                           // clobber list
-        _CLOBBER_r11_r12_r13
-        _CLOBBER_r15
-        _CLOBBER_ebp
-        _CLOBBER_GOT_ebx
-   );
-
-   // now do the math for the rest of the row
-   switch (bpp)
-   {
-      case 3:
-      {
-//       _ShiftBpp = 24;    // == bpp * 8
-//       _ShiftRem = 40;    // == 64 - _ShiftBpp
-
-         __asm__ __volatile__ (
-            LOAD_GOT_rbp
-// preload  "movl diff, %%ecx            \n\t"
-// preload  "movl row, %1                \n\t" // edi/rdi
-// preload  "movl prev_row, %0           \n\t" // esi/rsi
-            "pxor %%mm0, %%mm0           \n\t"
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm1 \n\t"
-         "paeth_3lp:                     \n\t"
-            "psrlq $40, %%mm1            \n\t" // shift last 3 bytes to 1st
-                                               // 3 bytes
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
-            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            "psrlq $40, %%mm3            \n\t" // shift last 3 bytes to 1st
-                                               // 3 bytes
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "packuswb %%mm1, %%mm7       \n\t"
-            "movq (%0," PCX ",), %%mm3   \n\t" // load c=Prior(x-bpp)
-            "pand " AMASK5_3_0 ", %%mm7  \n\t" // _amask5_3_0 (was _ActiveMask)
-            "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
-            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
-            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
-            "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
-                                               // Raw(x-bpp)
-            // now do Paeth for 2nd set of bytes (3-5)
-            "psrlq $24, %%mm2            \n\t" // load b=Prior(x) step 2
-            "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
-            "pxor %%mm7, %%mm7           \n\t"
-            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
-            //       pav + pbv = pbv + pav
-            "movq %%mm5, %%mm6           \n\t"
-            "paddw %%mm4, %%mm6          \n\t"
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
-            "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
-            "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
-            "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm0, %%mm5          \n\t"
-            "psubw %%mm7, %%mm4          \n\t"
-            "psubw %%mm0, %%mm5          \n\t"
-            "psubw %%mm7, %%mm4          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "pxor %%mm1, %%mm1           \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "packuswb %%mm1, %%mm7       \n\t"
-            "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
-            "pand " AMASK5_3_0 ", %%mm7  \n\t" // _amask5_3_0 (was _ActiveMask)
-            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            "psllq $24, %%mm7            \n\t" // shift bytes to 2nd group of
-                                               // 3 bytes
-             // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
-            "psllq $24, %%mm3            \n\t" // load c=Prior(x-bpp) step 2
-            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
-            "movq %%mm7, %%mm1           \n\t"
-            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            "psllq $24, %%mm1            \n\t" // shift bytes (was _ShiftBpp)
-                                    // now mm1 will be used as Raw(x-bpp)
-            // now do Paeth for 3rd, and final, set of bytes (6-7)
-            "pxor %%mm7, %%mm7           \n\t"
-            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
-            "psubw %%mm3, %%mm4          \n\t"
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "paddw %%mm5, %%mm6          \n\t"
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm1, %%mm1           \n\t"
-            "packuswb %%mm7, %%mm1       \n\t"
-            // step ecx to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ecx              \n\t"
-            "pand " AMASK0_2_6 ", %%mm1  \n\t" // _amask0_2_6 (_ActiveMaskEnd)
-            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
-            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
-                                 // mm1 will be used as Raw(x-bpp) next loop
-                           // mm3 ready to be used as Prior(x-bpp) next loop
-            "jb paeth_3lp                \n\t"
-            RESTORE_rbp
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),  // esi/rsi      // input regs
-              "1" (row),       // edi/rdi
-              "2" (diff),      // ecx
-              "3" (MMXLength)  // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 3 bpp
-
-      case 4:
-      {
-         __asm__ __volatile__ (
-// preload  "movl diff, %%ecx            \n\t"
-// preload  "movl row, %1                \n\t" // edi/rdi
-// preload  "movl prev_row, %0           \n\t" // esi/rsi
-            "pxor %%mm0, %%mm0           \n\t"
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
-                                               //  a=Raw(x-bpp) bytes
-         "paeth_4lp:                     \n\t"
-            // do first set of 4 bytes
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "packuswb %%mm1, %%mm7       \n\t"
-            "movq (%0," PCX ",), %%mm3   \n\t" // load c=Prior(x-bpp)
-            LOAD_GOT_rbp
-            "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
-            RESTORE_rbp
-            "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
-            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
-            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
-            "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
-                                               // Raw(x-bpp)
-            // do second set of 4 bytes
-            "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
-            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "pxor %%mm1, %%mm1           \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            // step ecx to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ecx              \n\t"
-            "packuswb %%mm7, %%mm1       \n\t"
-            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add predictor with Raw(x)
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
-                                 // mm1 will be used as Raw(x-bpp) next loop
-            "jb paeth_4lp                \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),  // esi/rsi      // input regs
-              "1" (row),       // edi/rdi
-              "2" (diff),      // ecx
-              "3" (MMXLength)  // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 4 bpp
-
-      case 1:
-      case 2:
-      {
-         __asm__ __volatile__ (
-// preload  "movl diff, %%eax            \n\t" // eax: x = offset to align. bdry
-// preload  "movl FullLength, %%edx      \n\t"
-            "cmpl %%edx, %%eax           \n\t"
-            "jnb paeth_dend              \n\t"
-
-            SAVE_ebp
-
-// preload  "movl row, %2                \n\t" // edi/rdi
-            // do Paeth decode for remaining bytes
-// preload  "movl prev_row, %1           \n\t" // esi/rsi
-            "movl %%eax, %%ebp           \n\t"
-// preload  "subl bpp, %%ebp             \n\t" // (bpp is preloaded into ecx)
-            "subl %%ecx, %%ebp           \n\t" // ebp = eax - bpp
-            "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
-
-            SAVE_GOT_ebx
-            SAVE_r11_r12_r13
-
-         "paeth_dlp:                     \n\t"
-            "xorl %%ebx, %%ebx           \n\t"
-            // pav = p - a = (a + b - c) - a = b - c
-            "movb (%1," PAX ",), %%bl    \n\t" // load Prior(x) into bl
-            "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-            "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
-            "movl %%ebx, " pa_TEMP "     \n\t" // Save pav for later use
-            "xorl %%ebx, %%ebx           \n\t"
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movb (%2," PBP ",), %%bl    \n\t" // load Raw(x-bpp) into bl
-            "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
-            "movl %%ebx, %%ecx           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "addl " pa_TEMP ", %%ebx     \n\t" // pcv = pav + pbv
-            // pc = abs(pcv)
-            "testl $0x80000000, %%ebx    \n\t"
-            "jz paeth_dpca               \n\t"
-            "negl %%ebx                  \n\t" // reverse sign of neg values
-
-         "paeth_dpca:                    \n\t"
-            "movl %%ebx, " pc_TEMP "     \n\t" // save pc for later use
-            // pb = abs(pbv)
-            "testl $0x80000000, %%ecx    \n\t"
-            "jz paeth_dpba               \n\t"
-            "negl %%ecx                  \n\t" // reverse sign of neg values
-
-         "paeth_dpba:                    \n\t"
-            "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
-            // pa = abs(pav)
-            "movl " pa_TEMP ", %%ebx     \n\t"
-            "testl $0x80000000, %%ebx    \n\t"
-            "jz paeth_dpaa               \n\t"
-            "negl %%ebx                  \n\t" // reverse sign of neg values
-
-         "paeth_dpaa:                    \n\t"
-            "movl %%ebx, " pa_TEMP "     \n\t" // save pa for later use
-            // test if pa <= pb
-            "cmpl %%ecx, %%ebx           \n\t"
-            "jna paeth_dabb              \n\t"
-            // pa > pb; now test if pb <= pc
-            "cmpl " pc_TEMP ", %%ecx     \n\t"
-            "jna paeth_dbbc              \n\t"
-            // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-            "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-            "jmp paeth_dpaeth            \n\t"
-
-         "paeth_dbbc:                    \n\t"
-            // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-            "movb (%1," PAX ",), %%cl    \n\t" // load Prior(x) into cl
-            "jmp paeth_dpaeth            \n\t"
-
-         "paeth_dabb:                    \n\t"
-            // pa <= pb; now test if pa <= pc
-            "cmpl " pc_TEMP ", %%ebx     \n\t"
-            "jna paeth_dabc              \n\t"
-            // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-            "movb (%1," PBP ",), %%cl   \n\t" // load Prior(x-bpp) into cl
-            "jmp paeth_dpaeth            \n\t"
-
-         "paeth_dabc:                    \n\t"
-            // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-            "movb (%2," PBP ",), %%cl    \n\t" // load Raw(x-bpp) into cl
-
-         "paeth_dpaeth:                  \n\t"
-            "incl %%eax                  \n\t"
-            "incl %%ebp                  \n\t"
-            // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-            "addb %%cl, -1(%2," PAX ",)  \n\t"
-            "cmpl %%edx, %%eax           \n\t" // check against FullLength
-            "jb paeth_dlp                \n\t"
-
-            RESTORE_r11_r12_r13
-            RESTORE_GOT_ebx
-            RESTORE_ebp
-
-         "paeth_dend:                    \n\t"
-
-            : "=c" (dummy_value_c),            // output regs (dummy)
-              "=S" (dummy_value_S),
-              "=D" (dummy_value_D),
-              "=a" (dummy_value_a),
-              "=d" (dummy_value_d)
-
-            : "0" (bpp),         // ecx        // input regs
-              "1" (prev_row),    // esi/rsi
-              "2" (row),         // edi/rdi
-              "3" (diff),        // eax
-              "4" (FullLength)   // edx
-
-            CLOB_COLON_ebx_ebp_r1X             // clobber list
-              CLOBBER_GOT_ebx
-              CLOB_COMMA_ebx_ebp
-              CLOBBER_ebp
-              CLOB_COMMA_ebX_r1X
-              CLOBBER_r11_r12_r13
-         );
-      }
-      return; // end 1 or 2 bpp (no need to go further with this one)
-
-      case 6:
-      {
-//       _ActiveMask2 = 0xffffffff00000000LL;  // NOT USED ("_amask_0_4_4")
-//       _ShiftBpp = 48;       // bpp << 3 == bpp * 8
-//       _ShiftRem = 16;       // 64 - _ShiftBpp
-
-         __asm__ __volatile__ (
-// preload  "movl diff, %%ecx            \n\t"
-// preload  "movl row, %1                \n\t" // edi/rdi
-// preload  "movl prev_row, %0           \n\t" // esi/rsi
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm1 \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-
-         "paeth_6lp:                     \n\t"
-            // must shift to position Raw(x-bpp) data
-            "psrlq $16, %%mm1            \n\t" // was _ShiftRem
-            // do first set of 4 bytes
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
-            // must shift to position Prior(x-bpp) data
-            "psrlq $16, %%mm3            \n\t" // was _ShiftRem
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "packuswb %%mm1, %%mm7       \n\t"
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // load c=Prior(x-bpp)
-            LOAD_GOT_rbp
-            "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
-            RESTORE_rbp
-            "psrlq $16, %%mm3            \n\t"
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x) step 1
-            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
-            "movq %%mm2, %%mm6           \n\t"
-            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
-            "movq -8(%1," PCX ",), %%mm1 \n\t"
-            "psllq $48, %%mm6            \n\t" // bpp * 8 = bits per pixel
-            "movq %%mm7, %%mm5           \n\t"
-            "psrlq $16, %%mm1            \n\t" // 64 - (bpp * 8) = remainder
-            "por %%mm6, %%mm3            \n\t"
-            "psllq $48, %%mm5            \n\t" // was _ShiftBpp
-            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            "por %%mm5, %%mm1            \n\t"
-            // do second set of 4 bytes
-            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "pxor %%mm1, %%mm1           \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            // step ecx to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ecx              \n\t"
-            "packuswb %%mm7, %%mm1       \n\t"
-            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
-                                 // mm1 will be used as Raw(x-bpp) next loop
-            "jb paeth_6lp                \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),  // esi/rsi      // input regs
-              "1" (row),       // edi/rdi
-              "2" (diff),      // ecx
-              "3" (MMXLength)  // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 6 bpp
-
-      case 8:                          // bpp == 8
-      {
-         __asm__ __volatile__ (
-// preload  "movl diff, %%ecx            \n\t"
-// preload  "movl row, %1                \n\t" // edi/rdi
-// preload  "movl prev_row, %0           \n\t" // esi/rsi
-            "pxor %%mm0, %%mm0           \n\t"
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
-                                               //  a=Raw(x-bpp) bytes
-         "paeth_8lp:                     \n\t"
-            // do first set of 4 bytes
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "packuswb %%mm1, %%mm7       \n\t"
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            LOAD_GOT_rbp
-            "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
-            RESTORE_rbp
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
-            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
-            "movq -8(%1," PCX ",), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
-
-            // do second set of 4 bytes
-            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "pxor %%mm1, %%mm1           \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            // step ecx to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ecx              \n\t"
-            "packuswb %%mm7, %%mm1       \n\t"
-            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
-                                 // mm1 will be used as Raw(x-bpp) next loop
-            "jb paeth_8lp                \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),  // esi/rsi      // input regs
-              "1" (row),       // edi/rdi
-              "2" (diff),      // ecx
-              "3" (MMXLength)  // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 8 bpp
-
-      default:                // bpp != 1,2,3,4,6,8:  doesn't exist
-      {
-         // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-         png_debug(1, "Internal libpng logic error (GCC "
-           "png_read_filter_row_mmx_paeth())\n");
-#endif
-      }
-      break;
-
-   } // end switch (bpp)
-
-   __asm__ __volatile__ (
-      // MMX acceleration complete; now do clean-up
-      // check if any remaining bytes left to decode
-//pre "movl FullLength, %%edx      \n\t"
-//pre "movl MMXLength, %%eax       \n\t"
-      "cmpl %%edx, %%eax           \n\t"
-      "jnb paeth_end               \n\t"
-
-      SAVE_ebp
-
-//pre "movl row, %2                \n\t" // edi/rdi
-//pre "movl prev_row, %1           \n\t" // esi/rsi
-      // do Paeth decode for remaining bytes
-      "movl %%eax, %%ebp           \n\t"
-//pre "subl bpp, %%ebp             \n\t" // (bpp is preloaded into ecx)
-      "subl %%ecx, %%ebp           \n\t" // ebp = eax - bpp
-      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
-
-      SAVE_GOT_ebx
-      SAVE_r11_r12_r13
-
-   "paeth_lp2:                     \n\t"
-      "xorl %%ebx, %%ebx           \n\t"
-      // pav = p - a = (a + b - c) - a = b - c
-      "movb (%1," PAX ",), %%bl    \n\t" // load Prior(x) into bl
-      "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
-      "movl %%ebx, " pa_TEMP "     \n\t" // Save pav for later use
-      "xorl %%ebx, %%ebx           \n\t"
-      // pbv = p - b = (a + b - c) - b = a - c
-      "movb (%2," PBP ",), %%bl    \n\t" // load Raw(x-bpp) into bl
-      "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
-      "movl %%ebx, %%ecx           \n\t"
-      // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
-      "addl " pa_TEMP ", %%ebx     \n\t" // pcv = pav + pbv
-      // pc = abs(pcv)
-      "testl $0x80000000, %%ebx    \n\t"
-      "jz paeth_pca2               \n\t"
-      "negl %%ebx                  \n\t" // reverse sign of neg values
-
-   "paeth_pca2:                    \n\t"
-      "movl %%ebx, " pc_TEMP "     \n\t" // save pc for later use
-      // pb = abs(pbv)
-      "testl $0x80000000, %%ecx    \n\t"
-      "jz paeth_pba2               \n\t"
-      "negl %%ecx                  \n\t" // reverse sign of neg values
-
-   "paeth_pba2:                    \n\t"
-      "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
-      // pa = abs(pav)
-      "movl " pa_TEMP ", %%ebx     \n\t"
-      "testl $0x80000000, %%ebx    \n\t"
-      "jz paeth_paa2               \n\t"
-      "negl %%ebx                  \n\t" // reverse sign of neg values
-
-   "paeth_paa2:                    \n\t"
-      "movl %%ebx, " pa_TEMP "     \n\t" // save pa for later use
-      // test if pa <= pb
-      "cmpl %%ecx, %%ebx           \n\t"
-      "jna paeth_abb2              \n\t"
-      // pa > pb; now test if pb <= pc
-      "cmpl " pc_TEMP ", %%ecx     \n\t"
-      "jna paeth_bbc2              \n\t"
-      // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-      "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "jmp paeth_paeth2            \n\t"
-
-   "paeth_bbc2:                    \n\t"
-      // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-      "movb (%1," PAX ",), %%cl    \n\t" // load Prior(x) into cl
-      "jmp paeth_paeth2            \n\t"
-
-   "paeth_abb2:                    \n\t"
-      // pa <= pb; now test if pa <= pc
-      "cmpl " pc_TEMP ", %%ebx     \n\t"
-      "jna paeth_abc2              \n\t"
-      // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-      "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "jmp paeth_paeth2            \n\t"
-
-   "paeth_abc2:                    \n\t"
-      // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-      "movb (%2," PBP ",), %%cl    \n\t" // load Raw(x-bpp) into cl
-
-   "paeth_paeth2:                  \n\t"
-      "incl %%eax                  \n\t"
-      "incl %%ebp                  \n\t"
-      // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-      "addb %%cl, -1(%2," PAX ",)  \n\t"
-      "cmpl %%edx, %%eax           \n\t" // check against FullLength
-      "jb paeth_lp2                \n\t"
-
-      RESTORE_r11_r12_r13
-      RESTORE_GOT_ebx
-      RESTORE_ebp
-
-   "paeth_end:                     \n\t"
-      "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
-
-      : "=c" (dummy_value_c),            // output regs (dummy)
-        "=S" (dummy_value_S),
-        "=D" (dummy_value_D),
-        "=a" (dummy_value_a),
-        "=d" (dummy_value_d)
-
-      : "0" (bpp),         // ecx        // input regs
-        "1" (prev_row),    // esi/rsi
-        "2" (row),         // edi/rdi
-        "3" (MMXLength),   // eax
-        "4" (FullLength)   // edx
-
-      CLOB_COLON_ebx_ebp_r1X             // clobber list
-        CLOBBER_GOT_ebx
-        CLOB_COMMA_ebx_ebp
-        CLOBBER_ebp
-        CLOB_COMMA_ebX_r1X
-        CLOBBER_r11_r12_r13
-   );
-
-} /* end png_read_filter_row_mmx_paeth() */
-
-#endif // PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK
-#endif /* PNG_MMX_READ_FILTER_PAETH_SUPPORTED */
-
-
-
-
-#if defined(PNG_MMX_READ_FILTER_SUB_SUPPORTED)
-
-//===========================================================================//
-//                                                                           //
-//           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
-//                                                                           //
-//===========================================================================//
-
-// Optimized code for PNG Sub filter decoder
-
-static void /* PRIVATE */
-png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
-{
-   unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
-   int bpp;
-   int dummy_value_a;
-   int dummy_value_c;
-   int dummy_value_d;
-   png_bytep dummy_value_D;
-   int diff; //     __attribute__((used));
-
-   bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
-   FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
-     // (why do we subtract off bpp?  not so in avg or paeth...)
-
-   __asm__ __volatile__ (
-      SAVE_r15
-      SAVE_ebp
-//pre "movl row, %1                \n\t" // edi/rdi
-      "mov  %1, " PSI "            \n\t" // lp = row
-//pre "movl bpp, %%ecx             \n\t"
-      "add  " PCX ", %1            \n\t" // rp = row + bpp
-//pre "movl FullLength, %%eax      \n\t" // bring in via eax...
-      SAVE_FullLength                    // ...but store for later use
-
-      "xorl %%eax, %%eax           \n\t"
-
-      // get # of bytes to alignment (note:  computing _delta_ of two pointers,
-      // so hereafter %%ebp is sufficient even on 64-bit)
-      "mov  %1, " PBP "            \n\t" // take start of row
-      "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
-//    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
-      CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
-      "sub  %1, " PBP "            \n\t" // subtract row ptr again => ebp =
-      "jz sub_go                   \n\t" //  target value of eax at alignment
-
-   "sub_lp1:                       \n\t" // fix alignment
-      "movb (" PSI "," PAX ",), %%cl \n\t"
-      "addb %%cl, (%1," PAX ",)    \n\t"
-      "incl %%eax                  \n\t"
-      "cmpl %%ebp, %%eax           \n\t"
-      "jb sub_lp1                  \n\t"
-
-   "sub_go:                        \n\t"
-      RESTORE_FullLength "%%ecx    \n\t" // FullLength -> ecx
-      "movl %%ecx, %%edx           \n\t"
-      "subl %%eax, %%edx           \n\t" // subtract alignment fix
-      "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
-      "subl %%edx, %%ecx           \n\t" // drop over bytes from length
-//out "movl %%ecx, MMXLength       \n\t"
-      "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
-      RESTORE_ebp                        //  (could swap ebp and ecx functions,
-      RESTORE_r15                        //  but %%cl issues...)
-
-      : "=c" (MMXLength),       // 0     // output regs
-        "=D" (dummy_value_D),   // 1
-        "=a" (diff)             // 2
-
-      : "0" (bpp),              // ecx   // input regs
-        "1" (row),              // edi
-        "2" (FullLength)        // eax
-
-      : "%esi", "%edx"                   // clobber list
-        _CLOBBER_r15
-        _CLOBBER_ebp
-   );
-
-   // now do the math for the rest of the row
-   switch (bpp)
-   {
-      case 3:
-      {
-//       _ShiftBpp = 24;       // == 3 * 8
-//       _ShiftRem  = 40;      // == 64 - 24
-
-         __asm__ __volatile__ (
-// preload  "mov  row, %1                 \n\t" // edi/rdi
-            LOAD_GOT_rbp
-            // load (former) _ActiveMask for 2nd active byte group
-            "movq " AMASK2_3_3 ", %%mm7   \n\t" // _amask2_3_3
-            RESTORE_rbp
-
-// notused  "mov  %1, " PSI "             \n\t" // lp = row
-// preload  "movl bpp, %%ecx              \n\t"
-            "add  " PCX ", %1             \n\t" // rp = row + bpp
-            "movq %%mm7, %%mm6            \n\t"
-// preload  "movl diff, %%edx             \n\t"
-            "psllq $24, %%mm6             \n\t" // move mask in mm6 to cover
-                                                //  3rd active byte group
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PDX ",), %%mm1  \n\t"
-
-         "sub_3lp:                        \n\t" // shift data for adding first
-            "psrlq $40, %%mm1             \n\t" //  bpp bytes (no need for mask;
-                                                //  shift clears inactive bytes)
-            // add 1st active group
-            "movq (%1," PDX ",), %%mm0    \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 2nd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $24, %%mm1             \n\t" // shift data to pos. correctly
-            "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 3rd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $24, %%mm1             \n\t" // shift data to pos. correctly
-            "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
-            "addl $8, %%edx               \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            "cmpl %%eax, %%edx            \n\t" // MMXLength
-            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
-            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
-            "jb sub_3lp                   \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (MMXLength)         // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm6", "%mm7"    // clobber list
-#endif
-         );
-      }
-      break;  // end 3 bpp
-
-      case 4:   // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem,
-      {         // but 64-bit PIC/.so problems (could still share, moving vars
-                // into unused MMX regs via ecx/edx, but kludgy)
-//       _ShiftBpp = bpp << 3;        // 32 (psllq)
-//       _ShiftRem = 64 - _ShiftBpp;  // 32 (psrlq)
-
-         __asm__ __volatile__ (
-// preload  "mov  row, %1                 \n\t" // edi/rdi
-// preload  "movl diff, %%edx             \n\t"
-// notused  "mov  %1, " PSI "             \n\t" // lp = row
-// preload  "movl bpp, %%ecx              \n\t"
-            "add  " PCX ", %1             \n\t" // rp = row + bpp
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PDX ",), %%mm1  \n\t"
-
-         "sub_4lp:                        \n\t" // shift data for adding first
-            "psrlq $32, %%mm1             \n\t" //  bpp bytes (no need for mask;
-                                                //  shift clears inactive bytes)
-            "movq (%1," PDX ",), %%mm0    \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 2nd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $32, %%mm1             \n\t" // shift data to pos. correctly
-            "addl $8, %%edx               \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            "cmpl %%eax, %%edx            \n\t" // MMXLength
-            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
-            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
-            "jb sub_4lp                   \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (MMXLength)         // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1"                    // clobber list
-#endif
-         );
-      }
-      break;  // end 4 bpp
-
-      case 1:
-      {
-         __asm__ __volatile__ (
-// preload  "movl diff, %%edx              \n\t"
-// preload  "mov  row, %1                  \n\t" // edi/rdi
-// preload  "cmpl FullLength, %%edx        \n\t"
-            "cmpl %%eax, %%edx             \n\t"
-            "jnb sub_1end                  \n\t"
-            "mov  %1, " PSI "              \n\t" // lp = row
-// irrel.   "xorl %%ecx, %%ecx             \n\t" // (actually bug with preload)
-// preload  "movl bpp, %%ecx               \n\t"
-            "add  " PCX ", %1              \n\t" // rp = row + bpp
-
-         "sub_1lp:                         \n\t"
-            "movb (" PSI "," PDX ",), %%cl \n\t"
-            "addb %%cl, (%1," PDX ",)      \n\t"
-            "incl %%edx                    \n\t"
-            "cmpl %%eax, %%edx             \n\t" // compare with FullLength
-            "jb sub_1lp                    \n\t"
-
-         "sub_1end:                        \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (FullLength)        // eax
-
-            : "%esi"                            // clobber list
-         );
-      }
-      return;  // end 1 bpp (bypassing cleanup block!)
-
-      case 2:
-      {
-//       _ShiftBpp = 16;       // == 2 * 8
-//       _ShiftRem = 48;       // == 64 - 16
-
-         __asm__ __volatile__ (
-            LOAD_GOT_rbp
-            // load (former) _ActiveMask for 2nd active byte group
-            "movq " AMASK4_2_2 ", %%mm7   \n\t" // _amask4_2_2
-            RESTORE_rbp
-// preload  "movl diff, %%edx             \n\t"
-            "movq %%mm7, %%mm6            \n\t"
-// preload  "mov  row, %1                 \n\t" // edi/rdi
-            "psllq $16, %%mm6             \n\t" // move mask in mm6 to cover
-                                                //  3rd active byte group
-// notused  "mov  %1, " PSI "             \n\t" // lp = row
-            "movq %%mm6, %%mm5            \n\t"
-// preload  "movl bpp, %%ecx              \n\t"
-            "add  " PCX ", %1             \n\t" // rp = row + bpp
-            "psllq $16, %%mm5             \n\t" // move mask in mm5 to cover
-                                                //  4th active byte group
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PDX ",), %%mm1  \n\t"
-
-         "sub_2lp:                        \n\t" // shift data for adding first
-            "psrlq $48, %%mm1             \n\t" //  bpp bytes (no need for mask;
-                                                //  shift clears inactive bytes)
-            // add 1st active group
-            "movq (%1," PDX ",), %%mm0    \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 2nd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
-            "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 3rd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
-            "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 4th active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
-            "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
-            "addl $8, %%edx               \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-            "cmpl %%eax, %%edx            \n\t" // MMXLength
-            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
-            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
-            "jb sub_2lp                   \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (MMXLength)         // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm5", "%mm6"    // clobber list
-            , "%mm7"
-#endif
-         );
-      }
-      break;  // end 2 bpp
-
-      case 6:   // formerly shared with 4 bpp case (see comments there)
-      {
-//       _ShiftBpp = bpp << 3;        // 48 (psllq)
-//       _ShiftRem = 64 - _ShiftBpp;  // 16 (psrlq)
-
-         __asm__ __volatile__ (
-// preload  "mov  row, %1                 \n\t" // edi/rdi
-// preload  "movl diff, %%edx             \n\t"
-// notused  "mov  %1, " PSI "             \n\t" // lp = row
-// preload  "movl bpp, %%ecx              \n\t"
-            "add  " PCX ", %1             \n\t" // rp = row + bpp
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PDX ",), %%mm1  \n\t"
-
-         "sub_6lp:                        \n\t" // shift data for adding first
-            "psrlq $16, %%mm1             \n\t" //  bpp bytes (no need for mask;
-                                                //  shift clears inactive bytes)
-            "movq (%1," PDX ",), %%mm0    \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 2nd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $48, %%mm1             \n\t" // shift data to pos. correctly
-            "addl $8, %%edx               \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            "cmpl %%eax, %%edx            \n\t" // MMXLength
-            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
-            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
-            "jb sub_6lp                   \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (MMXLength)         // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1"                    // clobber list
-#endif
-         );
-      }
-      break;  // end 6 bpp
-
-      case 8:
-      {
-         __asm__ __volatile__ (
-// preload  "mov  row, %1                 \n\t" // edi/rdi
-// preload  "movl diff, %%edx             \n\t"
-// notused  "mov  %1, " PSI "             \n\t" // lp = row
-// preload  "movl bpp, %%ecx              \n\t"
-            "add  " PCX ", %1             \n\t" // rp = row + bpp
-// preload  "movl MMXLength, %%eax        \n\t"
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PDX ",), %%mm7  \n\t"
-            "movl %%eax, %%esi            \n\t" // copy of MMXLength -> esi
-            "andl $0x0000003f, %%esi      \n\t" // calc bytes over mult of 64
-
-         "sub_8lp:                        \n\t"
-            "movq (%1," PDX ",), %%mm0    \n\t" // load Sub(x) for 1st 8 bytes
-            "paddb %%mm7, %%mm0           \n\t"
-            "movq 8(%1," PDX ",), %%mm1   \n\t" // load Sub(x) for 2nd 8 bytes
-            "movq %%mm0, (%1," PDX ",)    \n\t" // write Raw(x) for 1st 8 bytes
-
-            // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
-            // This will be repeated for each group of 8 bytes with the 8th
-            // group being used as the Raw(x-bpp) for the 1st group of the
-            // next loop.
-
-            "paddb %%mm0, %%mm1           \n\t"
-            "movq 16(%1," PDX ",), %%mm2  \n\t" // load Sub(x) for 3rd 8 bytes
-            "movq %%mm1, 8(%1," PDX ",)   \n\t" // write Raw(x) for 2nd 8 bytes
-            "paddb %%mm1, %%mm2           \n\t"
-            "movq 24(%1," PDX ",), %%mm3  \n\t" // load Sub(x) for 4th 8 bytes
-            "movq %%mm2, 16(%1," PDX ",)  \n\t" // write Raw(x) for 3rd 8 bytes
-            "paddb %%mm2, %%mm3           \n\t"
-            "movq 32(%1," PDX ",), %%mm4  \n\t" // load Sub(x) for 5th 8 bytes
-            "movq %%mm3, 24(%1," PDX ",)  \n\t" // write Raw(x) for 4th 8 bytes
-            "paddb %%mm3, %%mm4           \n\t"
-            "movq 40(%1," PDX ",), %%mm5  \n\t" // load Sub(x) for 6th 8 bytes
-            "movq %%mm4, 32(%1," PDX ",)  \n\t" // write Raw(x) for 5th 8 bytes
-            "paddb %%mm4, %%mm5           \n\t"
-            "movq 48(%1," PDX ",), %%mm6  \n\t" // load Sub(x) for 7th 8 bytes
-            "movq %%mm5, 40(%1," PDX ",)  \n\t" // write Raw(x) for 6th 8 bytes
-            "paddb %%mm5, %%mm6           \n\t"
-            "movq 56(%1," PDX ",), %%mm7  \n\t" // load Sub(x) for 8th 8 bytes
-            "movq %%mm6, 48(%1," PDX ",)  \n\t" // write Raw(x) for 7th 8 bytes
-            "addl $64, %%edx              \n\t"
-            "paddb %%mm6, %%mm7           \n\t"
-            "cmpl %%esi, %%edx            \n\t" // cmp to bytes over mult of 64
-            "movq %%mm7, -8(%1," PDX ",)  \n\t" // write Raw(x) for 8th 8 bytes
-            "jb sub_8lp                   \n\t"
-
-            "cmpl %%eax, %%edx            \n\t" // compare to MMXLength
-            "jnb sub_8lt8                 \n\t"
-
-         "sub_8lpA:                       \n\t"
-            "movq (%1," PDX ",), %%mm0    \n\t"
-            "addl $8, %%edx               \n\t"
-            "paddb %%mm7, %%mm0           \n\t"
-            "cmpl %%eax, %%edx            \n\t" // compare to MMXLength
-            "movq %%mm0, -8(%1," PDX ",)  \n\t" // -8 to offset early addl edx
-            "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
-            "jb sub_8lpA                  \n\t" //  to mm7 to be new Raw(x-bpp)
-                                                //  for next loop
-         "sub_8lt8:                       \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (MMXLength)         // eax
-
-            : "%esi"                            // clobber list
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            , "%mm0", "%mm1", "%mm2", "%mm3"
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 8 bpp
-
-      default:                // bpp != 1,2,3,4,6,8:  doesn't exist
-      {
-         // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-         png_debug(1, "Internal libpng logic error (GCC "
-           "png_read_filter_row_mmx_sub())\n");
-#endif
-      }
-      break;
-
-   } // end switch (bpp)
-
-   __asm__ __volatile__ (
-//pre "movl MMXLength, %%eax         \n\t"
-//pre "mov  row, %1                  \n\t" // edi/rdi
-//pre "cmpl FullLength, %%eax        \n\t"
-      "cmpl %%edx, %%eax             \n\t"
-      "jnb sub_end                   \n\t"
-
-      "mov  %1, " PSI "              \n\t" // lp = row
-//pre "movl bpp, %%ecx               \n\t"
-      "add  " PCX ", %1              \n\t" // rp = row + bpp
-      "xorl %%ecx, %%ecx             \n\t"
-
-   "sub_lp2:                         \n\t"
-      "movb (" PSI "," PAX ",), %%cl \n\t"
-      "addb %%cl, (%1," PAX ",)      \n\t"
-      "incl %%eax                    \n\t"
-      "cmpl %%edx, %%eax             \n\t" // FullLength
-      "jb sub_lp2                    \n\t"
-
-   "sub_end:                         \n\t"
-      "EMMS                          \n\t" // end MMX instructions
-
-      : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-        "=D" (dummy_value_D),   // 1
-        "=a" (dummy_value_a),   // 2
-        "=d" (dummy_value_d)    // 3
-
-      : "0" (bpp),              // ecx    // input regs
-        "1" (row),              // edi
-        "2" (MMXLength),        // eax
-        "3" (FullLength)        // edx
-
-      : "%esi"                            // clobber list
-   );
-
-} // end of png_read_filter_row_mmx_sub()
-
-#endif /* PNG_MMX_READ_FILTER_SUB_SUPPORTED */
-
-
-
-
-#if defined(PNG_MMX_READ_FILTER_UP_SUPPORTED)
-
-//===========================================================================//
-//                                                                           //
-//            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
-//                                                                           //
-//===========================================================================//
-
-// Optimized code for PNG Up filter decoder
-
-static void /* PRIVATE */
-png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
-                           png_bytep prev_row)
-{
-   unsigned len;        // png_uint_32 is actually 64-bit on x86-64
-   int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
-   png_bytep dummy_value_S;
-   png_bytep dummy_value_D;
-
-   len = row_info->rowbytes;              // number of bytes to filter
-
-   __asm__ __volatile__ (
-      SAVE_GOT_ebx
-//pre "mov  prev_row, %1           \n\t" // esi/rsi
-//pre "movl row, %2                \n\t" // edi/rdi
-
-      "xorl %%ebx, %%ebx           \n\t"
-      "xorl %%eax, %%eax           \n\t"
-
-      // get # of bytes to alignment (note:  computing _delta_ of two pointers,
-      // so hereafter %%ecx is sufficient even on 64-bit)
-      "mov  %2, " PCX "            \n\t" // take start of row
-      "add  $0x7, " PCX "          \n\t" // add 7 to incr past alignment bdry
-//    "andl $0xfffffff8, %%ecx     \n\t" // mask to alignment boundary (32-bit!)
-      CLEAR_BOTTOM_3_BITS  PCX    "\n\t" // mask to alignment boundary
-      "sub  %2, " PCX "            \n\t" // subtract row ptr again => ebp =
-      "jz up_go                    \n\t" //  target value of ecx at alignment
-
-   "up_lp1:                        \n\t" // fix alignment
-      "movb (%2," PBX ",), %%al    \n\t"
-      "addb (%1," PBX ",), %%al    \n\t"
-      "incl %%ebx                  \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movb %%al, -1(%2," PBX ",)  \n\t" // mov does not affect flags; -1 to
-      "jb up_lp1                   \n\t" //  offset incl ebx
-
-   "up_go:                         \n\t"
-//pre "movl len, %%edx             \n\t"
-      "movl %%edx, %%ecx           \n\t"
-      "subl %%ebx, %%edx           \n\t" // subtract alignment fix
-      "andl $0x0000003f, %%edx     \n\t" // calc bytes over mult of 64
-      "subl %%edx, %%ecx           \n\t" // sub over-bytes from original length
-
-      // unrolled loop - use all MMX registers and interleave to reduce
-      // number of branch instructions (loops) and reduce partial stalls
-   "up_loop:                       \n\t"
-      "movq (%1," PBX ",), %%mm1   \n\t"
-      "movq (%2," PBX ",), %%mm0   \n\t"
-      "movq 8(%1," PBX ",), %%mm3  \n\t"
-      "paddb %%mm1, %%mm0          \n\t"
-      "movq 8(%2," PBX ",), %%mm2  \n\t"
-      "movq %%mm0, (%2," PBX ",)   \n\t"
-      "paddb %%mm3, %%mm2          \n\t"
-      "movq 16(%1," PBX ",), %%mm5 \n\t"
-      "movq %%mm2, 8(%2," PBX ",)  \n\t"
-      "movq 16(%2," PBX ",), %%mm4 \n\t"
-      "movq 24(%1," PBX ",), %%mm7 \n\t"
-      "paddb %%mm5, %%mm4          \n\t"
-      "movq 24(%2," PBX ",), %%mm6 \n\t"
-      "movq %%mm4, 16(%2," PBX ",) \n\t"
-      "paddb %%mm7, %%mm6          \n\t"
-      "movq 32(%1," PBX ",), %%mm1 \n\t"
-      "movq %%mm6, 24(%2," PBX ",) \n\t"
-      "movq 32(%2," PBX ",), %%mm0 \n\t"
-      "movq 40(%1," PBX ",), %%mm3 \n\t"
-      "paddb %%mm1, %%mm0          \n\t"
-      "movq 40(%2," PBX ",), %%mm2 \n\t"
-      "movq %%mm0, 32(%2," PBX ",) \n\t"
-      "paddb %%mm3, %%mm2          \n\t"
-      "movq 48(%1," PBX ",), %%mm5 \n\t"
-      "movq %%mm2, 40(%2," PBX ",) \n\t"
-      "movq 48(%2," PBX ",), %%mm4 \n\t"
-      "movq 56(%1," PBX ",), %%mm7 \n\t"
-      "paddb %%mm5, %%mm4          \n\t"
-      "movq 56(%2," PBX ",), %%mm6 \n\t"
-      "movq %%mm4, 48(%2," PBX ",) \n\t"
-      "addl $64, %%ebx             \n\t"
-      "paddb %%mm7, %%mm6          \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movq %%mm6, -8(%2," PBX ",) \n\t" // (+56)movq does not affect flags;
-      "jb up_loop                  \n\t" //  -8 to offset addl ebx
-
-      "cmpl $0, %%edx              \n\t" // test for bytes over mult of 64
-      "jz up_end                   \n\t"
-
-      "cmpl $8, %%edx              \n\t" // test for less than 8 bytes
-      "jb up_lt8                   \n\t" //  [added by lcreeve at netins.net]
-
-      "addl %%edx, %%ecx           \n\t"
-      "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
-      "subl %%edx, %%ecx           \n\t" // drop over-bytes from length
-      "jz up_lt8                   \n\t"
-
-   "up_lpA:                        \n\t" // use MMX regs to update 8 bytes sim.
-      "movq (%1," PBX ",), %%mm1   \n\t"
-      "movq (%2," PBX ",), %%mm0   \n\t"
-      "addl $8, %%ebx              \n\t"
-      "paddb %%mm1, %%mm0          \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movq %%mm0, -8(%2," PBX ",) \n\t" // movq does not affect flags; -8 to
-      "jb up_lpA                   \n\t" //  offset add ebx
-      "cmpl $0, %%edx              \n\t" // test for bytes over mult of 8
-      "jz up_end                   \n\t"
-
-   "up_lt8:                        \n\t"
-      "xorl %%eax, %%eax           \n\t"
-      "addl %%edx, %%ecx           \n\t" // move over byte count into counter
-
-   "up_lp2:                        \n\t" // use x86 regs for remaining bytes
-      "movb (%2," PBX ",), %%al    \n\t"
-      "addb (%1," PBX ",), %%al    \n\t"
-      "incl %%ebx                  \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movb %%al, -1(%2," PBX ",)  \n\t" // mov does not affect flags; -1 to
-      "jb up_lp2                   \n\t" //  offset inc ebx
-
-   "up_end:                        \n\t"
-      "EMMS                        \n\t" // conversion of filtered row complete
-      RESTORE_GOT_ebx
-
-      : "=d" (dummy_value_d),   // 0     // output regs (dummy)
-        "=S" (dummy_value_S),   // 1
-        "=D" (dummy_value_D)    // 2
-
-      : "0" (len),              // edx   // input regs
-        "1" (prev_row),         // esi
-        "2" (row)               // edi
-
-      : "%eax", "%ecx"                   // clobber list (no input regs!)
-        _CLOBBER_GOT_ebx
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-      , "%mm0", "%mm1", "%mm2", "%mm3"
-      , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-   );
-
-} // end of png_read_filter_row_mmx_up()
-
-#endif /* PNG_MMX_READ_FILTER_UP_SUPPORTED */
-
-
-
-
-/*===========================================================================*/
-/*                                                                           */
-/*                   P N G _ R E A D _ F I L T E R _ R O W                   */
-/*                                                                           */
-/*===========================================================================*/
-
-/* Optimized png_read_filter_row routines */
-
-void /* PRIVATE */
-png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
-   row, png_bytep prev_row, int filter)
-{
-#if defined(PNG_DEBUG)
-   char filtname[10];
-#endif
-
-   if (_mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-#if defined(PNG_DEBUG)
-   png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
-   switch (filter)
-   {
-      case 0:
-         png_snprintf(filtname, 10, "none");
-         break;
-
-      case 1:
-         png_snprintf(filtname, 10, "sub-%s",
-#ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED
-#if !defined(PNG_1_0_X)
-           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
-            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-           _mmx_supported
-#endif
-           ? "MMX" :
-#endif
-           "C");
-         break;
-
-      case 2:
-         png_snprintf(filtname, 10, "up-%s",
-#ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED
-#if !defined(PNG_1_0_X)
-           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
-            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-           _mmx_supported
-#endif
-           ? "MMX" :
-#endif
-           "C");
-         break;
-
-      case 3:
-         png_snprintf(filtname, 10, "avg-%s",
-#ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED
-#if !defined(PNG_1_0_X)
-           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
-            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-           _mmx_supported
-#endif
-           ? "MMX" :
-#endif
-           "C");
-         break;
-
-      case 4:
-         png_snprintf(filtname, 10, "paeth-%s",
-#ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED
-#if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
-#if !defined(PNG_1_0_X)
-           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
-            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-           _mmx_supported
-#endif
-           ? "MMX" :
-#endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
-#endif
-           "C");
-         break;
-
-      default:
-         png_snprintf(filtname, 10, "unknown");
-         break;
-   }
-   png_debug2(2, "row_number=%ld, %s, ", png_ptr->row_number, filtname);
-   //png_debug1(0, "png_ptr=%10p, ", png_ptr);
-   //png_debug1(0, "asm_flags=0x%08lx, ", png_ptr->asm_flags);
-   png_debug1(0, "row=%10p, ", row);
-   png_debug2(0, "pixdepth=%d, bytes=%d, ", (int)row_info->pixel_depth,
-      (int)((row_info->pixel_depth + 7) >> 3));
-   png_debug1(0, "rowbytes=%ld\n", row_info->rowbytes);
-#endif /* PNG_DEBUG */
-
-   switch (filter)
-   {
-      case PNG_FILTER_VALUE_NONE:
-         break;
-
-      case PNG_FILTER_VALUE_SUB:
-#ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (_mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_sub(row_info, row);
-         }
-         else
-#endif
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_bytep rp = row + bpp;
-            png_bytep lp = row;
-
-            for (i = bpp; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
-               rp++;
-            }
-         }  /* end !UseMMX_sub */
-         break;
-
-      case PNG_FILTER_VALUE_UP:
-#ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (_mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_up(row_info, row, prev_row);
-         }
-          else
-#endif
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-
-            for (i = 0; i < istop; ++i)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-         }  /* end !UseMMX_up */
-         break;
-
-      case PNG_FILTER_VALUE_AVG:
-#ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (_mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_avg(row_info, row, prev_row);
-         }
-         else
-#endif
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop = row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++) >> 1)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
-               rp++;
-            }
-         }  /* end !UseMMX_avg */
-         break;
-
-      case PNG_FILTER_VALUE_PAETH:
-#ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED
-#if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (_mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_paeth(row_info, row, prev_row);
-         }
-         else
-#endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
-#endif
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_bytep cp = prev_row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop = row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)   /* use leftover rp,pp */
-            {
-               int a, b, c, pa, pb, pc, p;
-
-               a = *lp++;
-               b = *pp++;
-               c = *cp++;
-
-               p = b - c;
-               pc = a - c;
-
-#if defined(PNG_USE_ABS)
-               pa = abs(p);
-               pb = abs(pc);
-               pc = abs(p + pc);
-#else
-               pa = p < 0 ? -p : p;
-               pb = pc < 0 ? -pc : pc;
-               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
-#endif
-
-               /*
-                  if (pa <= pb && pa <= pc)
-                     p = a;
-                  else if (pb <= pc)
-                     p = b;
-                  else
-                     p = c;
-                */
-
-               p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
-
-               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
-               rp++;
-            }
-         }  /* end !UseMMX_paeth */
-         break;
-
-      default:
-         png_warning(png_ptr, "Ignoring bad row-filter type");
-         *row=0;
-         break;
-   }
-}
-
-#endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
-
-
-#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */
-#endif /* __GNUC__ */
+/* pnggccrd.c was removed from libpng-1.2.20. */
diff --git a/pngget.c b/pngget.c
index 75d2ca015..9cf1b845b 100644
--- a/pngget.c
+++ b/pngget.c
@@ -841,6 +841,7 @@ png_get_compression_buffer_size(png_structp png_ptr)
 png_uint_32 PNGAPI
 png_get_asm_flags (png_structp png_ptr)
 {
+    /* obsolete, to be removed from libpng-1.4.0 */
 #ifdef PNG_MMX_CODE_SUPPORTED
     return (png_uint_32)(png_ptr? png_ptr->asm_flags : 0L);
 #else
@@ -852,94 +853,36 @@ png_get_asm_flags (png_structp png_ptr)
 png_uint_32 PNGAPI
 png_get_asm_flagmask (int flag_select)
 {
-#ifdef PNG_MMX_CODE_SUPPORTED
-    png_uint_32 settable_asm_flags = 0;
-
-    if (flag_select & PNG_SELECT_READ)
-        settable_asm_flags |=
-          PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  |
-          PNG_ASM_FLAG_MMX_READ_INTERLACE    |
-          PNG_ASM_FLAG_MMX_READ_FILTER_SUB   |
-          PNG_ASM_FLAG_MMX_READ_FILTER_UP    |
-          PNG_ASM_FLAG_MMX_READ_FILTER_AVG   |
-          PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
-          /* no non-MMX flags yet */
-
-#if 0
-    /* GRR:  no write-flags yet, either, but someday... */
-    if (flag_select & PNG_SELECT_WRITE)
-        settable_asm_flags |=
-          PNG_ASM_FLAG_MMX_WRITE_ [whatever] ;
-#endif /* 0 */
-
-    return settable_asm_flags;  /* _theoretically_ settable capabilities only */
-#else
-    return (0L);
-#endif /* PNG_MMX_CODE_SUPPORTED */
+    /* obsolete, to be removed from libpng-1.4.0 */
+    flag_select=flag_select;
+    return 0L;
 }
 
-
     /* GRR:  could add this:   && defined(PNG_MMX_CODE_SUPPORTED) */
 /* this function was added to libpng 1.2.0 */
 png_uint_32 PNGAPI
 png_get_mmx_flagmask (int flag_select, int *compilerID)
 {
-#if defined(PNG_MMX_CODE_SUPPORTED)
-    png_uint_32 settable_mmx_flags = 0;
-
-    if (flag_select & PNG_SELECT_READ)
-        settable_mmx_flags |=
-          PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  |
-          PNG_ASM_FLAG_MMX_READ_INTERLACE    |
-          PNG_ASM_FLAG_MMX_READ_FILTER_SUB   |
-          PNG_ASM_FLAG_MMX_READ_FILTER_UP    |
-          PNG_ASM_FLAG_MMX_READ_FILTER_AVG   |
-          PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
-#if 0
-    /* GRR:  no MMX write support yet, but someday... */
-    if (flag_select & PNG_SELECT_WRITE)
-        settable_mmx_flags |=
-          PNG_ASM_FLAG_MMX_WRITE_ [whatever] ;
-#endif /* 0 */
-
-    if (compilerID != NULL) {
-#ifdef PNG_USE_PNGVCRD
-        *compilerID = 1;    /* MSVC */
-#else
-#ifdef PNG_USE_PNGGCCRD
-        *compilerID = 2;    /* gcc/gas */
-#else
-        *compilerID = -1;   /* unknown (i.e., no asm/MMX code compiled) */
-#endif
-#endif
-    }
-
-    return settable_mmx_flags;  /* _theoretically_ settable capabilities only */
-#else
-    return (0L);
-#endif /* ?PNG_MMX_CODE_SUPPORTED */
+    /* obsolete, to be removed from libpng-1.4.0 */
+    flag_select=flag_select;
+    *compilerID = -1;   /* unknown (i.e., no asm/MMX code compiled) */
+    return 0L;
 }
 
 /* this function was added to libpng 1.2.0 */
 png_byte PNGAPI
 png_get_mmx_bitdepth_threshold (png_structp png_ptr)
 {
-#if defined(PNG_MMX_CODE_SUPPORTED)
-    return (png_byte)(png_ptr? png_ptr->mmx_bitdepth_threshold : 0);
-#else
+    /* obsolete, to be removed from libpng-1.4.0 */
     return (png_ptr? 0: 0);
-#endif /* ?PNG_MMX_CODE_SUPPORTED */
 }
 
 /* this function was added to libpng 1.2.0 */
 png_uint_32 PNGAPI
 png_get_mmx_rowbytes_threshold (png_structp png_ptr)
 {
-#if defined(PNG_MMX_CODE_SUPPORTED)
-    return (png_uint_32)(png_ptr? png_ptr->mmx_rowbytes_threshold : 0L);
-#else
+    /* obsolete, to be removed from libpng-1.4.0 */
     return (png_ptr? 0L: 0L);
-#endif /* ?PNG_MMX_CODE_SUPPORTED */
 }
 #endif /* ?PNG_1_0_X */
 #endif /* ?PNG_ASSEMBLER_CODE_SUPPORTED */
diff --git a/pngpread.c b/pngpread.c
index 3d910c30b..d5952bbd9 100644
--- a/pngpread.c
+++ b/pngpread.c
@@ -1,7 +1,7 @@
 
 /* pngpread.c - read a png file in push mode
  *
- * Last changed in libpng 1.2.20 August 30, 2007
+ * Last changed in libpng 1.2.20 September 1, 2007
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -1001,11 +1001,6 @@ png_read_push_finish_row(png_structp png_ptr)
    /* offset to next interlace block in the y direction */
    PNG_CONST int FARDATA png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
 
-   /* Width of interlace block.  This is not currently used - if you need
-    * it, uncomment it here and in png.h
-   PNG_CONST int FARDATA png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
-   */
-
    /* Height of interlace block.  This is not currently used - if you need
     * it, uncomment it here and in png.h
    PNG_CONST int FARDATA png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
diff --git a/pngread.c b/pngread.c
index c26ce6112..c9140309e 100644
--- a/pngread.c
+++ b/pngread.c
@@ -55,12 +55,6 @@ png_create_read_struct_2(png_const_charp user_png_ver, png_voidp error_ptr,
    if (png_ptr == NULL)
       return (NULL);
 
-#if !defined(PNG_1_0_X)
-#ifdef PNG_MMX_CODE_SUPPORTED
-   png_init_mmx_flags(png_ptr);   /* 1.2.0 addition */
-#endif
-#endif /* PNG_1_0_X */
-
    /* added at libpng-1.2.6 */
 #ifdef PNG_SET_USER_LIMITS_SUPPORTED
    png_ptr->user_width_max=PNG_USER_WIDTH_MAX;
diff --git a/pngrutil.c b/pngrutil.c
index 4da862e3c..99bca1933 100644
--- a/pngrutil.c
+++ b/pngrutil.c
@@ -2277,1051 +2277,10 @@ png_check_chunk_name(png_structp png_ptr, png_bytep chunk_name)
    to any alpha or transparency value associated with the pixel.  If
    you want all pixels to be combined, pass 0xff (255) in mask.  */
 
-/* Optimized C version of utilities to read a PNG file
- *
- * Based on code contributed by Nirav Chhatrapati, Intel Corp., 1998.
- * Interface to libpng contributed by Gilles Vollant, 1999.
- * GNU C port by Greg Roelofs, 1999-2001.
- *
- */
-
-#if defined(PNG_OPTIMIZED_CODE_SUPPORTED)
-#if !defined(PNG_HAVE_MMX_COMBINE_ROW)
-
-/*===========================================================================*/
-/*                                                                           */
-/*                       P N G _ C O M B I N E _ R O W                       */
-/*                                                                           */
-/*===========================================================================*/
-
-
-#define BPP2  2
-#define BPP3  3 /* bytes per pixel (a.k.a. pixel_bytes) */
-#define BPP4  4
-#define BPP6  6 /* (defined only to help avoid cut-and-paste errors) */
-#define BPP8  8
-
-/* Combines the row recently read in with the previous row.
-   This routine takes care of alpha and transparency if requested.
-   This routine also handles the two methods of progressive display
-   of interlaced images, depending on the mask value.
-   The mask value describes which pixels are to be combined with
-   the row.  The pattern always repeats every 8 pixels, so just 8
-   bits are needed.  A one indicates the pixel is to be combined; a
-   zero indicates the pixel is to be skipped.  This is in addition
-   to any alpha or transparency value associated with the pixel.
-   If you want all pixels to be combined, pass 0xff (255) in mask. */
-
-/* Use this routine for the x86 platform - it uses a faster MMX routine
-   if the machine supports MMX. */
-
-void /* PRIVATE */
-png_combine_row(png_structp png_ptr, png_bytep row, int mask)
-{
-
-#if defined(PNG_USE_LOCAL_ARRAYS)
-static PNG_CONST int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
-static PNG_CONST int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
-static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
-#endif
-
-   png_debug(1, "in png_combine_row (pngrutil.c OPTIMIZED)\n");
-
-   if (mask == 0xff)
-   {
-      png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
-      png_memcpy(row, png_ptr->row_buf + 1,
-       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
-   }
-   else   /* (png_combine_row() is never called with mask == 0) */
-   {
-      switch (png_ptr->row_info.pixel_depth)
-      {
-         /* most common case:  combining 24-bit RGB */
-         case 24:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP3 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP3;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 24 bpp */
-
-         case 32:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP4 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP4;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            }
-
-            break;
-         }       /* end 32 bpp */
-
-         case 8:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = len;  /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff /* *BPP1 */ ;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            }
-
-            break;
-         }       /* end 8 bpp */
-
-         case 1:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_inc, s_start, s_end;
-            int m;
-            int shift;
-            png_uint_32 i;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-                s_start = 0;
-                s_end = 7;
-                s_inc = 1;
-            }
-            else
-#endif
-            {
-                s_start = 7;
-                s_end = 0;
-                s_inc = -1;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  int value;
-
-                  value = (*sp >> shift) & 0x1;
-                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 1 bpp */
-
-         case 2:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-            else
-#endif
-            {
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0x3;
-                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 2 bpp */
-
-         case 4:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-            else
-#endif
-            {
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0xf;
-                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 4 bpp */
-
-         case 16:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP2 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP2;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 16 bpp */
-
-
-
-         case 48:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP6 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP6;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            }
-            break;
-         }       /* end 48 bpp */
-
-         case 64:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            register png_uint_32 i;
-            png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
-              /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-            register int stride = BPP8 * png_pass_inc[png_ptr->pass];
-              /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-            register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
-              /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-            png_uint_32 len = png_ptr->width &~7;  /* reduce to mult of 8 */
-            int diff = (int) (png_ptr->width & 7); /* amount lost */
-            register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
-
-            srcptr = png_ptr->row_buf + 1 + initial_val;
-            dstptr = row + initial_val;
-
-            for (i = initial_val; i < final_val; i += stride)
-            {
-               png_memcpy(dstptr, srcptr, rep_bytes);
-               srcptr += stride;
-               dstptr += stride;
-            }
-            if (diff)  /* number of leftover pixels:  3 for pngtest */
-            {
-               final_val += diff*BPP8;
-               for (; i < final_val; i += stride)
-               {
-                  if (rep_bytes > (int)(final_val-i))
-                     rep_bytes = (int)(final_val-i);
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-            }
-
-            break;
-         }       /* end 64 bpp */
-
-         default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
-         {
-            /* this should never happen */
-            png_warning(png_ptr, "Invalid row_info.pixel_depth in pngrutil");
-            break;
-         }
-      } /* end switch (png_ptr->row_info.pixel_depth) */
-
-   } /* end if (non-trivial mask) */
-
-} /* end png_combine_row() */
-#endif /* PNG_HAVE_MMX_COMBINE_ROW */
-
-
-
-/*===========================================================================*/
-/*                                                                           */
-/*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
-/*                                                                           */
-/*===========================================================================*/
-
-#if defined(PNG_READ_INTERLACING_SUPPORTED)
-#if !defined(PNG_HAVE_MMX_READ_INTERLACE)
-
-/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
- * has taken place.  [GRR: what other steps come before and/or after?]
- */
-
-void /* PRIVATE */
-png_do_read_interlace(png_structp png_ptr)
-{
-#if defined(PNG_USE_LOCAL_ARRAYS)
-static PNG_CONST int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
-#endif
-   png_row_infop row_info = &(png_ptr->row_info);
-   png_bytep row = png_ptr->row_buf + 1;
-   int pass = png_ptr->pass;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-   png_uint_32 transformations = png_ptr->transformations;
-#endif
-   png_debug(1,"in png_do_read_interlace (pngrutil.c OPTIMIZED)\n");
-
-   if (row != NULL && row_info != NULL)
-   {
-      png_uint_32 final_width;
-
-      final_width = row_info->width * png_pass_inc[pass];
-
-      switch (row_info->pixel_depth)
-      {
-         case 1:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_byte v;
-            png_uint_32 i;
-            int j;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 3);
-            dp = row + (png_size_t)((final_width - 1) >> 3);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (int)((row_info->width + 7) & 7);
-               dshift = (int)((final_width + 7) & 7);
-               s_start = 7;
-               s_end = 0;
-               s_inc = -1;
-            }
-            else
-#endif
-            {
-               sshift = 7 - (int)((row_info->width + 7) & 7);
-               dshift = 7 - (int)((final_width + 7) & 7);
-               s_start = 0;
-               s_end = 7;
-               s_inc = 1;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               v = (png_byte)((*sp >> sshift) & 0x1);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 2:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 2);
-            dp = row + (png_size_t)((final_width - 1) >> 2);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
-               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
-               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0x3);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 4:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 1);
-            dp = row + (png_size_t)((final_width - 1) >> 1);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
-               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
-               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0xf);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-       /*====================================================================*/
-
-         default: /* 8-bit or larger (this is where the routine is modified) */
-         {
-            png_bytep sptr, dp;
-            png_uint_32 i;
-            png_size_t pixel_bytes;
-            int width = (int)row_info->width;
-
-            pixel_bytes = (row_info->pixel_depth >> 3);
-
-            /* point sptr at the last pixel in the pre-expanded row: */
-            sptr = row + (width - 1) * pixel_bytes;
-
-            /* point dp at the last pixel position in the expanded row: */
-            dp = row + (final_width - 1) * pixel_bytes;
-
-            /* MMX not supported:  use modified C code - takes advantage
-             *   of inlining of png_memcpy for a constant */
-            /* GRR 19991007:  does it?  or should pixel_bytes in each
-             *   block be replaced with immediate value (e.g., 1)? */
-            /* GRR 19991017:  replaced with constants in each case */
-            {
-               if (pixel_bytes == 1)
-               {
-                  for (i = width; i; i--)
-                  {
-                     int j;
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        *dp-- = *sptr;
-                     }
-                     --sptr;
-                  }
-               }
-               else if (pixel_bytes == 3)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 3);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, 3);
-                        dp -= 3;
-                     }
-                     sptr -= 3;
-                  }
-               }
-               else if (pixel_bytes == 2)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 2);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, 2);
-                        dp -= 2;
-                     }
-                     sptr -= 2;
-                  }
-               }
-               else if (pixel_bytes == 4)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 4);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-#if defined(PNG_DEBUG) && defined(PNG_1_0_X)
-                        if (dp < row || dp+3 > row+png_ptr->row_buf_size)
-                        {
-                           printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
-                             row, dp, row+png_ptr->row_buf_size);
-                           printf("row_buf=%d\n",png_ptr->row_buf_size);
-                        }
-#endif
-                        png_memcpy(dp, v, 4);
-                        dp -= 4;
-                     }
-                     sptr -= 4;
-                  }
-               }
-               else if (pixel_bytes == 6)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 6);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, 6);
-                        dp -= 6;
-                     }
-                     sptr -= 6;
-                  }
-               }
-               else if (pixel_bytes == 8)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 8);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, 8);
-                        dp -= 8;
-                     }
-                     sptr -= 8;
-                  }
-               }
-               else     /* GRR:  should never be reached */
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-
-            }
-            break;
-         }
-      } /* end switch (row_info->pixel_depth) */
-
-      row_info->width = final_width;
-
-      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
-   }
-
-} /* end png_do_read_interlace() */
-
-#endif /* PNG_HAVE_MMX_READ_INTERLACE */
-#endif /* PNG_READ_INTERLACING_SUPPORTED */
-
-
-
-#if !defined(PNG_HAVE_MMX_READ_FILTER_ROW)
-/*===========================================================================*/
-/*                                                                           */
-/*                   P N G _ R E A D _ F I L T E R _ R O W                   */
-/*                                                                           */
-/*===========================================================================*/
-
-
-/* Optimized png_read_filter_row routines */
-
-void /* PRIVATE */
-png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
-   row, png_bytep prev_row, int filter)
-{
-#if defined(PNG_DEBUG)
-   char filnm[10];
-#endif
-
-
-#if defined(PNG_DEBUG)
-   png_debug(1, "in png_read_filter_row (pngrutil.c OPTIMIZED)\n");
-   switch (filter)
-   {
-      case 0:
-         png_snprintf(filnm, 10, "none");
-         break;
-
-      case 1:
-         png_snprintf(filnm, 10, "sub-%s",
-             "x86");
-         break;
-
-      case 2:
-         png_snprintf(filnm, 10, "up-%s",
-             "x86");
-         break;
-
-      case 3:
-         png_snprintf(filnm, 10, "avg-%s",
-             "x86");
-         break;
-
-      case 4:
-         png_snprintf(filnm, 10, "Paeth-%s",
-             "x86");
-         break;
-
-      default:
-         png_snprintf(filnm, 10, "unknown");
-         break;
-   }
-   png_debug2(0, "row_number=%5ld, %10s, ", png_ptr->row_number, filnm);
-   png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
-   png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
-      (int)((row_info->pixel_depth + 7) >> 3));
-   png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
-#endif /* PNG_DEBUG */
-
-   switch (filter)
-   {
-      case PNG_FILTER_VALUE_NONE:
-         break;
-
-      case PNG_FILTER_VALUE_SUB:
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_bytep rp = row + bpp;
-            png_bytep lp = row;
-
-            for (i = bpp; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-
-      case PNG_FILTER_VALUE_UP:
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-
-            for (i = 0; i < istop; ++i)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-
-      case PNG_FILTER_VALUE_AVG:
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop = row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++) >> 1)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-
-      case PNG_FILTER_VALUE_PAETH:
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_bytep cp = prev_row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop = row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)   /* use leftover rp,pp */
-            {
-               int a, b, c, pa, pb, pc, p;
-
-               a = *lp++;
-               b = *pp++;
-               c = *cp++;
-
-               p = b - c;
-               pc = a - c;
-
-#if defined(PNG_USE_ABS)
-               pa = abs(p);
-               pb = abs(pc);
-               pc = abs(p + pc);
-#else
-               pa = p < 0 ? -p : p;
-               pb = pc < 0 ? -pc : pc;
-               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
-#endif
-
-               /*
-                  if (pa <= pb && pa <= pc)
-                     p = a;
-                  else if (pb <= pc)
-                     p = b;
-                  else
-                     p = c;
-                */
-
-               p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
-
-               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
-               rp++;
-            }
-         }
-         break;
-
-      default:
-         png_warning(png_ptr, "Ignoring bad row-filter type");
-         *row=0;
-         break;
-   }
-}
-
-#endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
-#endif /* PNG_OPTIMIZED_CODE_SUPPORTED */
-
-#if !defined(PNG_ASSEMBLER_CODE_SUPPORTED) || \
-    !defined(PNG_MMX_CODE_SUPPORTED) || \
-    (!defined(PNG_USE_PNGGCCRD) && !defined(PNG_USE_PNGVCRD))
-#if !defined(PNG_OPTIMIZED_CODE_SUPPORTED)
-/* Use the unoptimized original C code.  This might be removed from a future
- * version of libpng if testing proves it to be worthless. */
 void /* PRIVATE */
 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
 {
-   png_debug(1,"in png_combine_row NOT OPTIMIZED\n");
+   png_debug(1,"in png_combine_row\n");
    if (mask == 0xff)
    {
       png_memcpy(row, png_ptr->row_buf + 1,
@@ -3537,7 +2496,7 @@ png_do_read_interlace(png_structp png_ptr)
    PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
 #endif
 
-   png_debug(1,"in png_do_read_interlace (pngrutil.c NOT OPTIMIZED)\n");
+   png_debug(1,"in png_do_read_interlace\n");
    if (row != NULL && row_info != NULL)
    {
       png_uint_32 final_width;
@@ -3750,7 +2709,7 @@ void /* PRIVATE */
 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep row,
    png_bytep prev_row, int filter)
 {
-   png_debug(1, "in png_read_filter_row (NOT OPTIMIZED)\n");
+   png_debug(1, "in png_read_filter_row\n");
    png_debug2(2,"row = %lu, filter = %d\n", png_ptr->row_number, filter);
    switch (filter)
    {
@@ -3868,9 +2827,6 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep row,
          break;
    }
 }
-#endif /* !PNG_OPTIMIZED_CODE_SUPPORTED */
-#endif /* !PNG_ASSEMBLER_CODE_SUPPORTED || !PNG_MMX_CODE_SUPPORTED || */
-       /* (!PNG_USE_PNGGCCRD && !PNG_USE_PNGVCRD) */
 
 void /* PRIVATE */
 png_read_finish_row(png_structp png_ptr)
@@ -4165,9 +3121,6 @@ defined(PNG_USER_TRANSFORM_PTR_SUPPORTED)
 #endif
    png_ptr->big_row_buf = (png_bytep)png_malloc(png_ptr, row_bytes+64);
    png_ptr->row_buf = png_ptr->big_row_buf+32;
-#if defined(PNG_DEBUG) && defined(PNG_USE_PNGGCCRD) && defined(PNG_1_0_X)
-   png_ptr->row_buf_size = row_bytes;
-#endif
 
 #ifdef PNG_MAX_MALLOC_64K
    if ((png_uint_32)png_ptr->rowbytes + 1 > (png_uint_32)65536L)
diff --git a/pngset.c b/pngset.c
index 8d1cbfcd0..2871140d6 100644
--- a/pngset.c
+++ b/pngset.c
@@ -1202,51 +1202,13 @@ png_set_invalid(png_structp png_ptr, png_infop info_ptr, int mask)
 
 #ifndef PNG_1_0_X
 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
-/* this function was added to libpng 1.2.0 and should always exist by default */
+/* function was added to libpng 1.2.0 and should always exist by default */
 void PNGAPI
 png_set_asm_flags (png_structp png_ptr, png_uint_32 asm_flags)
 {
-#ifdef PNG_MMX_CODE_SUPPORTED
-    png_uint_32 settable_asm_flags;
-    png_uint_32 settable_mmx_flags;
-#endif
-    if (png_ptr == NULL)
-       return;
-#ifdef PNG_MMX_CODE_SUPPORTED
-
-    settable_mmx_flags =
-#ifdef PNG_HAVE_MMX_COMBINE_ROW
-                         PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  |
-#endif
-#ifdef PNG_HAVE_MMX_READ_INTERLACE
-                         PNG_ASM_FLAG_MMX_READ_INTERLACE    |
-#endif
-#ifdef PNG_HAVE_MMX_READ_FILTER_ROW
-                         PNG_ASM_FLAG_MMX_READ_FILTER_SUB   |
-                         PNG_ASM_FLAG_MMX_READ_FILTER_UP    |
-                         PNG_ASM_FLAG_MMX_READ_FILTER_AVG   |
-                         PNG_ASM_FLAG_MMX_READ_FILTER_PAETH |
-#endif
-                         0;
-
-    /* could be some non-MMX ones in the future, but not currently: */
-    settable_asm_flags = settable_mmx_flags;
-
-    if (!(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_SUPPORT_COMPILED) ||
-        !(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_SUPPORT_IN_CPU))
-    {
-        /* clear all MMX flags if MMX isn't supported */
-        settable_asm_flags &= ~settable_mmx_flags;
-        png_ptr->asm_flags &= ~settable_mmx_flags;
-    }
-
-    /* we're replacing the settable bits with those passed in by the user,
-     * so first zero them out of the master copy, then bitwise-OR in the
-     * allowed subset that was requested */
-
-    png_ptr->asm_flags &= ~settable_asm_flags;               /* zero them */
-    png_ptr->asm_flags |= (asm_flags & settable_asm_flags);  /* set them */
-#endif /* ?PNG_MMX_CODE_SUPPORTED */
+/* Obsolete as of libpng-1.2.20 and will be removed from libpng-1.4.0 */
+    if (png_ptr != NULL)
+    png_ptr->asm_flags = 0;
 }
 
 /* this function was added to libpng 1.2.0 */
@@ -1255,12 +1217,9 @@ png_set_mmx_thresholds (png_structp png_ptr,
                         png_byte mmx_bitdepth_threshold,
                         png_uint_32 mmx_rowbytes_threshold)
 {
+/* Obsolete as of libpng-1.2.20 and will be removed from libpng-1.4.0 */
     if (png_ptr == NULL)
        return;
-#ifdef PNG_MMX_CODE_SUPPORTED
-    png_ptr->mmx_bitdepth_threshold = mmx_bitdepth_threshold;
-    png_ptr->mmx_rowbytes_threshold = mmx_rowbytes_threshold;
-#endif /* ?PNG_MMX_CODE_SUPPORTED */
 }
 #endif /* ?PNG_ASSEMBLER_CODE_SUPPORTED */
 
diff --git a/pngtest.c b/pngtest.c
index c46d91778..70734a366 100644
--- a/pngtest.c
+++ b/pngtest.c
@@ -1548,4 +1548,4 @@ main(int argc, char *argv[])
 }
 
 /* Generate a compiler error if there is an old png.h in the search path. */
-typedef version_1_2_20rc3 your_png_h_is_not_version_1_2_20rc3;
+typedef version_1_2_20rc4 your_png_h_is_not_version_1_2_20rc4;
diff --git a/pngvcrd.c b/pngvcrd.c
index edd8ff6be..ce4233efe 100644
--- a/pngvcrd.c
+++ b/pngvcrd.c
@@ -1,3917 +1 @@
-
-/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
- *
- * For Intel x86 CPU and Microsoft Visual C++ compiler
- *
- * Last changed in libpng 1.2.20 August 30, 2007
- * For conditions of distribution and use, see copyright notice in png.h
- * Copyright (c) 1998-2007 Glenn Randers-Pehrson
- * Copyright (c) 1998, Intel Corporation
- *
- * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
- * Interface to libpng contributed by Gilles Vollant, 1999
- *
- *
- * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
- * a sign error in the post-MMX cleanup code for each pixel_depth resulted
- * in bad pixels at the beginning of some rows of some images, and also
- * (due to out-of-range memory reads and writes) caused heap corruption
- * when compiled with MSVC 6.0.  The error was fixed in version 1.0.4e.
- *
- * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
- *
- * [runtime MMX configuration, GRR 20010102]
- *
- * [Copy 6 bytes per pixel, not 4, and use stride of 6, not 4, in the
- *  second loop of interlace processing of 48-bit pixels, GR-P 20070717]
- *
- * [move instances of uAll union into local, except for two constant
- * instances, GR-P 20070805]
- *
- * [revised handling of static constants for efficiency, Steve Snyder 20070821]
- */
-
-#define PNG_INTERNAL
-#include "png.h"
-
-#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && \
-    defined(PNG_USE_PNGVCRD) && defined(PNG_MMX_CODE_SUPPORTED)
-
-#ifdef PNG_USE_LOCAL_ARRAYS
-static PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
-#endif
-
-static int mmx_supported=2;
-
-int PNGAPI
-png_mmx_support(void)
-{
-  int mmx_supported_local = 0;
-  _asm {
-    push ebx          //CPUID will trash these
-    push ecx
-    push edx
-
-    pushfd            //Save Eflag to stack
-    pop eax           //Get Eflag from stack into eax
-    mov ecx, eax      //Make another copy of Eflag in ecx
-    xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
-    push eax          //Save modified Eflag back to stack
-
-    popfd             //Restored modified value back to Eflag reg
-    pushfd            //Save Eflag to stack
-    pop eax           //Get Eflag from stack
-    push ecx          // save original Eflag to stack
-    popfd             // restore original Eflag
-    xor eax, ecx      //Compare the new Eflag with the original Eflag
-    jz NOT_SUPPORTED  //If the same, CPUID instruction is not supported,
-                      //skip following instructions and jump to
-                      //NOT_SUPPORTED label
-
-    xor eax, eax      //Set eax to zero
-
-    _asm _emit 0x0f   //CPUID instruction  (two bytes opcode)
-    _asm _emit 0xa2
-
-    cmp eax, 1        //make sure eax return non-zero value
-    jl NOT_SUPPORTED  //If eax is zero, mmx not supported
-
-    xor eax, eax      //set eax to zero
-    inc eax           //Now increment eax to 1.  This instruction is
-                      //faster than the instruction "mov eax, 1"
-
-    _asm _emit 0x0f   //CPUID instruction
-    _asm _emit 0xa2
-
-    and edx, 0x00800000  //mask out all bits but mmx bit(24)
-    cmp edx, 0        // 0 = mmx not supported
-    jz  NOT_SUPPORTED // non-zero = Yes, mmx IS supported
-
-    mov  mmx_supported_local, 1  //set return value to 1
-
-NOT_SUPPORTED:
-    mov  eax, mmx_supported_local  //move return value to eax
-    pop edx          //CPUID trashed these
-    pop ecx
-    pop ebx
-  }
-
-  //mmx_supported_local=0; // test code for force don't support MMX
-  //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
-
-  mmx_supported = mmx_supported_local;
-  return mmx_supported_local;
-}
-
-/* Combines the row recently read in with the previous row.
-   This routine takes care of alpha and transparency if requested.
-   This routine also handles the two methods of progressive display
-   of interlaced images, depending on the mask value.
-   The mask value describes which pixels are to be combined with
-   the row.  The pattern always repeats every 8 pixels, so just 8
-   bits are needed.  A one indicates the pixel is to be combined; a
-   zero indicates the pixel is to be skipped.  This is in addition
-   to any alpha or transparency value associated with the pixel.  If
-   you want all pixels to be combined, pass 0xff (255) in mask.  */
-
-/* Use this routine for x86 platform - uses faster MMX routine if machine
-   supports MMX */
-
-void /* PRIVATE */
-png_combine_row(png_structp png_ptr, png_bytep row, int mask)
-{
-   static PNG_CONST int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
-
-   png_debug(1,"in png_combine_row_asm\n");
-
-   if (mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-   if (mask == 0xff)
-   {
-      png_memcpy(row, png_ptr->row_buf + 1,
-       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
-       png_ptr->width));
-   }
-   /* GRR:  add "else if (mask == 0)" case?
-    *       or does png_combine_row() not even get called in that case? */
-   else
-   {
-      switch (png_ptr->row_info.pixel_depth)
-      {
-         case 24:
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            png_uint_32 len;
-            int unmask, diff;
-
-            static PNG_CONST __int64 bpp24_mask2=0x0101010202020404,  //24bpp
-                                     bpp24_mask1=0x0408080810101020,
-                                     bpp24_mask0=0x2020404040808080;
-
-            srcptr = png_ptr->row_buf + 1;
-            dstptr = row;
-
-            unmask = ~mask;
-            len     = (png_ptr->width)&~7;
-            diff = (png_ptr->width)&7;
-
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               _asm
-               {
-                  movd       mm7, unmask       //load bit pattern
-                  psubb      mm6,mm6           //zero mm6
-                  punpcklbw  mm7,mm7
-                  punpcklwd  mm7,mm7
-                  punpckldq  mm7,mm7           //fill register with 8 masks
-
-                  movq       mm0,bpp24_mask0
-                  movq       mm1,bpp24_mask1
-                  movq       mm2,bpp24_mask2
-
-                  pand       mm0,mm7
-                  pand       mm1,mm7
-                  pand       mm2,mm7
-
-                  pcmpeqb    mm0,mm6
-                  pcmpeqb    mm1,mm6
-                  pcmpeqb    mm2,mm6
-
-                  mov        ecx,len           //load length of line
-                  mov        esi,srcptr        //load source
-                  mov        ebx,dstptr        //load dest
-                  cmp        ecx,0
-                  jz         mainloop24end
-
-mainloop24:
-                  movq       mm4,[esi]
-                  pand       mm4,mm0
-                  movq       mm6,mm0
-                  movq       mm7,[ebx]
-                  pandn      mm6,mm7
-                  por        mm4,mm6
-                  movq       [ebx],mm4
-
-
-                  movq       mm5,[esi+8]
-                  pand       mm5,mm1
-                  movq       mm7,mm1
-                  movq       mm6,[ebx+8]
-                  pandn      mm7,mm6
-                  por        mm5,mm7
-                  movq       [ebx+8],mm5
-
-                  movq       mm6,[esi+16]
-                  pand       mm6,mm2
-                  movq       mm4,mm2
-                  movq       mm7,[ebx+16]
-                  pandn      mm4,mm7
-                  por        mm6,mm4
-                  movq       [ebx+16],mm6
-
-                  add        esi,24            //inc by 24 bytes processed
-                  add        ebx,24
-                  sub        ecx,8             //dec by 8 pixels processed
-
-                  ja         mainloop24
-
-mainloop24end:
-                  mov        ecx,diff
-                  cmp        ecx,0
-                  jz         end24
-
-                  mov        edx,mask
-                  sal        edx,24            //make low byte the high byte
-secondloop24:
-                  sal        edx,1             //move high bit to CF
-                  jnc        skip24            //if CF = 0
-                  mov        ax,[esi]
-                  mov        [ebx],ax
-                  xor        eax,eax
-                  mov        al,[esi+2]
-                  mov        [ebx+2],al
-skip24:
-                  add        esi,3
-                  add        ebx,3
-
-                  dec        ecx
-                  jnz        secondloop24
-
-end24:
-                  emms
-               }
-            }
-            else /* mmx not supported - use modified C routine */
-            {
-               register unsigned int incr1, initial_val, final_val;
-               png_size_t pixel_bytes;
-               png_uint_32 i;
-               register int disp = png_pass_inc[png_ptr->pass];
-
-               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-                  pixel_bytes;
-               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-               final_val = png_ptr->width*pixel_bytes;
-               incr1 = (disp)*pixel_bytes;
-               for (i = initial_val; i < final_val; i += incr1)
-               {
-                  png_memcpy(dstptr, srcptr, pixel_bytes);
-                  srcptr += incr1;
-                  dstptr += incr1;
-               }
-            } /* end of else */
-
-            break;
-         }       // end 24 bpp
-
-         case 32:
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            png_uint_32 len;
-            int unmask, diff;
-
-            static PNG_CONST __int64 bpp32_mask3=0x0101010102020202,  //32bpp
-                                     bpp32_mask2=0x0404040408080808,
-                                     bpp32_mask1=0x1010101020202020,
-                                     bpp32_mask0=0x4040404080808080;
-
-            srcptr = png_ptr->row_buf + 1;
-            dstptr = row;
-
-            unmask = ~mask;
-            len     = (png_ptr->width)&~7;
-            diff = (png_ptr->width)&7;
-
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               _asm
-               {
-                  movd       mm7, unmask       //load bit pattern
-                  psubb      mm6,mm6           //zero mm6
-                  punpcklbw  mm7,mm7
-                  punpcklwd  mm7,mm7
-                  punpckldq  mm7,mm7           //fill register with 8 masks
-
-                  movq       mm0,bpp32_mask0
-                  movq       mm1,bpp32_mask1
-                  movq       mm2,bpp32_mask2
-                  movq       mm3,bpp32_mask3
-
-                  pand       mm0,mm7
-                  pand       mm1,mm7
-                  pand       mm2,mm7
-                  pand       mm3,mm7
-
-                  pcmpeqb    mm0,mm6
-                  pcmpeqb    mm1,mm6
-                  pcmpeqb    mm2,mm6
-                  pcmpeqb    mm3,mm6
-
-                  mov        ecx,len           //load length of line
-                  mov        esi,srcptr        //load source
-                  mov        ebx,dstptr        //load dest
-
-                  cmp        ecx,0             //lcr
-                  jz         mainloop32end
-
-mainloop32:
-                  movq       mm4,[esi]
-                  pand       mm4,mm0
-                  movq       mm6,mm0
-                  movq       mm7,[ebx]
-                  pandn      mm6,mm7
-                  por        mm4,mm6
-                  movq       [ebx],mm4
-
-                  movq       mm5,[esi+8]
-                  pand       mm5,mm1
-                  movq       mm7,mm1
-                  movq       mm6,[ebx+8]
-                  pandn      mm7,mm6
-                  por        mm5,mm7
-                  movq       [ebx+8],mm5
-
-                  movq       mm6,[esi+16]
-                  pand       mm6,mm2
-                  movq       mm4,mm2
-                  movq       mm7,[ebx+16]
-                  pandn      mm4,mm7
-                  por        mm6,mm4
-                  movq       [ebx+16],mm6
-
-                  movq       mm7,[esi+24]
-                  pand       mm7,mm3
-                  movq       mm5,mm3
-                  movq       mm4,[ebx+24]
-                  pandn      mm5,mm4
-                  por        mm7,mm5
-                  movq       [ebx+24],mm7
-
-                  add        esi,32            //inc by 32 bytes processed
-                  add        ebx,32
-                  sub        ecx,8             //dec by 8 pixels processed
-
-                  ja         mainloop32
-
-mainloop32end:
-                  mov        ecx,diff
-                  cmp        ecx,0
-                  jz         end32
-
-                  mov        edx,mask
-                  sal        edx,24            //make low byte the high byte
-secondloop32:
-                  sal        edx,1             //move high bit to CF
-                  jnc        skip32            //if CF = 0
-                  mov        eax,[esi]
-                  mov        [ebx],eax
-skip32:
-                  add        esi,4
-                  add        ebx,4
-
-                  dec        ecx
-                  jnz        secondloop32
-
-end32:
-                  emms
-               }
-            }
-            else /* mmx _not supported - Use modified C routine */
-            {
-               register unsigned int incr1, initial_val, final_val;
-               png_size_t pixel_bytes;
-               png_uint_32 i;
-               register int disp = png_pass_inc[png_ptr->pass];
-
-               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-                  pixel_bytes;
-               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-               final_val = png_ptr->width*pixel_bytes;
-               incr1 = (disp)*pixel_bytes;
-               for (i = initial_val; i < final_val; i += incr1)
-               {
-                  png_memcpy(dstptr, srcptr, pixel_bytes);
-                  srcptr += incr1;
-                  dstptr += incr1;
-               }
-            } /* end of else */
-
-            break;
-         }       // end 32 bpp
-
-         case 8:
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            png_uint_32 len;
-            int m;
-            int diff, unmask;
-
-            static PNG_CONST __int64 bpp08_mask0=0x0102040810204080;
-
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               m = 0x80;
-               unmask = ~mask;
-               len  = png_ptr->width &~7;  //reduce to multiple of 8
-               diff = png_ptr->width & 7;  //amount lost
-
-               _asm
-               {
-                  movd       mm7, unmask   //load bit pattern
-                  psubb      mm6,mm6       //zero mm6
-                  punpcklbw  mm7,mm7
-                  punpcklwd  mm7,mm7
-                  punpckldq  mm7,mm7       //fill register with 8 masks
-
-                  movq       mm0,bpp08_mask0
-
-                  pand       mm0,mm7       //nonzero if keep byte
-                  pcmpeqb    mm0,mm6       //zeros->1s, v versa
-
-                  mov        ecx,len       //load length of line (pixels)
-                  mov        esi,srcptr    //load source
-                  mov        ebx,dstptr    //load dest
-                  cmp        ecx,0         //lcr
-                  je         mainloop8end
-
-mainloop8:
-                  movq       mm4,[esi]
-                  pand       mm4,mm0
-                  movq       mm6,mm0
-                  pandn      mm6,[ebx]
-                  por        mm4,mm6
-                  movq       [ebx],mm4
-
-                  add        esi,8         //inc by 8 bytes processed
-                  add        ebx,8
-                  sub        ecx,8         //dec by 8 pixels processed
-
-                  ja         mainloop8
-mainloop8end:
-
-                  mov        ecx,diff
-                  cmp        ecx,0
-                  jz         end8
-
-                  mov        edx,mask
-                  sal        edx,24        //make low byte the high byte
-
-secondloop8:
-                  sal        edx,1         //move high bit to CF
-                  jnc        skip8         //if CF = 0
-                  mov        al,[esi]
-                  mov        [ebx],al
-skip8:
-                  inc        esi
-                  inc        ebx
-
-                  dec        ecx
-                  jnz        secondloop8
-end8:
-                  emms
-               }
-            }
-            else /* mmx not supported - use modified C routine */
-            {
-               register unsigned int incr1, initial_val, final_val;
-               png_size_t pixel_bytes;
-               png_uint_32 i;
-               register int disp = png_pass_inc[png_ptr->pass];
-
-               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-                  pixel_bytes;
-               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-               final_val = png_ptr->width*pixel_bytes;
-               incr1 = (disp)*pixel_bytes;
-               for (i = initial_val; i < final_val; i += incr1)
-               {
-                  png_memcpy(dstptr, srcptr, pixel_bytes);
-                  srcptr += incr1;
-                  dstptr += incr1;
-               }
-            } /* end of else */
-
-            break;
-         }       // end 8 bpp
-
-         case 1:
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_inc, s_start, s_end;
-            int m;
-            int shift;
-            png_uint_32 i;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-                s_start = 0;
-                s_end = 7;
-                s_inc = 1;
-            }
-            else
-#endif
-            {
-                s_start = 7;
-                s_end = 0;
-                s_inc = -1;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  int value;
-
-                  value = (*sp >> shift) & 0x1;
-                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }
-
-         case 2:
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-            else
-#endif
-            {
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0x3;
-                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }
-
-         case 4:
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-            else
-#endif
-            {
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0xf;
-                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }
-
-         case 16:
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            png_uint_32 len;
-            int unmask, diff;
-            static PNG_CONST __int64 bpp16_mask1=0x0101020204040808,
-                                     bpp16_mask0=0x1010202040408080;
-
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-
-               unmask = ~mask;
-               len     = (png_ptr->width)&~7;
-               diff = (png_ptr->width)&7;
-               _asm
-               {
-                  movd       mm7, unmask       //load bit pattern
-                  psubb      mm6,mm6           //zero mm6
-                  punpcklbw  mm7,mm7
-                  punpcklwd  mm7,mm7
-                  punpckldq  mm7,mm7           //fill register with 8 masks
-
-                  movq       mm0,bpp16_mask0
-                  movq       mm1,bpp16_mask1
-
-                  pand       mm0,mm7
-                  pand       mm1,mm7
-
-                  pcmpeqb    mm0,mm6
-                  pcmpeqb    mm1,mm6
-
-                  mov        ecx,len           //load length of line
-                  mov        esi,srcptr        //load source
-                  mov        ebx,dstptr        //load dest
-                  cmp        ecx,0             //lcr
-                  jz         mainloop16end
-
-mainloop16:
-                  movq       mm4,[esi]
-                  pand       mm4,mm0
-                  movq       mm6,mm0
-                  movq       mm7,[ebx]
-                  pandn      mm6,mm7
-                  por        mm4,mm6
-                  movq       [ebx],mm4
-
-                  movq       mm5,[esi+8]
-                  pand       mm5,mm1
-                  movq       mm7,mm1
-                  movq       mm6,[ebx+8]
-                  pandn      mm7,mm6
-                  por        mm5,mm7
-                  movq       [ebx+8],mm5
-
-                  add        esi,16            //inc by 16 bytes processed
-                  add        ebx,16
-                  sub        ecx,8             //dec by 8 pixels processed
-
-                  ja         mainloop16
-
-mainloop16end:
-                  mov        ecx,diff
-                  cmp        ecx,0
-                  jz         end16
-
-                  mov        edx,mask
-                  sal        edx,24            //make low byte the high byte
-secondloop16:
-                  sal        edx,1             //move high bit to CF
-                  jnc        skip16            //if CF = 0
-                  mov        ax,[esi]
-                  mov        [ebx],ax
-skip16:
-                  add        esi,2
-                  add        ebx,2
-
-                  dec        ecx
-                  jnz        secondloop16
-end16:
-                  emms
-               }
-            }
-            else /* mmx not supported - use modified C routine */
-            {
-               register unsigned int incr1, initial_val, final_val;
-               png_size_t pixel_bytes;
-               png_uint_32 i;
-               register int disp = png_pass_inc[png_ptr->pass];
-
-               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-                  pixel_bytes;
-               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-               final_val = png_ptr->width*pixel_bytes;
-               incr1 = (disp)*pixel_bytes;
-               for (i = initial_val; i < final_val; i += incr1)
-               {
-                  png_memcpy(dstptr, srcptr, pixel_bytes);
-                  srcptr += incr1;
-                  dstptr += incr1;
-               }
-            } /* end of else */
-
-            break;
-         }       // end 16 bpp
-
-         case 48:
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            png_uint_32 len;
-            int unmask, diff;
-
-            static PNG_CONST __int64 bpp48_mask5=0x0101010101010202,
-                                     bpp48_mask4=0x0202020204040404,
-                                     bpp48_mask3=0x0404080808080808,
-                                     bpp48_mask2=0x1010101010102020,
-                                     bpp48_mask1=0x2020202040404040,
-                                     bpp48_mask0=0x4040808080808080;
-
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-
-               unmask = ~mask;
-               len     = (png_ptr->width)&~7;
-               diff = (png_ptr->width)&7;
-               _asm
-               {
-                  movd       mm7, unmask       //load bit pattern
-                  psubb      mm6,mm6           //zero mm6
-                  punpcklbw  mm7,mm7
-                  punpcklwd  mm7,mm7
-                  punpckldq  mm7,mm7           //fill register with 8 masks
-
-                  movq       mm0,bpp48_mask0
-                  movq       mm1,bpp48_mask1
-                  movq       mm2,bpp48_mask2
-                  movq       mm3,bpp48_mask3
-                  movq       mm4,bpp48_mask4
-                  movq       mm5,bpp48_mask5
-
-                  pand       mm0,mm7
-                  pand       mm1,mm7
-                  pand       mm2,mm7
-                  pand       mm3,mm7
-                  pand       mm4,mm7
-                  pand       mm5,mm7
-
-                  pcmpeqb    mm0,mm6
-                  pcmpeqb    mm1,mm6
-                  pcmpeqb    mm2,mm6
-                  pcmpeqb    mm3,mm6
-                  pcmpeqb    mm4,mm6
-                  pcmpeqb    mm5,mm6
-
-                  mov        ecx,len           //load length of line
-                  mov        esi,srcptr        //load source
-                  mov        ebx,dstptr        //load dest
-
-                  cmp        ecx,0
-                  jz         mainloop48end
-
-mainloop48:
-                  movq       mm7,[esi]
-                  pand       mm7,mm0
-                  movq       mm6,mm0
-                  pandn      mm6,[ebx]
-                  por        mm7,mm6
-                  movq       [ebx],mm7
-
-                  movq       mm6,[esi+8]
-                  pand       mm6,mm1
-                  movq       mm7,mm1
-                  pandn      mm7,[ebx+8]
-                  por        mm6,mm7
-                  movq       [ebx+8],mm6
-
-                  movq       mm6,[esi+16]
-                  pand       mm6,mm2
-                  movq       mm7,mm2
-                  pandn      mm7,[ebx+16]
-                  por        mm6,mm7
-                  movq       [ebx+16],mm6
-
-                  movq       mm7,[esi+24]
-                  pand       mm7,mm3
-                  movq       mm6,mm3
-                  pandn      mm6,[ebx+24]
-                  por        mm7,mm6
-                  movq       [ebx+24],mm7
-
-                  movq       mm6,[esi+32]
-                  pand       mm6,mm4
-                  movq       mm7,mm4
-                  pandn      mm7,[ebx+32]
-                  por        mm6,mm7
-                  movq       [ebx+32],mm6
-
-                  movq       mm7,[esi+40]
-                  pand       mm7,mm5
-                  movq       mm6,mm5
-                  pandn      mm6,[ebx+40]
-                  por        mm7,mm6
-                  movq       [ebx+40],mm7
-
-                  add        esi,48            //inc by 32 bytes processed
-                  add        ebx,48
-                  sub        ecx,8             //dec by 8 pixels processed
-
-                  ja         mainloop48
-mainloop48end:
-
-                  mov        ecx,diff
-                  cmp        ecx,0
-                  jz         end48
-
-                  mov        edx,mask
-                  sal        edx,24            //make low byte the high byte
-
-secondloop48:
-                  sal        edx,1             //move high bit to CF
-                  jnc        skip48            //if CF = 0
-                  mov        eax,[esi]
-                  mov        [ebx],eax
-                  mov        ax,[esi+4]       // These 2 lines added 20070717
-                  mov        [ebx+4],ax       // Glenn R-P
-skip48:
-                  add        esi,6            // Changed 4 to 6 on these 2
-                  add        ebx,6            // lines.  Glenn R-P 20070717
-
-                  dec        ecx
-                  jnz        secondloop48
-
-end48:
-                  emms
-               }
-            }
-            else /* mmx _not supported - Use modified C routine */
-            {
-               register unsigned int incr1, initial_val, final_val;
-               png_size_t pixel_bytes;
-               png_uint_32 i;
-               register int disp = png_pass_inc[png_ptr->pass];
-
-               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-                  pixel_bytes;
-               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-               final_val = png_ptr->width*pixel_bytes;
-               incr1 = (disp)*pixel_bytes;
-               for (i = initial_val; i < final_val; i += incr1)
-               {
-                  png_memcpy(dstptr, srcptr, pixel_bytes);
-                  srcptr += incr1;
-                  dstptr += incr1;
-               }
-            } /* end of else */
-
-            break;
-         }       // end 48 bpp
-
-         default:
-         {
-            png_bytep sptr;
-            png_bytep dp;
-            png_size_t pixel_bytes;
-            unsigned int i;
-            register int disp = png_pass_inc[png_ptr->pass];  // get the offset
-            register unsigned int incr1, initial_val, final_val;
-
-            pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-            sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-               pixel_bytes;
-            dp = row + offset_table[png_ptr->pass]*pixel_bytes;
-            initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-            final_val = png_ptr->width*pixel_bytes;
-            incr1 = (disp)*pixel_bytes;
-            for (i = initial_val; i < final_val; i += incr1)
-            {
-               png_memcpy(dp, sptr, pixel_bytes);
-               sptr += incr1;
-               dp += incr1;
-            }
-            break;
-         }
-      } /* end switch (png_ptr->row_info.pixel_depth) */
-   } /* end if (non-trivial mask) */
-
-} /* end png_combine_row() */
-
-
-#if defined(PNG_READ_INTERLACING_SUPPORTED)
-
-void /* PRIVATE */
-png_do_read_interlace(png_structp png_ptr)
-{
-   png_row_infop row_info = &(png_ptr->row_info);
-   png_bytep row = png_ptr->row_buf + 1;
-   int pass = png_ptr->pass;
-   png_uint_32 transformations = png_ptr->transformations;
-
-   png_debug(1,"in png_do_read_interlace\n");
-
-   if (mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-   if (row != NULL && row_info != NULL)
-   {
-      png_uint_32 final_width;
-
-      final_width = row_info->width * png_pass_inc[pass];
-
-      switch (row_info->pixel_depth)
-      {
-         case 1:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_byte v;
-            png_uint_32 i;
-            int j;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 3);
-            dp = row + (png_size_t)((final_width - 1) >> 3);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (int)((row_info->width + 7) & 7);
-               dshift = (int)((final_width + 7) & 7);
-               s_start = 7;
-               s_end = 0;
-               s_inc = -1;
-            }
-            else
-#endif
-            {
-               sshift = 7 - (int)((row_info->width + 7) & 7);
-               dshift = 7 - (int)((final_width + 7) & 7);
-               s_start = 0;
-               s_end = 7;
-               s_inc = 1;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               v = (png_byte)((*sp >> sshift) & 0x1);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 2:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 2);
-            dp = row + (png_size_t)((final_width - 1) >> 2);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
-               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
-               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0x3);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 4:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 1);
-            dp = row + (png_size_t)((final_width - 1) >> 1);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
-               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
-               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0xf);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         default:         // This is the place where the routine is modified
-         {
-            static PNG_CONST __int64 const4 = 0x0000000000FFFFFF;
-            // static PNG_CONST __int64 const5 = 0x000000FFFFFF0000;  // unused...
-            static PNG_CONST __int64 const6 = 0x00000000000000FF;
-            png_bytep sptr, dp;
-            png_uint_32 i;
-            png_size_t pixel_bytes;
-            int width = row_info->width;
-
-            pixel_bytes = (row_info->pixel_depth >> 3);
-
-            sptr = row + (width - 1) * pixel_bytes;
-            dp = row + (final_width - 1) * pixel_bytes;
-            // New code by Nirav Chhatrapati - Intel Corporation
-            // sign fix by GRR
-            // NOTE:  there is NO MMX code for 48-bit and 64-bit images
-
-            // use MMX routine if machine supports it
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               if (pixel_bytes == 3)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) - 8;
-                     if (width_mmx < 0)
-                         width_mmx = 0;
-                     width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 3
-                           sub edi, 9
-loop_pass4:
-                           movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
-                           movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
-                           movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
-                           psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
-                           pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
-                           psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
-                           por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
-                           movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
-                           psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
-                           movq [edi], mm0     ; move quad to memory
-                           psrlq mm5, 16       ; 0 0 0 0 0 X X v2
-                           pand mm5, const6    ; 0 0 0 0 0 0 0 v2
-                           por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
-                           movd [edi+8], mm6   ; move double to memory
-                           sub esi, 6
-                           sub edi, 12
-                           sub ecx, 2
-                           jnz loop_pass4
-                           EMMS
-                        }
-                     }
-
-                     sptr -= width_mmx*3;
-                     dp -= width_mmx*6;
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-
-                        png_memcpy(v, sptr, 3);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           png_memcpy(dp, v, 3);
-                           dp -= 3;
-                        }
-                        sptr -= 3;
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     _asm
-                     {
-                        mov esi, sptr
-                        mov edi, dp
-                        mov ecx, width
-                        sub edi, 9   // (png_pass_inc[pass] - 1)*pixel_bytes
-loop_pass2:
-                        movd mm0, [esi]     ; X X X X X v2 v1 v0
-                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
-                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
-                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
-                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
-                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
-                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
-                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
-                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
-                        movq [edi+4], mm0   ; move to memory
-                        psrlq mm0, 16       ; 0 0 v2 v1 v0 v2 v1 v0
-                        movd [edi], mm0     ; move to memory
-                        sub esi, 3
-                        sub edi, 12
-                        dec ecx
-                        jnz loop_pass2
-                        EMMS
-                     }
-                  }
-                  else if (width) /* && ((pass == 0) || (pass == 1))) */
-                  {
-                     _asm
-                     {
-                        mov esi, sptr
-                        mov edi, dp
-                        mov ecx, width
-                        sub edi, 21   // (png_pass_inc[pass] - 1)*pixel_bytes
-loop_pass0:
-                        movd mm0, [esi]     ; X X X X X v2 v1 v0
-                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
-                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
-                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
-                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
-                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
-                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
-                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
-                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
-                        movq mm3, mm0       ; v2 v1 v0 v2 v1 v0 v2 v1
-                        psllq mm0, 16       ; v0 v2 v1 v0 v2 v1 0 0
-                        movq mm4, mm3       ; v2 v1 v0 v2 v1 v0 v2 v1
-                        punpckhdq mm3, mm0  ; v0 v2 v1 v0 v2 v1 v0 v2
-                        movq [edi+16] , mm4
-                        psrlq mm0, 32       ; 0 0 0 0 v0 v2 v1 v0
-                        movq [edi+8] , mm3
-                        punpckldq mm0, mm4  ; v1 v0 v2 v1 v0 v2 v1 v0
-                        sub esi, 3
-                        movq [edi], mm0
-                        sub edi, 24
-                        //sub esi, 3
-                        dec ecx
-                        jnz loop_pass0
-                        EMMS
-                     }
-                  }
-               } /* end of pixel_bytes == 3 */
-
-               else if (pixel_bytes == 1)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 3) << 3);
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub edi, 15
-                           sub esi, 7
-loop1_pass4:
-                           movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
-                           movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
-                           punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
-                           //movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3
-                           punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
-                           movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
-                           sub esi, 8
-                           movq [edi], mm0     ; move to memory v4 v5 v6 and v7
-                           //sub esi, 4
-                           sub edi, 16
-                           sub ecx, 8
-                           jnz loop1_pass4
-                           EMMS
-                        }
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*2;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           *dp-- = *sptr;
-                        }
-                        sptr --;
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 2) << 2);
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub edi, 15
-                           sub esi, 3
-loop1_pass2:
-                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
-                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
-                           movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
-                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
-                           punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
-                           movq [edi], mm0     ; move to memory v2 and v3
-                           sub esi, 4
-                           movq [edi+8], mm1   ; move to memory v1     and v0
-                           sub edi, 16
-                           sub ecx, 4
-                           jnz loop1_pass2
-                           EMMS
-                        }
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*4;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           *dp-- = *sptr;
-                        }
-                        sptr --;
-                     }
-                  }
-                  else if (width) /* && ((pass == 0) || (pass == 1))) */
-                  {
-                     int width_mmx = ((width >> 2) << 2);
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub edi, 31
-                           sub esi, 3
-loop1_pass0:
-                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
-                           movq mm1, mm0       ; X X X X v0 v1 v2 v3
-                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
-                           movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
-                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
-                           movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
-                           punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
-                           punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
-                           movq [edi], mm0     ; move to memory v3
-                           punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
-                           movq [edi+8], mm3   ; move to memory v2
-                           movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
-                           punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
-                           punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
-                           movq [edi+16], mm2  ; move to memory v1
-                           movq [edi+24], mm4  ; move to memory v0
-                           sub esi, 4
-                           sub edi, 32
-                           sub ecx, 4
-                           jnz loop1_pass0
-                           EMMS
-                        }
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*8;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                       /* I simplified this part in version 1.0.4e
-                        * here and in several other instances where
-                        * pixel_bytes == 1  -- GR-P
-                        *
-                        * Original code:
-                        *
-                        * png_byte v[8];
-                        * png_memcpy(v, sptr, pixel_bytes);
-                        * for (j = 0; j < png_pass_inc[pass]; j++)
-                        * {
-                        *    png_memcpy(dp, v, pixel_bytes);
-                        *    dp -= pixel_bytes;
-                        * }
-                        * sptr -= pixel_bytes;
-                        *
-                        * Replacement code is in the next three lines:
-                        */
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                           *dp-- = *sptr;
-                        sptr--;
-                     }
-                  }
-               } /* end of pixel_bytes == 1 */
-
-               else if (pixel_bytes == 2)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 2
-                           sub edi, 6
-loop2_pass4:
-                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
-                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
-                           sub esi, 4
-                           movq [edi], mm0
-                           sub edi, 8
-                           sub ecx, 2
-                           jnz loop2_pass4
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*2 - 2);            // sign fixed
-                     dp -= (width_mmx*4 - 2);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 2;
-                        png_memcpy(v, sptr, 2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 2;
-                           png_memcpy(dp, v, 2);
-                        }
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 2
-                           sub edi, 14
-loop2_pass2:
-                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
-                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
-                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
-                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
-                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
-                           movq [edi], mm0
-                           sub esi, 4
-                           movq [edi + 8], mm1
-                           //sub esi, 4
-                           sub edi, 16
-                           sub ecx, 2
-                           jnz loop2_pass2
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*2 - 2);            // sign fixed
-                     dp -= (width_mmx*8 - 2);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 2;
-                        png_memcpy(v, sptr, 2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 2;
-                           png_memcpy(dp, v, 2);
-                        }
-                     }
-                  }
-                  else if (width) /* && ((pass == 0) || (pass == 1))) */
-                  {
-                     int width_mmx = ((width >> 1) << 1);
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 2
-                           sub edi, 30
-loop2_pass0:
-                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
-                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
-                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
-                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
-                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
-                           movq [edi], mm0
-                           movq [edi + 8], mm0
-                           movq [edi + 16], mm1
-                           movq [edi + 24], mm1
-                           sub esi, 4
-                           sub edi, 32
-                           sub ecx, 2
-                           jnz loop2_pass0
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*2 - 2);            // sign fixed
-                     dp -= (width_mmx*16 - 2);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 2;
-                        png_memcpy(v, sptr, 2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 2;
-                           png_memcpy(dp, v, 2);
-                        }
-                     }
-                  }
-               } /* end of pixel_bytes == 2 */
-
-               else if (pixel_bytes == 4)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 4
-                           sub edi, 12
-loop4_pass4:
-                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
-                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
-                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
-                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
-                           movq [edi], mm0
-                           sub esi, 8
-                           movq [edi + 8], mm1
-                           sub edi, 16
-                           sub ecx, 2
-                           jnz loop4_pass4
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*4 - 4);          // sign fixed
-                     dp -= (width_mmx*8 - 4);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 4;
-                        png_memcpy(v, sptr, 4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 4;
-                           png_memcpy(dp, v, 4);
-                        }
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 4
-                           sub edi, 28
-loop4_pass2:
-                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
-                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
-                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
-                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
-                           movq [edi], mm0
-                           movq [edi + 8], mm0
-                           movq [edi+16], mm1
-                           movq [edi + 24], mm1
-                           sub esi, 8
-                           sub edi, 32
-                           sub ecx, 2
-                           jnz loop4_pass2
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*4 - 4);            // sign fixed
-                     dp -= (width_mmx*16 - 4);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 4;
-                        png_memcpy(v, sptr, 4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 4;
-                           png_memcpy(dp, v, 4);
-                        }
-                     }
-                  }
-                  else if (width) /* && ((pass == 0) || (pass == 1))) */
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 4
-                           sub edi, 60
-loop4_pass0:
-                           movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
-                           movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
-                           punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
-                           punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
-                           movq [edi], mm0
-                           movq [edi + 8], mm0
-                           movq [edi + 16], mm0
-                           movq [edi + 24], mm0
-                           movq [edi+32], mm1
-                           movq [edi + 40], mm1
-                           movq [edi+ 48], mm1
-                           sub esi, 8
-                           movq [edi + 56], mm1
-                           sub edi, 64
-                           sub ecx, 2
-                           jnz loop4_pass0
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*4 - 4);            // sign fixed
-                     dp -= (width_mmx*32 - 4);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 4;
-                        png_memcpy(v, sptr, 4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 4;
-                           png_memcpy(dp, v, 4);
-                        }
-                     }
-                  }
-
-               } /* end of pixel_bytes == 4 */
-
-               else if (pixel_bytes == 6)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 6);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, 6);
-                        dp -= 6;
-                     }
-                     sptr -= 6;
-                  }
-               } /* end of pixel_bytes == 6 */
-
-               else
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr-= pixel_bytes;
-                  }
-               }
-            } /* end of mmx_supported */
-
-            else /* MMX not supported:  use modified C code - takes advantage
-                  * of inlining of memcpy for a constant */
-            {
-               if (pixel_bytes == 1)
-               {
-                  for (i = width; i; i--)
-                  {
-                     int j;
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                        *dp-- = *sptr;
-                     sptr--;
-                  }
-               }
-               else if (pixel_bytes == 3)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-               else if (pixel_bytes == 2)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-               else if (pixel_bytes == 4)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-               else if (pixel_bytes == 6)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-               else
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-
-            } /* end of MMX not supported */
-            break;
-         }
-      } /* end switch (row_info->pixel_depth) */
-
-      row_info->width = final_width;
-
-      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
-   }
-
-}
-
-#endif /* PNG_READ_INTERLACING_SUPPORTED */
-
-
-// These global constants are declared
-// here to ensure alignment on 8-byte boundaries.
-  union uAll {
-     __int64 use;
-     double  double_align;
-     long long long_long_align;
-  } ;
-  static PNG_CONST union uAll LBCarryMask = {0x0101010101010101},
-                              HBClearMask = {0x7f7f7f7f7f7f7f7f};
-
-// Optimized code for PNG Average filter decoder
-void /* PRIVATE */
-png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
-                            , png_bytep prev_row)
-{
-  // These variables are declared
-  // here to ensure alignment on 8-byte boundaries.
-  union uAll ActiveMask, ShiftBpp, ShiftRem;
-
-   int bpp;
-   png_uint_32 FullLength;
-   png_uint_32 MMXLength;
-   //png_uint_32 len;
-   int diff;
-
-   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
-   FullLength  = row_info->rowbytes; // # of bytes to filter
-   _asm {
-         // Init address pointers and offset
-         mov edi, row          // edi ==> Avg(x)
-         xor ebx, ebx          // ebx ==> x
-         mov edx, edi
-         mov esi, prev_row           // esi ==> Prior(x)
-         sub edx, bpp          // edx ==> Raw(x-bpp)
-
-         xor eax, eax
-         // Compute the Raw value for the first bpp bytes
-         //    Raw(x) = Avg(x) + (Prior(x)/2)
-davgrlp:
-         mov al, [esi + ebx]   // Load al with Prior(x)
-         inc ebx
-         shr al, 1             // divide by 2
-         add al, [edi+ebx-1]   // Add Avg(x); -1 to offset inc ebx
-         cmp ebx, bpp
-         mov [edi+ebx-1], al    // Write back Raw(x);
-                            // mov does not affect flags; -1 to offset inc ebx
-         jb davgrlp
-         // get # of bytes to alignment
-         mov diff, edi         // take start of row
-         add diff, ebx         // add bpp
-         add diff, 0xf         // add 7 + 8 to incr past alignment boundary
-         and diff, 0xfffffff8  // mask to alignment boundary
-         sub diff, edi         // subtract from start ==> value ebx at alignment
-         jz davggo
-         // fix alignment
-         // Compute the Raw value for the bytes upto the alignment boundary
-         //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-         xor ecx, ecx
-davglp1:
-         xor eax, eax
-         mov cl, [esi + ebx]        // load cl with Prior(x)
-         mov al, [edx + ebx]  // load al with Raw(x-bpp)
-         add ax, cx
-         inc ebx
-         shr ax, 1            // divide by 2
-         add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
-         cmp ebx, diff              // Check if at alignment boundary
-         mov [edi+ebx-1], al        // Write back Raw(x);
-                            // mov does not affect flags; -1 to offset inc ebx
-         jb davglp1               // Repeat until at alignment boundary
-davggo:
-         mov eax, FullLength
-         mov ecx, eax
-         sub eax, ebx          // subtract alignment fix
-         and eax, 0x00000007   // calc bytes over mult of 8
-         sub ecx, eax          // drop over bytes from original length
-         mov MMXLength, ecx
-   } // end _asm block
-   // Now do the math for the rest of the row
-   switch ( bpp )
-   {
-      case 3:
-      {
-         ActiveMask.use  = 0x0000000000ffffff;
-         ShiftBpp.use = 24;    // == 3 * 8
-         ShiftRem.use = 40;    // == 64 - 24
-         _asm {
-            // Re-init address pointers and offset
-            movq mm7, ActiveMask
-            mov ebx, diff      // ebx ==> x = offset to alignment boundary
-            movq mm5, LBCarryMask
-            mov edi, row       // edi ==> Avg(x)
-            movq mm4, HBClearMask
-            mov esi, prev_row        // esi ==> Prior(x)
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
-                               // (we correct position in loop below)
-davg3lp:
-            movq mm0, [edi + ebx]      // Load mm0 with Avg(x)
-            // Add (Prev_row/2) to Average
-            movq mm3, mm5
-            psrlq mm2, ShiftRem      // Correct position Raw(x-bpp) data
-            movq mm1, [esi + ebx]    // Load mm1 with Prior(x)
-            movq mm6, mm7
-            pand mm3, mm1      // get lsb for each prev_row byte
-            psrlq mm1, 1       // divide prev_row bytes by 2
-            pand  mm1, mm4     // clear invalid bit 7 of each byte
-            paddb mm0, mm1     // add (Prev_row/2) to Avg for each byte
-            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
-            movq mm1, mm3      // now use mm1 for getting LBCarrys
-            pand mm1, mm2      // get LBCarrys for each byte where both
-                               // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1       // divide raw bytes by 2
-            pand  mm2, mm4     // clear invalid bit 7 of each byte
-            paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6      // Leave only Active Group 1 bytes to add to Avg
-            paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
-                               //  byte
-            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 3-5
-            movq mm2, mm0        // mov updated Raws to mm2
-            psllq mm2, ShiftBpp  // shift data to position correctly
-            movq mm1, mm3        // now use mm1 for getting LBCarrys
-            pand mm1, mm2      // get LBCarrys for each byte where both
-                               // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1       // divide raw bytes by 2
-            pand  mm2, mm4     // clear invalid bit 7 of each byte
-            paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6      // Leave only Active Group 2 bytes to add to Avg
-            paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
-                               //  byte
-
-            // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp  // shift the mm6 mask to cover the last two
-                                 // bytes
-            movq mm2, mm0        // mov updated Raws to mm2
-            psllq mm2, ShiftBpp  // shift data to position correctly
-                              // Data only needs to be shifted once here to
-                              // get the correct x-bpp offset.
-            movq mm1, mm3     // now use mm1 for getting LBCarrys
-            pand mm1, mm2     // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1      // divide raw bytes by 2
-            pand  mm2, mm4    // clear invalid bit 7 of each byte
-            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
-            add ebx, 8
-            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
-                              // byte
-
-            // Now ready to write back to memory
-            movq [edi + ebx - 8], mm0
-            // Move updated Raw(x) to use as Raw(x-bpp) for next loop
-            cmp ebx, MMXLength
-            movq mm2, mm0     // mov updated Raw(x) to mm2
-            jb davg3lp
-         } // end _asm block
-      }
-      break;
-
-      case 6:
-      case 4:
-      case 7:
-      case 5:
-      {
-         ActiveMask.use  = 0xffffffffffffffff;  // use shift below to clear
-                                                // appropriate inactive bytes
-         ShiftBpp.use = bpp << 3;
-         ShiftRem.use = 64 - ShiftBpp.use;
-         _asm {
-            movq mm4, HBClearMask
-            // Re-init address pointers and offset
-            mov ebx, diff       // ebx ==> x = offset to alignment boundary
-            // Load ActiveMask and clear all bytes except for 1st active group
-            movq mm7, ActiveMask
-            mov edi, row         // edi ==> Avg(x)
-            psrlq mm7, ShiftRem
-            mov esi, prev_row    // esi ==> Prior(x)
-            movq mm6, mm7
-            movq mm5, LBCarryMask
-            psllq mm6, ShiftBpp  // Create mask for 2nd active group
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
-                                 // (we correct position in loop below)
-davg4lp:
-            movq mm0, [edi + ebx]
-            psrlq mm2, ShiftRem  // shift data to position correctly
-            movq mm1, [esi + ebx]
-            // Add (Prev_row/2) to Average
-            movq mm3, mm5
-            pand mm3, mm1     // get lsb for each prev_row byte
-            psrlq mm1, 1      // divide prev_row bytes by 2
-            pand  mm1, mm4    // clear invalid bit 7 of each byte
-            paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
-            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
-            movq mm1, mm3     // now use mm1 for getting LBCarrys
-            pand mm1, mm2     // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1      // divide raw bytes by 2
-            pand  mm2, mm4    // clear invalid bit 7 of each byte
-            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm7     // Leave only Active Group 1 bytes to add to Avg
-            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
-                              // byte
-            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
-            movq mm2, mm0     // mov updated Raws to mm2
-            psllq mm2, ShiftBpp // shift data to position correctly
-            add ebx, 8
-            movq mm1, mm3     // now use mm1 for getting LBCarrys
-            pand mm1, mm2     // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1      // divide raw bytes by 2
-            pand  mm2, mm4    // clear invalid bit 7 of each byte
-            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
-            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
-                              // byte
-            cmp ebx, MMXLength
-            // Now ready to write back to memory
-            movq [edi + ebx - 8], mm0
-            // Prep Raw(x-bpp) for next loop
-            movq mm2, mm0     // mov updated Raws to mm2
-            jb davg4lp
-         } // end _asm block
-      }
-      break;
-      case 2:
-      {
-         ActiveMask.use  = 0x000000000000ffff;
-         ShiftBpp.use = 16;   // == 2 * 8     [BUGFIX]
-         ShiftRem.use = 48;   // == 64 - 16   [BUGFIX]
-         _asm {
-            // Load ActiveMask
-            movq mm7, ActiveMask
-            // Re-init address pointers and offset
-            mov ebx, diff     // ebx ==> x = offset to alignment boundary
-            movq mm5, LBCarryMask
-            mov edi, row      // edi ==> Avg(x)
-            movq mm4, HBClearMask
-            mov esi, prev_row  // esi ==> Prior(x)
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
-                              // (we correct position in loop below)
-davg2lp:
-            movq mm0, [edi + ebx]
-            psrlq mm2, ShiftRem  // shift data to position correctly   [BUGFIX]
-            movq mm1, [esi + ebx]
-            // Add (Prev_row/2) to Average
-            movq mm3, mm5
-            pand mm3, mm1     // get lsb for each prev_row byte
-            psrlq mm1, 1      // divide prev_row bytes by 2
-            pand  mm1, mm4    // clear invalid bit 7 of each byte
-            movq mm6, mm7
-            paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
-            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
-            movq mm1, mm3     // now use mm1 for getting LBCarrys
-            pand mm1, mm2     // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1      // divide raw bytes by 2
-            pand  mm2, mm4    // clear invalid bit 7 of each byte
-            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6     // Leave only Active Group 1 bytes to add to Avg
-            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
-            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
-            movq mm2, mm0       // mov updated Raws to mm2
-            psllq mm2, ShiftBpp // shift data to position correctly
-            movq mm1, mm3       // now use mm1 for getting LBCarrys
-            pand mm1, mm2       // get LBCarrys for each byte where both
-                                // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1        // divide raw bytes by 2
-            pand  mm2, mm4      // clear invalid bit 7 of each byte
-            paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
-            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
-
-            // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
-            movq mm2, mm0       // mov updated Raws to mm2
-            psllq mm2, ShiftBpp // shift data to position correctly
-                                // Data only needs to be shifted once here to
-                                // get the correct x-bpp offset.
-            movq mm1, mm3       // now use mm1 for getting LBCarrys
-            pand mm1, mm2       // get LBCarrys for each byte where both
-                                // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1        // divide raw bytes by 2
-            pand  mm2, mm4      // clear invalid bit 7 of each byte
-            paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
-            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
-
-            // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 6 & 7
-            movq mm2, mm0        // mov updated Raws to mm2
-            psllq mm2, ShiftBpp  // shift data to position correctly
-                                 // Data only needs to be shifted once here to
-                                 // get the correct x-bpp offset.
-            add ebx, 8
-            movq mm1, mm3    // now use mm1 for getting LBCarrys
-            pand mm1, mm2    // get LBCarrys for each byte where both
-                             // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1     // divide raw bytes by 2
-            pand  mm2, mm4   // clear invalid bit 7 of each byte
-            paddb mm2, mm1   // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6    // Leave only Active Group 2 bytes to add to Avg
-            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
-
-            cmp ebx, MMXLength
-            // Now ready to write back to memory
-            movq [edi + ebx - 8], mm0
-            // Prep Raw(x-bpp) for next loop
-            movq mm2, mm0    // mov updated Raws to mm2
-            jb davg2lp
-        } // end _asm block
-      }
-      break;
-
-      case 1:                 // bpp == 1
-      {
-         _asm {
-            // Re-init address pointers and offset
-            mov ebx, diff     // ebx ==> x = offset to alignment boundary
-            mov edi, row      // edi ==> Avg(x)
-            cmp ebx, FullLength  // Test if offset at end of array
-            jnb davg1end
-            // Do Paeth decode for remaining bytes
-            mov esi, prev_row    // esi ==> Prior(x)
-            mov edx, edi
-            xor ecx, ecx         // zero ecx before using cl & cx in loop below
-            sub edx, bpp         // edx ==> Raw(x-bpp)
-davg1lp:
-            // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-            xor eax, eax
-            mov cl, [esi + ebx]  // load cl with Prior(x)
-            mov al, [edx + ebx]  // load al with Raw(x-bpp)
-            add ax, cx
-            inc ebx
-            shr ax, 1            // divide by 2
-            add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
-            cmp ebx, FullLength  // Check if at end of array
-            mov [edi+ebx-1], al  // Write back Raw(x);
-                         // mov does not affect flags; -1 to offset inc ebx
-            jb davg1lp
-davg1end:
-         } // end _asm block
-      }
-      return;
-
-      case 8:             // bpp == 8
-      {
-         _asm {
-            // Re-init address pointers and offset
-            mov ebx, diff           // ebx ==> x = offset to alignment boundary
-            movq mm5, LBCarryMask
-            mov edi, row            // edi ==> Avg(x)
-            movq mm4, HBClearMask
-            mov esi, prev_row       // esi ==> Prior(x)
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
-                                // (NO NEED to correct position in loop below)
-davg8lp:
-            movq mm0, [edi + ebx]
-            movq mm3, mm5
-            movq mm1, [esi + ebx]
-            add ebx, 8
-            pand mm3, mm1       // get lsb for each prev_row byte
-            psrlq mm1, 1        // divide prev_row bytes by 2
-            pand mm3, mm2       // get LBCarrys for each byte where both
-                                // lsb's were == 1
-            psrlq mm2, 1        // divide raw bytes by 2
-            pand  mm1, mm4      // clear invalid bit 7 of each byte
-            paddb mm0, mm3      // add LBCarrys to Avg for each byte
-            pand  mm2, mm4      // clear invalid bit 7 of each byte
-            paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
-            paddb mm0, mm2      // add (Raw/2) to Avg for each byte
-            cmp ebx, MMXLength
-            movq [edi + ebx - 8], mm0
-            movq mm2, mm0       // reuse as Raw(x-bpp)
-            jb davg8lp
-        } // end _asm block
-      }
-      break;
-      default:                  // bpp greater than 8
-      {
-        _asm {
-            movq mm5, LBCarryMask
-            // Re-init address pointers and offset
-            mov ebx, diff       // ebx ==> x = offset to alignment boundary
-            mov edi, row        // edi ==> Avg(x)
-            movq mm4, HBClearMask
-            mov edx, edi
-            mov esi, prev_row   // esi ==> Prior(x)
-            sub edx, bpp        // edx ==> Raw(x-bpp)
-davgAlp:
-            movq mm0, [edi + ebx]
-            movq mm3, mm5
-            movq mm1, [esi + ebx]
-            pand mm3, mm1       // get lsb for each prev_row byte
-            movq mm2, [edx + ebx]
-            psrlq mm1, 1        // divide prev_row bytes by 2
-            pand mm3, mm2       // get LBCarrys for each byte where both
-                                // lsb's were == 1
-            psrlq mm2, 1        // divide raw bytes by 2
-            pand  mm1, mm4      // clear invalid bit 7 of each byte
-            paddb mm0, mm3      // add LBCarrys to Avg for each byte
-            pand  mm2, mm4      // clear invalid bit 7 of each byte
-            paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
-            add ebx, 8
-            paddb mm0, mm2      // add (Raw/2) to Avg for each byte
-            cmp ebx, MMXLength
-            movq [edi + ebx - 8], mm0
-            jb davgAlp
-        } // end _asm block
-      }
-      break;
-   }                         // end switch ( bpp )
-
-   _asm {
-         // MMX acceleration complete now do clean-up
-         // Check if any remaining bytes left to decode
-         mov ebx, MMXLength    // ebx ==> x = offset bytes remaining after MMX
-         mov edi, row          // edi ==> Avg(x)
-         cmp ebx, FullLength   // Test if offset at end of array
-         jnb davgend
-         // Do Paeth decode for remaining bytes
-         mov esi, prev_row     // esi ==> Prior(x)
-         mov edx, edi
-         xor ecx, ecx          // zero ecx before using cl & cx in loop below
-         sub edx, bpp          // edx ==> Raw(x-bpp)
-davglp2:
-         // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-         xor eax, eax
-         mov cl, [esi + ebx]   // load cl with Prior(x)
-         mov al, [edx + ebx]   // load al with Raw(x-bpp)
-         add ax, cx
-         inc ebx
-         shr ax, 1              // divide by 2
-         add al, [edi+ebx-1]    // Add Avg(x); -1 to offset inc ebx
-         cmp ebx, FullLength    // Check if at end of array
-         mov [edi+ebx-1], al    // Write back Raw(x);
-                          // mov does not affect flags; -1 to offset inc ebx
-         jb davglp2
-davgend:
-         emms             // End MMX instructions; prep for possible FP instrs.
-   } // end _asm block
-}
-
-// Optimized code for PNG Paeth filter decoder
-void /* PRIVATE */
-png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
-                              png_bytep prev_row)
-{
-  // These variables are declared
-  // here to ensure alignment on 8-byte boundaries.
-  union uAll  ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
-
-   png_uint_32 FullLength;
-   png_uint_32 MMXLength;
-   //png_uint_32 len;
-   int bpp;
-   int diff;
-   //int ptemp;
-   int patemp, pbtemp, pctemp;
-
-   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
-   FullLength  = row_info->rowbytes; // # of bytes to filter
-   _asm
-   {
-         xor ebx, ebx        // ebx ==> x offset
-         mov edi, row
-         xor edx, edx        // edx ==> x-bpp offset
-         mov esi, prev_row
-         xor eax, eax
-
-         // Compute the Raw value for the first bpp bytes
-         // Note: the formula works out to be always
-         //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
-dpthrlp:
-         mov al, [edi + ebx]
-         add al, [esi + ebx]
-         inc ebx
-         cmp ebx, bpp
-         mov [edi + ebx - 1], al
-         jb dpthrlp
-         // get # of bytes to alignment
-         mov diff, edi         // take start of row
-         add diff, ebx         // add bpp
-         xor ecx, ecx
-         add diff, 0xf         // add 7 + 8 to incr past alignment boundary
-         and diff, 0xfffffff8  // mask to alignment boundary
-         sub diff, edi         // subtract from start ==> value ebx at alignment
-         jz dpthgo
-         // fix alignment
-dpthlp1:
-         xor eax, eax
-         // pav = p - a = (a + b - c) - a = b - c
-         mov al, [esi + ebx]   // load Prior(x) into al
-         mov cl, [esi + edx]   // load Prior(x-bpp) into cl
-         sub eax, ecx          // subtract Prior(x-bpp)
-         mov patemp, eax       // Save pav for later use
-         xor eax, eax
-         // pbv = p - b = (a + b - c) - b = a - c
-         mov al, [edi + edx]   // load Raw(x-bpp) into al
-         sub eax, ecx          // subtract Prior(x-bpp)
-         mov ecx, eax
-         // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-         add eax, patemp       // pcv = pav + pbv
-         // pc = abs(pcv)
-         test eax, 0x80000000
-         jz dpthpca
-         neg eax               // reverse sign of neg values
-dpthpca:
-         mov pctemp, eax       // save pc for later use
-         // pb = abs(pbv)
-         test ecx, 0x80000000
-         jz dpthpba
-         neg ecx               // reverse sign of neg values
-dpthpba:
-         mov pbtemp, ecx       // save pb for later use
-         // pa = abs(pav)
-         mov eax, patemp
-         test eax, 0x80000000
-         jz dpthpaa
-         neg eax               // reverse sign of neg values
-dpthpaa:
-         mov patemp, eax       // save pa for later use
-         // test if pa <= pb
-         cmp eax, ecx
-         jna dpthabb
-         // pa > pb; now test if pb <= pc
-         cmp ecx, pctemp
-         jna dpthbbc
-         // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-         jmp dpthpaeth
-dpthbbc:
-         // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-         mov cl, [esi + ebx]   // load Prior(x) into cl
-         jmp dpthpaeth
-dpthabb:
-         // pa <= pb; now test if pa <= pc
-         cmp eax, pctemp
-         jna dpthabc
-         // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-         jmp dpthpaeth
-dpthabc:
-         // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-         mov cl, [edi + edx]  // load Raw(x-bpp) into cl
-dpthpaeth:
-         inc ebx
-         inc edx
-         // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-         add [edi + ebx - 1], cl
-         cmp ebx, diff
-         jb dpthlp1
-dpthgo:
-         mov ecx, FullLength
-         mov eax, ecx
-         sub eax, ebx          // subtract alignment fix
-         and eax, 0x00000007   // calc bytes over mult of 8
-         sub ecx, eax          // drop over bytes from original length
-         mov MMXLength, ecx
-   } // end _asm block
-   // Now do the math for the rest of the row
-   switch ( bpp )
-   {
-      case 3:
-      {
-         ActiveMask.use = 0x0000000000ffffff;
-         ActiveMaskEnd.use = 0xffff000000000000;
-         ShiftBpp.use = 24;    // == bpp(3) * 8
-         ShiftRem.use = 40;    // == 64 - 24
-         _asm
-         {
-            mov ebx, diff
-            mov edi, row
-            mov esi, prev_row
-            pxor mm0, mm0
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]
-dpth3lp:
-            psrlq mm1, ShiftRem     // shift last 3 bytes to 1st 3 bytes
-            movq mm2, [esi + ebx]   // load b=Prior(x)
-            punpcklbw mm1, mm0      // Unpack High bytes of a
-            movq mm3, [esi+ebx-8]   // Prep c=Prior(x-bpp) bytes
-            punpcklbw mm2, mm0      // Unpack High bytes of b
-            psrlq mm3, ShiftRem     // shift last 3 bytes to 1st 3 bytes
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            punpcklbw mm3, mm0      // Unpack High bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4       // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5       // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
-            pand mm0, mm6       // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5    // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            paddw mm7, mm3
-            pxor mm0, mm0
-            packuswb mm7, mm1
-            movq mm3, [esi + ebx]   // load c=Prior(x-bpp)
-            pand mm7, ActiveMask
-            movq mm2, mm3           // load b=Prior(x) step 1
-            paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
-            punpcklbw mm3, mm0      // Unpack High bytes of c
-            movq [edi + ebx], mm7   // write back updated value
-            movq mm1, mm7           // Now mm1 will be used as Raw(x-bpp)
-            // Now do Paeth for 2nd set of bytes (3-5)
-            psrlq mm2, ShiftBpp     // load b=Prior(x) step 2
-            punpcklbw mm1, mm0      // Unpack High bytes of a
-            pxor mm7, mm7
-            punpcklbw mm2, mm0      // Unpack High bytes of b
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            psubw mm5, mm3
-            psubw mm4, mm3
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
-            //       pav + pbv = pbv + pav
-            movq mm6, mm5
-            paddw mm6, mm4
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm5       // Create mask pbv bytes < 0
-            pcmpgtw mm7, mm4       // Create mask pav bytes < 0
-            pand mm0, mm5          // Only pbv bytes < 0 in mm0
-            pand mm7, mm4          // Only pav bytes < 0 in mm7
-            psubw mm5, mm0
-            psubw mm4, mm7
-            psubw mm5, mm0
-            psubw mm4, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            movq mm2, [esi + ebx]  // load b=Prior(x)
-            pand mm3, mm7
-            pandn mm7, mm0
-            pxor mm1, mm1
-            paddw mm7, mm3
-            pxor mm0, mm0
-            packuswb mm7, mm1
-            movq mm3, mm2           // load c=Prior(x-bpp) step 1
-            pand mm7, ActiveMask
-            punpckhbw mm2, mm0      // Unpack High bytes of b
-            psllq mm7, ShiftBpp     // Shift bytes to 2nd group of 3 bytes
-             // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
-            psllq mm3, ShiftBpp     // load c=Prior(x-bpp) step 2
-            movq [edi + ebx], mm7   // write back updated value
-            movq mm1, mm7
-            punpckhbw mm3, mm0      // Unpack High bytes of c
-            psllq mm1, ShiftBpp     // Shift bytes
-                                    // Now mm1 will be used as Raw(x-bpp)
-            // Now do Paeth for 3rd, and final, set of bytes (6-7)
-            pxor mm7, mm7
-            punpckhbw mm1, mm0      // Unpack High bytes of a
-            psubw mm4, mm3
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            pxor mm0, mm0
-            paddw mm6, mm5
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
-            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
-            pand mm0, mm4       // Only pav bytes < 0 in mm7
-            pand mm7, mm5       // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
-            pand mm0, mm6       // Only pav bytes < 0 in mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5    // pa > pb?
-            movq mm0, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            pandn mm0, mm1
-            pandn mm7, mm4
-            paddw mm0, mm2
-            paddw mm7, mm5
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6    // pab > pc?
-            pand mm3, mm7
-            pandn mm7, mm0
-            paddw mm7, mm3
-            pxor mm1, mm1
-            packuswb mm1, mm7
-            // Step ebx to next set of 8 bytes and repeat loop til done
-            add ebx, 8
-            pand mm1, ActiveMaskEnd
-            paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
-
-            cmp ebx, MMXLength
-            pxor mm0, mm0              // pxor does not affect flags
-            movq [edi + ebx - 8], mm1  // write back updated value
-                                 // mm1 will be used as Raw(x-bpp) next loop
-                           // mm3 ready to be used as Prior(x-bpp) next loop
-            jb dpth3lp
-         } // end _asm block
-      }
-      break;
-
-      case 6:
-      case 7:
-      case 5:
-      {
-         ActiveMask.use  = 0x00000000ffffffff;
-         ActiveMask2.use = 0xffffffff00000000;
-         ShiftBpp.use = bpp << 3;    // == bpp * 8
-         ShiftRem.use = 64 - ShiftBpp.use;
-         _asm
-         {
-            mov ebx, diff
-            mov edi, row
-            mov esi, prev_row
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]
-            pxor mm0, mm0
-dpth6lp:
-            // Must shift to position Raw(x-bpp) data
-            psrlq mm1, ShiftRem
-            // Do first set of 4 bytes
-            movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
-            punpcklbw mm1, mm0      // Unpack Low bytes of a
-            movq mm2, [esi + ebx]   // load b=Prior(x)
-            punpcklbw mm2, mm0      // Unpack Low bytes of b
-            // Must shift to position Prior(x-bpp) data
-            psrlq mm3, ShiftRem
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            punpcklbw mm3, mm0      // Unpack Low bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4       // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5       // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
-            pand mm0, mm6       // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5    // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6    // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            paddw mm7, mm3
-            pxor mm0, mm0
-            packuswb mm7, mm1
-            movq mm3, [esi + ebx - 8]  // load c=Prior(x-bpp)
-            pand mm7, ActiveMask
-            psrlq mm3, ShiftRem
-            movq mm2, [esi + ebx]      // load b=Prior(x) step 1
-            paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
-            movq mm6, mm2
-            movq [edi + ebx], mm7      // write back updated value
-            movq mm1, [edi+ebx-8]
-            psllq mm6, ShiftBpp
-            movq mm5, mm7
-            psrlq mm1, ShiftRem
-            por mm3, mm6
-            psllq mm5, ShiftBpp
-            punpckhbw mm3, mm0         // Unpack High bytes of c
-            por mm1, mm5
-            // Do second set of 4 bytes
-            punpckhbw mm2, mm0         // Unpack High bytes of b
-            punpckhbw mm1, mm0         // Unpack High bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4          // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5          // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6           // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            pxor mm1, mm1
-            paddw mm7, mm3
-            pxor mm0, mm0
-            // Step ex to next set of 8 bytes and repeat loop til done
-            add ebx, 8
-            packuswb mm1, mm7
-            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
-            cmp ebx, MMXLength
-            movq [edi + ebx - 8], mm1      // write back updated value
-                                // mm1 will be used as Raw(x-bpp) next loop
-            jb dpth6lp
-         } // end _asm block
-      }
-      break;
-
-      case 4:
-      {
-         ActiveMask.use  = 0x00000000ffffffff;
-         _asm {
-            mov ebx, diff
-            mov edi, row
-            mov esi, prev_row
-            pxor mm0, mm0
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]    // Only time should need to read
-                                     //  a=Raw(x-bpp) bytes
-dpth4lp:
-            // Do first set of 4 bytes
-            movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
-            punpckhbw mm1, mm0       // Unpack Low bytes of a
-            movq mm2, [esi + ebx]    // load b=Prior(x)
-            punpcklbw mm2, mm0       // Unpack High bytes of b
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            punpckhbw mm3, mm0       // Unpack High bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4          // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5          // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            paddw mm7, mm3
-            pxor mm0, mm0
-            packuswb mm7, mm1
-            movq mm3, [esi + ebx]      // load c=Prior(x-bpp)
-            pand mm7, ActiveMask
-            movq mm2, mm3              // load b=Prior(x) step 1
-            paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
-            punpcklbw mm3, mm0         // Unpack High bytes of c
-            movq [edi + ebx], mm7      // write back updated value
-            movq mm1, mm7              // Now mm1 will be used as Raw(x-bpp)
-            // Do second set of 4 bytes
-            punpckhbw mm2, mm0         // Unpack Low bytes of b
-            punpcklbw mm1, mm0         // Unpack Low bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4          // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5          // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            pxor mm1, mm1
-            paddw mm7, mm3
-            pxor mm0, mm0
-            // Step ex to next set of 8 bytes and repeat loop til done
-            add ebx, 8
-            packuswb mm1, mm7
-            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
-            cmp ebx, MMXLength
-            movq [edi + ebx - 8], mm1      // write back updated value
-                                // mm1 will be used as Raw(x-bpp) next loop
-            jb dpth4lp
-         } // end _asm block
-      }
-      break;
-      case 8:                          // bpp == 8
-      {
-         ActiveMask.use  = 0x00000000ffffffff;
-         _asm {
-            mov ebx, diff
-            mov edi, row
-            mov esi, prev_row
-            pxor mm0, mm0
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]      // Only time should need to read
-                                       //  a=Raw(x-bpp) bytes
-dpth8lp:
-            // Do first set of 4 bytes
-            movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
-            punpcklbw mm1, mm0         // Unpack Low bytes of a
-            movq mm2, [esi + ebx]      // load b=Prior(x)
-            punpcklbw mm2, mm0         // Unpack Low bytes of b
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            punpcklbw mm3, mm0         // Unpack Low bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4          // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5          // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            paddw mm7, mm3
-            pxor mm0, mm0
-            packuswb mm7, mm1
-            movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
-            pand mm7, ActiveMask
-            movq mm2, [esi + ebx]    // load b=Prior(x)
-            paddb mm7, [edi + ebx]   // add Paeth predictor with Raw(x)
-            punpckhbw mm3, mm0       // Unpack High bytes of c
-            movq [edi + ebx], mm7    // write back updated value
-            movq mm1, [edi+ebx-8]    // read a=Raw(x-bpp) bytes
-
-            // Do second set of 4 bytes
-            punpckhbw mm2, mm0       // Unpack High bytes of b
-            punpckhbw mm1, mm0       // Unpack High bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4          // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5          // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            pxor mm1, mm1
-            paddw mm7, mm3
-            pxor mm0, mm0
-            // Step ex to next set of 8 bytes and repeat loop til done
-            add ebx, 8
-            packuswb mm1, mm7
-            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
-            cmp ebx, MMXLength
-            movq [edi + ebx - 8], mm1      // write back updated value
-                            // mm1 will be used as Raw(x-bpp) next loop
-            jb dpth8lp
-         } // end _asm block
-      }
-      break;
-
-      case 1:                // bpp = 1
-      case 2:                // bpp = 2
-      default:               // bpp > 8
-      {
-         _asm {
-            mov ebx, diff
-            cmp ebx, FullLength
-            jnb dpthdend
-            mov edi, row
-            mov esi, prev_row
-            // Do Paeth decode for remaining bytes
-            mov edx, ebx
-            xor ecx, ecx        // zero ecx before using cl & cx in loop below
-            sub edx, bpp        // Set edx = ebx - bpp
-dpthdlp:
-            xor eax, eax
-            // pav = p - a = (a + b - c) - a = b - c
-            mov al, [esi + ebx]        // load Prior(x) into al
-            mov cl, [esi + edx]        // load Prior(x-bpp) into cl
-            sub eax, ecx                 // subtract Prior(x-bpp)
-            mov patemp, eax                 // Save pav for later use
-            xor eax, eax
-            // pbv = p - b = (a + b - c) - b = a - c
-            mov al, [edi + edx]        // load Raw(x-bpp) into al
-            sub eax, ecx                 // subtract Prior(x-bpp)
-            mov ecx, eax
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            add eax, patemp                 // pcv = pav + pbv
-            // pc = abs(pcv)
-            test eax, 0x80000000
-            jz dpthdpca
-            neg eax                     // reverse sign of neg values
-dpthdpca:
-            mov pctemp, eax             // save pc for later use
-            // pb = abs(pbv)
-            test ecx, 0x80000000
-            jz dpthdpba
-            neg ecx                     // reverse sign of neg values
-dpthdpba:
-            mov pbtemp, ecx             // save pb for later use
-            // pa = abs(pav)
-            mov eax, patemp
-            test eax, 0x80000000
-            jz dpthdpaa
-            neg eax                     // reverse sign of neg values
-dpthdpaa:
-            mov patemp, eax             // save pa for later use
-            // test if pa <= pb
-            cmp eax, ecx
-            jna dpthdabb
-            // pa > pb; now test if pb <= pc
-            cmp ecx, pctemp
-            jna dpthdbbc
-            // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-            mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-            jmp dpthdpaeth
-dpthdbbc:
-            // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-            mov cl, [esi + ebx]        // load Prior(x) into cl
-            jmp dpthdpaeth
-dpthdabb:
-            // pa <= pb; now test if pa <= pc
-            cmp eax, pctemp
-            jna dpthdabc
-            // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-            mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-            jmp dpthdpaeth
-dpthdabc:
-            // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-            mov cl, [edi + edx]  // load Raw(x-bpp) into cl
-dpthdpaeth:
-            inc ebx
-            inc edx
-            // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-            add [edi + ebx - 1], cl
-            cmp ebx, FullLength
-            jb dpthdlp
-dpthdend:
-         } // end _asm block
-      }
-      return;                   // No need to go further with this one
-   }                         // end switch ( bpp )
-   _asm
-   {
-         // MMX acceleration complete now do clean-up
-         // Check if any remaining bytes left to decode
-         mov ebx, MMXLength
-         cmp ebx, FullLength
-         jnb dpthend
-         mov edi, row
-         mov esi, prev_row
-         // Do Paeth decode for remaining bytes
-         mov edx, ebx
-         xor ecx, ecx         // zero ecx before using cl & cx in loop below
-         sub edx, bpp         // Set edx = ebx - bpp
-dpthlp2:
-         xor eax, eax
-         // pav = p - a = (a + b - c) - a = b - c
-         mov al, [esi + ebx]  // load Prior(x) into al
-         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-         sub eax, ecx         // subtract Prior(x-bpp)
-         mov patemp, eax      // Save pav for later use
-         xor eax, eax
-         // pbv = p - b = (a + b - c) - b = a - c
-         mov al, [edi + edx]  // load Raw(x-bpp) into al
-         sub eax, ecx         // subtract Prior(x-bpp)
-         mov ecx, eax
-         // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-         add eax, patemp      // pcv = pav + pbv
-         // pc = abs(pcv)
-         test eax, 0x80000000
-         jz dpthpca2
-         neg eax              // reverse sign of neg values
-dpthpca2:
-         mov pctemp, eax      // save pc for later use
-         // pb = abs(pbv)
-         test ecx, 0x80000000
-         jz dpthpba2
-         neg ecx              // reverse sign of neg values
-dpthpba2:
-         mov pbtemp, ecx      // save pb for later use
-         // pa = abs(pav)
-         mov eax, patemp
-         test eax, 0x80000000
-         jz dpthpaa2
-         neg eax              // reverse sign of neg values
-dpthpaa2:
-         mov patemp, eax      // save pa for later use
-         // test if pa <= pb
-         cmp eax, ecx
-         jna dpthabb2
-         // pa > pb; now test if pb <= pc
-         cmp ecx, pctemp
-         jna dpthbbc2
-         // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-         jmp dpthpaeth2
-dpthbbc2:
-         // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-         mov cl, [esi + ebx]        // load Prior(x) into cl
-         jmp dpthpaeth2
-dpthabb2:
-         // pa <= pb; now test if pa <= pc
-         cmp eax, pctemp
-         jna dpthabc2
-         // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-         jmp dpthpaeth2
-dpthabc2:
-         // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-         mov cl, [edi + edx]  // load Raw(x-bpp) into cl
-dpthpaeth2:
-         inc ebx
-         inc edx
-         // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-         add [edi + ebx - 1], cl
-         cmp ebx, FullLength
-         jb dpthlp2
-dpthend:
-         emms             // End MMX instructions; prep for possible FP instrs.
-   } // end _asm block
-}
-
-// Optimized code for PNG Sub filter decoder
-void /* PRIVATE */
-png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
-{
-  // These variables are declared
-  // here to ensure alignment on 8-byte boundaries.
-  union uAll ActiveMask, ShiftBpp, ShiftRem;
-
-   //int test;
-   int bpp;
-   png_uint_32 FullLength;
-   png_uint_32 MMXLength;
-   int diff;
-
-   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
-   FullLength  = row_info->rowbytes - bpp; // # of bytes to filter
-   _asm {
-        mov edi, row
-        mov esi, edi               // lp = row
-        add edi, bpp               // rp = row + bpp
-        xor eax, eax
-        // get # of bytes to alignment
-        mov diff, edi               // take start of row
-        add diff, 0xf               // add 7 + 8 to incr past
-                                        // alignment boundary
-        xor ebx, ebx
-        and diff, 0xfffffff8        // mask to alignment boundary
-        sub diff, edi               // subtract from start ==> value
-                                        //  ebx at alignment
-        jz dsubgo
-        // fix alignment
-dsublp1:
-        mov al, [esi+ebx]
-        add [edi+ebx], al
-        inc ebx
-        cmp ebx, diff
-        jb dsublp1
-dsubgo:
-        mov ecx, FullLength
-        mov edx, ecx
-        sub edx, ebx                  // subtract alignment fix
-        and edx, 0x00000007           // calc bytes over mult of 8
-        sub ecx, edx                  // drop over bytes from length
-        mov MMXLength, ecx
-   } // end _asm block
-
-   // Now do the math for the rest of the row
-   switch ( bpp )
-   {
-        case 3:
-        {
-         ActiveMask.use  = 0x0000ffffff000000;
-         ShiftBpp.use = 24;       // == 3 * 8
-         ShiftRem.use  = 40;      // == 64 - 24
-         _asm {
-            mov edi, row
-            movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
-            mov esi, edi              // lp = row
-            add edi, bpp          // rp = row + bpp
-            movq mm6, mm7
-            mov ebx, diff
-            psllq mm6, ShiftBpp   // Move mask in mm6 to cover 3rd active
-                                  // byte group
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]
-dsub3lp:
-            psrlq mm1, ShiftRem   // Shift data for adding 1st bpp bytes
-                          // no need for mask; shift clears inactive bytes
-            // Add 1st active group
-            movq mm0, [edi+ebx]
-            paddb mm0, mm1
-            // Add 2nd active group
-            movq mm1, mm0         // mov updated Raws to mm1
-            psllq mm1, ShiftBpp   // shift data to position correctly
-            pand mm1, mm7         // mask to use only 2nd active group
-            paddb mm0, mm1
-            // Add 3rd active group
-            movq mm1, mm0         // mov updated Raws to mm1
-            psllq mm1, ShiftBpp   // shift data to position correctly
-            pand mm1, mm6         // mask to use only 3rd active group
-            add ebx, 8
-            paddb mm0, mm1
-            cmp ebx, MMXLength
-            movq [edi+ebx-8], mm0     // Write updated Raws back to array
-            // Prep for doing 1st add at top of loop
-            movq mm1, mm0
-            jb dsub3lp
-         } // end _asm block
-      }
-      break;
-
-      case 1:
-      {
-         // Placed here just in case this is a duplicate of the
-         // non-MMX code for the SUB filter in png_read_filter_row below
-         //
-         //         png_bytep rp;
-         //         png_bytep lp;
-         //         png_uint_32 i;
-         //         bpp = (row_info->pixel_depth + 7) >> 3;
-         //         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
-         //            i < row_info->rowbytes; i++, rp++, lp++)
-         //      {
-         //            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
-         //      }
-         _asm {
-            mov ebx, diff
-            mov edi, row
-            cmp ebx, FullLength
-            jnb dsub1end
-            mov esi, edi          // lp = row
-            xor eax, eax
-            add edi, bpp      // rp = row + bpp
-dsub1lp:
-            mov al, [esi+ebx]
-            add [edi+ebx], al
-            inc ebx
-            cmp ebx, FullLength
-            jb dsub1lp
-dsub1end:
-         } // end _asm block
-      }
-      return;
-
-      case 6:
-      case 7:
-      case 4:
-      case 5:
-      {
-         ShiftBpp.use = bpp << 3;
-         ShiftRem.use = 64 - ShiftBpp.use;
-         _asm {
-            mov edi, row
-            mov ebx, diff
-            mov esi, edi               // lp = row
-            add edi, bpp           // rp = row + bpp
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]
-dsub4lp:
-            psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
-                          // no need for mask; shift clears inactive bytes
-            movq mm0, [edi+ebx]
-            paddb mm0, mm1
-            // Add 2nd active group
-            movq mm1, mm0          // mov updated Raws to mm1
-            psllq mm1, ShiftBpp    // shift data to position correctly
-                                   // there is no need for any mask
-                                   // since shift clears inactive bits/bytes
-            add ebx, 8
-            paddb mm0, mm1
-            cmp ebx, MMXLength
-            movq [edi+ebx-8], mm0
-            movq mm1, mm0          // Prep for doing 1st add at top of loop
-            jb dsub4lp
-         } // end _asm block
-      }
-      break;
-
-      case 2:
-      {
-         ActiveMask.use  = 0x00000000ffff0000;
-         ShiftBpp.use = 16;       // == 2 * 8
-         ShiftRem.use = 48;       // == 64 - 16
-         _asm {
-            movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
-            mov ebx, diff
-            movq mm6, mm7
-            mov edi, row
-            psllq mm6, ShiftBpp     // Move mask in mm6 to cover 3rd active
-                                    //  byte group
-            mov esi, edi            // lp = row
-            movq mm5, mm6
-            add edi, bpp            // rp = row + bpp
-            psllq mm5, ShiftBpp     // Move mask in mm5 to cover 4th active
-                                    //  byte group
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]
-dsub2lp:
-            // Add 1st active group
-            psrlq mm1, ShiftRem     // Shift data for adding 1st bpp bytes
-                                    // no need for mask; shift clears inactive
-                                    //  bytes
-            movq mm0, [edi+ebx]
-            paddb mm0, mm1
-            // Add 2nd active group
-            movq mm1, mm0           // mov updated Raws to mm1
-            psllq mm1, ShiftBpp     // shift data to position correctly
-            pand mm1, mm7           // mask to use only 2nd active group
-            paddb mm0, mm1
-            // Add 3rd active group
-            movq mm1, mm0           // mov updated Raws to mm1
-            psllq mm1, ShiftBpp     // shift data to position correctly
-            pand mm1, mm6           // mask to use only 3rd active group
-            paddb mm0, mm1
-            // Add 4th active group
-            movq mm1, mm0           // mov updated Raws to mm1
-            psllq mm1, ShiftBpp     // shift data to position correctly
-            pand mm1, mm5           // mask to use only 4th active group
-            add ebx, 8
-            paddb mm0, mm1
-            cmp ebx, MMXLength
-            movq [edi+ebx-8], mm0   // Write updated Raws back to array
-            movq mm1, mm0           // Prep for doing 1st add at top of loop
-            jb dsub2lp
-         } // end _asm block
-      }
-      break;
-      case 8:
-      {
-         _asm {
-            mov edi, row
-            mov ebx, diff
-            mov esi, edi            // lp = row
-            add edi, bpp            // rp = row + bpp
-            mov ecx, MMXLength
-            movq mm7, [edi+ebx-8]   // PRIME the pump (load the first
-                                    // Raw(x-bpp) data set
-            and ecx, 0x0000003f     // calc bytes over mult of 64
-dsub8lp:
-            movq mm0, [edi+ebx]     // Load Sub(x) for 1st 8 bytes
-            paddb mm0, mm7
-            movq mm1, [edi+ebx+8]   // Load Sub(x) for 2nd 8 bytes
-            movq [edi+ebx], mm0    // Write Raw(x) for 1st 8 bytes
-                                   // Now mm0 will be used as Raw(x-bpp) for
-                                   // the 2nd group of 8 bytes.  This will be
-                                   // repeated for each group of 8 bytes with
-                                   // the 8th group being used as the Raw(x-bpp)
-                                   // for the 1st group of the next loop.
-            paddb mm1, mm0
-            movq mm2, [edi+ebx+16]  // Load Sub(x) for 3rd 8 bytes
-            movq [edi+ebx+8], mm1   // Write Raw(x) for 2nd 8 bytes
-            paddb mm2, mm1
-            movq mm3, [edi+ebx+24]  // Load Sub(x) for 4th 8 bytes
-            movq [edi+ebx+16], mm2  // Write Raw(x) for 3rd 8 bytes
-            paddb mm3, mm2
-            movq mm4, [edi+ebx+32]  // Load Sub(x) for 5th 8 bytes
-            movq [edi+ebx+24], mm3  // Write Raw(x) for 4th 8 bytes
-            paddb mm4, mm3
-            movq mm5, [edi+ebx+40]  // Load Sub(x) for 6th 8 bytes
-            movq [edi+ebx+32], mm4  // Write Raw(x) for 5th 8 bytes
-            paddb mm5, mm4
-            movq mm6, [edi+ebx+48]  // Load Sub(x) for 7th 8 bytes
-            movq [edi+ebx+40], mm5  // Write Raw(x) for 6th 8 bytes
-            paddb mm6, mm5
-            movq mm7, [edi+ebx+56]  // Load Sub(x) for 8th 8 bytes
-            movq [edi+ebx+48], mm6  // Write Raw(x) for 7th 8 bytes
-            add ebx, 64
-            paddb mm7, mm6
-            cmp ebx, ecx
-            movq [edi+ebx-8], mm7   // Write Raw(x) for 8th 8 bytes
-            jb dsub8lp
-            cmp ebx, MMXLength
-            jnb dsub8lt8
-dsub8lpA:
-            movq mm0, [edi+ebx]
-            add ebx, 8
-            paddb mm0, mm7
-            cmp ebx, MMXLength
-            movq [edi+ebx-8], mm0   // use -8 to offset early add to ebx
-            movq mm7, mm0           // Move calculated Raw(x) data to mm1 to
-                                    // be the new Raw(x-bpp) for the next loop
-            jb dsub8lpA
-dsub8lt8:
-         } // end _asm block
-      }
-      break;
-
-      default:                // bpp greater than 8 bytes
-      {
-         _asm {
-            mov ebx, diff
-            mov edi, row
-            mov esi, edi           // lp = row
-            add edi, bpp           // rp = row + bpp
-dsubAlp:
-            movq mm0, [edi+ebx]
-            movq mm1, [esi+ebx]
-            add ebx, 8
-            paddb mm0, mm1
-            cmp ebx, MMXLength
-            movq [edi+ebx-8], mm0  // mov does not affect flags; -8 to offset
-                                   //  add ebx
-            jb dsubAlp
-         } // end _asm block
-      }
-      break;
-
-   } // end switch ( bpp )
-
-   _asm {
-        mov ebx, MMXLength
-        mov edi, row
-        cmp ebx, FullLength
-        jnb dsubend
-        mov esi, edi               // lp = row
-        xor eax, eax
-        add edi, bpp               // rp = row + bpp
-dsublp2:
-        mov al, [esi+ebx]
-        add [edi+ebx], al
-        inc ebx
-        cmp ebx, FullLength
-        jb dsublp2
-dsubend:
-        emms             // End MMX instructions; prep for possible FP instrs.
-   } // end _asm block
-}
-
-// Optimized code for PNG Up filter decoder
-void /* PRIVATE */
-png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
-   png_bytep prev_row)
-{
-   png_uint_32 len;
-   len  = row_info->rowbytes;       // # of bytes to filter
-   _asm {
-      mov edi, row
-      // get # of bytes to alignment
-      mov ecx, edi
-      xor ebx, ebx
-      add ecx, 0x7
-      xor eax, eax
-      and ecx, 0xfffffff8
-      mov esi, prev_row
-      sub ecx, edi
-      jz dupgo
-      // fix alignment
-duplp1:
-      mov al, [edi+ebx]
-      add al, [esi+ebx]
-      inc ebx
-      cmp ebx, ecx
-      mov [edi + ebx-1], al  // mov does not affect flags; -1 to offset inc ebx
-      jb duplp1
-dupgo:
-      mov ecx, len
-      mov edx, ecx
-      sub edx, ebx                  // subtract alignment fix
-      and edx, 0x0000003f           // calc bytes over mult of 64
-      sub ecx, edx                  // drop over bytes from length
-      // Unrolled loop - use all MMX registers and interleave to reduce
-      // number of branch instructions (loops) and reduce partial stalls
-duploop:
-      movq mm1, [esi+ebx]
-      movq mm0, [edi+ebx]
-      movq mm3, [esi+ebx+8]
-      paddb mm0, mm1
-      movq mm2, [edi+ebx+8]
-      movq [edi+ebx], mm0
-      paddb mm2, mm3
-      movq mm5, [esi+ebx+16]
-      movq [edi+ebx+8], mm2
-      movq mm4, [edi+ebx+16]
-      movq mm7, [esi+ebx+24]
-      paddb mm4, mm5
-      movq mm6, [edi+ebx+24]
-      movq [edi+ebx+16], mm4
-      paddb mm6, mm7
-      movq mm1, [esi+ebx+32]
-      movq [edi+ebx+24], mm6
-      movq mm0, [edi+ebx+32]
-      movq mm3, [esi+ebx+40]
-      paddb mm0, mm1
-      movq mm2, [edi+ebx+40]
-      movq [edi+ebx+32], mm0
-      paddb mm2, mm3
-      movq mm5, [esi+ebx+48]
-      movq [edi+ebx+40], mm2
-      movq mm4, [edi+ebx+48]
-      movq mm7, [esi+ebx+56]
-      paddb mm4, mm5
-      movq mm6, [edi+ebx+56]
-      movq [edi+ebx+48], mm4
-      add ebx, 64
-      paddb mm6, mm7
-      cmp ebx, ecx
-      movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
-                                     // -8 to offset add ebx
-      jb duploop
-
-      cmp edx, 0                     // Test for bytes over mult of 64
-      jz dupend
-
-
-      // 2 lines added by lcreeve at netins.net
-      // (mail 11 Jul 98 in png-implement list)
-      cmp edx, 8 //test for less than 8 bytes
-      jb duplt8
-
-
-      add ecx, edx
-      and edx, 0x00000007           // calc bytes over mult of 8
-      sub ecx, edx                  // drop over bytes from length
-      jz duplt8
-      // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
-duplpA:
-      movq mm1, [esi+ebx]
-      movq mm0, [edi+ebx]
-      add ebx, 8
-      paddb mm0, mm1
-      cmp ebx, ecx
-      movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
-      jb duplpA
-      cmp edx, 0            // Test for bytes over mult of 8
-      jz dupend
-duplt8:
-      xor eax, eax
-      add ecx, edx          // move over byte count into counter
-      // Loop using x86 registers to update remaining bytes
-duplp2:
-      mov al, [edi + ebx]
-      add al, [esi + ebx]
-      inc ebx
-      cmp ebx, ecx
-      mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
-      jb duplp2
-dupend:
-      // Conversion of filtered row completed
-      emms          // End MMX instructions; prep for possible FP instrs.
-   } // end _asm block
-}
-
-
-// Optimized png_read_filter_row routines
-void /* PRIVATE */
-png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
-   row, png_bytep prev_row, int filter)
-{
-#ifdef PNG_DEBUG
-   char filnm[10];
-#endif
-
-   if (mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-#ifdef PNG_DEBUG
-   png_debug(1, "in png_read_filter_row\n");
-   switch (filter)
-   {
-      case 0: png_snprintf(filnm, 10, "none");
-         break;
-#if !defined(PNG_1_0_X)
-      case 1: png_snprintf(filnm, 10, "sub-%s",
-        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
-         break;
-      case 2: png_snprintf(filnm, 10, "up-%s",
-        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
-         break;
-      case 3: png_snprintf(filnm, 10, "avg-%s",
-        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
-         break;
-      case 4: png_snprintf(filnm, 10, "Paeth-%s",
-        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
-         break;
-#else
-      case 1: png_snprintf(filnm, 10, "sub");
-         break;
-      case 2: png_snprintf(filnm, 10, "up");
-         break;
-      case 3: png_snprintf(filnm, 10, "avg");
-         break;
-      case 4: png_snprintf(filnm, 10, "Paeth");
-         break;
-#endif
-      default: png_snprintf(filnm, 10, "unknw");
-         break;
-   }
-   png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
-   png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
-      (int)((row_info->pixel_depth + 7) >> 3));
-   png_debug1(0,"len=%8d, ", row_info->rowbytes);
-#endif /* PNG_DEBUG */
-
-   switch (filter)
-   {
-      case PNG_FILTER_VALUE_NONE:
-         break;
-
-      case PNG_FILTER_VALUE_SUB:
-      {
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_sub(row_info, row);
-         }
-         else
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_bytep rp = row + bpp;
-            png_bytep lp = row;
-
-            for (i = bpp; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-      }
-
-      case PNG_FILTER_VALUE_UP:
-      {
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_up(row_info, row, prev_row);
-         }
-         else
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-
-            for (i = 0; i < istop; ++i)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-      }
-
-      case PNG_FILTER_VALUE_AVG:
-      {
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_avg(row_info, row, prev_row);
-         }
-         else
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop = row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++) >> 1)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-      }
-
-      case PNG_FILTER_VALUE_PAETH:
-      {
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_paeth(row_info, row, prev_row);
-         }
-         else
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_bytep cp = prev_row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop=row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)   // use leftover rp,pp
-            {
-               int a, b, c, pa, pb, pc, p;
-
-               a = *lp++;
-               b = *pp++;
-               c = *cp++;
-
-               p = b - c;
-               pc = a - c;
-
-#ifdef PNG_USE_ABS
-               pa = abs(p);
-               pb = abs(pc);
-               pc = abs(p + pc);
-#else
-               pa = p < 0 ? -p : p;
-               pb = pc < 0 ? -pc : pc;
-               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
-#endif
-
-               /*
-                  if (pa <= pb && pa <= pc)
-                     p = a;
-                  else if (pb <= pc)
-                     p = b;
-                  else
-                     p = c;
-                */
-
-               p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
-
-               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
-               rp++;
-            }
-         }
-         break;
-      }
-
-      default:
-         png_warning(png_ptr, "Ignoring bad row filter type");
-         *row=0;
-         break;
-   }
-}
-
-#endif /* PNG_MMX_CODE_SUPPORTED && PNG_USE_PNGVCRD */
+/* pnggvrd.c was removed from libpng-1.2.20. */
diff --git a/pngwrite.c b/pngwrite.c
index 8d5b98acf..c6df1ef34 100644
--- a/pngwrite.c
+++ b/pngwrite.c
@@ -454,14 +454,6 @@ png_create_write_struct_2(png_const_charp user_png_ver, png_voidp error_ptr,
    if (png_ptr == NULL)
       return (NULL);
 
-#if !defined(PNG_1_0_X)
-#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
-#ifdef PNG_MMX_CODE_SUPPORTED
-   png_init_mmx_flags(png_ptr);   /* 1.2.0 addition */
-#endif
-#endif
-#endif /* PNG_1_0_X */
-
    /* added at libpng-1.2.6 */
 #ifdef PNG_SET_USER_LIMITS_SUPPORTED
    png_ptr->user_width_max=PNG_USER_WIDTH_MAX;
@@ -670,14 +662,6 @@ png_write_init_3(png_structpp ptr_ptr, png_const_charp user_png_ver,
    png_ptr->user_height_max=PNG_USER_HEIGHT_MAX;
 #endif
 
-#if !defined(PNG_1_0_X)
-#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
-#ifdef PNG_MMX_CODE_SUPPORTED
-   png_init_mmx_flags(png_ptr);   /* 1.2.0 addition */
-#endif
-#endif
-#endif /* PNG_1_0_X */
-
 #ifdef PNG_SETJMP_SUPPORTED
    /* restore jump buffer */
    png_memcpy(png_ptr->jmpbuf, tmp_jmp, png_sizeof (jmp_buf));
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index 47c57a26d..b948607e2 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -62,8 +62,6 @@ else(NOT WIN32)
  endif("$ENV{${TEXT}}" STREQUAL "")
 endif(NOT WIN32)
 
-option(PNG_MMX "Use MMX assembler code (x86 only)" ${png_asm_tmp})
-
 # SET LIBNAME
 # msvc does not append 'lib' - do it here to have consistent name
 if(MSVC)
@@ -101,21 +99,8 @@ set(pngtest_sources
 # SOME NEEDED DEFINITIONS
 add_definitions(-DZLIB_DLL)
 
-if(PNG_MMX)
- if(MSVC)
-  add_definitions(-DPNG_NO_MODULEDEF -D_CRT_SECURE_NO_DEPRECATE)
-  set(libpng_sources ${libpng_sources}
-          pngvcrd.c
-  )
- else(MSVC)
-  set(libpng_sources ${libpng_sources}
-          pnggccrd.c
-  )
- endif(MSVC)
-else(PNG_MMX)
-  add_definitions(-DLIBPNG_NO_MMX)
-  add_definitions(-DPNG_NO_MMX_CODE)
-endif(PNG_MMX)
+add_definitions(-DLIBPNG_NO_MMX)
+add_definitions(-DPNG_NO_MMX_CODE)
 
 if(PNG_CONSOLE_IO_SUPPORTED)
  add_definitions(-DPNG_CONSOLE_IO_SUPPORTED)
@@ -180,7 +165,7 @@ configure_file(${PNG_SOURCE_DIR}/scripts/libpng-config.in
 
 # SET UP LINKS
 set_target_properties(${PNG_LIB_NAME} PROPERTIES
-#    VERSION 0.${PNGLIB_RELEASE}.1.2.20rc3
+#    VERSION 0.${PNGLIB_RELEASE}.1.2.20rc4
      VERSION 0.${PNGLIB_RELEASE}.0
      SOVERSION 0
      CLEAN_DIRECT_OUTPUT 1)
diff --git a/scripts/libpng-config-head.in b/scripts/libpng-config-head.in
index 0289369ff..8f8e29b56 100755
--- a/scripts/libpng-config-head.in
+++ b/scripts/libpng-config-head.in
@@ -8,7 +8,7 @@
 
 # Modeled after libxml-config.
 
-version=1.2.20rc3
+version=1.2.20rc4
 prefix=""
 libdir=""
 libs=""
diff --git a/scripts/libpng.pc-configure.in b/scripts/libpng.pc-configure.in
index 03e08e20e..ab39e205b 100644
--- a/scripts/libpng.pc-configure.in
+++ b/scripts/libpng.pc-configure.in
@@ -5,6 +5,6 @@ includedir=@includedir@/libpng12
 
 Name: libpng
 Description: Loads and saves PNG files
-Version: 1.2.20rc3
+Version: 1.2.20rc4
 Libs: -L${libdir} -lpng12
 Cflags: -I${includedir} @LIBPNG_NO_MMX@
diff --git a/scripts/libpng.pc.in b/scripts/libpng.pc.in
index d3e688d9e..d83e2d8b3 100644
--- a/scripts/libpng.pc.in
+++ b/scripts/libpng.pc.in
@@ -5,6 +5,6 @@ includedir=@includedir@/libpng12
 
 Name: libpng
 Description: Loads and saves PNG files
-Version: 1.2.20rc3
+Version: 1.2.20rc4
 Libs: -L${libdir} -lpng12
 Cflags: -I${includedir}
diff --git a/scripts/makefile.32sunu b/scripts/makefile.32sunu
index fb4dc4598..125599888 100644
--- a/scripts/makefile.32sunu
+++ b/scripts/makefile.32sunu
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.64sunu b/scripts/makefile.64sunu
index 0b8564f78..aed69ed46 100644
--- a/scripts/makefile.64sunu
+++ b/scripts/makefile.64sunu
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.aix b/scripts/makefile.aix
index 1d31f0d07..f32233f23 100644
--- a/scripts/makefile.aix
+++ b/scripts/makefile.aix
@@ -20,7 +20,7 @@ LN_SF = ln -f -s
 
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 prefix=/usr/local
diff --git a/scripts/makefile.beos b/scripts/makefile.beos
index d59523378..15449c434 100644
--- a/scripts/makefile.beos
+++ b/scripts/makefile.beos
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.cygwin b/scripts/makefile.cygwin
index 701e6b12b..2812ac25d 100644
--- a/scripts/makefile.cygwin
+++ b/scripts/makefile.cygwin
@@ -31,9 +31,6 @@ endif
 
 DESTDIR=
 
-# To disable assembler optimizations, add '-DPNG_NO_MMX_CODE' to
-# $CFLAGS.  To enable, add pnggccrd.o to the dependencies.
-
 CC=gcc
 ifdef MINGW
 MINGW_CCFLAGS=-mno-cygwin -I/usr/include/mingw
@@ -77,7 +74,7 @@ CFLAGS= $(strip $(MINGW_CCFLAGS) $(addprefix -I,$(ZLIBINC)) \
 LIBNAME = libpng12
 PNGMAJ = 0
 CYGDLL = 12
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 SHAREDLIB=cygpng$(CYGDLL).dll
@@ -112,7 +109,7 @@ DL =$(D)$(LIBPATH)
 
 OBJS = png.o pngset.o pngget.o pngrutil.o pngtrans.o pngwutil.o \
 	pngread.o pngrio.o pngwio.o pngwrite.o pngrtran.o \
-	pngwtran.o pngmem.o pngerror.o pngpread.o # pnggccrd.o
+	pngwtran.o pngmem.o pngerror.o pngpread.o
 
 OBJSDLL = $(OBJS:.o=.pic.o)
 
@@ -156,20 +153,6 @@ shared: all-shared
 all-static: $(STATLIB) pngtest-stat$(EXE)
 all-shared: $(SHAREDLIB) pngtest$(EXE)
 
-# pnggccrd.o: pnggccrd.c png.h pngconf.h
-#	@echo ""
-#	@echo '    You can ignore the "control reaches end of non-void function"'
-#	@echo '    warning and "<variable> defined but not used" warnings:'
-#	@echo ""
-#	$(CC) -c $(CFLAGS) -o $@ $<
-
-# pnggccrd.pic.o:	pnggccrd.c png.h pngconf.h
-#	@echo ""
-#	@echo '    You can ignore the "control reaches end of non-void function"'
-#	@echo '    warning and "<variable> defined but not used" warnings:'
-#	@echo ""
-#	$(CC) -c $(CFLAGS) -DPNG_BUILD_DLL -o $@ $<
-
 $(STATLIB): $(OBJS)
 	ar rc $@ $(OBJS)
 	$(RANLIB) $@
diff --git a/scripts/makefile.darwin b/scripts/makefile.darwin
index f84c07070..2a27d1f74 100644
--- a/scripts/makefile.darwin
+++ b/scripts/makefile.darwin
@@ -19,7 +19,7 @@ ZLIBINC=../zlib
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.dec b/scripts/makefile.dec
index b18730fa7..a9d243e14 100644
--- a/scripts/makefile.dec
+++ b/scripts/makefile.dec
@@ -5,7 +5,7 @@
 
 # Library name:
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 LIBNAME = libpng12
 
diff --git a/scripts/makefile.elf b/scripts/makefile.elf
index d7f49bedd..4d8e04c45 100644
--- a/scripts/makefile.elf
+++ b/scripts/makefile.elf
@@ -12,7 +12,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.gcmmx b/scripts/makefile.gcmmx
index a99bef585..1bc5215b7 100644
--- a/scripts/makefile.gcmmx
+++ b/scripts/makefile.gcmmx
@@ -7,8 +7,6 @@
 
 # CAUTION: Do not use this makefile with gcc versions 2.7.2.2 and earlier.
 
-# WARNING: The assembler code in pnggccrd.c may not be thread safe.
-
 # NOTE:  When testing MMX performance on a multitasking system, make sure
 #        there are no floating-point programs (e.g., SETI@Home) running in
 #        the background!  Context switches between MMX and FPU are expensive.
@@ -16,7 +14,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
@@ -98,7 +96,7 @@ DM=$(DESTDIR)$(MANPATH)
 
 OBJS = png.o pngset.o pngget.o pngrutil.o pngtrans.o pngwutil.o \
 	pngread.o pngrio.o pngwio.o pngwrite.o pngrtran.o \
-	pngwtran.o pngmem.o pngerror.o pngpread.o pnggccrd.o
+	pngwtran.o pngmem.o pngerror.o pngpread.o
 
 OBJSDLL = $(OBJS:.o=.pic.o)
 
@@ -127,12 +125,6 @@ libpng-config:
 	cat scripts/libpng-config-body.in ) > libpng-config
 	chmod +x libpng-config
 
-pnggccrd.o:	pnggccrd.c png.h pngconf.h
-	$(CC) -c $(CFLAGS) -o $@ $*.c
-
-pnggccrd.pic.o:	pnggccrd.c png.h pngconf.h
-	$(CC) -c $(CFLAGS) -fPIC -o $@ pnggccrd.c
-
 $(LIBSO): $(LIBSOMAJ)
 	$(LN_SF) $(LIBSOMAJ) $(LIBSO)
 
diff --git a/scripts/makefile.hp64 b/scripts/makefile.hp64
index 646661216..38c896cdb 100644
--- a/scripts/makefile.hp64
+++ b/scripts/makefile.hp64
@@ -18,7 +18,7 @@ ZLIBINC=/opt/zlib/include
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.hpgcc b/scripts/makefile.hpgcc
index 3cfa3e3fa..34053ce72 100644
--- a/scripts/makefile.hpgcc
+++ b/scripts/makefile.hpgcc
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.hpux b/scripts/makefile.hpux
index 543a14da2..d92222d59 100644
--- a/scripts/makefile.hpux
+++ b/scripts/makefile.hpux
@@ -18,7 +18,7 @@ ZLIBINC=/opt/zlib/include
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.intel b/scripts/makefile.intel
index a155ddd58..b0b523af0 100644
--- a/scripts/makefile.intel
+++ b/scripts/makefile.intel
@@ -36,7 +36,7 @@ O=.obj
 
 OBJS=png$(O) pngset$(O) pngget$(O) pngrutil$(O) pngtrans$(O) pngwutil$(O) \
 pngmem$(O) pngpread$(O) pngread$(O) pngerror$(O) pngwrite$(O) \
-pngrtran$(O) pngwtran$(O) pngrio$(O) pngwio$(O) pngvcrd$(O) pnggccrd$(O)
+pngrtran$(O) pngwtran$(O) pngrio$(O) pngwio$(O)
 
 all: test
 
@@ -61,12 +61,6 @@ pngrtran$(O): png.h pngconf.h
 pngrutil$(O): png.h pngconf.h
 	$(CC) $(CFLAGS) $*.c $(ERRFILE)
 
-pnggccrd$(O): png.h pngconf.h
-	$(CC) $(CFLAGS) $*.c $(ERRFILE)
-
-pngvcrd$(O): png.h pngconf.h
-	$(CC) $(CFLAGS) $*.c $(ERRFILE)
-
 pngerror$(O): png.h pngconf.h
 	$(CC) $(CFLAGS) $*.c $(ERRFILE)
 
diff --git a/scripts/makefile.linux b/scripts/makefile.linux
index 9ee81f66f..2c83c21f2 100644
--- a/scripts/makefile.linux
+++ b/scripts/makefile.linux
@@ -6,7 +6,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.mingw b/scripts/makefile.mingw
index 09faefd91..43963e850 100644
--- a/scripts/makefile.mingw
+++ b/scripts/makefile.mingw
@@ -39,10 +39,6 @@ RANLIB=ranlib
 
 MKDIR_P=/bin/mkdir -pv
 
-
-# To disable assembler optimizations, add '-DPNG_NO_MMX_CODE' to
-# $CFLAGS.  To enable them, add pnggccrd.o to the dependencies.
-
 # Where "make install" puts libpng*.a, *png*.dll, png.h, and pngconf.h
 ifndef prefix
 prefix=/usr
@@ -78,7 +74,7 @@ CFLAGS= $(strip $(MINGW_CCFLAGS) $(addprefix -I,$(ZLIBINC)) \
 LIBNAME = libpng12
 PNGMAJ = 0
 MINGDLL = 12
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 SHAREDLIB=libpng$(MINGDLL).dll
@@ -109,7 +105,7 @@ DL =$(D)$(LIBPATH)
 
 OBJS = png.o pngset.o pngget.o pngrutil.o pngtrans.o pngwutil.o \
 	pngread.o pngrio.o pngwio.o pngwrite.o pngrtran.o \
-	pngwtran.o pngmem.o pngerror.o pngpread.o # pnggccrd.o
+	pngwtran.o pngmem.o pngerror.o pngpread.o
 
 OBJSDLL = $(OBJS:.o=.pic.o)
 
@@ -282,7 +278,6 @@ pngwrite.o pngwrite.pic.o:	png.h pngconf.h pngwrite.c
 pngwtran.o pngwtran.pic.o:	png.h pngconf.h pngwtran.c
 pngwutil.o pngwutil.pic.o:	png.h pngconf.h pngwutil.c
 pngpread.o pngpread.pic.o:	png.h pngconf.h pngpread.c
-# pnggccrd.o pnggccrd.pic.o:	png.h pngconf.h pnggccrd.c
 
 pngtest.o pngtest.pic.o:	png.h pngconf.h pngtest.c
 
diff --git a/scripts/makefile.ne12bsd b/scripts/makefile.ne12bsd
index 422e5167d..de9c56a85 100644
--- a/scripts/makefile.ne12bsd
+++ b/scripts/makefile.ne12bsd
@@ -14,10 +14,10 @@ INCSDIR=${LOCALBASE}/include/libpng12
 
 LIB=	png12
 SHLIB_MAJOR=	0
-SHLIB_MINOR=	1.2.20rc3
-SRCS=	pnggccrd.c png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
-		pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
-		pngwtran.c pngmem.c pngerror.c pngpread.c
+SHLIB_MINOR=	1.2.20rc4
+SRCS=	png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
+	pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
+	pngwtran.c pngmem.c pngerror.c pngpread.c
 INCS=	png.h pngconf.h
 MAN=	libpng.3 libpngpf.3 png.5
 
diff --git a/scripts/makefile.netbsd b/scripts/makefile.netbsd
index c1219e2ad..26be87f11 100644
--- a/scripts/makefile.netbsd
+++ b/scripts/makefile.netbsd
@@ -14,10 +14,10 @@ INCSDIR=${LOCALBASE}/include/libpng
 
 LIB=	png
 SHLIB_MAJOR=	3
-SHLIB_MINOR=	1.2.20rc3
-SRCS=	pnggccrd.c png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
-		pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
-		pngwtran.c pngmem.c pngerror.c pngpread.c
+SHLIB_MINOR=	1.2.20rc4
+SRCS=	png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
+	pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
+	pngwtran.c pngmem.c pngerror.c pngpread.c
 INCS=	png.h pngconf.h
 MAN=	libpng.3 libpngpf.3 png.5
 
diff --git a/scripts/makefile.nommx b/scripts/makefile.nommx
index a2d840bf7..e1b8c01d0 100644
--- a/scripts/makefile.nommx
+++ b/scripts/makefile.nommx
@@ -7,7 +7,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.openbsd b/scripts/makefile.openbsd
index 9e85b6ff3..912596246 100644
--- a/scripts/makefile.openbsd
+++ b/scripts/makefile.openbsd
@@ -8,10 +8,10 @@ LIBDIR=	${PREFIX}/lib
 MANDIR= ${PREFIX}/man/cat
 
 SHLIB_MAJOR=	0
-SHLIB_MINOR=	1.2.20rc3
+SHLIB_MINOR=	1.2.20rc4
 
 LIB=	png
-SRCS=	png.c pngerror.c pnggccrd.c pngget.c pngmem.c pngpread.c \
+SRCS=	png.c pngerror.c pngget.c pngmem.c pngpread.c \
 	pngread.c pngrio.c pngrtran.c pngrutil.c pngset.c pngtrans.c \
 	pngwio.c pngwrite.c pngwtran.c pngwutil.c
 
diff --git a/scripts/makefile.sco b/scripts/makefile.sco
index daa5661d3..3827f5158 100644
--- a/scripts/makefile.sco
+++ b/scripts/makefile.sco
@@ -9,7 +9,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.sggcc b/scripts/makefile.sggcc
index 94f428cd6..58f7ec6f7 100644
--- a/scripts/makefile.sggcc
+++ b/scripts/makefile.sggcc
@@ -6,7 +6,7 @@
 # Library name:
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.sgi b/scripts/makefile.sgi
index 371fc32ed..64a659506 100644
--- a/scripts/makefile.sgi
+++ b/scripts/makefile.sgi
@@ -6,7 +6,7 @@
 # Library name:
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.so9 b/scripts/makefile.so9
index 4dace2907..3b1a2864c 100644
--- a/scripts/makefile.so9
+++ b/scripts/makefile.so9
@@ -8,7 +8,7 @@
 
 # Library name:
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 LIBNAME = libpng12
 
diff --git a/scripts/makefile.solaris b/scripts/makefile.solaris
index daf96f877..01b10f491 100644
--- a/scripts/makefile.solaris
+++ b/scripts/makefile.solaris
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.solaris-x86 b/scripts/makefile.solaris-x86
index 7a9fc164a..e4e2b74d3 100644
--- a/scripts/makefile.solaris-x86
+++ b/scripts/makefile.solaris-x86
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
@@ -69,7 +69,7 @@ DM=$(DESTDIR)$(MANPATH)
 
 OBJS = png.o pngset.o pngget.o pngrutil.o pngtrans.o pngwutil.o \
 	pngread.o pngrio.o pngwio.o pngwrite.o pngrtran.o \
-	pngwtran.o pngmem.o pngerror.o pngpread.o pnggccrd.o
+	pngwtran.o pngmem.o pngerror.o pngpread.o
 
 OBJSDLL = $(OBJS:.o=.pic.o)
 
@@ -232,7 +232,6 @@ pngwio.o pngwio.pic.o: png.h pngconf.h
 pngmem.o pngmem.pic.o: png.h pngconf.h
 pngset.o pngset.pic.o: png.h pngconf.h
 pngget.o pngget.pic.o: png.h pngconf.h
-pnggccrd.o pnggccrd.pic.o: png.h pngconf.h
 pngread.o pngread.pic.o: png.h pngconf.h
 pngrtran.o pngrtran.pic.o: png.h pngconf.h
 pngrutil.o pngrutil.pic.o: png.h pngconf.h
diff --git a/scripts/makefile.vcawin32 b/scripts/makefile.vcawin32
index 4c2f3cc0e..99f543053 100644
--- a/scripts/makefile.vcawin32
+++ b/scripts/makefile.vcawin32
@@ -26,8 +26,7 @@ O=.obj
 OBJS1 = png$(O) pngerror$(O) pngget$(O) pngmem$(O) pngpread$(O)
 OBJS2 = pngread$(O) pngrio$(O) pngrtran$(O) pngrutil$(O) pngset$(O)
 OBJS3 = pngtrans$(O) pngwio$(O) pngwrite$(O) pngwtran$(O) pngwutil$(O)
-OBJS4 = pngvcrd$(O)
-OBJS  = $(OBJS1) $(OBJS2) $(OBJS3) $(OBJS4)
+OBJS  = $(OBJS1) $(OBJS2) $(OBJS3)
 
 # Targets
 all: libpng.lib
@@ -80,9 +79,6 @@ pngwtran$(O): png.h pngconf.h
 pngwutil$(O): png.h pngconf.h
 	$(CC) -c $(CFLAGS) $*.c $(ERRFILE)
 
-pngvcrd$(O): png.h pngconf.h
-	$(CC) -c $(CFLAGS) $*.c $(ERRFILE)
-
 libpng.lib: $(OBJS)
 	-$(RM) $@
 	$(AR) $(ARFLAGS) -out:$@ $(OBJS) $(ERRFILE)
diff --git a/scripts/pngos2.def b/scripts/pngos2.def
index d6074f364..4a6b3f1d0 100644
--- a/scripts/pngos2.def
+++ b/scripts/pngos2.def
@@ -2,7 +2,7 @@
 ; PNG.LIB module definition file for OS/2
 ;----------------------------------------
 
-; Version 1.2.20rc3
+; Version 1.2.20rc4
 
 LIBRARY		PNG
 DESCRIPTION	"PNG image compression library for OS/2"
diff --git a/scripts/pngw32.def b/scripts/pngw32.def
index 05a97cf92..b31fa79ea 100644
--- a/scripts/pngw32.def
+++ b/scripts/pngw32.def
@@ -5,7 +5,7 @@
 LIBRARY
 
 EXPORTS
-;Version 1.2.20rc3
+;Version 1.2.20rc4
   png_build_grayscale_palette  @1
   png_check_sig        @2
   png_chunk_error      @3
-- 
cgit v1.2.1