summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGlenn Randers-Pehrson <glennrp at users.sourceforge.net>2007-06-28 16:04:25 -0500
committerGlenn Randers-Pehrson <glennrp at users.sourceforge.net>2009-04-06 16:13:41 -0500
commitdb40ca4acaec070e5628c2519599eb203a7f0287 (patch)
tree6151c98d983730d0bc24334bff7921338c7de069
parentf456d0d6bb30b46a99b3a33872fec370d660381a (diff)
downloadlibpng-db40ca4acaec070e5628c2519599eb203a7f0287.tar.gz
Imported from libpng-1.2.19beta19.tarv1.2.19beta19
-rw-r--r--ANNOUNCE33
-rw-r--r--CHANGES7
-rw-r--r--INSTALL20
-rw-r--r--KNOWNBUG10
-rw-r--r--LICENSE4
-rw-r--r--README14
-rw-r--r--Y2KINFO4
-rwxr-xr-xconfigure22
-rw-r--r--configure.ac4
-rw-r--r--libpng-1.2.19beta19.txt (renamed from libpng-1.2.19beta18.txt)6
-rw-r--r--libpng.318
-rw-r--r--libpngpf.34
-rw-r--r--png.52
-rw-r--r--png.c6
-rw-r--r--png.h22
-rw-r--r--pngconf.h2
-rw-r--r--pngerror.c2
-rw-r--r--pnggccrd.c3263
-rw-r--r--pngpread.c2
-rw-r--r--pngread.c2
-rw-r--r--pngrtran.c8
-rw-r--r--pngrutil.c173
-rw-r--r--pngtest.c2
-rw-r--r--pngvcrd.c1088
-rw-r--r--pngwutil.c2
-rw-r--r--scripts/CMakeLists.txt2
-rwxr-xr-xscripts/libpng-config-head.in2
-rw-r--r--scripts/libpng.pc-configure.in2
-rw-r--r--scripts/libpng.pc.in2
-rw-r--r--scripts/makefile.32sunu2
-rw-r--r--scripts/makefile.64sunu2
-rw-r--r--scripts/makefile.aix2
-rw-r--r--scripts/makefile.beos2
-rw-r--r--scripts/makefile.cygwin2
-rw-r--r--scripts/makefile.darwin2
-rw-r--r--scripts/makefile.dec2
-rw-r--r--scripts/makefile.elf2
-rw-r--r--scripts/makefile.gcmmx2
-rw-r--r--scripts/makefile.hp642
-rw-r--r--scripts/makefile.hpgcc2
-rw-r--r--scripts/makefile.hpux2
-rw-r--r--scripts/makefile.linux2
-rw-r--r--scripts/makefile.mingw2
-rw-r--r--scripts/makefile.ne12bsd2
-rw-r--r--scripts/makefile.netbsd2
-rw-r--r--scripts/makefile.nommx2
-rw-r--r--scripts/makefile.openbsd2
-rw-r--r--scripts/makefile.sco2
-rw-r--r--scripts/makefile.sggcc2
-rw-r--r--scripts/makefile.sgi2
-rw-r--r--scripts/makefile.so92
-rw-r--r--scripts/makefile.solaris2
-rw-r--r--scripts/makefile.solaris-x862
-rw-r--r--scripts/pngos2.def2
-rw-r--r--scripts/pngw32.def2
55 files changed, 2386 insertions, 2394 deletions
diff --git a/ANNOUNCE b/ANNOUNCE
index edb5bc958..0eb6e9b78 100644
--- a/ANNOUNCE
+++ b/ANNOUNCE
@@ -1,5 +1,5 @@
-Libpng 1.2.19beta18 - June 23, 2007
+Libpng 1.2.19beta19 - June 28, 2007
This is not intended to be a public release. It will be replaced
within a few weeks by a public version or by another test version.
@@ -9,32 +9,32 @@ Files available for download:
Source files with LF line endings (for Unix/Linux) and with a
"configure" script
- libpng-1.2.19beta18.tar.gz
- libpng-1.2.19beta18.tar.bz2
+ libpng-1.2.19beta19.tar.gz
+ libpng-1.2.19beta19.tar.bz2
Source files with LF line endings (for Unix/Linux) without the
"configure" script
- libpng-1.2.19beta18-no-config.tar.gz
- libpng-1.2.19beta18-no-config.tar.bz2
+ libpng-1.2.19beta19-no-config.tar.gz
+ libpng-1.2.19beta19-no-config.tar.bz2
Source files with CRLF line endings (for Windows), without the
"configure" script
- lp1219b18.zip
- lp1219b18.tar.bz2
+ lp1219b19.zip
+ lp1219b19.tar.bz2
Project files
- libpng-1.2.19beta18-project-netware.zip
- libpng-1.2.19beta18-project-wince.zip
+ libpng-1.2.19beta19-project-netware.zip
+ libpng-1.2.19beta19-project-wince.zip
Other information:
- libpng-1.2.19beta18-README.txt
- libpng-1.2.19beta18-KNOWNBUGS.txt
- libpng-1.2.19beta18-LICENSE.txt
- libpng-1.2.19beta18-Y2K-compliance.txt
+ libpng-1.2.19beta19-README.txt
+ libpng-1.2.19beta19-KNOWNBUGS.txt
+ libpng-1.2.19beta19-LICENSE.txt
+ libpng-1.2.19beta19-Y2K-compliance.txt
Changes since the last public release (1.2.18):
@@ -113,6 +113,13 @@ version 1.2.19beta18 [June 23, 2007]
on pnggccrd.o from many makefiles.
Added sl and dylib to list of extensions be installed by Makefile.am
+version 1.2.19beta18 [June 22, 2007]
+ More cleanup of pnggccrd.c and pngrutil.c
+
+version 1.2.19beta19 [June 28, 2007]
+ Fixed testing PNG_RGB_TO_GRAY_ERR & PNG_RGB_TO_GRAY_WARN in pngrtran.c
+ More cleanup of pnggccrd.c and pngvcrd.c
+
Send comments/corrections/commendations to png-mng-implement at lists.sf.net
(subscription required; visit
diff --git a/CHANGES b/CHANGES
index e17bb2635..137bb9d6f 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1806,6 +1806,13 @@ version 1.2.19beta18 [June 23, 2007]
on pnggccrd.o from many makefiles.
Added sl and dylib to list of extensions be installed by Makefile.am
+version 1.2.19beta18 [June 22, 2007]
+ More cleanup of pnggccrd.c and pngrutil.c
+
+version 1.2.19beta19 [June 28, 2007]
+ Fixed testing PNG_RGB_TO_GRAY_ERR & PNG_RGB_TO_GRAY_WARN in pngrtran.c
+ More cleanup of pnggccrd.c and pngvcrd.c
+
Send comments/corrections/commendations to png-mng-implement at lists.sf.net
(subscription required; visit
https://lists.sourceforge.net/lists/listinfo/png-mng-implement
diff --git a/INSTALL b/INSTALL
index 51a79acfa..99031ceeb 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,5 +1,5 @@
-Installing libpng version 1.2.19beta18 - June 23, 2007
+Installing libpng version 1.2.19beta19 - June 28, 2007
On Unix/Linux and similar systems, you can simply type
@@ -44,7 +44,7 @@ to have access to the zlib.h and zconf.h include files that
correspond to the version of zlib that's installed.
You can rename the directories that you downloaded (they
-might be called "libpng-1.2.19beta18" or "lpng109" and "zlib-1.2.1"
+might be called "libpng-1.2.19beta19" or "lpng109" and "zlib-1.2.1"
or "zlib121") so that you have directories called "zlib" and "libpng".
Your directory structure should look like this:
@@ -101,14 +101,14 @@ include
CMakeLists.txt => "cmake" script
makefile.std => Generic UNIX makefile (cc, creates static libpng.a)
makefile.elf => Linux/ELF makefile symbol versioning,
- gcc, creates libpng12.so.0.1.2.19beta18)
+ gcc, creates libpng12.so.0.1.2.19beta19)
makefile.linux => Linux/ELF makefile
- (gcc, creates libpng12.so.0.1.2.19beta18)
+ (gcc, creates libpng12.so.0.1.2.19beta19)
makefile.gcmmx => Linux/ELF makefile
- (gcc, creates libpng12.so.0.1.2.19beta18,
+ (gcc, creates libpng12.so.0.1.2.19beta19,
uses assembler code tuned for Intel MMX platform)
makefile.nommx => Linux/ELF makefile
- (gcc, creates libpng12.so.0.1.2.19beta18
+ (gcc, creates libpng12.so.0.1.2.19beta19
does not use Intel MMX assembler code)
makefile.gcc => Generic makefile (gcc, creates static libpng.a)
makefile.knr => Archaic UNIX Makefile that converts files with
@@ -131,14 +131,14 @@ include
makefile.openbsd => OpenBSD makefile
makefile.sgi => Silicon Graphics IRIX makefile (cc, creates static lib)
makefile.sggcc => Silicon Graphics (gcc,
- creates libpng12.so.0.1.2.19beta18)
+ creates libpng12.so.0.1.2.19beta19)
makefile.sunos => Sun makefile
makefile.solaris => Solaris 2.X makefile (gcc,
- creates libpng12.so.0.1.2.19beta18)
+ creates libpng12.so.0.1.2.19beta19)
makefile.solaris-x86 => Solaris/intelMMX 2.X makefile (gcc,
- creates libpng12.so.0.1.2.19beta18)
+ creates libpng12.so.0.1.2.19beta19)
makefile.so9 => Solaris 9 makefile (gcc,
- creates libpng12.so.0.1.2.19beta18)
+ creates libpng12.so.0.1.2.19beta19)
makefile.32sunu => Sun Ultra 32-bit makefile
makefile.64sunu => Sun Ultra 64-bit makefile
makefile.sco => For SCO OSr5 ELF and Unixware 7 with Native cc
diff --git a/KNOWNBUG b/KNOWNBUG
index a0749519c..1ba1885e0 100644
--- a/KNOWNBUG
+++ b/KNOWNBUG
@@ -1,5 +1,5 @@
-Known bugs in libpng version 1.2.19beta18
+Known bugs in libpng version 1.2.19beta19
1. April 22, 2001: pnggccrd.c has been reported to crash on NetBSD when
reading interlaced PNG files, when assembler code is enabled but running
@@ -19,5 +19,11 @@ Known bugs in libpng version 1.2.19beta18
libpng12.so => libpng12.so.0.1.2.9betaN
that are generated by the custom makefiles.
- STATUS: For now, system library builders should use the custom makefiles.
+4. June 28, 2007: Pnggccrd.c has been reported to be unreliable on
+ some 64-bit platforms. It appears to build properly, but it does
+ not always read files correctly, leading to corrupted output or
+ to a crash of the application.
+
+ STATUS: Under investigation. A workaround is to use -DPNG_NO_MMX_CODE
+ on 64-bit platforms.
diff --git a/LICENSE b/LICENSE
index 9dc09d4f4..9390f38e8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -8,7 +8,7 @@ COPYRIGHT NOTICE, DISCLAIMER, and LICENSE:
If you modify libpng you may insert additional notices immediately following
this sentence.
-libpng versions 1.2.6, August 15, 2004, through 1.2.19beta18, June 23, 2007, are
+libpng versions 1.2.6, August 15, 2004, through 1.2.19beta19, June 28, 2007, are
Copyright (c) 2004, 2006-2007 Glenn Randers-Pehrson, and are
distributed according to the same disclaimer and license as libpng-1.2.5
with the following individual added to the list of Contributing Authors
@@ -106,4 +106,4 @@ certification mark of the Open Source Initiative.
Glenn Randers-Pehrson
glennrp at users.sourceforge.net
-June 23, 2007
+June 28, 2007
diff --git a/README b/README
index d301106a5..df0ca89e3 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README for libpng version 1.2.19beta18 - June 23, 2007 (shared library 12.0)
+README for libpng version 1.2.19beta19 - June 28, 2007 (shared library 12.0)
See the note about version numbers near the top of png.h
See INSTALL for instructions on how to install libpng.
@@ -190,11 +190,11 @@ Files in this distribution:
descrip.mms => VMS makefile for MMS or MMK
makefile.std => Generic UNIX makefile (cc, creates static libpng.a)
makefile.elf => Linux/ELF makefile symbol versioning,
- gcc, creates libpng12.so.0.1.2.19beta18)
+ gcc, creates libpng12.so.0.1.2.19beta19)
makefile.linux => Linux/ELF makefile
- (gcc, creates libpng12.so.0.1.2.19beta18)
+ (gcc, creates libpng12.so.0.1.2.19beta19)
makefile.gcmmx => Linux/ELF makefile
- (gcc, creates libpng12.so.0.1.2.19beta18,
+ (gcc, creates libpng12.so.0.1.2.19beta19,
uses assembler code tuned for Intel MMX platform)
makefile.gcc => Generic makefile (gcc, creates static libpng.a)
makefile.knr => Archaic UNIX Makefile that converts files with
@@ -216,12 +216,12 @@ Files in this distribution:
makefile.openbsd => OpenBSD makefile
makefile.sgi => Silicon Graphics IRIX (cc, creates static lib)
makefile.sggcc => Silicon Graphics
- (gcc, creates libpng12.so.0.1.2.19beta18)
+ (gcc, creates libpng12.so.0.1.2.19beta19)
makefile.sunos => Sun makefile
makefile.solaris => Solaris 2.X makefile
- (gcc, creates libpng12.so.0.1.2.19beta18)
+ (gcc, creates libpng12.so.0.1.2.19beta19)
makefile.so9 => Solaris 9 makefile
- (gcc, creates libpng12.so.0.1.2.19beta18)
+ (gcc, creates libpng12.so.0.1.2.19beta19)
makefile.32sunu => Sun Ultra 32-bit makefile
makefile.64sunu => Sun Ultra 64-bit makefile
makefile.sco => For SCO OSr5 ELF and Unixware 7 with Native cc
diff --git a/Y2KINFO b/Y2KINFO
index afd5357b2..3f3fc9c8a 100644
--- a/Y2KINFO
+++ b/Y2KINFO
@@ -1,13 +1,13 @@
Y2K compliance in libpng:
=========================
- June 23, 2007
+ June 28, 2007
Since the PNG Development group is an ad-hoc body, we can't make
an official declaration.
This is your unofficial assurance that libpng from version 0.71 and
- upward through 1.2.19beta18 are Y2K compliant. It is my belief that earlier
+ upward through 1.2.19beta19 are Y2K compliant. It is my belief that earlier
versions were also Y2K compliant.
Libpng only has three year fields. One is a 2-byte unsigned integer
diff --git a/configure b/configure
index 614cc88f3..a2ae72f46 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.61 for libpng 1.2.19beta18.
+# Generated by GNU Autoconf 2.61 for libpng 1.2.19beta19.
#
# Report bugs to <png-mng-implement@lists.sourceforge.net>.
#
@@ -728,8 +728,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
# Identity of this package.
PACKAGE_NAME='libpng'
PACKAGE_TARNAME='libpng'
-PACKAGE_VERSION='1.2.19beta18'
-PACKAGE_STRING='libpng 1.2.19beta18'
+PACKAGE_VERSION='1.2.19beta19'
+PACKAGE_STRING='libpng 1.2.19beta19'
PACKAGE_BUGREPORT='png-mng-implement@lists.sourceforge.net'
ac_unique_file="pngget.c"
@@ -1405,7 +1405,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures libpng 1.2.19beta18 to adapt to many kinds of systems.
+\`configure' configures libpng 1.2.19beta19 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1475,7 +1475,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of libpng 1.2.19beta18:";;
+ short | recursive ) echo "Configuration of libpng 1.2.19beta19:";;
esac
cat <<\_ACEOF
@@ -1585,7 +1585,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-libpng configure 1.2.19beta18
+libpng configure 1.2.19beta19
generated by GNU Autoconf 2.61
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1599,7 +1599,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by libpng $as_me 1.2.19beta18, which was
+It was created by libpng $as_me 1.2.19beta19, which was
generated by GNU Autoconf 2.61. Invocation command line was
$ $0 $@
@@ -2269,7 +2269,7 @@ fi
# Define the identity of the package.
PACKAGE='libpng'
- VERSION='1.2.19beta18'
+ VERSION='1.2.19beta19'
cat >>confdefs.h <<_ACEOF
@@ -2440,7 +2440,7 @@ fi
-PNGLIB_VERSION=1.2.19beta18
+PNGLIB_VERSION=1.2.19beta19
PNGLIB_MAJOR=1
PNGLIB_MINOR=2
PNGLIB_RELEASE=19
@@ -21119,7 +21119,7 @@ exec 6>&1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by libpng $as_me 1.2.19beta18, which was
+This file was extended by libpng $as_me 1.2.19beta19, which was
generated by GNU Autoconf 2.61. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -21172,7 +21172,7 @@ Report bugs to <bug-autoconf@gnu.org>."
_ACEOF
cat >>$CONFIG_STATUS <<_ACEOF
ac_cs_version="\\
-libpng config.status 1.2.19beta18
+libpng config.status 1.2.19beta19
configured by $0, generated by GNU Autoconf 2.61,
with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
diff --git a/configure.ac b/configure.ac
index 2af258115..9c89b3a5a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -18,12 +18,12 @@ AC_PREREQ(2.59)
dnl Version number stuff here:
-AC_INIT([libpng], [1.2.19beta18], [png-mng-implement@lists.sourceforge.net])
+AC_INIT([libpng], [1.2.19beta19], [png-mng-implement@lists.sourceforge.net])
AM_INIT_AUTOMAKE
dnl stop configure from automagically running automake
AM_MAINTAINER_MODE
-PNGLIB_VERSION=1.2.19beta18
+PNGLIB_VERSION=1.2.19beta19
PNGLIB_MAJOR=1
PNGLIB_MINOR=2
PNGLIB_RELEASE=19
diff --git a/libpng-1.2.19beta18.txt b/libpng-1.2.19beta19.txt
index 208149b2c..e623011ee 100644
--- a/libpng-1.2.19beta18.txt
+++ b/libpng-1.2.19beta19.txt
@@ -1,6 +1,6 @@
libpng.txt - A description on how to use and modify libpng
- libpng version 1.2.19beta18 - June 23, 2007
+ libpng version 1.2.19beta19 - June 28, 2007
Updated and distributed by Glenn Randers-Pehrson
<glennrp at users.sourceforge.net>
Copyright (c) 1998-2007 Glenn Randers-Pehrson
@@ -2921,13 +2921,13 @@ application:
IX. Y2K Compliance in libpng
-June 23, 2007
+June 28, 2007
Since the PNG Development group is an ad-hoc body, we can't make
an official declaration.
This is your unofficial assurance that libpng from version 0.71 and
-upward through 1.2.19beta18 are Y2K compliant. It is my belief that earlier
+upward through 1.2.19beta19 are Y2K compliant. It is my belief that earlier
versions were also Y2K compliant.
Libpng only has three year fields. One is a 2-byte unsigned integer that
diff --git a/libpng.3 b/libpng.3
index a9ce21821..d9ac651e8 100644
--- a/libpng.3
+++ b/libpng.3
@@ -1,6 +1,6 @@
-.TH LIBPNG 3 "June 23, 2007"
+.TH LIBPNG 3 "June 28, 2007"
.SH NAME
-libpng \- Portable Network Graphics (PNG) Reference Library 1.2.19beta18
+libpng \- Portable Network Graphics (PNG) Reference Library 1.2.19beta19
.SH SYNOPSIS
\fB
#include <png.h>\fP
@@ -410,7 +410,7 @@ Following is a copy of the libpng.txt file that accompanies libpng.
.SH LIBPNG.TXT
libpng.txt - A description on how to use and modify libpng
- libpng version 1.2.19beta18 - June 23, 2007
+ libpng version 1.2.19beta19 - June 28, 2007
Updated and distributed by Glenn Randers-Pehrson
<glennrp at users.sourceforge.net>
Copyright (c) 1998-2007 Glenn Randers-Pehrson
@@ -3331,13 +3331,13 @@ application:
.SH IX. Y2K Compliance in libpng
-June 23, 2007
+June 28, 2007
Since the PNG Development group is an ad-hoc body, we can't make
an official declaration.
This is your unofficial assurance that libpng from version 0.71 and
-upward through 1.2.19beta18 are Y2K compliant. It is my belief that earlier
+upward through 1.2.19beta19 are Y2K compliant. It is my belief that earlier
versions were also Y2K compliant.
Libpng only has three year fields. One is a 2-byte unsigned integer that
@@ -3526,7 +3526,7 @@ the first widely used release:
1.2.17 13 10217 12.so.0.17[.0]
1.0.26 10 10026 10.so.0.26[.0]
1.2.18 13 10218 12.so.0.18[.0]
- 1.2.19beta1-18 13 10219 12.so.0.19[.0]
+ 1.2.19beta1-19 13 10219 12.so.0.19[.0]
Henceforth the source version will match the shared-library minor
and patch numbers; the shared-library major version number will be
@@ -3582,7 +3582,7 @@ possible without all of you.
Thanks to Frank J. T. Wojcik for helping with the documentation.
-Libpng version 1.2.19beta18 - June 23, 2007:
+Libpng version 1.2.19beta19 - June 28, 2007:
Initially created in 1995 by Guy Eric Schalnat, then of Group 42, Inc.
Currently maintained by Glenn Randers-Pehrson (glennrp at users.sourceforge.net).
@@ -3603,7 +3603,7 @@ included in the libpng distribution, the latter shall prevail.)
If you modify libpng you may insert additional notices immediately following
this sentence.
-libpng versions 1.2.6, August 15, 2004, through 1.2.19beta18, June 23, 2007, are
+libpng versions 1.2.6, August 15, 2004, through 1.2.19beta19, June 28, 2007, are
Copyright (c) 2004,2006-2007 Glenn Randers-Pehrson, and are
distributed according to the same disclaimer and license as libpng-1.2.5
with the following individual added to the list of Contributing Authors
@@ -3702,7 +3702,7 @@ certification mark of the Open Source Initiative.
Glenn Randers-Pehrson
glennrp at users.sourceforge.net
-June 23, 2007
+June 28, 2007
.\" end of man page
diff --git a/libpngpf.3 b/libpngpf.3
index 005b3c5fb..59f0d5e6b 100644
--- a/libpngpf.3
+++ b/libpngpf.3
@@ -1,6 +1,6 @@
-.TH LIBPNGPF 3 "June 23, 2007"
+.TH LIBPNGPF 3 "June 28, 2007"
.SH NAME
-libpng \- Portable Network Graphics (PNG) Reference Library 1.2.19beta18
+libpng \- Portable Network Graphics (PNG) Reference Library 1.2.19beta19
(private functions)
.SH SYNOPSIS
\fB#include <png.h>\fP
diff --git a/png.5 b/png.5
index 2e8c42bc3..16c3e3ac4 100644
--- a/png.5
+++ b/png.5
@@ -1,4 +1,4 @@
-.TH PNG 5 "June 23, 2007"
+.TH PNG 5 "June 28, 2007"
.SH NAME
png \- Portable Network Graphics (PNG) format
.SH DESCRIPTION
diff --git a/png.c b/png.c
index b9d036102..399098ed4 100644
--- a/png.c
+++ b/png.c
@@ -1,7 +1,7 @@
/* png.c - location for general purpose libpng functions
*
- * Last changed in libpng 1.2.19 June 23, 2007
+ * Last changed in libpng 1.2.19 June 28, 2007
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998-2007 Glenn Randers-Pehrson
* (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -13,7 +13,7 @@
#include "png.h"
/* Generate a compiler error if there is an old png.h in the search path. */
-typedef version_1_2_19beta18 Your_png_h_is_not_version_1_2_19beta18;
+typedef version_1_2_19beta19 Your_png_h_is_not_version_1_2_19beta19;
/* Version information for C files. This had better match the version
* string defined in png.h. */
@@ -706,7 +706,7 @@ png_charp PNGAPI
png_get_copyright(png_structp png_ptr)
{
png_ptr = png_ptr; /* silence compiler warning about unused png_ptr */
- return ((png_charp) "\n libpng version 1.2.19beta18 - June 23, 2007\n\
+ return ((png_charp) "\n libpng version 1.2.19beta19 - June 28, 2007\n\
Copyright (c) 1998-2007 Glenn Randers-Pehrson\n\
Copyright (c) 1996-1997 Andreas Dilger\n\
Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.\n");
diff --git a/png.h b/png.h
index 65b0f2393..736a53db2 100644
--- a/png.h
+++ b/png.h
@@ -1,7 +1,7 @@
/* png.h - header file for PNG reference library
*
- * libpng version 1.2.19beta18 - June 23, 2007
+ * libpng version 1.2.19beta19 - June 28, 2007
* Copyright (c) 1998-2007 Glenn Randers-Pehrson
* (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
* (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
@@ -9,7 +9,7 @@
* Authors and maintainers:
* libpng versions 0.71, May 1995, through 0.88, January 1996: Guy Schalnat
* libpng versions 0.89c, June 1996, through 0.96, May 1997: Andreas Dilger
- * libpng versions 0.97, January 1998, through 1.2.19beta18 - June 23, 2007: Glenn
+ * libpng versions 0.97, January 1998, through 1.2.19beta19 - June 28, 2007: Glenn
* See also "Contributing Authors", below.
*
* Note about libpng version numbers:
@@ -150,7 +150,7 @@
* 1.2.17 13 10217 12.so.0.17[.0]
* 1.0.26 10 10026 10.so.0.26[.0]
* 1.2.18 13 10218 12.so.0.18[.0]
- * 1.2.19beta1-18 13 10219 12.so.0.19[.0]
+ * 1.2.19beta1-19 13 10219 12.so.0.19[.0]
*
* Henceforth the source version will match the shared-library major
* and minor numbers; the shared-library major version number will be
@@ -180,7 +180,7 @@
* If you modify libpng you may insert additional notices immediately following
* this sentence.
*
- * libpng versions 1.2.6, August 15, 2004, through 1.2.19beta18, June 23, 2007, are
+ * libpng versions 1.2.6, August 15, 2004, through 1.2.19beta19, June 28, 2007, are
* Copyright (c) 2004, 2006-2007 Glenn Randers-Pehrson, and are
* distributed according to the same disclaimer and license as libpng-1.2.5
* with the following individual added to the list of Contributing Authors:
@@ -292,13 +292,13 @@
* Y2K compliance in libpng:
* =========================
*
- * June 23, 2007
+ * June 28, 2007
*
* Since the PNG Development group is an ad-hoc body, we can't make
* an official declaration.
*
* This is your unofficial assurance that libpng from version 0.71 and
- * upward through 1.2.19beta18 are Y2K compliant. It is my belief that earlier
+ * upward through 1.2.19beta19 are Y2K compliant. It is my belief that earlier
* versions were also Y2K compliant.
*
* Libpng only has three year fields. One is a 2-byte unsigned integer
@@ -354,9 +354,9 @@
*/
/* Version information for png.h - this should match the version in png.c */
-#define PNG_LIBPNG_VER_STRING "1.2.19beta18"
+#define PNG_LIBPNG_VER_STRING "1.2.19beta19"
#define PNG_HEADER_VERSION_STRING \
- " libpng version 1.2.19beta18 - June 23, 2007 (header)\n"
+ " libpng version 1.2.19beta19 - June 28, 2007 (header)\n"
#define PNG_LIBPNG_VER_SONUM 0
#define PNG_LIBPNG_VER_DLLNUM 13
@@ -368,7 +368,7 @@
/* This should match the numeric part of the final component of
* PNG_LIBPNG_VER_STRING, omitting any leading zero: */
-#define PNG_LIBPNG_VER_BUILD 18
+#define PNG_LIBPNG_VER_BUILD 19
/* Release Status */
#define PNG_LIBPNG_BUILD_ALPHA 1
@@ -1353,7 +1353,7 @@ struct png_struct_def
png_byte filter_type;
#endif
-#if defined(PNG_1_0_X) || defined(PNG_DEBUG)
+#if defined(PNG_1_0_X)
/* New member added in libpng-1.0.10, ifdef'ed out in 1.2.0 */
png_uint_32 row_buf_size;
#endif
@@ -1407,7 +1407,7 @@ struct png_struct_def
/* This triggers a compiler error in png.c, if png.c and png.h
* do not agree upon the version number.
*/
-typedef png_structp version_1_2_19beta18;
+typedef png_structp version_1_2_19beta19;
typedef png_struct FAR * FAR * png_structpp;
diff --git a/pngconf.h b/pngconf.h
index 1d75317de..c9f880eea 100644
--- a/pngconf.h
+++ b/pngconf.h
@@ -1,7 +1,7 @@
/* pngconf.h - machine configurable file for libpng
*
- * libpng version 1.2.19beta18 - June 23, 2007
+ * libpng version 1.2.19beta19 - June 28, 2007
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998-2007 Glenn Randers-Pehrson
* (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pngerror.c b/pngerror.c
index 9f6489283..c29e0d87c 100644
--- a/pngerror.c
+++ b/pngerror.c
@@ -1,7 +1,7 @@
/* pngerror.c - stub functions for i/o and memory allocation
*
- * Last changed in libpng 1.2.19 June 23, 2007
+ * Last changed in libpng 1.2.19 June 28, 2007
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998-2007 Glenn Randers-Pehrson
* (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pnggccrd.c b/pnggccrd.c
index 39b470630..a9e239a2e 100644
--- a/pnggccrd.c
+++ b/pnggccrd.c
@@ -3,7 +3,7 @@
*
* For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
*
- * Last changed in libpng 1.2.19 June 23, 2007
+ * Last changed in libpng 1.2.19 June 28, 2007
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998 Intel Corporation
* Copyright (c) 1999-2002,2007 Greg Roelofs
@@ -51,7 +51,7 @@
* - write MMX code for 48-bit case (pixel_bytes == 6)
* - figure out what's up with 24-bit case (pixel_bytes == 3):
* why subtract 8 from width_mmx in the pass 4/5 case?
- * (only width_mmx case) (near line 1606)
+ * (only width_mmx case) (near line 2335)
* x [DONE] replace pixel_bytes within each block with the true
* constant value (or are compilers smart enough to do that?)
* - rewrite all MMX interlacing code so it's aligned with
@@ -70,8 +70,8 @@
* inconsistent, and don't match the MMX Programmer's Reference
* Manual conventions anyway. They should be changed to
* "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
- * was lowest in memory (e.g., corresponding to a left pixel)
- * and b7 is the byte that was highest (e.g., a right pixel).
+ * was lowest in memory (i.e., corresponding to a left pixel)
+ * and b7 is the byte that was highest (i.e., a right pixel).
*
* 19991016:
* - Brennan's Guide notwithstanding, gcc under Linux does *not*
@@ -83,6 +83,10 @@
* defined within the scope of a single function, but both
* static and truly global (multi-module) variables work fine.
*
+ * 19991017:
+ * - replaced pixel_bytes in each png_memcpy() call with constant value for
+ * inlining (png_do_read_interlace() "non-MMX/modified C code" block)
+ *
* 19991023:
* - fixed png_combine_row() non-MMX replication bug (odd passes only?)
* - switched from string-concatenation-with-macros to cleaner method of
@@ -216,12 +220,30 @@
* 20010310:
* - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
*
+ * 20010808:
+ * - added PNG_THREAD_UNSAFE_OK around code using global variables [GRP]
+ *
+ * 20011124:
+ * - fixed missing save of Eflag in png_mmx_support() [Maxim Sobolev]
+ *
* 20020304:
* - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
*
+ * 20020407:
+ * - fixed insufficient preservation of ebx register [Sami Farin]
+ *
* 20040724:
* - more tinkering with clobber list at lines 4529 and 5033 to get it to
- * compile with gcc 3.4
+ * compile with gcc 3.4 [GRP]
+ *
+ * 20040809:
+ * - added "rim" definitions for CONST4 and CONST6 [GRP]
+ *
+ * 20060303:
+ * - added "OS2" to list of systems that don't need leading underscores [GRP]
+ *
+ * 20060320:
+ * - made PIC-compliant [Christian Aichinger]
*
* 20070313:
* - finally applied Giuseppe Ghibò's 64-bit patch of 20060803 (completely
@@ -238,7 +260,7 @@
* 20070527:
* - revised png_combine_row() to reuse mask in lieu of external _unmask
* - moved 32-bit (RGBA) case to top of png_combine_row(): most common
- * - just about ready to give up on x86_64 -fPIC mode; can't even access 16
+ * - just about ready to give up on x86-64 -fPIC mode; can't even access 16
* _mask*_* constants without triggering link error on shared library:
* /usr/bin/ld: pnggccrd.pic.o: relocation R_X86_64_32S against `a local
* symbol' can not be used when making a shared object; recompile with
@@ -254,12 +276,12 @@
*
* 20070603:
* - revised png_combine_row() to use @GOTPCREL(%%rip) addressing on _c64
- * struct of _mask*_* constants for x86_64 -fPIC; see sam.zoy.org link
+ * struct of _mask*_* constants for x86-64 -fPIC; see sam.zoy.org link
* above for details
* - moved _const4 and _const6 into _c64 struct, renamed to _amask5_3_0 and
* _amask7_1_0, respectively
* - can't figure out how to use _c64._mask*_* vars within asm code, so still
- * need single variables for non-x86_64/-fPIC half :-(
+ * need single variables for non-x86-64/-fPIC half :-(
* - replaced various __PIC__ ifdefs with *_GOT_ebx macros
* - moved _LBCarryMask and _HBClearMask into _c64 struct
* - conditionally replaced _p*temp variables with %r11d-%r13d (via p*_TEMP
@@ -268,18 +290,18 @@
* 20070604:
* - replaced all _ActiveMask and _ActiveMaskEnd with new _amask*_*_* consts
* (_amask naming convention: numbers of 00-bytes, ff-bytes, 00-bytes)
- * - _ActiveMask // (10) // avg/paeth/sub; read-only; consts; movq/pand
- * 0x0000000000ffffffLL (bpp 3, avg) _amask5_3_0
- * 0xffffffffffffffffLL (bpp 4, 6, avg) _amask0_8_0
- * 0x000000000000ffffLL (bpp 2, avg) _amask6_2_0
- * 0x0000000000ffffffLL (bpp 3, paeth) _amask5_3_0
- * 0x00000000ffffffffLL (bpp 6, paeth) _amask4_4_0
- * 0x00000000ffffffffLL (bpp 4, paeth) _amask4_4_0
- * 0x00000000ffffffffLL (bpp 8, paeth) _amask4_4_0
- * 0x0000ffffff000000LL (bpp 3, sub) _amask2_3_3
- * 0x00000000ffff0000LL (bpp 2, sub) _amask4_2_2
- * - _ActiveMaskEnd // (1) // paeth only; read-only; const; pand
- * 0xffff000000000000LL (bpp 3, paeth) _amask0_2_6
+ * - _ActiveMask // (10) // avg/paeth/sub; read-only; consts; movq/pand
+ * 0x0000000000ffffffLL (bpp 3, avg) _amask5_3_0
+ * 0xffffffffffffffffLL (bpp 4, 6, avg) _amask0_8_0
+ * 0x000000000000ffffLL (bpp 2, avg) _amask6_2_0
+ * 0x0000000000ffffffLL (bpp 3, paeth) _amask5_3_0
+ * 0x00000000ffffffffLL (bpp 6, paeth) _amask4_4_0
+ * 0x00000000ffffffffLL (bpp 4, paeth) _amask4_4_0
+ * 0x00000000ffffffffLL (bpp 8, paeth) _amask4_4_0
+ * 0x0000ffffff000000LL (bpp 3, sub) _amask2_3_3
+ * 0x00000000ffff0000LL (bpp 2, sub) _amask4_2_2
+ * - _ActiveMaskEnd // (1) // paeth only; read-only; const; pand
+ * 0xffff000000000000LL (bpp 3, paeth) _amask0_2_6
* - changed all "#if defined(__x86_64__) // later // && defined(__PIC__)"
* lines to "#ifdef PNG_x86_64_USE_GOTPCREL" for easier/safer testing
*
@@ -327,7 +349,44 @@
* 20070616:
* - finished replacing direct _FullLength accesses with register constraints
* (*ugly* conditional clobber-separator macros for avg and paeth, sigh)
- * Changed all "ifdef *" to "if defined(*)"
+ *
+ * 20070618:
+ * - fixed misplaced PNG_THREAD_UNSAFE_OK endif (was missing LOAD_GOT_rbp/
+ * RESTORE_rbp in 32-bit thread-safe case)
+ * - changed all "ifdef *" to "if defined(*)" [GRP]
+ *
+ * 20070619:
+ * - rearranged most bitdepth-related case statements to put most frequent
+ * cases at top (24-bit, 32-bit, 8-bit, rest)
+ *
+ * 20070623:
+ * - cleaned up png_debug() warnings/formatting
+ * - removed PNG_MMX_CODE_SUPPORTED ifdefs and added outer __GNUC__ ifdef
+ * (module no longer used by non-x86/non-GCC builds as of libpng 1.2.19)
+ * - removed single libpng-1.2.x PNG_DEBUG dependency on 1.0.x png_struct
+ * member (row_buf_size)
+ * - rearranged pass-related if-blocks in png_do_read_interlace() to put most
+ * frequent cases (4, 5) at top [GRP suggestion]
+ *
+ * 20070624-28:
+ * - fixed 64-bit crash bug: pointers -> rsi/rdi, not esi/edi (switched to
+ * %0/%1/%2/%3/%4 notation; eliminated size suffixes from relevant add/
+ * inc/sub/mov instructions; changed dummy vars to pointers)
+ * - png_combine_row()
+ * - png_do_read_interlace()
+ * - png_read_filter_row_mmx_avg()
+ * - png_read_filter_row_mmx_paeth()
+ * - NOTE: this fix makes use of the fact that modifying a 32-bit reg (e.g.,
+ * %%ebx) clears the top half of its corresponding 64-bit reg (%%rbx), so
+ * it's safe to mix 32-bit operations with 64-bit base/index addressing
+ * (see new PSI/PDI/PAX/PDX/PBP/etc. "pointer-register" macros)
+ *
+
+ * 200706xx:
+ * - continued fixing intermittent 64-bit crash bug:
+ * - png_read_filter_row_mmx_sub()
+ * - png_read_filter_row_mmx_up()
+
*
*
* STILL TO DO:
@@ -339,29 +398,32 @@
* - write MMX code for 48-bit case (pixel_bytes == 6)
* - figure out what's up with 24-bit case (pixel_bytes == 3):
* why subtract 8 from width_mmx in the pass 4/5 case? due to
- * odd number of bytes? (only width_mmx case) (near line 1606)
+ * odd number of bytes? (only width_mmx case) (near line 2335)
* - rewrite all MMX interlacing code so it's aligned with beginning
* of the row buffer, not the end (see 19991007 for details)
* - add error messages to any remaining bogus default cases
* - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
- * - try =r, etc., as reg constraints? (would gcc use 64-bit ones on x86_64?)
+ * - try =r, etc., as reg constraints? (would gcc use 64-bit ones on x86-64?)
* - need full, non-graphical, CRC-based test suite... maybe autogenerate
* random data of various height/width/depth, compute CRCs, write (C
* funcs), read (asm/MMX), recompute CRCs, and compare?
- * - write true x86_64 version using 128-bit "media instructions", %xmm0-15,
+ * - write true x86-64 version using 128-bit "media instructions", %xmm0-15,
* and extra general-purpose registers
*/
+#if defined(__GNUC__)
+
#define PNG_INTERNAL
#include "png.h"
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
+%14-%#include "pngpriv.h"
-#if defined(PNG_MMX_CODE_SUPPORTED)
-#if defined(__x86_64__) && defined(__PIC__) /* optionally comment __PIC__: */
-# define PNG_x86_64_USE_GOTPCREL /* GOTPCREL => full thread-safety */
-# define PNG_CLOBBER_x86_64_REGS_SUPPORTED /* works as of gcc 3.4.3 ... */
-#endif
+/* if you want/need full thread-safety on x86-64 even when linking statically,
+ * comment out the "&& defined(__PIC__)" part here: */
+#if defined(__x86_64__) && defined(__PIC__)
+# define PNG_x86_64_USE_GOTPCREL // GOTPCREL => full thread-safety
+# define PNG_CLOBBER_x86_64_REGS_SUPPORTED // works as of gcc 3.4.3 ...
#endif
int PNGAPI png_mmx_support(void);
@@ -372,8 +434,6 @@ static PNG_CONST int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
#endif
-#if defined(PNG_MMX_CODE_SUPPORTED)
-
/* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
* so define them without: */
#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
@@ -536,8 +596,8 @@ static PNG_CONST ull _mask48_1 __attribute__((used, aligned(8))) = 0x20202020404
static PNG_CONST ull _mask48_0 __attribute__((used, aligned(8))) = 0x4040808080808080LL;
// png_do_read_interlace() constants:
-static PNG_CONST ull _amask5_3_0 __attribute__((aligned(8))) = 0x0000000000FFFFFFLL; // was _const4
-static PNG_CONST ull _amask7_1_0 __attribute__((aligned(8))) = 0x00000000000000FFLL; // was _const6
+static PNG_CONST ull _amask5_3_0 __attribute__((aligned(8))) = 0x0000000000FFFFFFLL; // was _const4
+static PNG_CONST ull _amask7_1_0 __attribute__((aligned(8))) = 0x00000000000000FFLL; // was _const6
// png_read_filter_row_mmx_avg() constants:
static PNG_CONST ull _LBCarryMask __attribute__((used, aligned(8))) = 0x0101010101010101LL;
@@ -626,8 +686,8 @@ static PNG_CONST ull _amask4_2_2 __attribute__((used, aligned(8))) = 0x00000000
# define _CLOBBER_r11_r12_r13 // not using regs => not clobbering
# define CLOBBER_r11_r12_r13
# endif // PNG_THREAD_UNSAFE_OK
-# define LOAD_GOT_rbp
-# define RESTORE_rbp
+# define LOAD_GOT_rbp
+# define RESTORE_rbp
#endif
#if defined(__x86_64__)
@@ -637,7 +697,7 @@ static PNG_CONST ull _amask4_2_2 __attribute__((used, aligned(8))) = 0x00000000
# define CLOBBER_ebp "%ebp"
# define SAVE_FullLength "movl %%eax, %%r15d \n\t"
# define RESTORE_FullLength "movl %%r15d, " // may go into eax or ecx
-# if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED) // works as of gcc 3.4.3 ...
+# if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED) // works as of gcc 3.4.3 ...
# define SAVE_r15
# define RESTORE_r15
# define _CLOBBER_r15 ,"%r15"
@@ -716,11 +776,17 @@ static PNG_CONST ull _amask4_2_2 __attribute__((used, aligned(8))) = 0x00000000
# define CLOBBER_GOT_ebx "%ebx"
#endif
-#endif // PNG_MMX_CODE_SUPPORTED
+#if defined(PNG_HAVE_MMX_COMBINE_ROW) || defined(PNG_HAVE_MMX_READ_INTERLACE)
+# define BPP2 2
+# define BPP3 3 // bytes per pixel (a.k.a. pixel_bytes)
+# define BPP4 4 // (defined only to help avoid cut-and-paste errors)
+# define BPP6 6
+# define BPP8 8
+#endif
-static int _mmx_supported = 2; /* 0: no MMX; 1: MMX supported; 2: not tested */
+static int _mmx_supported = 2; // 0: no MMX; 1: MMX supported; 2: not tested
/*===========================================================================*/
/* */
@@ -728,29 +794,28 @@ static int _mmx_supported = 2; /* 0: no MMX; 1: MMX supported; 2: not tested */
/* */
/*===========================================================================*/
-/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
- * (2) all instructions compile with gcc 2.7.2.3 and later
- * x (3) the function is moved down here to prevent gcc from
- * x inlining it in multiple places and then barfing be-
- * x cause the ".NOT_SUPPORTED" label is multiply defined
- * [need to retest with gcc 2.7.2.3]
- */
-
-/* GRR 20070524: This declaration apparently is compatible with but supersedes
- * the one in png.h; in any case, the generated object file is slightly
- * smaller. It is unnecessary with gcc 4.1.2, but gcc 2.x apparently
- * replicated the ".NOT_SUPPORTED" label in each location the function was
- * inlined, leading to compilation errors due to the "multiply defined"
- * label. Old workaround was to leave the function at the end of this
- * file; new one (still testing) is to use a gcc-specific function attribute
- * to prevent inlining. */
+// GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
+// (2) all instructions compile with gcc 2.7.2.3 and later
+// x (3) the function is moved down here to prevent gcc from
+// x inlining it in multiple places and then barfing be-
+// x cause the ".NOT_SUPPORTED" label is multiply defined
+// [need to retest with gcc 2.7.2.3]
+
+// GRR 20070524: This declaration apparently is compatible with but supersedes
+// the one in png.h; in any case, the generated object file is slightly
+// smaller. It is unnecessary with gcc 4.1.2, but gcc 2.x apparently
+// replicated the ".NOT_SUPPORTED" label in each location the function was
+// inlined, leading to compilation errors due to the "multiply defined"
+// label. Old workaround was to leave the function at the end of this
+// file; new one (still testing) is to use a gcc-specific function attribute
+// to prevent local inlining.
int PNGAPI
png_mmx_support(void) __attribute__((noinline));
int PNGAPI
png_mmx_support(void)
{
-#if defined(PNG_MMX_CODE_SUPPORTED)
+#if defined(PNG_MMX_CODE_SUPPORTED) // superfluous, but what the heck
int result;
__asm__ __volatile__ (
#if defined(__x86_64__)
@@ -844,12 +909,6 @@ png_mmx_support(void)
#if defined(PNG_HAVE_MMX_COMBINE_ROW)
-#define BPP2 2
-#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
-#define BPP4 4
-#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
-#define BPP8 8
-
/* Combines the row recently read in with the previous row.
This routine takes care of alpha and transparency if requested.
This routine also handles the two methods of progressive display
@@ -869,7 +928,6 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
{
png_debug(1, "in png_combine_row (pnggccrd.c)\n");
-#if defined(PNG_MMX_CODE_SUPPORTED)
if (_mmx_supported == 2) {
#if !defined(PNG_1_0_X)
/* this should have happened in png_init_mmx_flags() already */
@@ -877,7 +935,6 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
#endif
png_mmx_support();
}
-#endif
if (mask == 0xff)
{
@@ -889,16 +946,14 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
{
switch (png_ptr->row_info.pixel_depth)
{
- /* most common case: combining 32-bit RGBA */
+ // most common case: combining 32-bit RGBA
case 32: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
png_bytep dstptr;
-#if defined(PNG_MMX_CODE_SUPPORTED)
#if !defined(PNG_1_0_X)
- if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
- /* && _mmx_supported */ )
+ if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
#else
if (_mmx_supported)
#endif
@@ -906,10 +961,10 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
png_uint_32 len;
int diff;
int dummy_value_a; // fix 'forbidden register spilled' error
- int dummy_value_d;
int dummy_value_c;
- int dummy_value_S;
- int dummy_value_D;
+ int dummy_value_d;
+ png_bytep dummy_value_S;
+ png_bytep dummy_value_D;
srcptr = png_ptr->row_buf + 1;
dstptr = row;
@@ -943,47 +998,47 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
"pcmpeqb %%mm6, %%mm3 \n\t"
// preload "movl len, %%ecx \n\t" // load length of line
-// preload "movl srcptr, %%esi \n\t" // load source
-// preload "movl dstptr, %%edi \n\t" // load dest
+// preload "movl srcptr, %3 \n\t" // load source
+// preload "movl dstptr, %4 \n\t" // load dest
"cmpl $0, %%ecx \n\t" // lcr
"jz mainloop32end \n\t"
"mainloop32: \n\t"
- "movq (%%esi), %%mm4 \n\t"
+ "movq (%3), %%mm4 \n\t"
"pand %%mm0, %%mm4 \n\t"
"movq %%mm0, %%mm6 \n\t"
- "movq (%%edi), %%mm7 \n\t"
+ "movq (%4), %%mm7 \n\t"
"pandn %%mm7, %%mm6 \n\t"
"por %%mm6, %%mm4 \n\t"
- "movq %%mm4, (%%edi) \n\t"
+ "movq %%mm4, (%4) \n\t"
- "movq 8(%%esi), %%mm5 \n\t"
+ "movq 8(%3), %%mm5 \n\t"
"pand %%mm1, %%mm5 \n\t"
"movq %%mm1, %%mm7 \n\t"
- "movq 8(%%edi), %%mm6 \n\t"
+ "movq 8(%4), %%mm6 \n\t"
"pandn %%mm6, %%mm7 \n\t"
"por %%mm7, %%mm5 \n\t"
- "movq %%mm5, 8(%%edi) \n\t"
+ "movq %%mm5, 8(%4) \n\t"
- "movq 16(%%esi), %%mm6 \n\t"
+ "movq 16(%3), %%mm6 \n\t"
"pand %%mm2, %%mm6 \n\t"
"movq %%mm2, %%mm4 \n\t"
- "movq 16(%%edi), %%mm7 \n\t"
+ "movq 16(%4), %%mm7 \n\t"
"pandn %%mm7, %%mm4 \n\t"
"por %%mm4, %%mm6 \n\t"
- "movq %%mm6, 16(%%edi) \n\t"
+ "movq %%mm6, 16(%4) \n\t"
- "movq 24(%%esi), %%mm7 \n\t"
+ "movq 24(%3), %%mm7 \n\t"
"pand %%mm3, %%mm7 \n\t"
"movq %%mm3, %%mm5 \n\t"
- "movq 24(%%edi), %%mm4 \n\t"
+ "movq 24(%4), %%mm4 \n\t"
"pandn %%mm4, %%mm5 \n\t"
"por %%mm5, %%mm7 \n\t"
- "movq %%mm7, 24(%%edi) \n\t"
+ "movq %%mm7, 24(%4) \n\t"
- "addl $32, %%esi \n\t" // inc by 32 bytes processed
- "addl $32, %%edi \n\t"
+ "add $32, %3 \n\t" // inc by 32 bytes processed
+ "add $32, %4 \n\t"
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
"ja mainloop32 \n\t"
@@ -998,12 +1053,12 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
"secondloop32: \n\t"
"sall %%edx \n\t" // move high bit to CF
"jnc skip32 \n\t" // if CF = 0
- "movl (%%esi), %%eax \n\t"
- "movl %%eax, (%%edi) \n\t"
+ "movl (%3), %%eax \n\t"
+ "movl %%eax, (%4) \n\t"
"skip32: \n\t"
- "addl $4, %%esi \n\t"
- "addl $4, %%edi \n\t"
+ "add $4, %3 \n\t"
+ "add $4, %4 \n\t"
"decl %%ecx \n\t"
"jnz secondloop32 \n\t"
@@ -1016,12 +1071,12 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "3" (srcptr), // esi // input regs
- "4" (dstptr), // edi
- "0" (diff), // eax
-// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
+ : "0" (diff), // eax // input regs
+ "1" (mask), // edx
"2" (len), // ecx
- "1" (mask) // edx
+// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
+ "3" (srcptr), // esi/rsi
+ "4" (dstptr) // edi/rdi
#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
@@ -1029,8 +1084,7 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
#endif
);
}
- else /* mmx _not supported - Use modified C routine */
-#endif /* PNG_MMX_CODE_SUPPORTED */
+ else /* not _mmx_supported - use modified C routine */
{
register png_uint_32 i;
png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
@@ -1069,181 +1123,13 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
break;
} /* end 32 bpp */
- case 1: /* png_ptr->row_info.pixel_depth */
- {
- png_bytep sp;
- png_bytep dp;
- int s_inc, s_start, s_end;
- int m;
- int shift;
- png_uint_32 i;
-
- sp = png_ptr->row_buf + 1;
- dp = row;
- m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (png_ptr->transformations & PNG_PACKSWAP)
- {
- s_start = 0;
- s_end = 7;
- s_inc = 1;
- }
- else
-#endif
- {
- s_start = 7;
- s_end = 0;
- s_inc = -1;
- }
-
- shift = s_start;
-
- for (i = 0; i < png_ptr->width; i++)
- {
- if (m & mask)
- {
- int value;
-
- value = (*sp >> shift) & 0x1;
- *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
- *dp |= (png_byte)(value << shift);
- }
-
- if (shift == s_end)
- {
- shift = s_start;
- sp++;
- dp++;
- }
- else
- shift += s_inc;
-
- if (m == 1)
- m = 0x80;
- else
- m >>= 1;
- }
- break;
- } /* end 1 bpp */
-
- case 2: /* png_ptr->row_info.pixel_depth */
- {
- png_bytep sp;
- png_bytep dp;
- int s_start, s_end, s_inc;
- int m;
- int shift;
- png_uint_32 i;
- int value;
-
- sp = png_ptr->row_buf + 1;
- dp = row;
- m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (png_ptr->transformations & PNG_PACKSWAP)
- {
- s_start = 0;
- s_end = 6;
- s_inc = 2;
- }
- else
-#endif
- {
- s_start = 6;
- s_end = 0;
- s_inc = -2;
- }
-
- shift = s_start;
-
- for (i = 0; i < png_ptr->width; i++)
- {
- if (m & mask)
- {
- value = (*sp >> shift) & 0x3;
- *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
- *dp |= (png_byte)(value << shift);
- }
-
- if (shift == s_end)
- {
- shift = s_start;
- sp++;
- dp++;
- }
- else
- shift += s_inc;
- if (m == 1)
- m = 0x80;
- else
- m >>= 1;
- }
- break;
- } /* end 2 bpp */
-
- case 4: /* png_ptr->row_info.pixel_depth */
- {
- png_bytep sp;
- png_bytep dp;
- int s_start, s_end, s_inc;
- int m;
- int shift;
- png_uint_32 i;
- int value;
-
- sp = png_ptr->row_buf + 1;
- dp = row;
- m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (png_ptr->transformations & PNG_PACKSWAP)
- {
- s_start = 0;
- s_end = 4;
- s_inc = 4;
- }
- else
-#endif
- {
- s_start = 4;
- s_end = 0;
- s_inc = -4;
- }
- shift = s_start;
-
- for (i = 0; i < png_ptr->width; i++)
- {
- if (m & mask)
- {
- value = (*sp >> shift) & 0xf;
- *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
- *dp |= (png_byte)(value << shift);
- }
-
- if (shift == s_end)
- {
- shift = s_start;
- sp++;
- dp++;
- }
- else
- shift += s_inc;
- if (m == 1)
- m = 0x80;
- else
- m >>= 1;
- }
- break;
- } /* end 4 bpp */
-
- case 8: /* png_ptr->row_info.pixel_depth */
+ case 24: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
png_bytep dstptr;
-#if defined(PNG_MMX_CODE_SUPPORTED)
#if !defined(PNG_1_0_X)
- if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
- /* && _mmx_supported */ )
+ if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
#else
if (_mmx_supported)
#endif
@@ -1253,8 +1139,8 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
int dummy_value_a; // fix 'forbidden register spilled' error
int dummy_value_d;
int dummy_value_c;
- int dummy_value_S;
- int dummy_value_D;
+ png_bytep dummy_value_S;
+ png_bytep dummy_value_D;
srcptr = png_ptr->row_buf + 1;
dstptr = row;
@@ -1271,53 +1157,82 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
LOAD_GOT_rbp
- "movq " MASK8_0 ", %%mm0 \n\t" // _mask8_0 -> mm0
+ "movq " MASK24_0 ", %%mm0 \n\t" // _mask24_0 -> mm0
+ "movq " MASK24_1 ", %%mm1 \n\t" // _mask24_1 -> mm1
+ "movq " MASK24_2 ", %%mm2 \n\t" // _mask24_2 -> mm2
RESTORE_rbp
- "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
- "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm7, %%mm2 \n\t"
+
+ "pcmpeqb %%mm6, %%mm0 \n\t"
+ "pcmpeqb %%mm6, %%mm1 \n\t"
+ "pcmpeqb %%mm6, %%mm2 \n\t"
// preload "movl len, %%ecx \n\t" // load length of line
-// preload "movl srcptr, %%esi \n\t" // load source
-// preload "movl dstptr, %%edi \n\t" // load dest
+// preload "movl srcptr, %3 \n\t" // load source
+// preload "movl dstptr, %4 \n\t" // load dest
- "cmpl $0, %%ecx \n\t" // len == 0 ?
- "je mainloop8end \n\t"
+ "cmpl $0, %%ecx \n\t"
+ "jz mainloop24end \n\t"
- "mainloop8: \n\t"
- "movq (%%esi), %%mm4 \n\t" // *srcptr
+ "mainloop24: \n\t"
+ "movq (%3), %%mm4 \n\t"
"pand %%mm0, %%mm4 \n\t"
"movq %%mm0, %%mm6 \n\t"
- "pandn (%%edi), %%mm6 \n\t" // *dstptr
+ "movq (%4), %%mm7 \n\t" // GRR PTR CRASH HERE
+ "pandn %%mm7, %%mm6 \n\t"
"por %%mm6, %%mm4 \n\t"
- "movq %%mm4, (%%edi) \n\t"
- "addl $8, %%esi \n\t" // inc by 8 bytes processed
- "addl $8, %%edi \n\t"
+ "movq %%mm4, (%4) \n\t"
+
+ "movq 8(%3), %%mm5 \n\t"
+ "pand %%mm1, %%mm5 \n\t"
+ "movq %%mm1, %%mm7 \n\t"
+ "movq 8(%4), %%mm6 \n\t"
+ "pandn %%mm6, %%mm7 \n\t"
+ "por %%mm7, %%mm5 \n\t"
+ "movq %%mm5, 8(%4) \n\t"
+
+ "movq 16(%3), %%mm6 \n\t"
+ "pand %%mm2, %%mm6 \n\t"
+ "movq %%mm2, %%mm4 \n\t"
+ "movq 16(%4), %%mm7 \n\t"
+ "pandn %%mm7, %%mm4 \n\t"
+ "por %%mm4, %%mm6 \n\t"
+ "movq %%mm6, 16(%4) \n\t"
+
+ "add $24, %3 \n\t" // inc by 24 bytes processed
+ "add $24, %4 \n\t"
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
- "ja mainloop8 \n\t"
- "mainloop8end: \n\t"
+ "ja mainloop24 \n\t"
+
+ "mainloop24end: \n\t"
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
"movl %%eax, %%ecx \n\t"
"cmpl $0, %%ecx \n\t"
- "jz end8 \n\t"
+ "jz end24 \n\t"
// preload "movl mask, %%edx \n\t"
"sall $24, %%edx \n\t" // make low byte, high byte
- "secondloop8: \n\t"
+ "secondloop24: \n\t"
"sall %%edx \n\t" // move high bit to CF
- "jnc skip8 \n\t" // if CF = 0
- "movb (%%esi), %%al \n\t"
- "movb %%al, (%%edi) \n\t"
+ "jnc skip24 \n\t" // if CF = 0
+ "movw (%3), %%ax \n\t"
+ "movw %%ax, (%4) \n\t"
+ "xorl %%eax, %%eax \n\t"
+ "movb 2(%3), %%al \n\t"
+ "movb %%al, 2(%4) \n\t"
- "skip8: \n\t"
- "incl %%esi \n\t"
- "incl %%edi \n\t"
+ "skip24: \n\t"
+ "add $3, %3 \n\t"
+ "add $3, %4 \n\t"
"decl %%ecx \n\t"
- "jnz secondloop8 \n\t"
+ "jnz secondloop24 \n\t"
- "end8: \n\t"
- "EMMS \n\t" // DONE
+ "end24: \n\t"
+ "EMMS \n\t" // DONE
: "=a" (dummy_value_a), // output regs (dummy)
"=d" (dummy_value_d),
@@ -1325,31 +1240,31 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "3" (srcptr), // esi // input regs
- "4" (dstptr), // edi
- "0" (diff), // eax
-// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
+ : "0" (diff), // eax // input regs
+ "1" (mask), // edx
"2" (len), // ecx
- "1" (mask) // edx
+// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
+ "3" (srcptr), // esi/rsi
+ "4" (dstptr) // edi/rdi
#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
- : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
+ : "%mm0", "%mm1", "%mm2" // clobber list
+ , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
);
}
- else /* mmx _not supported - Use modified C routine */
-#endif /* PNG_MMX_CODE_SUPPORTED */
+ else /* not _mmx_supported - use modified C routine */
{
register png_uint_32 i;
- png_uint_32 initial_val = png_pass_start[png_ptr->pass];
+ png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
- register int stride = png_pass_inc[png_ptr->pass];
+ register int stride = BPP3 * png_pass_inc[png_ptr->pass];
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
- register int rep_bytes = png_pass_width[png_ptr->pass];
+ register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
int diff = (int) (png_ptr->width & 7); /* amount lost */
- register png_uint_32 final_val = len; /* GRR bugfix */
+ register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
srcptr = png_ptr->row_buf + 1 + initial_val;
dstptr = row + initial_val;
@@ -1362,7 +1277,7 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
}
if (diff) /* number of leftover pixels: 3 for pngtest */
{
- final_val += diff /* *BPP1 */ ;
+ final_val += diff*BPP3;
for (; i < final_val; i += stride)
{
if (rep_bytes > (int)(final_val-i))
@@ -1372,21 +1287,18 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
dstptr += stride;
}
}
-
} /* end of else (_mmx_supported) */
break;
- } /* end 8 bpp */
+ } /* end 24 bpp */
- case 16: /* png_ptr->row_info.pixel_depth */
+ case 8: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
png_bytep dstptr;
-#if defined(PNG_MMX_CODE_SUPPORTED)
#if !defined(PNG_1_0_X)
- if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
- /* && _mmx_supported */ )
+ if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
#else
if (_mmx_supported)
#endif
@@ -1396,8 +1308,8 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
int dummy_value_a; // fix 'forbidden register spilled' error
int dummy_value_d;
int dummy_value_c;
- int dummy_value_S;
- int dummy_value_D;
+ png_bytep dummy_value_S;
+ png_bytep dummy_value_D;
srcptr = png_ptr->row_buf + 1;
dstptr = row;
@@ -1414,100 +1326,84 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
LOAD_GOT_rbp
- "movq " MASK16_0 ", %%mm0 \n\t" // _mask16_0 -> mm0
- "movq " MASK16_1 ", %%mm1 \n\t" // _mask16_1 -> mm1
+ "movq " MASK8_0 ", %%mm0 \n\t" // _mask8_0 -> mm0
RESTORE_rbp
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm7, %%mm1 \n\t"
-
- "pcmpeqb %%mm6, %%mm0 \n\t"
- "pcmpeqb %%mm6, %%mm1 \n\t"
+ "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
+ "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
// preload "movl len, %%ecx \n\t" // load length of line
-// preload "movl srcptr, %%esi \n\t" // load source
-// preload "movl dstptr, %%edi \n\t" // load dest
+// preload "movl srcptr, %3 \n\t" // load source
+// preload "movl dstptr, %4 \n\t" // load dest
- "cmpl $0, %%ecx \n\t"
- "jz mainloop16end \n\t"
+ "cmpl $0, %%ecx \n\t" // len == 0 ?
+ "je mainloop8end \n\t"
- "mainloop16: \n\t"
- "movq (%%esi), %%mm4 \n\t"
+ "mainloop8: \n\t"
+ "movq (%3), %%mm4 \n\t" // *srcptr
"pand %%mm0, %%mm4 \n\t"
"movq %%mm0, %%mm6 \n\t"
- "movq (%%edi), %%mm7 \n\t"
- "pandn %%mm7, %%mm6 \n\t"
+ "pandn (%4), %%mm6 \n\t" // *dstptr
"por %%mm6, %%mm4 \n\t"
- "movq %%mm4, (%%edi) \n\t"
-
- "movq 8(%%esi), %%mm5 \n\t"
- "pand %%mm1, %%mm5 \n\t"
- "movq %%mm1, %%mm7 \n\t"
- "movq 8(%%edi), %%mm6 \n\t"
- "pandn %%mm6, %%mm7 \n\t"
- "por %%mm7, %%mm5 \n\t"
- "movq %%mm5, 8(%%edi) \n\t"
-
- "addl $16, %%esi \n\t" // inc by 16 bytes processed
- "addl $16, %%edi \n\t"
+ "movq %%mm4, (%4) \n\t"
+ "add $8, %3 \n\t" // inc by 8 bytes processed
+ "add $8, %4 \n\t"
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
- "ja mainloop16 \n\t"
+ "ja mainloop8 \n\t"
- "mainloop16end: \n\t"
+ "mainloop8end: \n\t"
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
"movl %%eax, %%ecx \n\t"
"cmpl $0, %%ecx \n\t"
- "jz end16 \n\t"
+ "jz end8 \n\t"
// preload "movl mask, %%edx \n\t"
"sall $24, %%edx \n\t" // make low byte, high byte
- "secondloop16: \n\t"
+ "secondloop8: \n\t"
"sall %%edx \n\t" // move high bit to CF
- "jnc skip16 \n\t" // if CF = 0
- "movw (%%esi), %%ax \n\t"
- "movw %%ax, (%%edi) \n\t"
+ "jnc skip8 \n\t" // if CF = 0
+ "movb (%3), %%al \n\t"
+ "movb %%al, (%4) \n\t"
- "skip16: \n\t"
- "addl $2, %%esi \n\t"
- "addl $2, %%edi \n\t"
+ "skip8: \n\t"
+ "inc %3 \n\t"
+ "inc %4 \n\t"
"decl %%ecx \n\t"
- "jnz secondloop16 \n\t"
+ "jnz secondloop8 \n\t"
- "end16: \n\t"
+ "end8: \n\t"
"EMMS \n\t" // DONE
: "=a" (dummy_value_a), // output regs (dummy)
- "=c" (dummy_value_c),
"=d" (dummy_value_d),
+ "=c" (dummy_value_c),
"=S" (dummy_value_S),
"=D" (dummy_value_D)
: "0" (diff), // eax // input regs
-// was (unmask) " " RESERVED // ebx // Global Offset Table idx
- "1" (len), // ecx
- "2" (mask), // edx
- "3" (srcptr), // esi
- "4" (dstptr) // edi
+ "1" (mask), // edx
+ "2" (len), // ecx
+// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
+ "3" (srcptr), // esi/rsi
+ "4" (dstptr) // edi/rdi
#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
- : "%mm0", "%mm1", "%mm4" // clobber list
- , "%mm5", "%mm6", "%mm7"
+ : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
#endif
);
}
- else /* mmx _not supported - Use modified C routine */
-#endif /* PNG_MMX_CODE_SUPPORTED */
+ else /* not _mmx_supported - use modified C routine */
{
register png_uint_32 i;
- png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
+ png_uint_32 initial_val = png_pass_start[png_ptr->pass];
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
- register int stride = BPP2 * png_pass_inc[png_ptr->pass];
+ register int stride = png_pass_inc[png_ptr->pass];
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
- register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
+ register int rep_bytes = png_pass_width[png_ptr->pass];
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
int diff = (int) (png_ptr->width & 7); /* amount lost */
- register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
+ register png_uint_32 final_val = len; /* GRR bugfix */
srcptr = png_ptr->row_buf + 1 + initial_val;
dstptr = row + initial_val;
@@ -1520,7 +1416,7 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
}
if (diff) /* number of leftover pixels: 3 for pngtest */
{
- final_val += diff*BPP2;
+ final_val += diff /* *BPP1 */ ;
for (; i < final_val; i += stride)
{
if (rep_bytes > (int)(final_val-i))
@@ -1530,20 +1426,185 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
dstptr += stride;
}
}
+
} /* end of else (_mmx_supported) */
break;
- } /* end 16 bpp */
+ } /* end 8 bpp */
- case 24: /* png_ptr->row_info.pixel_depth */
+ case 1: /* png_ptr->row_info.pixel_depth */
+ {
+ png_bytep sp;
+ png_bytep dp;
+ int s_inc, s_start, s_end;
+ int m;
+ int shift;
+ png_uint_32 i;
+
+ sp = png_ptr->row_buf + 1;
+ dp = row;
+ m = 0x80;
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+ if (png_ptr->transformations & PNG_PACKSWAP)
+ {
+ s_start = 0;
+ s_end = 7;
+ s_inc = 1;
+ }
+ else
+#endif
+ {
+ s_start = 7;
+ s_end = 0;
+ s_inc = -1;
+ }
+
+ shift = s_start;
+
+ for (i = 0; i < png_ptr->width; i++)
+ {
+ if (m & mask)
+ {
+ int value;
+
+ value = (*sp >> shift) & 0x1;
+ *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
+ *dp |= (png_byte)(value << shift);
+ }
+
+ if (shift == s_end)
+ {
+ shift = s_start;
+ sp++;
+ dp++;
+ }
+ else
+ shift += s_inc;
+
+ if (m == 1)
+ m = 0x80;
+ else
+ m >>= 1;
+ }
+ break;
+ } /* end 1 bpp */
+
+ case 2: /* png_ptr->row_info.pixel_depth */
+ {
+ png_bytep sp;
+ png_bytep dp;
+ int s_start, s_end, s_inc;
+ int m;
+ int shift;
+ png_uint_32 i;
+ int value;
+
+ sp = png_ptr->row_buf + 1;
+ dp = row;
+ m = 0x80;
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+ if (png_ptr->transformations & PNG_PACKSWAP)
+ {
+ s_start = 0;
+ s_end = 6;
+ s_inc = 2;
+ }
+ else
+#endif
+ {
+ s_start = 6;
+ s_end = 0;
+ s_inc = -2;
+ }
+
+ shift = s_start;
+
+ for (i = 0; i < png_ptr->width; i++)
+ {
+ if (m & mask)
+ {
+ value = (*sp >> shift) & 0x3;
+ *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
+ *dp |= (png_byte)(value << shift);
+ }
+
+ if (shift == s_end)
+ {
+ shift = s_start;
+ sp++;
+ dp++;
+ }
+ else
+ shift += s_inc;
+ if (m == 1)
+ m = 0x80;
+ else
+ m >>= 1;
+ }
+ break;
+ } /* end 2 bpp */
+
+ case 4: /* png_ptr->row_info.pixel_depth */
+ {
+ png_bytep sp;
+ png_bytep dp;
+ int s_start, s_end, s_inc;
+ int m;
+ int shift;
+ png_uint_32 i;
+ int value;
+
+ sp = png_ptr->row_buf + 1;
+ dp = row;
+ m = 0x80;
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+ if (png_ptr->transformations & PNG_PACKSWAP)
+ {
+ s_start = 0;
+ s_end = 4;
+ s_inc = 4;
+ }
+ else
+#endif
+ {
+ s_start = 4;
+ s_end = 0;
+ s_inc = -4;
+ }
+ shift = s_start;
+
+ for (i = 0; i < png_ptr->width; i++)
+ {
+ if (m & mask)
+ {
+ value = (*sp >> shift) & 0xf;
+ *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
+ *dp |= (png_byte)(value << shift);
+ }
+
+ if (shift == s_end)
+ {
+ shift = s_start;
+ sp++;
+ dp++;
+ }
+ else
+ shift += s_inc;
+ if (m == 1)
+ m = 0x80;
+ else
+ m >>= 1;
+ }
+ break;
+ } /* end 4 bpp */
+
+ case 16: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
png_bytep dstptr;
-#if defined(PNG_MMX_CODE_SUPPORTED)
#if !defined(PNG_1_0_X)
- if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
- /* && _mmx_supported */ )
+ if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
#else
if (_mmx_supported)
#endif
@@ -1553,8 +1614,8 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
int dummy_value_a; // fix 'forbidden register spilled' error
int dummy_value_d;
int dummy_value_c;
- int dummy_value_S;
- int dummy_value_D;
+ png_bytep dummy_value_S;
+ png_bytep dummy_value_D;
srcptr = png_ptr->row_buf + 1;
dstptr = row;
@@ -1571,81 +1632,66 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
LOAD_GOT_rbp
- "movq " MASK24_0 ", %%mm0 \n\t" // _mask24_0 -> mm0
- "movq " MASK24_1 ", %%mm1 \n\t" // _mask24_1 -> mm1
- "movq " MASK24_2 ", %%mm2 \n\t" // _mask24_2 -> mm2
+ "movq " MASK16_0 ", %%mm0 \n\t" // _mask16_0 -> mm0
+ "movq " MASK16_1 ", %%mm1 \n\t" // _mask16_1 -> mm1
RESTORE_rbp
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm1 \n\t"
- "pand %%mm7, %%mm2 \n\t"
"pcmpeqb %%mm6, %%mm0 \n\t"
"pcmpeqb %%mm6, %%mm1 \n\t"
- "pcmpeqb %%mm6, %%mm2 \n\t"
// preload "movl len, %%ecx \n\t" // load length of line
-// preload "movl srcptr, %%esi \n\t" // load source
-// preload "movl dstptr, %%edi \n\t" // load dest
+// preload "movl srcptr, %3 \n\t" // load source
+// preload "movl dstptr, %4 \n\t" // load dest
"cmpl $0, %%ecx \n\t"
- "jz mainloop24end \n\t"
+ "jz mainloop16end \n\t"
- "mainloop24: \n\t"
- "movq (%%esi), %%mm4 \n\t"
+ "mainloop16: \n\t"
+ "movq (%3), %%mm4 \n\t"
"pand %%mm0, %%mm4 \n\t"
"movq %%mm0, %%mm6 \n\t"
- "movq (%%edi), %%mm7 \n\t"
+ "movq (%4), %%mm7 \n\t"
"pandn %%mm7, %%mm6 \n\t"
"por %%mm6, %%mm4 \n\t"
- "movq %%mm4, (%%edi) \n\t"
+ "movq %%mm4, (%4) \n\t"
- "movq 8(%%esi), %%mm5 \n\t"
+ "movq 8(%3), %%mm5 \n\t"
"pand %%mm1, %%mm5 \n\t"
"movq %%mm1, %%mm7 \n\t"
- "movq 8(%%edi), %%mm6 \n\t"
+ "movq 8(%4), %%mm6 \n\t"
"pandn %%mm6, %%mm7 \n\t"
"por %%mm7, %%mm5 \n\t"
- "movq %%mm5, 8(%%edi) \n\t"
-
- "movq 16(%%esi), %%mm6 \n\t"
- "pand %%mm2, %%mm6 \n\t"
- "movq %%mm2, %%mm4 \n\t"
- "movq 16(%%edi), %%mm7 \n\t"
- "pandn %%mm7, %%mm4 \n\t"
- "por %%mm4, %%mm6 \n\t"
- "movq %%mm6, 16(%%edi) \n\t"
+ "movq %%mm5, 8(%4) \n\t"
- "addl $24, %%esi \n\t" // inc by 24 bytes processed
- "addl $24, %%edi \n\t"
+ "add $16, %3 \n\t" // inc by 16 bytes processed
+ "add $16, %4 \n\t"
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
+ "ja mainloop16 \n\t"
- "ja mainloop24 \n\t"
-
- "mainloop24end: \n\t"
+ "mainloop16end: \n\t"
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
"movl %%eax, %%ecx \n\t"
"cmpl $0, %%ecx \n\t"
- "jz end24 \n\t"
+ "jz end16 \n\t"
// preload "movl mask, %%edx \n\t"
"sall $24, %%edx \n\t" // make low byte, high byte
- "secondloop24: \n\t"
+ "secondloop16: \n\t"
"sall %%edx \n\t" // move high bit to CF
- "jnc skip24 \n\t" // if CF = 0
- "movw (%%esi), %%ax \n\t"
- "movw %%ax, (%%edi) \n\t"
- "xorl %%eax, %%eax \n\t"
- "movb 2(%%esi), %%al \n\t"
- "movb %%al, 2(%%edi) \n\t"
+ "jnc skip16 \n\t" // if CF = 0
+ "movw (%3), %%ax \n\t"
+ "movw %%ax, (%4) \n\t"
- "skip24: \n\t"
- "addl $3, %%esi \n\t"
- "addl $3, %%edi \n\t"
+ "skip16: \n\t"
+ "add $2, %3 \n\t"
+ "add $2, %4 \n\t"
"decl %%ecx \n\t"
- "jnz secondloop24 \n\t"
+ "jnz secondloop16 \n\t"
- "end24: \n\t"
+ "end16: \n\t"
"EMMS \n\t" // DONE
: "=a" (dummy_value_a), // output regs (dummy)
@@ -1654,32 +1700,31 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "3" (srcptr), // esi // input regs
- "4" (dstptr), // edi
- "0" (diff), // eax
-// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
+ : "0" (diff), // eax // input regs
+ "1" (mask), // edx
"2" (len), // ecx
- "1" (mask) // edx
+// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
+ "3" (srcptr), // esi/rsi
+ "4" (dstptr) // edi/rdi
#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
- : "%mm0", "%mm1", "%mm2" // clobber list
- , "%mm4", "%mm5", "%mm6", "%mm7"
+ : "%mm0", "%mm1", "%mm4" // clobber list
+ , "%mm5", "%mm6", "%mm7"
#endif
);
}
- else /* mmx _not supported - Use modified C routine */
-#endif /* PNG_MMX_CODE_SUPPORTED */
+ else /* not _mmx_supported - use modified C routine */
{
register png_uint_32 i;
- png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
+ png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
- register int stride = BPP3 * png_pass_inc[png_ptr->pass];
+ register int stride = BPP2 * png_pass_inc[png_ptr->pass];
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
- register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
+ register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
int diff = (int) (png_ptr->width & 7); /* amount lost */
- register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
+ register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
srcptr = png_ptr->row_buf + 1 + initial_val;
dstptr = row + initial_val;
@@ -1692,7 +1737,7 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
}
if (diff) /* number of leftover pixels: 3 for pngtest */
{
- final_val += diff*BPP3;
+ final_val += diff*BPP2;
for (; i < final_val; i += stride)
{
if (rep_bytes > (int)(final_val-i))
@@ -1705,17 +1750,15 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
} /* end of else (_mmx_supported) */
break;
- } /* end 24 bpp */
+ } /* end 16 bpp */
case 48: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
png_bytep dstptr;
-#if defined(PNG_MMX_CODE_SUPPORTED)
#if !defined(PNG_1_0_X)
- if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
- /* && _mmx_supported */ )
+ if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
#else
if (_mmx_supported)
#endif
@@ -1725,8 +1768,8 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
int dummy_value_a; // fix 'forbidden register spilled' error
int dummy_value_d;
int dummy_value_c;
- int dummy_value_S;
- int dummy_value_D;
+ png_bytep dummy_value_S;
+ png_bytep dummy_value_D;
srcptr = png_ptr->row_buf + 1;
dstptr = row;
@@ -1766,57 +1809,57 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
"pcmpeqb %%mm6, %%mm5 \n\t"
// preload "movl len, %%ecx \n\t" // load length of line
-// preload "movl srcptr, %%esi \n\t" // load source
-// preload "movl dstptr, %%edi \n\t" // load dest
+// preload "movl srcptr, %3 \n\t" // load source
+// preload "movl dstptr, %4 \n\t" // load dest
"cmpl $0, %%ecx \n\t"
"jz mainloop48end \n\t"
"mainloop48: \n\t"
- "movq (%%esi), %%mm7 \n\t"
+ "movq (%3), %%mm7 \n\t"
"pand %%mm0, %%mm7 \n\t"
"movq %%mm0, %%mm6 \n\t"
- "pandn (%%edi), %%mm6 \n\t"
+ "pandn (%4), %%mm6 \n\t"
"por %%mm6, %%mm7 \n\t"
- "movq %%mm7, (%%edi) \n\t"
+ "movq %%mm7, (%4) \n\t"
- "movq 8(%%esi), %%mm6 \n\t"
+ "movq 8(%3), %%mm6 \n\t"
"pand %%mm1, %%mm6 \n\t"
"movq %%mm1, %%mm7 \n\t"
- "pandn 8(%%edi), %%mm7 \n\t"
+ "pandn 8(%4), %%mm7 \n\t"
"por %%mm7, %%mm6 \n\t"
- "movq %%mm6, 8(%%edi) \n\t"
+ "movq %%mm6, 8(%4) \n\t"
- "movq 16(%%esi), %%mm6 \n\t"
+ "movq 16(%3), %%mm6 \n\t"
"pand %%mm2, %%mm6 \n\t"
"movq %%mm2, %%mm7 \n\t"
- "pandn 16(%%edi), %%mm7 \n\t"
+ "pandn 16(%4), %%mm7 \n\t"
"por %%mm7, %%mm6 \n\t"
- "movq %%mm6, 16(%%edi) \n\t"
+ "movq %%mm6, 16(%4) \n\t"
- "movq 24(%%esi), %%mm7 \n\t"
+ "movq 24(%3), %%mm7 \n\t"
"pand %%mm3, %%mm7 \n\t"
"movq %%mm3, %%mm6 \n\t"
- "pandn 24(%%edi), %%mm6 \n\t"
+ "pandn 24(%4), %%mm6 \n\t"
"por %%mm6, %%mm7 \n\t"
- "movq %%mm7, 24(%%edi) \n\t"
+ "movq %%mm7, 24(%4) \n\t"
- "movq 32(%%esi), %%mm6 \n\t"
+ "movq 32(%3), %%mm6 \n\t"
"pand %%mm4, %%mm6 \n\t"
"movq %%mm4, %%mm7 \n\t"
- "pandn 32(%%edi), %%mm7 \n\t"
+ "pandn 32(%4), %%mm7 \n\t"
"por %%mm7, %%mm6 \n\t"
- "movq %%mm6, 32(%%edi) \n\t"
+ "movq %%mm6, 32(%4) \n\t"
- "movq 40(%%esi), %%mm7 \n\t"
+ "movq 40(%3), %%mm7 \n\t"
"pand %%mm5, %%mm7 \n\t"
"movq %%mm5, %%mm6 \n\t"
- "pandn 40(%%edi), %%mm6 \n\t"
+ "pandn 40(%4), %%mm6 \n\t"
"por %%mm6, %%mm7 \n\t"
- "movq %%mm7, 40(%%edi) \n\t"
+ "movq %%mm7, 40(%4) \n\t"
- "addl $48, %%esi \n\t" // inc by 48 bytes processed
- "addl $48, %%edi \n\t"
+ "add $48, %3 \n\t" // inc by 48 bytes processed
+ "add $48, %4 \n\t"
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
"ja mainloop48 \n\t"
@@ -1832,12 +1875,12 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
"secondloop48: \n\t"
"sall %%edx \n\t" // move high bit to CF
"jnc skip48 \n\t" // if CF = 0
- "movl (%%esi), %%eax \n\t"
- "movl %%eax, (%%edi) \n\t"
+ "movl (%3), %%eax \n\t"
+ "movl %%eax, (%4) \n\t"
"skip48: \n\t"
- "addl $4, %%esi \n\t"
- "addl $4, %%edi \n\t"
+ "add $4, %3 \n\t"
+ "add $4, %4 \n\t"
"decl %%ecx \n\t"
"jnz secondloop48 \n\t"
@@ -1850,12 +1893,12 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "3" (srcptr), // esi // input regs
- "4" (dstptr), // edi
- "0" (diff), // eax
-// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
+ : "0" (diff), // eax // input regs
+ "1" (mask), // edx
"2" (len), // ecx
- "1" (mask) // edx
+// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
+ "3" (srcptr), // esi/rsi
+ "4" (dstptr) // edi/rdi
#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
@@ -1863,8 +1906,7 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
#endif
);
}
- else /* mmx _not supported - Use modified C routine */
-#endif /* PNG_MMX_CODE_SUPPORTED */
+ else /* not _mmx_supported - use modified C routine */
{
register png_uint_32 i;
png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
@@ -1945,8 +1987,11 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
{
- /* this should never happen */
- png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
+ // ERROR: SHOULD NEVER BE REACHED
+#if defined(PNG_DEBUG)
+ png_debug(1, "Internal libpng logic error (GCC "
+ "png_combine_row() pixel_depth)\n");
+#endif
break;
}
} /* end switch (png_ptr->row_info.pixel_depth) */
@@ -1985,7 +2030,6 @@ png_do_read_interlace(png_structp png_ptr)
png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
-#if defined(PNG_MMX_CODE_SUPPORTED)
if (_mmx_supported == 2) {
#if !defined(PNG_1_0_X)
/* this should have happened in png_init_mmx_flags() already */
@@ -1993,7 +2037,6 @@ png_do_read_interlace(png_structp png_ptr)
#endif
png_mmx_support();
}
-#endif
if (row != NULL && row_info != NULL)
{
@@ -2192,30 +2235,106 @@ png_do_read_interlace(png_structp png_ptr)
/* New code by Nirav Chhatrapati - Intel Corporation */
-#if defined(PNG_MMX_CODE_SUPPORTED)
#if !defined(PNG_1_0_X)
- if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
- /* && _mmx_supported */ )
+ if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
#else
if (_mmx_supported)
#endif
{
+ int dummy_value_c; // fix 'forbidden register spilled'
+ png_bytep dummy_value_S;
+ png_bytep dummy_value_D;
+ png_bytep dummy_value_a;
+ png_bytep dummy_value_d;
+
//--------------------------------------------------------------
- if (pixel_bytes == 3)
+ if (pixel_bytes == BPP3)
{
- if (((pass == 0) || (pass == 1)) && width)
+ if (((pass == 4) || (pass == 5)) && width)
{
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
- long dummy_value_a;
+ int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
+ if (width_mmx < 0)
+ width_mmx = 0;
+ width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
+ if (width_mmx)
+ {
+ // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
+ // sptr points at last pixel in pre-expanded row
+ // dp points at last pixel position in expanded row
+ __asm__ __volatile__ (
+ "sub $3, %1 \n\t"
+ "sub $9, %2 \n\t"
+ // (png_pass_inc[pass] + 1)*pixel_bytes
+
+ ".loop3_pass4: \n\t"
+ "movq (%1), %%mm0 \n\t" // x x 5 4 3 2 1 0
+ "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
+ "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
+ "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
+ "pand (%3), %%mm1 \n\t" // z z z z z 2 1 0
+ "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
+ "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
+ "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
+ "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
+ "movq %%mm0, (%2) \n\t"
+ "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
+ "pand (%4), %%mm3 \n\t" // z z z z z z z 5
+ "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
+ "sub $6, %1 \n\t"
+ "movd %%mm2, 8(%2) \n\t"
+ "sub $12, %2 \n\t"
+ "subl $2, %%ecx \n\t"
+ "jnz .loop3_pass4 \n\t"
+ "EMMS \n\t" // DONE
+
+ : "=c" (dummy_value_c), // output regs (dummy)
+ "=S" (dummy_value_S),
+ "=D" (dummy_value_D),
+ "=a" (dummy_value_a),
+ "=d" (dummy_value_d)
+
+ : "0" (width_mmx), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp), // edi/rdi
+#if defined(PNG_x86_64_USE_GOTPCREL) // formerly _const4 and _const6:
+ "3" (&_c64._amask5_3_0), // (0x0000000000FFFFFFLL)
+ "4" (&_c64._amask7_1_0) // (0x00000000000000FFLL)
+#else
+ "3" (&_amask5_3_0), // eax (0x0000000000FFFFFFLL)
+ "4" (&_amask7_1_0) // edx (0x00000000000000FFLL)
+#endif
+
+#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
+ : "%mm0", "%mm1" // clobber list
+ , "%mm2", "%mm3"
+#endif
+ );
+ }
+
+ sptr -= width_mmx*BPP3;
+ dp -= width_mmx*2*BPP3;
+ for (i = width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ png_memcpy(v, sptr, BPP3);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, BPP3);
+ dp -= BPP3;
+ }
+ sptr -= BPP3;
+ }
+ }
+ else if (((pass == 2) || (pass == 3)) && width)
+ {
__asm__ __volatile__ (
- "subl $21, %%edi \n\t"
+ "sub $9, %2 \n\t"
// (png_pass_inc[pass] - 1)*pixel_bytes
- ".loop3_pass0: \n\t"
- "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
+ ".loop3_pass2: \n\t"
+ "movd (%1), %%mm0 \n\t" // x x x x x 2 1 0
"pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
@@ -2224,19 +2343,13 @@ png_do_read_interlace(png_structp png_ptr)
"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
- "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
- "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
- "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
- "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
- "movq %%mm4, 16(%%edi) \n\t"
- "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
- "movq %%mm3, 8(%%edi) \n\t"
- "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
- "subl $3, %%esi \n\t"
- "movq %%mm0, (%%edi) \n\t"
- "subl $24, %%edi \n\t"
+ "movq %%mm0, 4(%2) \n\t"
+ "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
+ "sub $3, %1 \n\t"
+ "movd %%mm0, (%2) \n\t"
+ "sub $12, %2 \n\t"
"decl %%ecx \n\t"
- "jnz .loop3_pass0 \n\t"
+ "jnz .loop3_pass2 \n\t"
"EMMS \n\t" // DONE
: "=c" (dummy_value_c), // output regs (dummy)
@@ -2244,34 +2357,28 @@ png_do_read_interlace(png_structp png_ptr)
"=D" (dummy_value_D),
"=a" (dummy_value_a)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width), // ecx
-#if defined(PNG_x86_64_USE_GOTPCREL) // formerly _const4:
+ : "0" (width), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp), // edi/rdi
+#if defined(PNG_x86_64_USE_GOTPCREL) // formerly _const4:
"3" (&_c64._amask5_3_0) // (0x0000000000FFFFFFLL)
#else
- "3" (&_amask5_3_0) // (0x0000000000FFFFFFLL)
+ "3" (&_amask5_3_0) // eax (0x0000000000FFFFFFLL)
#endif
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
+#if defined(CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1", "%mm2" // clobber list
- , "%mm3", "%mm4"
#endif
);
}
- else if (((pass == 2) || (pass == 3)) && width)
+ else if (width) // && ((pass == 0) || (pass == 1))
{
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
- long dummy_value_a;
-
__asm__ __volatile__ (
- "subl $9, %%edi \n\t"
+ "sub $21, %2 \n\t"
// (png_pass_inc[pass] - 1)*pixel_bytes
- ".loop3_pass2: \n\t"
- "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
+ ".loop3_pass0: \n\t"
+ "movd (%1), %%mm0 \n\t" // x x x x x 2 1 0
"pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
@@ -2280,13 +2387,19 @@ png_do_read_interlace(png_structp png_ptr)
"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
- "movq %%mm0, 4(%%edi) \n\t"
- "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
- "subl $3, %%esi \n\t"
- "movd %%mm0, (%%edi) \n\t"
- "subl $12, %%edi \n\t"
+ "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
+ "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
+ "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
+ "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
+ "movq %%mm4, 16(%2) \n\t"
+ "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
+ "movq %%mm3, 8(%2) \n\t"
+ "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
+ "sub $3, %1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "sub $24, %2 \n\t"
"decl %%ecx \n\t"
- "jnz .loop3_pass2 \n\t"
+ "jnz .loop3_pass0 \n\t"
"EMMS \n\t" // DONE
: "=c" (dummy_value_c), // output regs (dummy)
@@ -2294,226 +2407,168 @@ png_do_read_interlace(png_structp png_ptr)
"=D" (dummy_value_D),
"=a" (dummy_value_a)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width), // ecx
-#if defined(PNG_x86_64_USE_GOTPCREL) // formerly _const4:
+ : "0" (width), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp), // edi/rdi
+#if defined(PNG_x86_64_USE_GOTPCREL) // formerly _const4:
"3" (&_c64._amask5_3_0) // (0x0000000000FFFFFFLL)
#else
- "3" (&_amask5_3_0) // (0x0000000000FFFFFFLL)
+ "3" (&_amask5_3_0) // eax (0x0000000000FFFFFFLL)
#endif
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
+#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1", "%mm2" // clobber list
+ , "%mm3", "%mm4"
#endif
);
}
- else if (width) /* && ((pass == 4) || (pass == 5)) */
+ } /* end of pixel_bytes == 3 */
+
+ //--------------------------------------------------------------
+ else if (pixel_bytes == BPP4)
+ {
+ if (((pass == 4) || (pass == 5)) && width)
{
- int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
- if (width_mmx < 0)
- width_mmx = 0;
- width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
+ int width_mmx = ((width >> 1) << 1) ;
+ width -= width_mmx; // 0,1 pixels => 0,4 bytes
if (width_mmx)
{
- // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
- // sptr points at last pixel in pre-expanded row
- // dp points at last pixel position in expanded row
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
- long dummy_value_a;
- long dummy_value_d;
-
__asm__ __volatile__ (
- "subl $3, %%esi \n\t"
- "subl $9, %%edi \n\t"
- // (png_pass_inc[pass] + 1)*pixel_bytes
+ "sub $4, %1 \n\t"
+ "sub $12, %2 \n\t"
- ".loop3_pass4: \n\t"
- "movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
- "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
- "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
- "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
- "pand (%3), %%mm1 \n\t" // z z z z z 2 1 0
- "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
- "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
- "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
- "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
- "movq %%mm0, (%%edi) \n\t"
- "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
- "pand (%4), %%mm3 \n\t" // z z z z z z z 5
- "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
- "subl $6, %%esi \n\t"
- "movd %%mm2, 8(%%edi) \n\t"
- "subl $12, %%edi \n\t"
+ ".loop4_pass4: \n\t"
+ "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
+ "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
+ "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
+ "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
+ "movq %%mm0, (%2) \n\t"
+ "sub $8, %1 \n\t"
+ "movq %%mm1, 8(%2) \n\t"
+ "sub $16, %2 \n\t"
"subl $2, %%ecx \n\t"
- "jnz .loop3_pass4 \n\t"
+ "jnz .loop4_pass4 \n\t"
"EMMS \n\t" // DONE
: "=c" (dummy_value_c), // output regs (dummy)
"=S" (dummy_value_S),
- "=D" (dummy_value_D),
- "=a" (dummy_value_a),
- "=d" (dummy_value_d)
+ "=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width_mmx), // ecx
-#if defined(PNG_x86_64_USE_GOTPCREL) // formerly _const4 and _const6:
- "3" (&_c64._amask5_3_0), // (0x0000000000FFFFFFLL)
- "4" (&_c64._amask7_1_0) // (0x00000000000000FFLL)
-#else
- "3" (&_amask5_3_0), // (0x0000000000FFFFFFLL)
- "4" (&_amask7_1_0) // (0x00000000000000FFLL)
-#endif
+ : "0" (width_mmx), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
+#if defined(CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1" // clobber list
- , "%mm2", "%mm3"
#endif
);
}
- sptr -= width_mmx*3;
- dp -= width_mmx*6;
+ sptr -= (width_mmx*BPP4 - BPP4); // sign fixed
+ dp -= (width_mmx*2*BPP4 - BPP4); // sign fixed
for (i = width; i; i--)
{
png_byte v[8];
int j;
-
- png_memcpy(v, sptr, 3);
+ sptr -= BPP4;
+ png_memcpy(v, sptr, BPP4);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- png_memcpy(dp, v, 3);
- dp -= 3;
+ dp -= BPP4;
+ png_memcpy(dp, v, BPP4);
}
- sptr -= 3;
}
}
- } /* end of pixel_bytes == 3 */
-
- //--------------------------------------------------------------
- else if (pixel_bytes == 1)
- {
- if (((pass == 0) || (pass == 1)) && width)
+ else if (((pass == 2) || (pass == 3)) && width)
{
- int width_mmx = ((width >> 2) << 2);
- width -= width_mmx; // 0-3 pixels => 0-3 bytes
+ int width_mmx = ((width >> 1) << 1);
+ width -= width_mmx; // 0,1 pixels => 0,4 bytes
if (width_mmx)
{
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
-
__asm__ __volatile__ (
- "subl $3, %%esi \n\t"
- "subl $31, %%edi \n\t"
+ "sub $4, %1 \n\t"
+ "sub $28, %2 \n\t"
- ".loop1_pass0: \n\t"
- "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
- "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
- "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
- "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
- "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
- "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
- "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
- "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
- "movq %%mm0, (%%edi) \n\t"
- "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
- "movq %%mm3, 8(%%edi) \n\t"
- "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
- "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
- "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
- "movq %%mm2, 16(%%edi) \n\t"
- "subl $4, %%esi \n\t"
- "movq %%mm4, 24(%%edi) \n\t"
- "subl $32, %%edi \n\t"
- "subl $4, %%ecx \n\t"
- "jnz .loop1_pass0 \n\t"
+ ".loop4_pass2: \n\t"
+ "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
+ "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
+ "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
+ "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm0, 8(%2) \n\t"
+ "movq %%mm1, 16(%2) \n\t"
+ "movq %%mm1, 24(%2) \n\t"
+ "sub $8, %1 \n\t"
+ "sub $32, %2 \n\t"
+ "subl $2, %%ecx \n\t"
+ "jnz .loop4_pass2 \n\t"
"EMMS \n\t" // DONE
: "=c" (dummy_value_c), // output regs (dummy)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width_mmx) // ecx
+ : "0" (width_mmx), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
- : "%mm0", "%mm1", "%mm2" // clobber list
- , "%mm3", "%mm4"
+#if defined(CLOBBER_MMX_REGS_SUPPORTED)
+ : "%mm0", "%mm1" // clobber list
#endif
);
}
- sptr -= width_mmx;
- dp -= width_mmx*8;
+ sptr -= (width_mmx*4 - 4); // sign fixed
+ dp -= (width_mmx*16 - 4); // sign fixed
for (i = width; i; i--)
{
+ png_byte v[8];
int j;
-
- /* I simplified this part in version 1.0.4e
- * here and in several other instances where
- * pixel_bytes == 1 -- GR-P
- *
- * Original code:
- *
- * png_byte v[8];
- * png_memcpy(v, sptr, pixel_bytes);
- * for (j = 0; j < png_pass_inc[pass]; j++)
- * {
- * png_memcpy(dp, v, pixel_bytes);
- * dp -= pixel_bytes;
- * }
- * sptr -= pixel_bytes;
- *
- * Replacement code is in the next three lines:
- */
-
+ sptr -= 4;
+ png_memcpy(v, sptr, 4);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- *dp-- = *sptr;
+ dp -= 4;
+ png_memcpy(dp, v, 4);
}
- --sptr;
}
}
- else if (((pass == 2) || (pass == 3)) && width)
+ else if (width) // && ((pass == 0) || (pass == 1))
{
- int width_mmx = ((width >> 2) << 2);
- width -= width_mmx; // 0-3 pixels => 0-3 bytes
+ int width_mmx = ((width >> 1) << 1);
+ width -= width_mmx; // 0,1 pixels => 0,4 bytes
if (width_mmx)
{
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
-
__asm__ __volatile__ (
- "subl $3, %%esi \n\t"
- "subl $15, %%edi \n\t"
+ "sub $4, %1 \n\t"
+ "sub $60, %2 \n\t"
- ".loop1_pass2: \n\t"
- "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
- "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
- "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
- "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
- "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
- "movq %%mm0, (%%edi) \n\t"
- "subl $4, %%esi \n\t"
- "movq %%mm1, 8(%%edi) \n\t"
- "subl $16, %%edi \n\t"
- "subl $4, %%ecx \n\t"
- "jnz .loop1_pass2 \n\t"
+ ".loop4_pass0: \n\t"
+ "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
+ "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
+ "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
+ "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm0, 8(%2) \n\t"
+ "movq %%mm0, 16(%2) \n\t"
+ "movq %%mm0, 24(%2) \n\t"
+ "movq %%mm1, 32(%2) \n\t"
+ "movq %%mm1, 40(%2) \n\t"
+ "movq %%mm1, 48(%2) \n\t"
+ "sub $8, %1 \n\t"
+ "movq %%mm1, 56(%2) \n\t"
+ "sub $64, %2 \n\t"
+ "subl $2, %%ecx \n\t"
+ "jnz .loop4_pass0 \n\t"
"EMMS \n\t" // DONE
: "=c" (dummy_value_c), // output regs (dummy)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width_mmx) // ecx
+ : "0" (width_mmx), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1" // clobber list
@@ -2521,53 +2576,56 @@ png_do_read_interlace(png_structp png_ptr)
);
}
- sptr -= width_mmx;
- dp -= width_mmx*4;
+ sptr -= (width_mmx*4 - 4); // sign fixed
+ dp -= (width_mmx*32 - 4); // sign fixed
for (i = width; i; i--)
{
+ png_byte v[8];
int j;
-
+ sptr -= 4;
+ png_memcpy(v, sptr, 4);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- *dp-- = *sptr;
+ dp -= 4;
+ png_memcpy(dp, v, 4);
}
- --sptr;
}
}
- else if (width) /* && ((pass == 4) || (pass == 5)) */
+ } /* end of pixel_bytes == 4 */
+
+ //--------------------------------------------------------------
+ else if (pixel_bytes == 1)
+ {
+ if (((pass == 4) || (pass == 5)) && width)
{
int width_mmx = ((width >> 3) << 3);
width -= width_mmx; // 0-3 pixels => 0-3 bytes
if (width_mmx)
{
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
-
__asm__ __volatile__ (
- "subl $7, %%esi \n\t"
- "subl $15, %%edi \n\t"
+ "sub $7, %1 \n\t"
+ "sub $15, %2 \n\t"
".loop1_pass4: \n\t"
- "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
+ "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
"punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
- "movq %%mm1, 8(%%edi) \n\t"
- "subl $8, %%esi \n\t"
- "movq %%mm0, (%%edi) \n\t"
- "subl $16, %%edi \n\t"
+ "movq %%mm1, 8(%2) \n\t"
+ "sub $8, %1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "sub $16, %2 \n\t"
"subl $8, %%ecx \n\t"
"jnz .loop1_pass4 \n\t"
"EMMS \n\t" // DONE
- : "=c" (dummy_value_c), // output regs (none)
+ : "=c" (dummy_value_c), // output regs (dummy)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width_mmx) // ecx
+ : "0" (width_mmx), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1" // clobber list
@@ -2588,147 +2646,157 @@ png_do_read_interlace(png_structp png_ptr)
--sptr;
}
}
- } /* end of pixel_bytes == 1 */
-
- //--------------------------------------------------------------
- else if (pixel_bytes == 2)
- {
- if (((pass == 0) || (pass == 1)) && width)
+ else if (((pass == 2) || (pass == 3)) && width)
{
- int width_mmx = ((width >> 1) << 1);
- width -= width_mmx; // 0,1 pixels => 0,2 bytes
+ int width_mmx = ((width >> 2) << 2);
+ width -= width_mmx; // 0-3 pixels => 0-3 bytes
if (width_mmx)
{
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
-
__asm__ __volatile__ (
- "subl $2, %%esi \n\t"
- "subl $30, %%edi \n\t"
+ "sub $3, %1 \n\t"
+ "sub $15, %2 \n\t"
- ".loop2_pass0: \n\t"
- "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
- "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
- "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
- "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
- "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
- "movq %%mm0, (%%edi) \n\t"
- "movq %%mm0, 8(%%edi) \n\t"
- "movq %%mm1, 16(%%edi) \n\t"
- "subl $4, %%esi \n\t"
- "movq %%mm1, 24(%%edi) \n\t"
- "subl $32, %%edi \n\t"
- "subl $2, %%ecx \n\t"
- "jnz .loop2_pass0 \n\t"
+ ".loop1_pass2: \n\t"
+ "movd (%1), %%mm0 \n\t" // x x x x 3 2 1 0
+ "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
+ "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
+ "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
+ "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
+ "movq %%mm0, (%2) \n\t"
+ "sub $4, %1 \n\t"
+ "movq %%mm1, 8(%2) \n\t"
+ "sub $16, %2 \n\t"
+ "subl $4, %%ecx \n\t"
+ "jnz .loop1_pass2 \n\t"
"EMMS \n\t" // DONE
: "=c" (dummy_value_c), // output regs (dummy)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width_mmx) // ecx
+ : "0" (width_mmx), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
+#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1" // clobber list
#endif
);
}
- sptr -= (width_mmx*2 - 2); // sign fixed
- dp -= (width_mmx*16 - 2); // sign fixed
+ sptr -= width_mmx;
+ dp -= width_mmx*4;
for (i = width; i; i--)
{
- png_byte v[8];
int j;
- sptr -= 2;
- png_memcpy(v, sptr, 2);
+
for (j = 0; j < png_pass_inc[pass]; j++)
{
- dp -= 2;
- png_memcpy(dp, v, 2);
+ *dp-- = *sptr;
}
+ --sptr;
}
}
- else if (((pass == 2) || (pass == 3)) && width)
+ else if (width) // && ((pass == 0) || (pass == 1))
{
- int width_mmx = ((width >> 1) << 1) ;
- width -= width_mmx; // 0,1 pixels => 0,2 bytes
+ int width_mmx = ((width >> 2) << 2);
+ width -= width_mmx; // 0-3 pixels => 0-3 bytes
if (width_mmx)
{
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
-
__asm__ __volatile__ (
- "subl $2, %%esi \n\t"
- "subl $14, %%edi \n\t"
+ "sub $3, %1 \n\t"
+ "sub $31, %2 \n\t"
- ".loop2_pass2: \n\t"
- "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
- "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
- "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
- "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
- "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
- "movq %%mm0, (%%edi) \n\t"
- "subl $4, %%esi \n\t"
- "movq %%mm1, 8(%%edi) \n\t"
- "subl $16, %%edi \n\t"
- "subl $2, %%ecx \n\t"
- "jnz .loop2_pass2 \n\t"
+ ".loop1_pass0: \n\t"
+ "movd (%1), %%mm0 \n\t" // x x x x 3 2 1 0
+ "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
+ "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
+ "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
+ "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
+ "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
+ "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
+ "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
+ "movq %%mm0, (%2) \n\t"
+ "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
+ "movq %%mm3, 8(%2) \n\t"
+ "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
+ "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
+ "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
+ "movq %%mm2, 16(%2) \n\t"
+ "sub $4, %1 \n\t"
+ "movq %%mm4, 24(%2) \n\t"
+ "sub $32, %2 \n\t"
+ "subl $4, %%ecx \n\t"
+ "jnz .loop1_pass0 \n\t"
"EMMS \n\t" // DONE
: "=c" (dummy_value_c), // output regs (dummy)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width_mmx) // ecx
+ : "0" (width_mmx), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
- : "%mm0", "%mm1" // clobber list
+#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
+ : "%mm0", "%mm1", "%mm2" // clobber list
+ , "%mm3", "%mm4"
#endif
);
}
- sptr -= (width_mmx*2 - 2); // sign fixed
- dp -= (width_mmx*8 - 2); // sign fixed
+ sptr -= width_mmx;
+ dp -= width_mmx*8;
for (i = width; i; i--)
{
- png_byte v[8];
int j;
- sptr -= 2;
- png_memcpy(v, sptr, 2);
+
+ /* I simplified this part in version 1.0.4e
+ * here and in several other instances where
+ * pixel_bytes == 1 -- GR-P
+ *
+ * Original code:
+ *
+ * png_byte v[8];
+ * png_memcpy(v, sptr, pixel_bytes);
+ * for (j = 0; j < png_pass_inc[pass]; j++)
+ * {
+ * png_memcpy(dp, v, pixel_bytes);
+ * dp -= pixel_bytes;
+ * }
+ * sptr -= pixel_bytes;
+ *
+ * Replacement code is in the next three lines:
+ */
+
for (j = 0; j < png_pass_inc[pass]; j++)
{
- dp -= 2;
- png_memcpy(dp, v, 2);
+ *dp-- = *sptr;
}
+ --sptr;
}
}
- else if (width) // pass == 4 or 5
+ } /* end of pixel_bytes == 1 */
+
+ //--------------------------------------------------------------
+ else if (pixel_bytes == BPP2)
+ {
+ if (((pass == 4) || (pass == 5)) && width)
{
int width_mmx = ((width >> 1) << 1) ;
width -= width_mmx; // 0,1 pixels => 0,2 bytes
if (width_mmx)
{
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
-
__asm__ __volatile__ (
- "subl $2, %%esi \n\t"
- "subl $6, %%edi \n\t"
+ "sub $2, %1 \n\t"
+ "sub $6, %2 \n\t"
".loop2_pass4: \n\t"
- "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
+ "movd (%1), %%mm0 \n\t" // x x x x 3 2 1 0
"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
- "subl $4, %%esi \n\t"
- "movq %%mm0, (%%edi) \n\t"
- "subl $8, %%edi \n\t"
+ "sub $4, %1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "sub $8, %2 \n\t"
"subl $2, %%ecx \n\t"
"jnz .loop2_pass4 \n\t"
"EMMS \n\t" // DONE
@@ -2737,9 +2805,9 @@ png_do_read_interlace(png_structp png_ptr)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width_mmx) // ecx
+ : "0" (width_mmx), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0" // clobber list
@@ -2747,124 +2815,52 @@ png_do_read_interlace(png_structp png_ptr)
);
}
- sptr -= (width_mmx*2 - 2); // sign fixed
- dp -= (width_mmx*4 - 2); // sign fixed
- for (i = width; i; i--)
- {
- png_byte v[8];
- int j;
- sptr -= 2;
- png_memcpy(v, sptr, 2);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- dp -= 2;
- png_memcpy(dp, v, 2);
- }
- }
- }
- } /* end of pixel_bytes == 2 */
-
- //--------------------------------------------------------------
- else if (pixel_bytes == 4)
- {
- if (((pass == 0) || (pass == 1)) && width)
- {
- int width_mmx = ((width >> 1) << 1);
- width -= width_mmx; // 0,1 pixels => 0,4 bytes
- if (width_mmx)
- {
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
-
- __asm__ __volatile__ (
- "subl $4, %%esi \n\t"
- "subl $60, %%edi \n\t"
-
- ".loop4_pass0: \n\t"
- "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
- "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
- "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
- "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
- "movq %%mm0, (%%edi) \n\t"
- "movq %%mm0, 8(%%edi) \n\t"
- "movq %%mm0, 16(%%edi) \n\t"
- "movq %%mm0, 24(%%edi) \n\t"
- "movq %%mm1, 32(%%edi) \n\t"
- "movq %%mm1, 40(%%edi) \n\t"
- "movq %%mm1, 48(%%edi) \n\t"
- "subl $8, %%esi \n\t"
- "movq %%mm1, 56(%%edi) \n\t"
- "subl $64, %%edi \n\t"
- "subl $2, %%ecx \n\t"
- "jnz .loop4_pass0 \n\t"
- "EMMS \n\t" // DONE
-
- : "=c" (dummy_value_c), // output regs (dummy)
- "=S" (dummy_value_S),
- "=D" (dummy_value_D)
-
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width_mmx) // ecx
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
- : "%mm0", "%mm1" // clobber list
-#endif
- );
- }
-
- sptr -= (width_mmx*4 - 4); // sign fixed
- dp -= (width_mmx*32 - 4); // sign fixed
+ sptr -= (width_mmx*BPP2 - BPP2); // sign fixed
+ dp -= (width_mmx*2*BPP2 - BPP2); // sign fixed
for (i = width; i; i--)
{
png_byte v[8];
int j;
- sptr -= 4;
- png_memcpy(v, sptr, 4);
+ sptr -= BPP2;
+ png_memcpy(v, sptr, BPP2);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- dp -= 4;
- png_memcpy(dp, v, 4);
+ dp -= BPP2;
+ png_memcpy(dp, v, BPP2);
}
}
}
else if (((pass == 2) || (pass == 3)) && width)
{
- int width_mmx = ((width >> 1) << 1);
- width -= width_mmx; // 0,1 pixels => 0,4 bytes
+ int width_mmx = ((width >> 1) << 1) ;
+ width -= width_mmx; // 0,1 pixels => 0,2 bytes
if (width_mmx)
{
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
-
__asm__ __volatile__ (
- "subl $4, %%esi \n\t"
- "subl $28, %%edi \n\t"
+ "sub $2, %1 \n\t"
+ "sub $14, %2 \n\t"
- ".loop4_pass2: \n\t"
- "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
- "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
- "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
- "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
- "movq %%mm0, (%%edi) \n\t"
- "movq %%mm0, 8(%%edi) \n\t"
- "movq %%mm1, 16(%%edi) \n\t"
- "movq %%mm1, 24(%%edi) \n\t"
- "subl $8, %%esi \n\t"
- "subl $32, %%edi \n\t"
+ ".loop2_pass2: \n\t"
+ "movd (%1), %%mm0 \n\t" // x x x x 3 2 1 0
+ "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
+ "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
+ "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
+ "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
+ "movq %%mm0, (%2) \n\t"
+ "sub $4, %1 \n\t"
+ "movq %%mm1, 8(%2) \n\t"
+ "sub $16, %2 \n\t"
"subl $2, %%ecx \n\t"
- "jnz .loop4_pass2 \n\t"
+ "jnz .loop2_pass2 \n\t"
"EMMS \n\t" // DONE
: "=c" (dummy_value_c), // output regs (dummy)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width_mmx) // ecx
+ : "0" (width_mmx), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
#if defined(CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1" // clobber list
@@ -2872,55 +2868,54 @@ png_do_read_interlace(png_structp png_ptr)
);
}
- sptr -= (width_mmx*4 - 4); // sign fixed
- dp -= (width_mmx*16 - 4); // sign fixed
+ sptr -= (width_mmx*2 - 2); // sign fixed
+ dp -= (width_mmx*8 - 2); // sign fixed
for (i = width; i; i--)
{
png_byte v[8];
int j;
- sptr -= 4;
- png_memcpy(v, sptr, 4);
+ sptr -= 2;
+ png_memcpy(v, sptr, 2);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- dp -= 4;
- png_memcpy(dp, v, 4);
+ dp -= 2;
+ png_memcpy(dp, v, 2);
}
}
}
- else if (width) // pass == 4 or 5
+ else if (width) // && ((pass == 0) || (pass == 1))
{
- int width_mmx = ((width >> 1) << 1) ;
- width -= width_mmx; // 0,1 pixels => 0,4 bytes
+ int width_mmx = ((width >> 1) << 1);
+ width -= width_mmx; // 0,1 pixels => 0,2 bytes
if (width_mmx)
{
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
-
__asm__ __volatile__ (
- "subl $4, %%esi \n\t"
- "subl $12, %%edi \n\t"
+ "sub $2, %1 \n\t"
+ "sub $30, %2 \n\t"
- ".loop4_pass4: \n\t"
- "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
- "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
- "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
- "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
- "movq %%mm0, (%%edi) \n\t"
- "subl $8, %%esi \n\t"
- "movq %%mm1, 8(%%edi) \n\t"
- "subl $16, %%edi \n\t"
+ ".loop2_pass0: \n\t"
+ "movd (%1), %%mm0 \n\t" // x x x x 3 2 1 0
+ "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
+ "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
+ "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
+ "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm0, 8(%2) \n\t"
+ "movq %%mm1, 16(%2) \n\t"
+ "sub $4, %1 \n\t"
+ "movq %%mm1, 24(%2) \n\t"
+ "sub $32, %2 \n\t"
"subl $2, %%ecx \n\t"
- "jnz .loop4_pass4 \n\t"
+ "jnz .loop2_pass0 \n\t"
"EMMS \n\t" // DONE
: "=c" (dummy_value_c), // output regs (dummy)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width_mmx) // ecx
+ : "0" (width_mmx), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
#if defined(CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1" // clobber list
@@ -2928,64 +2923,54 @@ png_do_read_interlace(png_structp png_ptr)
);
}
- sptr -= (width_mmx*4 - 4); // sign fixed
- dp -= (width_mmx*8 - 4); // sign fixed
+ sptr -= (width_mmx*2 - 2); // sign fixed
+ dp -= (width_mmx*16 - 2); // sign fixed
for (i = width; i; i--)
{
png_byte v[8];
int j;
- sptr -= 4;
- png_memcpy(v, sptr, 4);
+ sptr -= 2;
+ png_memcpy(v, sptr, 2);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- dp -= 4;
- png_memcpy(dp, v, 4);
+ dp -= 2;
+ png_memcpy(dp, v, 2);
}
}
}
- } /* end of pixel_bytes == 4 */
+ } /* end of pixel_bytes == 2 */
//--------------------------------------------------------------
- else if (pixel_bytes == 8)
+ else if (pixel_bytes == BPP8)
{
// GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
- /* GRR NOTE: no need to combine passes here! */
- if (((pass == 0) || (pass == 1)) && width)
+ // GRR NOTE: no need to combine passes here!
+ if (((pass == 4) || (pass == 5)) && width)
{
- int dummy_value_c; /* fix 'forbidden register spilled' */
- int dummy_value_S;
- int dummy_value_D;
-
- /* source is 8-byte RRGGBBAA */
- /* dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA */
+ // source is 8-byte RRGGBBAA
+ // dest is 16-byte RRGGBBAA RRGGBBAA
__asm__ __volatile__ (
- "subl $56, %%edi \n\t" // start of last block
-
- ".loop8_pass0: \n\t"
- "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
- "movq %%mm0, (%%edi) \n\t"
- "movq %%mm0, 8(%%edi) \n\t"
- "movq %%mm0, 16(%%edi) \n\t"
- "movq %%mm0, 24(%%edi) \n\t"
- "movq %%mm0, 32(%%edi) \n\t"
- "movq %%mm0, 40(%%edi) \n\t"
- "movq %%mm0, 48(%%edi) \n\t"
- "subl $8, %%esi \n\t"
- "movq %%mm0, 56(%%edi) \n\t"
- "subl $64, %%edi \n\t"
+ "sub $8, %2 \n\t" // start of last block
+
+ ".loop8_pass4: \n\t"
+ "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
+ "movq %%mm0, (%2) \n\t"
+ "sub $8, %1 \n\t"
+ "movq %%mm0, 8(%2) \n\t"
+ "sub $16, %2 \n\t"
"decl %%ecx \n\t"
- "jnz .loop8_pass0 \n\t"
+ "jnz .loop8_pass4 \n\t"
"EMMS \n\t" // DONE
: "=c" (dummy_value_c), // output regs (dummy)
"=S" (dummy_value_S),
"=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width) // ecx
+ : "0" (width), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
+#if defined(CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0" // clobber list
#endif
);
@@ -2996,235 +2981,211 @@ png_do_read_interlace(png_structp png_ptr)
// dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
// (recall that expansion is _in place_: sptr and dp
// both point at locations within same row buffer)
- {
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
-
- __asm__ __volatile__ (
- "subl $24, %%edi \n\t" // start of last block
-
- ".loop8_pass2: \n\t"
- "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
- "movq %%mm0, (%%edi) \n\t"
- "movq %%mm0, 8(%%edi) \n\t"
- "movq %%mm0, 16(%%edi) \n\t"
- "subl $8, %%esi \n\t"
- "movq %%mm0, 24(%%edi) \n\t"
- "subl $32, %%edi \n\t"
- "decl %%ecx \n\t"
- "jnz .loop8_pass2 \n\t"
- "EMMS \n\t" // DONE
+ __asm__ __volatile__ (
+ "sub $24, %2 \n\t" // start of last block
+
+ ".loop8_pass2: \n\t"
+ "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm0, 8(%2) \n\t"
+ "movq %%mm0, 16(%2) \n\t"
+ "sub $8, %1 \n\t"
+ "movq %%mm0, 24(%2) \n\t"
+ "sub $32, %2 \n\t"
+ "decl %%ecx \n\t"
+ "jnz .loop8_pass2 \n\t"
+ "EMMS \n\t" // DONE
- : "=c" (dummy_value_c), // output regs (dummy)
- "=S" (dummy_value_S),
- "=D" (dummy_value_D)
+ : "=c" (dummy_value_c), // output regs (dummy)
+ "=S" (dummy_value_S),
+ "=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width) // ecx
+ : "0" (width), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
- : "%mm0" // clobber list
+ : "%mm0" // clobber list
#endif
- );
- }
+ );
}
- else if (width) // pass == 4 or 5
+ else if (width) // && ((pass == 0) || (pass == 1))
{
// source is 8-byte RRGGBBAA
- // dest is 16-byte RRGGBBAA RRGGBBAA
- {
- int dummy_value_c; // fix 'forbidden register spilled'
- int dummy_value_S;
- int dummy_value_D;
+ // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
+ __asm__ __volatile__ (
+ "sub $56, %2 \n\t" // start of last block
- __asm__ __volatile__ (
- "subl $8, %%edi \n\t" // start of last block
-
- ".loop8_pass4: \n\t"
- "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
- "movq %%mm0, (%%edi) \n\t"
- "subl $8, %%esi \n\t"
- "movq %%mm0, 8(%%edi) \n\t"
- "subl $16, %%edi \n\t"
- "decl %%ecx \n\t"
- "jnz .loop8_pass4 \n\t"
- "EMMS \n\t" // DONE
+ ".loop8_pass0: \n\t"
+ "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm0, 8(%2) \n\t"
+ "movq %%mm0, 16(%2) \n\t"
+ "movq %%mm0, 24(%2) \n\t"
+ "movq %%mm0, 32(%2) \n\t"
+ "movq %%mm0, 40(%2) \n\t"
+ "movq %%mm0, 48(%2) \n\t"
+ "sub $8, %1 \n\t"
+ "movq %%mm0, 56(%2) \n\t"
+ "sub $64, %2 \n\t"
+ "decl %%ecx \n\t"
+ "jnz .loop8_pass0 \n\t"
+ "EMMS \n\t" // DONE
- : "=c" (dummy_value_c), // output regs (dummy)
- "=S" (dummy_value_S),
- "=D" (dummy_value_D)
+ : "=c" (dummy_value_c), // output regs (dummy)
+ "=S" (dummy_value_S),
+ "=D" (dummy_value_D)
- : "1" (sptr), // esi // input regs
- "2" (dp), // edi
- "0" (width) // ecx
+ : "0" (width), // ecx // input regs
+ "1" (sptr), // esi/rsi
+ "2" (dp) // edi/rdi
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
- : "%mm0" // clobber list
+#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
+ : "%mm0" // clobber list
#endif
- );
- }
+ );
}
-
} /* end of pixel_bytes == 8 */
//--------------------------------------------------------------
- else if (pixel_bytes == 6)
+ else if (pixel_bytes == BPP6) // why no MMX for this case?
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
- png_memcpy(v, sptr, 6);
+ png_memcpy(v, sptr, BPP6);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- png_memcpy(dp, v, 6);
- dp -= 6;
+ png_memcpy(dp, v, BPP6);
+ dp -= BPP6;
}
- sptr -= 6;
+ sptr -= BPP6;
}
} /* end of pixel_bytes == 6 */
//--------------------------------------------------------------
else
{
- for (i = width; i; i--)
- {
- png_byte v[8];
- int j;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr-= pixel_bytes;
- }
+ // ERROR: SHOULD NEVER BE REACHED
+#if defined(PNG_DEBUG)
+ png_debug(1, "Internal libpng logic error (GCC "
+ "png_do_read_interlace() _mmx_supported)\n");
+#endif
}
+
} // end of _mmx_supported ========================================
else /* MMX not supported: use modified C code - takes advantage
* of inlining of png_memcpy for a constant */
- /* GRR 19991007: does it? or should pixel_bytes in each
- * block be replaced with immediate value (e.g., 1)? */
- /* GRR 19991017: replaced with constants in each case */
-#endif /* PNG_MMX_CODE_SUPPORTED */
{
- if (pixel_bytes == 1)
+ if (pixel_bytes == BPP3)
{
for (i = width; i; i--)
{
+ png_byte v[8];
int j;
+ png_memcpy(v, sptr, BPP3);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- *dp-- = *sptr;
+ png_memcpy(dp, v, BPP3);
+ dp -= BPP3;
}
- --sptr;
+ sptr -= BPP3;
}
}
- else if (pixel_bytes == 3)
+ else if (pixel_bytes == BPP4)
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
- png_memcpy(v, sptr, 3);
+ png_memcpy(v, sptr, BPP4);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- png_memcpy(dp, v, 3);
- dp -= 3;
+#if defined(PNG_DEBUG) && defined(PNG_1_0_X) // row_buf_size gone in 1.2.x
+ if (dp < row || dp+3 > row+png_ptr->row_buf_size)
+ {
+ printf("dp out of bounds: row=%10p, dp=%10p, "
+ "rp=%10p\n", row, dp, row+png_ptr->row_buf_size);
+ printf("row_buf_size=%lu\n", png_ptr->row_buf_size);
+ }
+#endif
+ png_memcpy(dp, v, BPP4);
+ dp -= BPP4;
}
- sptr -= 3;
+ sptr -= BPP4;
}
}
- else if (pixel_bytes == 2)
+ else if (pixel_bytes == 1)
{
for (i = width; i; i--)
{
- png_byte v[8];
int j;
- png_memcpy(v, sptr, 2);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- png_memcpy(dp, v, 2);
- dp -= 2;
+ *dp-- = *sptr;
}
- sptr -= 2;
+ --sptr;
}
}
- else if (pixel_bytes == 4)
+ else if (pixel_bytes == BPP2)
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
- png_memcpy(v, sptr, 4);
+ png_memcpy(v, sptr, BPP2);
for (j = 0; j < png_pass_inc[pass]; j++)
{
-#if defined(PNG_DEBUG)
- if (dp < row || dp+3 > row+png_ptr->row_buf_size)
- {
- printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
- row, dp, row+png_ptr->row_buf_size);
- printf("row_buf=%d\n",png_ptr->row_buf_size);
- }
-#endif
- png_memcpy(dp, v, 4);
- dp -= 4;
+ png_memcpy(dp, v, BPP2);
+ dp -= BPP2;
}
- sptr -= 4;
+ sptr -= BPP2;
}
}
- else if (pixel_bytes == 6)
+ else if (pixel_bytes == BPP6)
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
- png_memcpy(v, sptr, 6);
+ png_memcpy(v, sptr, BPP6);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- png_memcpy(dp, v, 6);
- dp -= 6;
+ png_memcpy(dp, v, BPP6);
+ dp -= BPP6;
}
- sptr -= 6;
+ sptr -= BPP6;
}
}
- else if (pixel_bytes == 8)
+ else if (pixel_bytes == BPP8)
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
- png_memcpy(v, sptr, 8);
+ png_memcpy(v, sptr, BPP8);
for (j = 0; j < png_pass_inc[pass]; j++)
{
- png_memcpy(dp, v, 8);
- dp -= 8;
+ png_memcpy(dp, v, BPP8);
+ dp -= BPP8;
}
- sptr -= 8;
+ sptr -= BPP8;
}
}
- else /* GRR: should never be reached */
+ else
{
- for (i = width; i; i--)
- {
- png_byte v[8];
- int j;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
+ // ERROR: SHOULD NEVER BE REACHED
+#if defined(PNG_DEBUG)
+ png_debug(1, "Internal libpng logic error (GCC "
+ "png_do_read_interlace() !_mmx_supported)\n");
+#endif
}
} /* end if (MMX not supported) */
break;
- }
+ } /* end default (8-bit or larger) */
} /* end switch (row_info->pixel_depth) */
row_info->width = final_width;
@@ -3240,7 +3201,6 @@ png_do_read_interlace(png_structp png_ptr)
#if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
-#if defined(PNG_MMX_CODE_SUPPORTED)
//===========================================================================//
// //
@@ -3259,24 +3219,39 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
int dummy_value_a;
int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
int dummy_value_d;
- int dummy_value_S;
- int dummy_value_D;
+ png_bytep dummy_value_S;
+ png_bytep dummy_value_D;
int diff; // __attribute__((used));
bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
FullLength = row_info->rowbytes; // number of bytes to filter
+#ifdef __x86_64__ // regs used for pointers or together with pointer-regs
+# define PBP "%%rbp"
+# define PAX "%%rax"
+# define PBX "%%rbx"
+# define PCX "%%rcx"
+# define PDX "%%rdx"
+# define CLEAR_BOTTOM_3_BITS "and $0xfffffffffffffff8, "
+#else
+# define PBP "%%ebp"
+# define PAX "%%eax"
+# define PBX "%%ebx"
+# define PCX "%%ecx"
+# define PDX "%%edx"
+# define CLEAR_BOTTOM_3_BITS "and $0xfffffff8, "
+#endif
__asm__ __volatile__ (
SAVE_GOT_ebx
SAVE_r15
SAVE_ebp
// initialize address pointers and offset
-//pre "movl row, %%edi \n\t" // edi: ptr to Avg(x)
+//pre "movl row, %2 \n\t" // edi/rdi: ptr to Avg(x)
"xorl %%ebx, %%ebx \n\t" // ebx: x
-//pre "movl prev_row, %%esi \n\t" // esi: ptr to Prior(x)
- "movl %%edi, %%edx \n\t"
-//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
- "subl %%ecx, %%edx \n\t" // edx: ptr to Raw(x-bpp)
+//pre "movl prev_row, %1 \n\t" // esi/rsi: ptr to Prior(x)
+ "mov %2, " PDX " \n\t" // copy of row ptr...
+//pre "subl bpp, " PDX " \n\t" // (bpp is preloaded into ecx)
+ "sub " PCX "," PDX " \n\t" // edx/rdx: ptr to Raw(x-bpp)
//pre "movl FullLength, %%eax \n\t" // bring in via eax...
SAVE_FullLength // ...but store for later use
"xorl %%eax, %%eax \n\t"
@@ -3284,22 +3259,25 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
// Compute the Raw value for the first bpp bytes
// Raw(x) = Avg(x) + (Prior(x)/2)
"avg_rlp: \n\t"
- "movb (%%esi,%%ebx,), %%al \n\t" // load al with Prior(x)
+ "movb (%1," PBX ",), %%al \n\t" // load al with Prior(x)
"incl %%ebx \n\t"
"shrb %%al \n\t" // divide by 2
- "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
+ "addb -1(%2," PBX ",), %%al \n\t" // add Avg(x); -1 to offset inc ebx
//pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
"cmpl %%ecx, %%ebx \n\t"
- "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
+ "movb %%al, -1(%2," PBX ",) \n\t" // write Raw(x); -1 to offset inc ebx
"jb avg_rlp \n\t" // mov does not affect flags
- // get # of bytes to alignment
- "movl %%edi, %%ebp \n\t" // take start of row
- "addl %%ebx, %%ebp \n\t" // add bpp
- "addl $0xf, %%ebp \n\t" // add 7+8 to incr past alignment bdry
- "andl $0xfffffff8, %%ebp \n\t" // mask to alignment boundary
- "subl %%edi, %%ebp \n\t" // subtract from start => value ebx at
- "jz avg_go \n\t" // alignment
+ // get # of bytes to alignment (32-bit mask _would_ be good enough
+ // [computing delta], but 32-bit ops are zero-extended on 64-bit, argh)
+ // (if swapped edx and ebp, could do 8-bit or 16-bit mask...FIXME?)
+ "mov %2, " PBP " \n\t" // take start of row
+ "add " PBX "," PBP " \n\t" // add bpp
+ "add $0xf, " PBP " \n\t" // add 7+8 to incr past alignment bdry
+// "andl $0xfffffff8, %%ebp \n\t" // mask to alignment boundary (32-bit!)
+ CLEAR_BOTTOM_3_BITS PBP "\n\t" // mask to alignment boundary
+ "sub %2, " PBP " \n\t" // subtract row ptr again => ebp =
+ "jz avg_go \n\t" // target value of ebx at alignment
// fix alignment
// Compute the Raw value for the bytes up to the alignment boundary
@@ -3308,36 +3286,36 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
"avg_lp1: \n\t"
"xorl %%eax, %%eax \n\t"
- "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
- "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
+ "movb (%1," PBX ",), %%cl \n\t" // load cl with Prior(x)
+ "movb (" PDX "," PBX ",), %%al \n\t" // load al with Raw(x-bpp)
"addw %%cx, %%ax \n\t"
"incl %%ebx \n\t"
"shrw %%ax \n\t" // divide by 2
- "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
+ "addb -1(%2," PBX ",), %%al \n\t" // add Avg(x); -1 to offset inc ebx
"cmpl %%ebp, %%ebx \n\t" // check if at alignment boundary
- "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
+ "movb %%al, -1(%2," PBX ",) \n\t" // write Raw(x); -1 to offset inc ebx
"jb avg_lp1 \n\t" // repeat until at alignment boundary
"avg_go: \n\t"
RESTORE_FullLength "%%eax \n\t" // FullLength -> eax
- "movl %%eax, %%ecx \n\t"
+ "movl %%eax, %%ecx \n\t" // copy -> ecx
"subl %%ebx, %%eax \n\t" // subtract alignment fix
"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
- "subl %%eax, %%ecx \n\t" // drop over bytes from original length
+ "subl %%eax, %%ecx \n\t" // sub over-bytes from original length
//out "movl %%ecx, MMXLength \n\t"
- "movl %%ebp, %%edi \n\t" // ebp = diff, but no reg constraint(?)
- RESTORE_ebp // (could swap ebp and ecx functions)
+ "movl %%ebp, %%eax \n\t" // ebp = diff, but no reg constraint(?)
+ RESTORE_ebp // (could swap ebp and edx functions)
RESTORE_r15
RESTORE_GOT_ebx
: "=c" (MMXLength), // output regs
"=S" (dummy_value_S),
- "=D" (diff),
- "=a" (dummy_value_a)
+ "=D" (dummy_value_D),
+ "=a" (diff)
: "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row), // edi
+ "1" (prev_row), // esi/rsi
+ "2" (row), // edi/rdi
"3" (FullLength) // eax
: "%edx" // clobber list
@@ -3361,20 +3339,20 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
// preload "movl diff, %%ecx \n\t" // ecx: x = offset to
// alignment boundary
"movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
-// preload "movl row, %%edi \n\t" // edi: Avg(x)
+// preload "movl row, %1 \n\t" // edi: Avg(x)
"movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
+// preload "movl prev_row, %0 \n\t" // esi: Prior(x)
RESTORE_rbp
// prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm2 \n\t"// load previous aligned 8 bytes
- // (correct pos. in loop below)
+ "movq -8(%1," PCX ",), %%mm2 \n\t"// load previous aligned 8 bytes
+ // (correct pos. in loop below)
"avg_3lp: \n\t"
- "movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
+ "movq (%1," PCX ",), %%mm0 \n\t" // load mm0 with Avg(x)
"movq %%mm5, %%mm3 \n\t"
"psrlq $40, %%mm2 \n\t" // correct position Raw(x-bpp)
// data
- "movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
+ "movq (%0," PCX ",), %%mm1 \n\t" // load mm1 with Prior(x)
"movq %%mm7, %%mm6 \n\t"
"pand %%mm1, %%mm3 \n\t" // get lsb for each prevrow byte
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
@@ -3440,7 +3418,7 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
// Avg for each Active byte
// now ready to write back to memory
- "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
+ "movq %%mm0, -8(%1," PCX ",) \n\t"
// move updated Raw(x) to use as Raw(x-bpp) for next loop
"cmpl %%eax, %%ecx \n\t" // MMXLength
"movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
@@ -3451,10 +3429,10 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
"=c" (dummy_value_c),
"=a" (dummy_value_a)
- : "0" (prev_row), // esi // input regs
- "1" (row), // edi
- "2" (diff), // ecx
- "3" (MMXLength) // eax
+ : "0" (prev_row), // esi/rsi // input regs
+ "1" (row), // edi/rdi
+ "2" (diff), // ecx
+ "3" (MMXLength) // eax
#if defined(CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
@@ -3466,7 +3444,7 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
case 4: // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem,
{ // but loop uses all 8 MMX regs, and psrlq/psllq require 64-bit
- // mem (PIC/.so problems), MMX reg (none avail.), or immediate
+ // mem (PIC/.so problems), MMX reg (none left), or immediate
// _ShiftBpp = bpp << 3; // 32 (psllq)
// _ShiftRem = 64 - _ShiftBpp; // 32 (psrlq)
@@ -3481,19 +3459,19 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
RESTORE_rbp
// ... and clear all bytes except for 1st active group
-// preload "movl row, %%edi \n\t" // edi: Avg(x)
+// preload "movl row, %1 \n\t" // edi: Avg(x)
"psrlq $32, %%mm7 \n\t" // was _ShiftRem
-// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
+// preload "movl prev_row, %0 \n\t" // esi: Prior(x)
"movq %%mm7, %%mm6 \n\t"
"psllq $32, %%mm6 \n\t" // mask for 2nd active group
// prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
- // (we correct pos. in loop below)
+ "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
+ // (we correct pos. in loop below)
"avg_4lp: \n\t"
- "movq (%%edi,%%ecx,), %%mm0 \n\t"
+ "movq (%1," PCX ",), %%mm0 \n\t"
"psrlq $32, %%mm2 \n\t" // shift data to pos. correctly
- "movq (%%esi,%%ecx,), %%mm1 \n\t"
+ "movq (%0," PCX ",), %%mm1 \n\t"
// add (Prev_row/2) to average
"movq %%mm5, %%mm3 \n\t"
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
@@ -3538,7 +3516,7 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
// Avg for each Active byte
"cmpl %%eax, %%ecx \n\t" // MMXLength
// now ready to write back to memory
- "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
+ "movq %%mm0, -8(%1," PCX ",) \n\t"
// prep Raw(x-bpp) for next loop
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
"jb avg_4lp \n\t"
@@ -3548,10 +3526,10 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
"=c" (dummy_value_c),
"=a" (dummy_value_a)
- : "0" (prev_row), // esi // input regs
- "1" (row), // edi
- "2" (diff), // ecx
- "3" (MMXLength) // eax
+ : "0" (prev_row), // esi/rsi // input regs
+ "1" (row), // edi/rdi
+ "2" (diff), // ecx
+ "3" (MMXLength) // eax
#if defined(CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
@@ -3561,101 +3539,62 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
}
break; // end 4 bpp
- case 6: // formerly shared with 4 bpp case (see comments there)
+ case 1:
{
-// _ShiftBpp = bpp << 3; // 48 (psllq)
-// _ShiftRem = 64 - _ShiftBpp; // 16 (psrlq)
-
__asm__ __volatile__ (
- LOAD_GOT_rbp
- "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
- "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
// re-init address pointers and offset
-// preload "movl diff, %%ecx \n\t" // ecx: x = offset to
- // alignment boundary
- "movq " AMASK0_8_0 ", %%mm7 \n\t" // _amask0_8_0 -> mm7
- RESTORE_rbp
+// preload "movl diff, %%ecx \n\t" // ecx: x = offset to align. bdry
+// preload "movl row, %1 \n\t" // edi/rdi: Avg(x)
+// preload "movl FullLength, %%eax \n\t"
+ "cmpl %%eax, %%ecx \n\t" // test if offset at end of array
+ "jnb avg_1end \n\t"
- // ... and clear all bytes except for 1st active group
-// preload "movl row, %%edi \n\t" // edi: Avg(x)
- "psrlq $16, %%mm7 \n\t"
-// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
- "movq %%mm7, %%mm6 \n\t"
- "psllq $48, %%mm6 \n\t" // mask for 2nd active group
+ SAVE_ebp
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
- // (we correct pos. in loop below)
- "avg_6lp: \n\t"
- "movq (%%edi,%%ecx,), %%mm0 \n\t"
- "psrlq $16, %%mm2 \n\t" // shift data to pos. correctly
- "movq (%%esi,%%ecx,), %%mm1 \n\t"
- // add (Prev_row/2) to average
- "movq %%mm5, %%mm3 \n\t"
- "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
- "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
- "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
- // each byte
- // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
- "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
- // LBCarrys
- "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
- // where both
- // lsb's were == 1 (only valid for active group)
- "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
- "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
- // for each byte
- "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
- // bytes to add to Avg
- "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
- // for each Active
- // byte
- // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
- "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
- "psllq $48, %%mm2 \n\t" // shift data to pos. correctly
- "addl $8, %%ecx \n\t"
- "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
- // LBCarrys
- "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
- // where both
- // lsb's were == 1 (only valid for active group)
- "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
- "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
- // for each byte
- "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
- // bytes to add to Avg
- "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
- // Avg for each Active byte
- "cmpl %%eax, %%ecx \n\t" // MMXLength
- // now ready to write back to memory
- "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
- // prep Raw(x-bpp) for next loop
- "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
- "jb avg_6lp \n\t"
+ // do Avg decode for remaining bytes
+// preload "movl prev_row, %0 \n\t" // esi/rsi: Prior(x)
+ "mov %1, " PBP " \n\t" // copy of row pointer...
+ "dec " PBP " \n\t" // ebp/rbp: Raw(x-bpp)
+ "xorl %%edx, %%edx \n\t" // zero edx before using dl & dx
+ // in loop below
+ SAVE_GOT_ebx
+
+ "avg_1lp: \n\t"
+ // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
+ "xorl %%ebx, %%ebx \n\t"
+ "movb (%0," PCX ",), %%dl \n\t" // load dl with Prior(x)
+ "movb (" PBP "," PCX ",), %%bl \n\t" // load bl with Raw(x-bpp)
+ "addw %%dx, %%bx \n\t"
+ "incl %%ecx \n\t"
+ "shrw %%bx \n\t" // divide by 2
+ "addb -1(%1," PCX ",), %%bl \n\t" // add Avg(x); -1 to offset
+ // inc ecx
+ "cmpl %%eax, %%ecx \n\t" // check if at end of array
+ "movb %%bl, -1(%1," PCX ",) \n\t" // write back Raw(x);
+ // mov does not affect flags; -1 to offset inc ecx
+ "jb avg_1lp \n\t"
+
+ RESTORE_GOT_ebx
+ RESTORE_ebp
+
+ "avg_1end: \n\t"
: "=S" (dummy_value_S), // output regs (dummy)
"=D" (dummy_value_D),
"=c" (dummy_value_c),
"=a" (dummy_value_a)
- : "0" (prev_row), // esi // input regs
- "1" (row), // edi
- "2" (diff), // ecx
- "3" (MMXLength) // eax
+ : "0" (prev_row), // esi/rsi // input regs
+ "1" (row), // edi/rdi
+ "2" (diff), // ecx
+ "3" (FullLength) // eax
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
- : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
- , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
+ : "%edx" // clobber list
+ _CLOBBER_GOT_ebx
+ _CLOBBER_ebp
);
}
- break; // end 6 bpp
+ return; // end 1 bpp
case 2:
{
@@ -3670,18 +3609,18 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
// preload "movl diff, %%ecx \n\t" // ecx: x = offset to
// alignment boundary
"movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
-// preload "movl row, %%edi \n\t" // edi: Avg(x)
+// preload "movl row, %1 \n\t" // edi: Avg(x)
"movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
+// preload "movl prev_row, %0 \n\t" // esi: Prior(x)
RESTORE_rbp
// prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
- // (we correct pos. in loop below)
+ "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
+ // (we correct pos. in loop below)
"avg_2lp: \n\t"
- "movq (%%edi,%%ecx,), %%mm0 \n\t"
+ "movq (%1," PCX ",), %%mm0 \n\t"
"psrlq $48, %%mm2 \n\t" // shift data to pos. correctly
- "movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
+ "movq (%0," PCX ",), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
// add (Prev_row/2) to average
"movq %%mm5, %%mm3 \n\t"
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
@@ -3773,7 +3712,7 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
// Avg for each Active byte
"cmpl %%eax, %%ecx \n\t" // MMXLength
// now ready to write back to memory
- "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
+ "movq %%mm0, -8(%1," PCX ",) \n\t"
// prep Raw(x-bpp) for next loop
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
"jb avg_2lp \n\t"
@@ -3783,10 +3722,10 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
"=c" (dummy_value_c),
"=a" (dummy_value_a)
- : "0" (prev_row), // esi // input regs
- "1" (row), // edi
- "2" (diff), // ecx
- "3" (MMXLength) // eax
+ : "0" (prev_row), // esi/rsi // input regs
+ "1" (row), // edi/rdi
+ "2" (diff), // ecx
+ "3" (MMXLength) // eax
#if defined(CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
@@ -3796,66 +3735,101 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
}
break; // end 2 bpp
- case 1:
+ case 6: // formerly shared with 4 bpp case (see comments there)
{
+// _ShiftBpp = bpp << 3; // 48 (psllq)
+// _ShiftRem = 64 - _ShiftBpp; // 16 (psrlq)
+
__asm__ __volatile__ (
+ LOAD_GOT_rbp
+ "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
+ "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
// re-init address pointers and offset
-// preload "movl diff, %%eax \n\t" // eax: x = offset to align. bdry
-// preload "movl row, %%edi \n\t" // edi: Avg(x)
-// preload "movl FullLength, %%edx \n\t"
- "cmpl %%edx, %%eax \n\t" // test if offset at end of array
- "jnb avg_1end \n\t"
-
- SAVE_ebp
-
- // do Avg decode for remaining bytes
-// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
- "movl %%edi, %%ebp \n\t"
-// preload "subl bpp, %%ebp \n\t" // (bpp is preloaded into ecx)
- "subl %%ecx, %%ebp \n\t" // ebp: Raw(x-bpp)
- "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
- // in loop below
- SAVE_GOT_ebx
-
- "avg_1lp: \n\t"
- // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
- "xorl %%ebx, %%ebx \n\t"
- "movb (%%esi,%%eax,), %%cl \n\t" // load cl with Prior(x)
- "movb (%%ebp,%%eax,), %%bl \n\t" // load bl with Raw(x-bpp)
- "addw %%cx, %%bx \n\t"
- "incl %%eax \n\t"
- "shrw %%bx \n\t" // divide by 2
- "addb -1(%%edi,%%eax,), %%bl \n\t" // add Avg(x); -1 to offset
- // inc eax
- "cmpl %%edx, %%eax \n\t" // check if at end of array
- "movb %%bl, -1(%%edi,%%eax,) \n\t" // write back Raw(x);
- // mov does not affect flags; -1 to offset inc eax
- "jb avg_1lp \n\t"
+// preload "movl diff, %%ecx \n\t" // ecx: x = offset to
+ // alignment boundary
+ "movq " AMASK0_8_0 ", %%mm7 \n\t" // _amask0_8_0 -> mm7
+ RESTORE_rbp
- RESTORE_GOT_ebx
- RESTORE_ebp
+ // ... and clear all bytes except for 1st active group
+// preload "movl row, %1 \n\t" // edi: Avg(x)
+ "psrlq $16, %%mm7 \n\t"
+// preload "movl prev_row, %0 \n\t" // esi: Prior(x)
+ "movq %%mm7, %%mm6 \n\t"
+ "psllq $48, %%mm6 \n\t" // mask for 2nd active group
- "avg_1end: \n\t"
+ // prime the pump: load the first Raw(x-bpp) data set
+ "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
+ // (we correct pos. in loop below)
+ "avg_6lp: \n\t"
+ "movq (%1," PCX ",), %%mm0 \n\t"
+ "psrlq $16, %%mm2 \n\t" // shift data to pos. correctly
+ "movq (%0," PCX ",), %%mm1 \n\t"
+ // add (Prev_row/2) to average
+ "movq %%mm5, %%mm3 \n\t"
+ "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
+ "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
+ "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
+ // byte
+ "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
+ // each byte
+ // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
+ "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
+ // LBCarrys
+ "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
+ // where both
+ // lsb's were == 1 (only valid for active group)
+ "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
+ "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
+ // byte
+ "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
+ // for each byte
+ "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
+ // bytes to add to Avg
+ "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
+ // for each Active
+ // byte
+ // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
+ "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
+ "psllq $48, %%mm2 \n\t" // shift data to pos. correctly
+ "addl $8, %%ecx \n\t"
+ "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
+ // LBCarrys
+ "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
+ // where both
+ // lsb's were == 1 (only valid for active group)
+ "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
+ "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
+ // byte
+ "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
+ // for each byte
+ "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
+ // bytes to add to Avg
+ "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
+ // Avg for each Active byte
+ "cmpl %%eax, %%ecx \n\t" // MMXLength
+ // now ready to write back to memory
+ "movq %%mm0, -8(%1," PCX ",) \n\t"
+ // prep Raw(x-bpp) for next loop
+ "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
+ "jb avg_6lp \n\t"
- : "=c" (dummy_value_c), // output regs (dummy)
- "=S" (dummy_value_S),
+ : "=S" (dummy_value_S), // output regs (dummy)
"=D" (dummy_value_D),
- "=a" (dummy_value_a),
- "=d" (dummy_value_d)
+ "=c" (dummy_value_c),
+ "=a" (dummy_value_a)
- : "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row), // edi
- "3" (diff), // eax
- "4" (FullLength) // edx
+ : "0" (prev_row), // esi/rsi // input regs
+ "1" (row), // edi/rdi
+ "2" (diff), // ecx
+ "3" (MMXLength) // eax
- CLOB_COLON_ebx_ebp // clobber list
- CLOBBER_GOT_ebx
- CLOB_COMMA_ebx_ebp
- CLOBBER_ebp
+#if defined(CLOBBER_MMX_REGS_SUPPORTED)
+ : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
+ , "%mm4", "%mm5", "%mm6", "%mm7"
+#endif
);
}
- return; // end 1 bpp
+ break; // end 6 bpp
case 8:
{
@@ -3865,19 +3839,19 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
// alignment boundary
LOAD_GOT_rbp
"movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
-// preload "movl row, %%edi \n\t" // edi: Avg(x)
+// preload "movl row, %1 \n\t" // edi: Avg(x)
"movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
+// preload "movl prev_row, %0 \n\t" // esi: Prior(x)
RESTORE_rbp
// prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
+ "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
// (NO NEED to correct pos. in loop below)
"avg_8lp: \n\t"
- "movq (%%edi,%%ecx,), %%mm0 \n\t"
+ "movq (%1," PCX ",), %%mm0 \n\t"
"movq %%mm5, %%mm3 \n\t"
- "movq (%%esi,%%ecx,), %%mm1 \n\t"
+ "movq (%0," PCX ",), %%mm1 \n\t"
"addl $8, %%ecx \n\t"
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
@@ -3890,7 +3864,7 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
"cmpl %%eax, %%ecx \n\t" // MMXLength
- "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
+ "movq %%mm0, -8(%1," PCX ",) \n\t"
"movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
"jb avg_8lp \n\t"
@@ -3899,10 +3873,10 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
"=c" (dummy_value_c),
"=a" (dummy_value_a)
- : "0" (prev_row), // esi // input regs
- "1" (row), // edi
- "2" (diff), // ecx
- "3" (MMXLength) // eax
+ : "0" (prev_row), // esi/rsi // input regs
+ "1" (row), // edi/rdi
+ "2" (diff), // ecx
+ "3" (MMXLength) // eax
#if defined(CLOBBER_MMX_REGS_SUPPORTED)
: "%mm0", "%mm1", "%mm2" // clobber list
@@ -3914,10 +3888,10 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
default: // bpp != 1,2,3,4,6,8: doesn't exist
{
-#if defined(PNG_DEBUG)
// ERROR: SHOULD NEVER BE REACHED
- png_debug(1,
- "Internal libpng logic error (GCC png_read_filter_row_mmx_avg())\n");
+#if defined(PNG_DEBUG)
+ png_debug(1, "Internal libpng logic error (GCC "
+ "png_read_filter_row_mmx_avg())\n");
#endif
}
break;
@@ -3929,17 +3903,17 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
// check if any remaining bytes left to decode
//pre "movl FullLength, %%edx \n\t"
//pre "movl MMXLength, %%eax \n\t" // eax: x == offset bytes after MMX
-//pre "movl row, %%edi \n\t" // edi: Avg(x)
+//pre "movl row, %2 \n\t" // edi: Avg(x)
"cmpl %%edx, %%eax \n\t" // test if offset at end of array
"jnb avg_end \n\t"
SAVE_ebp
// do Avg decode for remaining bytes
-//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
- "movl %%edi, %%ebp \n\t"
-//pre "subl bpp, %%ebp \n\t" // (bpp is preloaded into ecx)
- "subl %%ecx, %%ebp \n\t" // ebp: Raw(x-bpp)
+//pre "movl prev_row, %1 \n\t" // esi: Prior(x)
+ "mov %2, " PBP " \n\t" // copy of row pointer...
+//pre "subl bpp, " PBP " \n\t" // (bpp is preloaded into ecx)
+ "sub " PCX "," PBP " \n\t" // ebp: Raw(x-bpp)
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
SAVE_GOT_ebx
@@ -3947,14 +3921,14 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
"avg_lp2: \n\t"
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
"xorl %%ebx, %%ebx \n\t"
- "movb (%%esi,%%eax,), %%cl \n\t" // load cl with Prior(x)
- "movb (%%ebp,%%eax,), %%bl \n\t" // load bl with Raw(x-bpp)
+ "movb (%1," PAX ",), %%cl \n\t" // load cl with Prior(x)
+ "movb (" PBP "," PAX ",), %%bl \n\t" // load bl with Raw(x-bpp)
"addw %%cx, %%bx \n\t"
"incl %%eax \n\t"
"shrw %%bx \n\t" // divide by 2
- "addb -1(%%edi,%%eax,), %%bl \n\t" // add Avg(x); -1 to offset inc eax
+ "addb -1(%2," PAX ",), %%bl \n\t" // add Avg(x); -1 to offset inc eax
"cmpl %%edx, %%eax \n\t" // check if at end of array
- "movb %%bl, -1(%%edi,%%eax,) \n\t" // write back Raw(x) [mov does not
+ "movb %%bl, -1(%2," PAX ",) \n\t" // write back Raw(x) [mov does not
"jb avg_lp2 \n\t" // affect flags; -1 to offset inc eax]
RESTORE_GOT_ebx
@@ -3970,8 +3944,8 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
"=d" (dummy_value_d)
: "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row), // edi
+ "1" (prev_row), // esi/rsi
+ "2" (row), // edi/rdi
"3" (MMXLength), // eax
"4" (FullLength) // edx
@@ -4004,8 +3978,8 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
int dummy_value_a;
int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
int dummy_value_d;
- int dummy_value_S;
- int dummy_value_D;
+ png_charp dummy_value_S;
+ png_charp dummy_value_D;
int diff; // __attribute__((used));
bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
@@ -4015,9 +3989,9 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
SAVE_GOT_ebx
SAVE_r15
SAVE_ebp
-//pre "movl row, %%edi \n\t"
+//pre "movl row, %2 \n\t" // edi/rdi
"xorl %%ebx, %%ebx \n\t" // ebx: x offset
-//pre "movl prev_row, %%esi \n\t"
+//pre "movl prev_row, %1 \n\t" // esi/rsi
"xorl %%edx, %%edx \n\t" // edx: x-bpp offset
//pre "movl FullLength, %%eax \n\t" // bring in via eax...
SAVE_FullLength // ...but store for later use
@@ -4027,37 +4001,39 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
// Note: the formula works out to be always
// Paeth(x) = Raw(x) + Prior(x) where x < bpp
"paeth_rlp: \n\t"
- "movb (%%edi,%%ebx,), %%al \n\t"
- "addb (%%esi,%%ebx,), %%al \n\t"
+ "movb (%2," PBX ",), %%al \n\t"
+ "addb (%1," PBX ",), %%al \n\t"
"incl %%ebx \n\t"
//pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
"cmpl %%ecx, %%ebx \n\t"
- "movb %%al, -1(%%edi,%%ebx,) \n\t"
+ "movb %%al, -1(%2," PBX ",) \n\t"
"jb paeth_rlp \n\t"
- // get # of bytes to alignment
- "movl %%edi, %%ebp \n\t" // take start of row
- "addl %%ebx, %%ebp \n\t" // add bpp
- "xorl %%ecx, %%ecx \n\t"
- "addl $0xf, %%ebp \n\t" // add 7+8 to incr past alignment bdry
- "andl $0xfffffff8, %%ebp \n\t" // mask to alignment boundary
- "subl %%edi, %%ebp \n\t" // subtract from start ==> value ebx
- // at alignment
- "jz paeth_go \n\t"
+ // get # of bytes to alignment (note: computing _delta_ of two pointers,
+ // so hereafter %%ebp is sufficient even on 64-bit)
+ "mov %2, " PBP " \n\t" // take start of row
+ "add " PBX "," PBP " \n\t" // add bpp
+ "add $0xf, " PBP " \n\t" // add 7+8 to incr past alignment bdry
+// "andl $0xfffffff8, %%ebp \n\t" // mask to alignment boundary (32-bit!)
+ CLEAR_BOTTOM_3_BITS PBP "\n\t" // mask to alignment boundary
+ "sub %2, " PBP " \n\t" // subtract row ptr again => ebp =
+ "jz paeth_go \n\t" // target value of ebx at alignment
+
// fix alignment
+ "xorl %%ecx, %%ecx \n\t"
SAVE_r11_r12_r13
"paeth_lp1: \n\t"
"xorl %%eax, %%eax \n\t"
// pav = p - a = (a + b - c) - a = b - c
- "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
- "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
+ "movb (%1," PBX ",), %%al \n\t" // load Prior(x) into al
+ "movb (%1," PDX ",), %%cl \n\t" // load Prior(x-bpp) into cl
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
"movl %%eax, " pa_TEMP " \n\t" // Save pav for later use
"xorl %%eax, %%eax \n\t"
// pbv = p - b = (a + b - c) - b = a - c
- "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
+ "movb (%2," PDX ",), %%al \n\t" // load Raw(x-bpp) into al
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
"movl %%eax, %%ecx \n\t"
// pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
@@ -4091,12 +4067,12 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"cmpl " pc_TEMP ", %%ecx \n\t"
"jna paeth_bbc \n\t"
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
+ "movb (%1," PDX ",), %%cl \n\t" // load Prior(x-bpp) into cl
"jmp paeth_paeth \n\t"
"paeth_bbc: \n\t"
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
- "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
+ "movb (%1," PBX ",), %%cl \n\t" // load Prior(x) into cl
"jmp paeth_paeth \n\t"
"paeth_abb: \n\t"
@@ -4104,18 +4080,18 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"cmpl " pc_TEMP ", %%eax \n\t"
"jna paeth_abc \n\t"
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
+ "movb (%1," PDX ",), %%cl \n\t" // load Prior(x-bpp) into cl
"jmp paeth_paeth \n\t"
"paeth_abc: \n\t"
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
- "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
+ "movb (%2," PDX ",), %%cl \n\t" // load Raw(x-bpp) into cl
"paeth_paeth: \n\t"
"incl %%ebx \n\t"
"incl %%edx \n\t"
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
- "addb %%cl, -1(%%edi,%%ebx,) \n\t"
+ "addb %%cl, -1(%2," PBX ",) \n\t"
"cmpl %%ebp, %%ebx \n\t"
"jb paeth_lp1 \n\t"
@@ -4128,19 +4104,19 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
"subl %%eax, %%ecx \n\t" // drop over bytes from original length
//out "movl %%ecx, MMXLength \n\t"
- "movl %%ebp, %%edi \n\t" // ebp = diff, but no reg constraint(?)
- RESTORE_ebp // (could swap ebp and ecx functions)
+ "movl %%ebp, %%eax \n\t" // ebp = diff, but no reg constraint(?)
+ RESTORE_ebp // (could swap ebp and edx functions)
RESTORE_r15
RESTORE_GOT_ebx
: "=c" (MMXLength), // output regs
"=S" (dummy_value_S),
- "=D" (diff),
- "=a" (dummy_value_a)
+ "=D" (dummy_value_D),
+ "=a" (diff)
: "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row), // edi
+ "1" (prev_row), // esi/rsi
+ "2" (row), // edi/rdi
"3" (FullLength) // eax
: "%edx" // clobber list
@@ -4161,18 +4137,18 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
__asm__ __volatile__ (
LOAD_GOT_rbp
// preload "movl diff, %%ecx \n\t"
-// preload "movl row, %%edi \n\t"
-// preload "movl prev_row, %%esi \n\t"
+// preload "movl row, %1 \n\t" // edi/rdi
+// preload "movl prev_row, %0 \n\t" // esi/rsi
"pxor %%mm0, %%mm0 \n\t"
// prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
+ "movq -8(%1," PCX ",), %%mm1 \n\t"
"paeth_3lp: \n\t"
"psrlq $40, %%mm1 \n\t" // shift last 3 bytes to 1st
// 3 bytes
- "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
+ "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x)
"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
- "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
+ "movq -8(%0," PCX ",), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
"psrlq $40, %%mm3 \n\t" // shift last 3 bytes to 1st
// 3 bytes
@@ -4224,12 +4200,12 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"paddw %%mm3, %%mm7 \n\t"
"pxor %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm7 \n\t"
- "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
+ "movq (%0," PCX ",), %%mm3 \n\t" // load c=Prior(x-bpp)
"pand " AMASK5_3_0 ", %%mm7 \n\t" // _amask5_3_0 (was _ActiveMask)
"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
- "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor + Raw(x)
+ "paddb (%1," PCX ",), %%mm7 \n\t" // add Paeth predictor + Raw(x)
"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
- "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
+ "movq %%mm7, (%1," PCX ",) \n\t" // write back updated value
"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
// Raw(x-bpp)
// now do Paeth for 2nd set of bytes (3-5)
@@ -4278,7 +4254,7 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"paddw %%mm2, %%mm0 \n\t"
// test ((pa <= pb)? pa:pb) <= pc
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
- "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
+ "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x)
"pand %%mm7, %%mm3 \n\t"
"pandn %%mm0, %%mm7 \n\t"
"pxor %%mm1, %%mm1 \n\t"
@@ -4292,9 +4268,9 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
// 3 bytes
// pav = p - a = (a + b - c) - a = b - c
"movq %%mm2, %%mm4 \n\t"
- "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor + Raw(x)
+ "paddb (%1," PCX ",), %%mm7 \n\t" // add Paeth predictor + Raw(x)
"psllq $24, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
- "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
+ "movq %%mm7, (%1," PCX ",) \n\t" // write back updated value
"movq %%mm7, %%mm1 \n\t"
"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
"psllq $24, %%mm1 \n\t" // shift bytes (was _ShiftBpp)
@@ -4349,11 +4325,10 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
// step ecx to next set of 8 bytes and repeat loop til done
"addl $8, %%ecx \n\t"
"pand " AMASK0_2_6 ", %%mm1 \n\t" // _amask0_2_6 (_ActiveMaskEnd)
- "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor + Raw(x)
-
+ "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
"cmpl %%eax, %%ecx \n\t" // MMXLength
"pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
- "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
+ "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
// mm1 will be used as Raw(x-bpp) next loop
// mm3 ready to be used as Prior(x-bpp) next loop
"jb paeth_3lp \n\t"
@@ -4364,8 +4339,8 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"=c" (dummy_value_c),
"=a" (dummy_value_a)
- : "0" (prev_row), // esi // input regs
- "1" (row), // edi
+ : "0" (prev_row), // esi/rsi // input regs
+ "1" (row), // edi/rdi
"2" (diff), // ecx
"3" (MMXLength) // eax
@@ -4377,33 +4352,25 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
}
break; // end 3 bpp
- case 6:
+ case 4:
{
-// _ActiveMask2 = 0xffffffff00000000LL; // NOT USED ("_amask_0_4_4")
-// _ShiftBpp = 48; // bpp << 3 == bpp * 8
-// _ShiftRem = 16; // 64 - _ShiftBpp
-
__asm__ __volatile__ (
// preload "movl diff, %%ecx \n\t"
-// preload "movl row, %%edi \n\t"
-// preload "movl prev_row, %%esi \n\t"
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
+// preload "movl row, %1 \n\t" // edi/rdi
+// preload "movl prev_row, %0 \n\t" // esi/rsi
"pxor %%mm0, %%mm0 \n\t"
-
- "paeth_6lp: \n\t"
- // must shift to position Raw(x-bpp) data
- "psrlq $16, %%mm1 \n\t" // was _ShiftRem
+ // prime the pump: load the first Raw(x-bpp) data set
+ "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
+ // a=Raw(x-bpp) bytes
+ "paeth_4lp: \n\t"
// do first set of 4 bytes
- "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
- "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
- "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
- "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
- // must shift to position Prior(x-bpp) data
- "psrlq $16, %%mm3 \n\t" // was _ShiftRem
+ "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+ "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
+ "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x)
+ "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
// pav = p - a = (a + b - c) - a = b - c
"movq %%mm2, %%mm4 \n\t"
- "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
+ "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
// pbv = p - b = (a + b - c) - b = a - c
"movq %%mm1, %%mm5 \n\t"
"psubw %%mm3, %%mm4 \n\t"
@@ -4448,26 +4415,19 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"paddw %%mm3, %%mm7 \n\t"
"pxor %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm7 \n\t"
- "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
+ "movq (%0," PCX ",), %%mm3 \n\t" // load c=Prior(x-bpp)
LOAD_GOT_rbp
- "pand " AMASK4_4_0 ", %%mm7 \n\t" // _amask4_4_0 (was _ActiveMask)
+ "pand " AMASK4_4_0 ", %%mm7 \n\t" // _amask4_4_0 (was _ActiveMask)
RESTORE_rbp
- "psrlq $16, %%mm3 \n\t"
- "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
- "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor + Raw(x)
- "movq %%mm2, %%mm6 \n\t"
- "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
- "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
- "psllq $48, %%mm6 \n\t" // bpp * 8 = bits per pixel
- "movq %%mm7, %%mm5 \n\t"
- "psrlq $16, %%mm1 \n\t" // 64 - (bpp * 8) = remainder
- "por %%mm6, %%mm3 \n\t"
- "psllq $48, %%mm5 \n\t" // was _ShiftBpp
- "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
- "por %%mm5, %%mm1 \n\t"
+ "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
+ "paddb (%1," PCX ",), %%mm7 \n\t" // add Paeth predictor + Raw(x)
+ "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
+ "movq %%mm7, (%1," PCX ",) \n\t" // write back updated value
+ "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
+ // Raw(x-bpp)
// do second set of 4 bytes
- "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
- "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
+ "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
+ "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
// pav = p - a = (a + b - c) - a = b - c
"movq %%mm2, %%mm4 \n\t"
// pbv = p - b = (a + b - c) - b = a - c
@@ -4517,19 +4477,19 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
// step ecx to next set of 8 bytes and repeat loop til done
"addl $8, %%ecx \n\t"
"packuswb %%mm7, %%mm1 \n\t"
- "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor + Raw(x)
+ "paddb -8(%1," PCX ",), %%mm1 \n\t" // add predictor with Raw(x)
"cmpl %%eax, %%ecx \n\t" // MMXLength
- "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
- // mm1 will be used as Raw(x-bpp) next loop
- "jb paeth_6lp \n\t"
+ "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
+ // mm1 will be used as Raw(x-bpp) next loop
+ "jb paeth_4lp \n\t"
: "=S" (dummy_value_S), // output regs (dummy)
"=D" (dummy_value_D),
"=c" (dummy_value_c),
"=a" (dummy_value_a)
- : "0" (prev_row), // esi // input regs
- "1" (row), // edi
+ : "0" (prev_row), // esi/rsi // input regs
+ "1" (row), // edi/rdi
"2" (diff), // ecx
"3" (MMXLength) // eax
@@ -4539,27 +4499,156 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
#endif
);
}
- break; // end 6 bpp
+ break; // end 4 bpp
- case 4:
+ case 1:
+ case 2:
{
__asm__ __volatile__ (
+// preload "movl diff, %%eax \n\t" // eax: x = offset to align. bdry
+// preload "movl FullLength, %%edx \n\t"
+ "cmpl %%edx, %%eax \n\t"
+ "jnb paeth_dend \n\t"
+
+ SAVE_ebp
+
+// preload "movl row, %2 \n\t" // edi/rdi
+ // do Paeth decode for remaining bytes
+// preload "movl prev_row, %1 \n\t" // esi/rsi
+ "movl %%eax, %%ebp \n\t"
+// preload "subl bpp, %%ebp \n\t" // (bpp is preloaded into ecx)
+ "subl %%ecx, %%ebp \n\t" // ebp = eax - bpp
+ "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
+
+ SAVE_GOT_ebx
+ SAVE_r11_r12_r13
+
+ "paeth_dlp: \n\t"
+ "xorl %%ebx, %%ebx \n\t"
+ // pav = p - a = (a + b - c) - a = b - c
+ "movb (%1," PAX ",), %%bl \n\t" // load Prior(x) into bl
+ "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl
+ "subl %%ecx, %%ebx \n\t" // subtract Prior(x-bpp)
+ "movl %%ebx, " pa_TEMP " \n\t" // Save pav for later use
+ "xorl %%ebx, %%ebx \n\t"
+ // pbv = p - b = (a + b - c) - b = a - c
+ "movb (%2," PBP ",), %%bl \n\t" // load Raw(x-bpp) into bl
+ "subl %%ecx, %%ebx \n\t" // subtract Prior(x-bpp)
+ "movl %%ebx, %%ecx \n\t"
+ // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
+ "addl " pa_TEMP ", %%ebx \n\t" // pcv = pav + pbv
+ // pc = abs(pcv)
+ "testl $0x80000000, %%ebx \n\t"
+ "jz paeth_dpca \n\t"
+ "negl %%ebx \n\t" // reverse sign of neg values
+
+ "paeth_dpca: \n\t"
+ "movl %%ebx, " pc_TEMP " \n\t" // save pc for later use
+ // pb = abs(pbv)
+ "testl $0x80000000, %%ecx \n\t"
+ "jz paeth_dpba \n\t"
+ "negl %%ecx \n\t" // reverse sign of neg values
+
+ "paeth_dpba: \n\t"
+ "movl %%ecx, " pb_TEMP " \n\t" // save pb for later use
+ // pa = abs(pav)
+ "movl " pa_TEMP ", %%ebx \n\t"
+ "testl $0x80000000, %%ebx \n\t"
+ "jz paeth_dpaa \n\t"
+ "negl %%ebx \n\t" // reverse sign of neg values
+
+ "paeth_dpaa: \n\t"
+ "movl %%ebx, " pa_TEMP " \n\t" // save pa for later use
+ // test if pa <= pb
+ "cmpl %%ecx, %%ebx \n\t"
+ "jna paeth_dabb \n\t"
+ // pa > pb; now test if pb <= pc
+ "cmpl " pc_TEMP ", %%ecx \n\t"
+ "jna paeth_dbbc \n\t"
+ // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
+ "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl
+ "jmp paeth_dpaeth \n\t"
+
+ "paeth_dbbc: \n\t"
+ // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
+ "movb (%1," PAX ",), %%cl \n\t" // load Prior(x) into cl
+ "jmp paeth_dpaeth \n\t"
+
+ "paeth_dabb: \n\t"
+ // pa <= pb; now test if pa <= pc
+ "cmpl " pc_TEMP ", %%ebx \n\t"
+ "jna paeth_dabc \n\t"
+ // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
+ "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl
+ "jmp paeth_dpaeth \n\t"
+
+ "paeth_dabc: \n\t"
+ // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
+ "movb (%2," PBP ",), %%cl \n\t" // load Raw(x-bpp) into cl
+
+ "paeth_dpaeth: \n\t"
+ "incl %%eax \n\t"
+ "incl %%ebp \n\t"
+ // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
+ "addb %%cl, -1(%2," PAX ",) \n\t"
+ "cmpl %%edx, %%eax \n\t" // check against FullLength
+ "jb paeth_dlp \n\t"
+
+ RESTORE_r11_r12_r13
+ RESTORE_GOT_ebx
+ RESTORE_ebp
+
+ "paeth_dend: \n\t"
+
+ : "=c" (dummy_value_c), // output regs (dummy)
+ "=S" (dummy_value_S),
+ "=D" (dummy_value_D),
+ "=a" (dummy_value_a),
+ "=d" (dummy_value_d)
+
+ : "0" (bpp), // ecx // input regs
+ "1" (prev_row), // esi/rsi
+ "2" (row), // edi/rdi
+ "3" (diff), // eax
+ "4" (FullLength) // edx
+
+ CLOB_COLON_ebx_ebp_r1X // clobber list
+ CLOBBER_GOT_ebx
+ CLOB_COMMA_ebx_ebp
+ CLOBBER_ebp
+ CLOB_COMMA_ebX_r1X
+ CLOBBER_r11_r12_r13
+ );
+ }
+ return; // end 1 or 2 bpp (no need to go further with this one)
+
+ case 6:
+ {
+// _ActiveMask2 = 0xffffffff00000000LL; // NOT USED ("_amask_0_4_4")
+// _ShiftBpp = 48; // bpp << 3 == bpp * 8
+// _ShiftRem = 16; // 64 - _ShiftBpp
+
+ __asm__ __volatile__ (
// preload "movl diff, %%ecx \n\t"
-// preload "movl row, %%edi \n\t"
-// preload "movl prev_row, %%esi \n\t"
- "pxor %%mm0, %%mm0 \n\t"
+// preload "movl row, %1 \n\t" // edi/rdi
+// preload "movl prev_row, %0 \n\t" // esi/rsi
// prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
- // a=Raw(x-bpp) bytes
- "paeth_4lp: \n\t"
+ "movq -8(%1," PCX ",), %%mm1 \n\t"
+ "pxor %%mm0, %%mm0 \n\t"
+
+ "paeth_6lp: \n\t"
+ // must shift to position Raw(x-bpp) data
+ "psrlq $16, %%mm1 \n\t" // was _ShiftRem
// do first set of 4 bytes
- "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
- "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
- "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
- "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
+ "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+ "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
+ "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x)
+ "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
+ // must shift to position Prior(x-bpp) data
+ "psrlq $16, %%mm3 \n\t" // was _ShiftRem
// pav = p - a = (a + b - c) - a = b - c
"movq %%mm2, %%mm4 \n\t"
- "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
+ "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
// pbv = p - b = (a + b - c) - b = a - c
"movq %%mm1, %%mm5 \n\t"
"psubw %%mm3, %%mm4 \n\t"
@@ -4604,19 +4693,26 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"paddw %%mm3, %%mm7 \n\t"
"pxor %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm7 \n\t"
- "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
+ "movq -8(%0," PCX ",), %%mm3 \n\t" // load c=Prior(x-bpp)
LOAD_GOT_rbp
- "pand " AMASK4_4_0 ", %%mm7 \n\t" // _amask4_4_0 (was _ActiveMask)
+ "pand " AMASK4_4_0 ", %%mm7 \n\t" // _amask4_4_0 (was _ActiveMask)
RESTORE_rbp
- "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
- "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor + Raw(x)
- "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
- "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
- "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
- // Raw(x-bpp)
+ "psrlq $16, %%mm3 \n\t"
+ "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x) step 1
+ "paddb (%1," PCX ",), %%mm7 \n\t" // add Paeth predictor + Raw(x)
+ "movq %%mm2, %%mm6 \n\t"
+ "movq %%mm7, (%1," PCX ",) \n\t" // write back updated value
+ "movq -8(%1," PCX ",), %%mm1 \n\t"
+ "psllq $48, %%mm6 \n\t" // bpp * 8 = bits per pixel
+ "movq %%mm7, %%mm5 \n\t"
+ "psrlq $16, %%mm1 \n\t" // 64 - (bpp * 8) = remainder
+ "por %%mm6, %%mm3 \n\t"
+ "psllq $48, %%mm5 \n\t" // was _ShiftBpp
+ "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
+ "por %%mm5, %%mm1 \n\t"
// do second set of 4 bytes
- "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
- "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
+ "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
+ "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
// pav = p - a = (a + b - c) - a = b - c
"movq %%mm2, %%mm4 \n\t"
// pbv = p - b = (a + b - c) - b = a - c
@@ -4666,19 +4762,19 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
// step ecx to next set of 8 bytes and repeat loop til done
"addl $8, %%ecx \n\t"
"packuswb %%mm7, %%mm1 \n\t"
- "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
+ "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
"cmpl %%eax, %%ecx \n\t" // MMXLength
- "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
- // mm1 will be used as Raw(x-bpp) next loop
- "jb paeth_4lp \n\t"
+ "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
+ // mm1 will be used as Raw(x-bpp) next loop
+ "jb paeth_6lp \n\t"
: "=S" (dummy_value_S), // output regs (dummy)
"=D" (dummy_value_D),
"=c" (dummy_value_c),
"=a" (dummy_value_a)
- : "0" (prev_row), // esi // input regs
- "1" (row), // edi
+ : "0" (prev_row), // esi/rsi // input regs
+ "1" (row), // edi/rdi
"2" (diff), // ecx
"3" (MMXLength) // eax
@@ -4688,23 +4784,23 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
#endif
);
}
- break; // end 4 bpp
+ break; // end 6 bpp
case 8: // bpp == 8
{
__asm__ __volatile__ (
// preload "movl diff, %%ecx \n\t"
-// preload "movl row, %%edi \n\t"
-// preload "movl prev_row, %%esi \n\t"
+// preload "movl row, %1 \n\t" // edi/rdi
+// preload "movl prev_row, %0 \n\t" // esi/rsi
"pxor %%mm0, %%mm0 \n\t"
// prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
+ "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
// a=Raw(x-bpp) bytes
"paeth_8lp: \n\t"
// do first set of 4 bytes
- "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+ "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
- "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
+ "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x)
"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
// pav = p - a = (a + b - c) - a = b - c
"movq %%mm2, %%mm4 \n\t"
@@ -4753,15 +4849,15 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"paddw %%mm3, %%mm7 \n\t"
"pxor %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm7 \n\t"
- "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
+ "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
LOAD_GOT_rbp
"pand " AMASK4_4_0 ", %%mm7 \n\t" // _amask4_4_0 (was _ActiveMask)
RESTORE_rbp
- "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
- "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor + Raw(x)
+ "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x)
+ "paddb (%1," PCX ",), %%mm7 \n\t" // add Paeth predictor + Raw(x)
"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
- "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
- "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
+ "movq %%mm7, (%1," PCX ",) \n\t" // write back updated value
+ "movq -8(%1," PCX ",), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
// do second set of 4 bytes
"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
@@ -4815,10 +4911,10 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
// step ecx to next set of 8 bytes and repeat loop til done
"addl $8, %%ecx \n\t"
"packuswb %%mm7, %%mm1 \n\t"
- "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor + Raw(x)
+ "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
"cmpl %%eax, %%ecx \n\t" // MMXLength
- "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
- // mm1 will be used as Raw(x-bpp) next loop
+ "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
+ // mm1 will be used as Raw(x-bpp) next loop
"jb paeth_8lp \n\t"
: "=S" (dummy_value_S), // output regs (dummy)
@@ -4826,8 +4922,8 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"=c" (dummy_value_c),
"=a" (dummy_value_a)
- : "0" (prev_row), // esi // input regs
- "1" (row), // edi
+ : "0" (prev_row), // esi/rsi // input regs
+ "1" (row), // edi/rdi
"2" (diff), // ecx
"3" (MMXLength) // eax
@@ -4839,125 +4935,15 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
}
break; // end 8 bpp
- default: // bpp = 1 or 2
+ default: // bpp != 1,2,3,4,6,8: doesn't exist
{
- __asm__ __volatile__ (
-// preload "movl diff, %%eax \n\t" // eax: x = offset to align. bdry
-// preload "movl FullLength, %%edx \n\t"
- "cmpl %%edx, %%eax \n\t"
- "jnb paeth_dend \n\t"
-
- SAVE_ebp
-
-// preload "movl row, %%edi \n\t"
- // do Paeth decode for remaining bytes
-// preload "movl prev_row, %%esi \n\t"
- "movl %%eax, %%ebp \n\t"
-// preload "subl bpp, %%ebp \n\t" // (bpp is preloaded into ecx)
- "subl %%ecx, %%ebp \n\t" // ebp = eax - bpp
- "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
-
- SAVE_GOT_ebx
- SAVE_r11_r12_r13
-
- "paeth_dlp: \n\t"
- "xorl %%ebx, %%ebx \n\t"
- // pav = p - a = (a + b - c) - a = b - c
- "movb (%%esi,%%eax,), %%bl \n\t" // load Prior(x) into bl
- "movb (%%esi,%%ebp,), %%cl \n\t" // load Prior(x-bpp) into cl
- "subl %%ecx, %%ebx \n\t" // subtract Prior(x-bpp)
- "movl %%ebx, " pa_TEMP " \n\t" // Save pav for later use
- "xorl %%ebx, %%ebx \n\t"
- // pbv = p - b = (a + b - c) - b = a - c
- "movb (%%edi,%%ebp,), %%bl \n\t" // load Raw(x-bpp) into bl
- "subl %%ecx, %%ebx \n\t" // subtract Prior(x-bpp)
- "movl %%ebx, %%ecx \n\t"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "addl " pa_TEMP ", %%ebx \n\t" // pcv = pav + pbv
- // pc = abs(pcv)
- "testl $0x80000000, %%ebx \n\t"
- "jz paeth_dpca \n\t"
- "negl %%ebx \n\t" // reverse sign of neg values
-
- "paeth_dpca: \n\t"
- "movl %%ebx, " pc_TEMP " \n\t" // save pc for later use
- // pb = abs(pbv)
- "testl $0x80000000, %%ecx \n\t"
- "jz paeth_dpba \n\t"
- "negl %%ecx \n\t" // reverse sign of neg values
-
- "paeth_dpba: \n\t"
- "movl %%ecx, " pb_TEMP " \n\t" // save pb for later use
- // pa = abs(pav)
- "movl " pa_TEMP ", %%ebx \n\t"
- "testl $0x80000000, %%ebx \n\t"
- "jz paeth_dpaa \n\t"
- "negl %%ebx \n\t" // reverse sign of neg values
-
- "paeth_dpaa: \n\t"
- "movl %%ebx, " pa_TEMP " \n\t" // save pa for later use
- // test if pa <= pb
- "cmpl %%ecx, %%ebx \n\t"
- "jna paeth_dabb \n\t"
- // pa > pb; now test if pb <= pc
- "cmpl " pc_TEMP ", %%ecx \n\t"
- "jna paeth_dbbc \n\t"
- // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%ebp,), %%cl \n\t" // load Prior(x-bpp) into cl
- "jmp paeth_dpaeth \n\t"
-
- "paeth_dbbc: \n\t"
- // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
- "movb (%%esi,%%eax,), %%cl \n\t" // load Prior(x) into cl
- "jmp paeth_dpaeth \n\t"
-
- "paeth_dabb: \n\t"
- // pa <= pb; now test if pa <= pc
- "cmpl " pc_TEMP ", %%ebx \n\t"
- "jna paeth_dabc \n\t"
- // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%ebp,), %%cl \n\t" // load Prior(x-bpp) into cl
- "jmp paeth_dpaeth \n\t"
-
- "paeth_dabc: \n\t"
- // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
- "movb (%%edi,%%ebp,), %%cl \n\t" // load Raw(x-bpp) into cl
-
- "paeth_dpaeth: \n\t"
- "incl %%eax \n\t"
- "incl %%ebp \n\t"
- // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
- "addb %%cl, -1(%%edi,%%eax,) \n\t"
- "cmpl %%edx, %%eax \n\t" // check against FullLength
- "jb paeth_dlp \n\t"
-
- RESTORE_r11_r12_r13
- RESTORE_GOT_ebx
- RESTORE_ebp
-
- "paeth_dend: \n\t"
-
- : "=c" (dummy_value_c), // output regs (dummy)
- "=S" (dummy_value_S),
- "=D" (dummy_value_D),
- "=a" (dummy_value_a),
- "=d" (dummy_value_d)
-
- : "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row), // edi
- "3" (diff), // eax
- "4" (FullLength) // edx
-
- CLOB_COLON_ebx_ebp_r1X // clobber list
- CLOBBER_GOT_ebx
- CLOB_COMMA_ebx_ebp
- CLOBBER_ebp
- CLOB_COMMA_ebX_r1X
- CLOBBER_r11_r12_r13
- );
+ // ERROR: SHOULD NEVER BE REACHED
+#if defined(PNG_DEBUG)
+ png_debug(1, "Internal libpng logic error (GCC "
+ "png_read_filter_row_mmx_paeth())\n");
+#endif
}
- return; // No need to go further with this one
+ break;
} // end switch (bpp)
@@ -4971,8 +4957,8 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
SAVE_ebp
-//pre "movl row, %%edi \n\t"
-//pre "movl prev_row, %%esi \n\t"
+//pre "movl row, %2 \n\t" // edi/rdi
+//pre "movl prev_row, %1 \n\t" // esi/rsi
// do Paeth decode for remaining bytes
"movl %%eax, %%ebp \n\t"
//pre "subl bpp, %%ebp \n\t" // (bpp is preloaded into ecx)
@@ -4985,13 +4971,13 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"paeth_lp2: \n\t"
"xorl %%ebx, %%ebx \n\t"
// pav = p - a = (a + b - c) - a = b - c
- "movb (%%esi,%%eax,), %%bl \n\t" // load Prior(x) into bl
- "movb (%%esi,%%ebp,), %%cl \n\t" // load Prior(x-bpp) into cl
+ "movb (%1," PAX ",), %%bl \n\t" // load Prior(x) into bl
+ "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl
"subl %%ecx, %%ebx \n\t" // subtract Prior(x-bpp)
"movl %%ebx, " pa_TEMP " \n\t" // Save pav for later use
"xorl %%ebx, %%ebx \n\t"
// pbv = p - b = (a + b - c) - b = a - c
- "movb (%%edi,%%ebp,), %%bl \n\t" // load Raw(x-bpp) into bl
+ "movb (%2," PBP ",), %%bl \n\t" // load Raw(x-bpp) into bl
"subl %%ecx, %%ebx \n\t" // subtract Prior(x-bpp)
"movl %%ebx, %%ecx \n\t"
// pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
@@ -5025,12 +5011,12 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"cmpl " pc_TEMP ", %%ecx \n\t"
"jna paeth_bbc2 \n\t"
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%ebp,), %%cl \n\t" // load Prior(x-bpp) into cl
+ "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl
"jmp paeth_paeth2 \n\t"
"paeth_bbc2: \n\t"
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
- "movb (%%esi,%%eax,), %%cl \n\t" // load Prior(x) into cl
+ "movb (%1," PAX ",), %%cl \n\t" // load Prior(x) into cl
"jmp paeth_paeth2 \n\t"
"paeth_abb2: \n\t"
@@ -5038,18 +5024,18 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"cmpl " pc_TEMP ", %%ebx \n\t"
"jna paeth_abc2 \n\t"
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%ebp,), %%cl \n\t" // load Prior(x-bpp) into cl
+ "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl
"jmp paeth_paeth2 \n\t"
"paeth_abc2: \n\t"
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
- "movb (%%edi,%%ebp,), %%cl \n\t" // load Raw(x-bpp) into cl
+ "movb (%2," PBP ",), %%cl \n\t" // load Raw(x-bpp) into cl
"paeth_paeth2: \n\t"
"incl %%eax \n\t"
"incl %%ebp \n\t"
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
- "addb %%cl, -1(%%edi,%%eax,) \n\t"
+ "addb %%cl, -1(%2," PAX ",) \n\t"
"cmpl %%edx, %%eax \n\t" // check against FullLength
"jb paeth_lp2 \n\t"
@@ -5067,8 +5053,8 @@ png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
"=d" (dummy_value_d)
: "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row), // edi
+ "1" (prev_row), // esi/rsi
+ "2" (row), // edi/rdi
"3" (MMXLength), // eax
"4" (FullLength) // edx
@@ -5113,7 +5099,7 @@ png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
__asm__ __volatile__ (
SAVE_r15
SAVE_ebp
-//pre "movl row, %%edi \n\t"
+//pre "movl row, %%edi \n\t" // edi/rdi
"movl %%edi, %%esi \n\t" // lp = row
//pre "movl bpp, %%ecx \n\t"
"addl %%ecx, %%edi \n\t" // rp = row + bpp
@@ -5168,7 +5154,7 @@ png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
// _ShiftRem = 40; // == 64 - 24
__asm__ __volatile__ (
-// preload "movl row, %%edi \n\t"
+// preload "movl row, %%edi \n\t" // edi/rdi
LOAD_GOT_rbp
// load (former) _ActiveMask for 2nd active byte group
"movq " AMASK2_3_3 ", %%mm7 \n\t" // _amask2_3_3
@@ -5234,7 +5220,7 @@ png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
// _ShiftRem = 64 - _ShiftBpp; // 32 (psrlq)
__asm__ __volatile__ (
-// preload "movl row, %%edi \n\t"
+// preload "movl row, %%edi \n\t" // edi/rdi
// preload "movl diff, %%edx \n\t"
"movl %%edi, %%esi \n\t" // lp = row
// preload "movl bpp, %%ecx \n\t"
@@ -5278,61 +5264,11 @@ png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
}
break; // end 4 bpp
- case 6: // formerly shared with 4 bpp case (see comments there)
- {
-// _ShiftBpp = bpp << 3; // 48 (psllq)
-// _ShiftRem = 64 - _ShiftBpp; // 16 (psrlq)
-
- __asm__ __volatile__ (
-// preload "movl row, %%edi \n\t"
-// preload "movl diff, %%edx \n\t"
- "movl %%edi, %%esi \n\t" // lp = row
-// preload "movl bpp, %%ecx \n\t"
- "addl %%ecx, %%edi \n\t" // rp = row + bpp
-
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%edx,), %%mm1 \n\t"
-
- "sub_6lp: \n\t" // shift data for adding first
- "psrlq $16, %%mm1 \n\t" // bpp bytes (no need for mask;
- // shift clears inactive bytes)
- "movq (%%edi,%%edx,), %%mm0 \n\t"
- "paddb %%mm1, %%mm0 \n\t"
-
- // add 2nd active group
- "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
- "psllq $48, %%mm1 \n\t" // shift data to pos. correctly
- "addl $8, %%edx \n\t"
- "paddb %%mm1, %%mm0 \n\t"
-
- "cmpl %%eax, %%edx \n\t" // MMXLength
- "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
- "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
- "jb sub_6lp \n\t"
-
- : "=c" (dummy_value_c), // 0 // output regs (dummy)
- "=D" (dummy_value_D), // 1
- "=d" (dummy_value_d), // 2
- "=a" (dummy_value_a) // 3
-
- : "0" (bpp), // ecx // input regs
- "1" (row), // edi
- "2" (diff), // edx
- "3" (MMXLength) // eax
-
- : "%esi" // clobber list
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
- , "%mm0", "%mm1"
-#endif
- );
- }
- break; // end 6 bpp
-
case 1:
{
__asm__ __volatile__ (
// preload "movl diff, %%edx \n\t"
-// preload "movl row, %%edi \n\t"
+// preload "movl row, %%edi \n\t" // edi/rdi
// preload "cmpl FullLength, %%edx \n\t"
"cmpl %%eax, %%edx \n\t"
"jnb sub_1end \n\t"
@@ -5377,7 +5313,7 @@ png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
RESTORE_rbp
// preload "movl diff, %%edx \n\t"
"movq %%mm7, %%mm6 \n\t"
-// preload "movl row, %%edi \n\t"
+// preload "movl row, %%edi \n\t" // edi/rdi
"psllq $16, %%mm6 \n\t" // move mask in mm6 to cover
// 3rd active byte group
"movl %%edi, %%esi \n\t" // lp = row
@@ -5437,11 +5373,61 @@ png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
}
break; // end 2 bpp
+ case 6: // formerly shared with 4 bpp case (see comments there)
+ {
+// _ShiftBpp = bpp << 3; // 48 (psllq)
+// _ShiftRem = 64 - _ShiftBpp; // 16 (psrlq)
+
+ __asm__ __volatile__ (
+// preload "movl row, %%edi \n\t" // edi/rdi
+// preload "movl diff, %%edx \n\t"
+ "movl %%edi, %%esi \n\t" // lp = row
+// preload "movl bpp, %%ecx \n\t"
+ "addl %%ecx, %%edi \n\t" // rp = row + bpp
+
+ // prime the pump: load the first Raw(x-bpp) data set
+ "movq -8(%%edi,%%edx,), %%mm1 \n\t"
+
+ "sub_6lp: \n\t" // shift data for adding first
+ "psrlq $16, %%mm1 \n\t" // bpp bytes (no need for mask;
+ // shift clears inactive bytes)
+ "movq (%%edi,%%edx,), %%mm0 \n\t"
+ "paddb %%mm1, %%mm0 \n\t"
+
+ // add 2nd active group
+ "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
+ "psllq $48, %%mm1 \n\t" // shift data to pos. correctly
+ "addl $8, %%edx \n\t"
+ "paddb %%mm1, %%mm0 \n\t"
+
+ "cmpl %%eax, %%edx \n\t" // MMXLength
+ "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
+ "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
+ "jb sub_6lp \n\t"
+
+ : "=c" (dummy_value_c), // 0 // output regs (dummy)
+ "=D" (dummy_value_D), // 1
+ "=d" (dummy_value_d), // 2
+ "=a" (dummy_value_a) // 3
+
+ : "0" (bpp), // ecx // input regs
+ "1" (row), // edi
+ "2" (diff), // edx
+ "3" (MMXLength) // eax
+
+ : "%esi" // clobber list
+#if defined(CLOBBER_MMX_REGS_SUPPORTED)
+ , "%mm0", "%mm1"
+#endif
+ );
+ }
+ break; // end 6 bpp
+
case 8:
{
__asm__ __volatile__ (
SAVE_ebp
-// preload "movl row, %%edi \n\t"
+// preload "movl row, %%edi \n\t" // edi/rdi
// preload "movl diff, %%edx \n\t"
"movl %%edi, %%esi \n\t" // lp = row
// preload "movl bpp, %%ecx \n\t"
@@ -5525,10 +5511,10 @@ png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
default: // bpp != 1,2,3,4,6,8: doesn't exist
{
-#if defined(PNG_DEBUG)
// ERROR: SHOULD NEVER BE REACHED
- png_debug(1,
- "Internal libpng logic error (GCC png_read_filter_row_mmx_sub())\n");
+#if defined(PNG_DEBUG)
+ png_debug(1, "Internal libpng logic error (GCC "
+ "png_read_filter_row_mmx_sub())\n");
#endif
}
break;
@@ -5537,7 +5523,7 @@ png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
__asm__ __volatile__ (
//pre "movl MMXLength, %%eax \n\t"
-//pre "movl row, %%edi \n\t"
+//pre "movl row, %%edi \n\t" // edi/rdi
//pre "cmpl FullLength, %%eax \n\t"
"cmpl %%edx, %%eax \n\t"
"jnb sub_end \n\t"
@@ -5596,14 +5582,14 @@ png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
__asm__ __volatile__ (
SAVE_GOT_ebx
-//pre "movl row, %%edi \n\t"
+//pre "movl row, %%edi \n\t" // edi/rdi
// get # of bytes to alignment
"movl %%edi, %%ecx \n\t"
"xorl %%ebx, %%ebx \n\t"
"addl $0x7, %%ecx \n\t"
"xorl %%eax, %%eax \n\t"
"andl $0xfffffff8, %%ecx \n\t"
-//pre "movl prev_row, %%esi \n\t"
+//pre "movl prev_row, %%esi \n\t" // esi/rsi
"subl %%edi, %%ecx \n\t"
"jz up_go \n\t"
@@ -5718,8 +5704,6 @@ png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
} // end of png_read_filter_row_mmx_up()
-#endif /* PNG_MMX_CODE_SUPPORTED */
-
@@ -5737,10 +5721,9 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
row, png_bytep prev_row, int filter)
{
#if defined(PNG_DEBUG)
- char filnm[10];
+ char filtname[10];
#endif
-#if defined(PNG_MMX_CODE_SUPPORTED)
if (_mmx_supported == 2) {
#if !defined(PNG_1_0_X)
/* this should have happened in png_init_mmx_flags() already */
@@ -5748,19 +5731,17 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
#endif
png_mmx_support();
}
-#endif /* PNG_MMX_CODE_SUPPORTED */
#if defined(PNG_DEBUG)
png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
switch (filter)
{
case 0:
- png_snprintf(filnm, 10, "none");
+ png_snprintf(filtname, 10, "none");
break;
case 1:
- png_snprintf(filnm, 10, "sub-%s",
-#if defined(PNG_MMX_CODE_SUPPORTED)
+ png_snprintf(filtname, 10, "sub-%s",
#if !defined(PNG_1_0_X)
((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
@@ -5768,14 +5749,11 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
#else
_mmx_supported
#endif
- ? "MMX" :
-#endif
- "x86");
+ ? "MMX" : "x86");
break;
case 2:
- png_snprintf(filnm, 10, "up-%s",
-#if defined(PNG_MMX_CODE_SUPPORTED)
+ png_snprintf(filtname, 10, "up-%s",
#if !defined(PNG_1_0_X)
((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
@@ -5783,14 +5761,11 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
#else
_mmx_supported
#endif
- ? "MMX" :
-#endif
- "x86");
+ ? "MMX" : "x86");
break;
case 3:
- png_snprintf(filnm, 10, "avg-%s",
-#if defined(PNG_MMX_CODE_SUPPORTED)
+ png_snprintf(filtname, 10, "avg-%s",
#if !defined(PNG_1_0_X)
((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
@@ -5798,14 +5773,12 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
#else
_mmx_supported
#endif
- ? "MMX" :
-#endif
- "x86");
+ ? "MMX" : "x86");
break;
case 4:
- png_snprintf(filnm, 10, "Paeth-%s",
-#if defined(PNG_MMX_CODE_SUPPORTED)
+ png_snprintf(filtname, 10, "paeth-%s",
+#if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
@@ -5814,19 +5787,21 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
_mmx_supported
#endif
? "MMX" :
-#endif
+#endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
"x86");
break;
default:
- png_snprintf(filnm, 10, "unknown");
+ png_snprintf(filtname, 10, "unknown");
break;
}
- png_debug2(0, "row_number=%5ld, %10s, ", png_ptr->row_number, filnm);
- png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
- png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
+ png_debug2(2, "row_number=%ld, %s, ", png_ptr->row_number, filtname);
+ //png_debug1(0, "png_ptr=%10p, ", png_ptr);
+ //png_debug1(0, "asm_flags=0x%08lx, ", png_ptr->asm_flags);
+ png_debug1(0, "row=%10p, ", row);
+ png_debug2(0, "pixdepth=%d, bytes=%d, ", (int)row_info->pixel_depth,
(int)((row_info->pixel_depth + 7) >> 3));
- png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
+ png_debug1(0, "rowbytes=%ld\n", row_info->rowbytes);
#endif /* PNG_DEBUG */
switch (filter)
@@ -5835,7 +5810,6 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
break;
case PNG_FILTER_VALUE_SUB:
-#if defined(PNG_MMX_CODE_SUPPORTED)
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
@@ -5847,7 +5821,6 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
png_read_filter_row_mmx_sub(row_info, row);
}
else
-#endif /* PNG_MMX_CODE_SUPPORTED */
{
png_uint_32 i;
png_uint_32 istop = row_info->rowbytes;
@@ -5864,7 +5837,6 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
break;
case PNG_FILTER_VALUE_UP:
-#if defined(PNG_MMX_CODE_SUPPORTED)
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
@@ -5876,7 +5848,6 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
png_read_filter_row_mmx_up(row_info, row, prev_row);
}
else
-#endif /* PNG_MMX_CODE_SUPPORTED */
{
png_uint_32 i;
png_uint_32 istop = row_info->rowbytes;
@@ -5892,7 +5863,6 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
break;
case PNG_FILTER_VALUE_AVG:
-#if defined(PNG_MMX_CODE_SUPPORTED)
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
@@ -5904,7 +5874,6 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
png_read_filter_row_mmx_avg(row_info, row, prev_row);
}
else
-#endif /* PNG_MMX_CODE_SUPPORTED */
{
png_uint_32 i;
png_bytep rp = row;
@@ -5930,7 +5899,6 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
break;
case PNG_FILTER_VALUE_PAETH:
-#if defined(PNG_MMX_CODE_SUPPORTED)
#if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
@@ -5944,7 +5912,6 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
}
else
#endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
-#endif /* PNG_MMX_CODE_SUPPORTED */
{
png_uint_32 i;
png_bytep rp = row;
@@ -6009,3 +5976,5 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */
+#endif /* __GNUC__ */
+
diff --git a/pngpread.c b/pngpread.c
index b68244aa7..6eff87d03 100644
--- a/pngpread.c
+++ b/pngpread.c
@@ -1,7 +1,7 @@
/* pngpread.c - read a png file in push mode
*
- * Last changed in libpng 1.2.19 June 23, 2007
+ * Last changed in libpng 1.2.19 June 28, 2007
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998-2007 Glenn Randers-Pehrson
* (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pngread.c b/pngread.c
index f5a270513..f8fcdc442 100644
--- a/pngread.c
+++ b/pngread.c
@@ -1,7 +1,7 @@
/* pngread.c - read a PNG file
*
- * Last changed in libpng 1.2.19 June 23, 2007
+ * Last changed in libpng 1.2.19 June 28, 2007
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998-2007 Glenn Randers-Pehrson
* (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pngrtran.c b/pngrtran.c
index f8dce7198..fa0671265 100644
--- a/pngrtran.c
+++ b/pngrtran.c
@@ -1,7 +1,7 @@
/* pngrtran.c - transforms the data in a row for PNG readers
*
- * Last changed in libpng 1.2.19 June 23, 2007
+ * Last changed in libpng 1.2.19 June 28, 2007
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998-2007 Glenn Randers-Pehrson
* (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -1293,9 +1293,11 @@ png_do_read_transformations(png_structp png_ptr)
if(rgb_error)
{
png_ptr->rgb_to_gray_status=1;
- if(png_ptr->transformations & PNG_RGB_TO_GRAY_WARN)
+ if((png_ptr->transformations & PNG_RGB_TO_GRAY) ==
+ PNG_RGB_TO_GRAY_WARN)
png_warning(png_ptr, "png_do_rgb_to_gray found nongray pixel");
- if(png_ptr->transformations & PNG_RGB_TO_GRAY_ERR)
+ if((png_ptr->transformations & PNG_RGB_TO_GRAY) ==
+ PNG_RGB_TO_GRAY_ERR)
png_error(png_ptr, "png_do_rgb_to_gray found nongray pixel");
}
}
diff --git a/pngrutil.c b/pngrutil.c
index b782530da..673d812fd 100644
--- a/pngrutil.c
+++ b/pngrutil.c
@@ -1,7 +1,7 @@
/* pngrutil.c - utilities to read a PNG file
*
- * Last changed in libpng 1.2.19 June 23, 2007
+ * Last changed in libpng 1.2.19 June 28, 2007
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998-2007 Glenn Randers-Pehrson
* (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -2325,7 +2325,50 @@ static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
{
switch (png_ptr->row_info.pixel_depth)
{
- /* most common case: combining 32-bit RGBA */
+ /* most common case: combining 24-bit RGB */
+ case 24: /* png_ptr->row_info.pixel_depth */
+ {
+ png_bytep srcptr;
+ png_bytep dstptr;
+
+ {
+ register png_uint_32 i;
+ png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
+ /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
+ register int stride = BPP3 * png_pass_inc[png_ptr->pass];
+ /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
+ register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
+ /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
+ png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
+ int diff = (int) (png_ptr->width & 7); /* amount lost */
+ register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
+
+ srcptr = png_ptr->row_buf + 1 + initial_val;
+ dstptr = row + initial_val;
+
+ for (i = initial_val; i < final_val; i += stride)
+ {
+ png_memcpy(dstptr, srcptr, rep_bytes);
+ srcptr += stride;
+ dstptr += stride;
+ }
+ if (diff) /* number of leftover pixels: 3 for pngtest */
+ {
+ final_val += diff*BPP3;
+ for (; i < final_val; i += stride)
+ {
+ if (rep_bytes > (int)(final_val-i))
+ rep_bytes = (int)(final_val-i);
+ png_memcpy(dstptr, srcptr, rep_bytes);
+ srcptr += stride;
+ dstptr += stride;
+ }
+ }
+ } /* end of else (_mmx_supported) */
+
+ break;
+ } /* end 24 bpp */
+
case 32: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
@@ -2369,6 +2412,48 @@ static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
break;
} /* end 32 bpp */
+ case 8: /* png_ptr->row_info.pixel_depth */
+ {
+ png_bytep srcptr;
+ png_bytep dstptr;
+ {
+ register png_uint_32 i;
+ png_uint_32 initial_val = png_pass_start[png_ptr->pass];
+ /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
+ register int stride = png_pass_inc[png_ptr->pass];
+ /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
+ register int rep_bytes = png_pass_width[png_ptr->pass];
+ /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
+ png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
+ int diff = (int) (png_ptr->width & 7); /* amount lost */
+ register png_uint_32 final_val = len; /* GRR bugfix */
+
+ srcptr = png_ptr->row_buf + 1 + initial_val;
+ dstptr = row + initial_val;
+
+ for (i = initial_val; i < final_val; i += stride)
+ {
+ png_memcpy(dstptr, srcptr, rep_bytes);
+ srcptr += stride;
+ dstptr += stride;
+ }
+ if (diff) /* number of leftover pixels: 3 for pngtest */
+ {
+ final_val += diff /* *BPP1 */ ;
+ for (; i < final_val; i += stride)
+ {
+ if (rep_bytes > (int)(final_val-i))
+ rep_bytes = (int)(final_val-i);
+ png_memcpy(dstptr, srcptr, rep_bytes);
+ srcptr += stride;
+ dstptr += stride;
+ }
+ }
+ }
+
+ break;
+ } /* end 8 bpp */
+
case 1: /* png_ptr->row_info.pixel_depth */
{
png_bytep sp;
@@ -2535,48 +2620,6 @@ static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
break;
} /* end 4 bpp */
- case 8: /* png_ptr->row_info.pixel_depth */
- {
- png_bytep srcptr;
- png_bytep dstptr;
- {
- register png_uint_32 i;
- png_uint_32 initial_val = png_pass_start[png_ptr->pass];
- /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
- register int stride = png_pass_inc[png_ptr->pass];
- /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
- register int rep_bytes = png_pass_width[png_ptr->pass];
- /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
- png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
- int diff = (int) (png_ptr->width & 7); /* amount lost */
- register png_uint_32 final_val = len; /* GRR bugfix */
-
- srcptr = png_ptr->row_buf + 1 + initial_val;
- dstptr = row + initial_val;
-
- for (i = initial_val; i < final_val; i += stride)
- {
- png_memcpy(dstptr, srcptr, rep_bytes);
- srcptr += stride;
- dstptr += stride;
- }
- if (diff) /* number of leftover pixels: 3 for pngtest */
- {
- final_val += diff /* *BPP1 */ ;
- for (; i < final_val; i += stride)
- {
- if (rep_bytes > (int)(final_val-i))
- rep_bytes = (int)(final_val-i);
- png_memcpy(dstptr, srcptr, rep_bytes);
- srcptr += stride;
- dstptr += stride;
- }
- }
- }
-
- break;
- } /* end 8 bpp */
-
case 16: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
@@ -2621,48 +2664,6 @@ static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
} /* end 16 bpp */
- case 24: /* png_ptr->row_info.pixel_depth */
- {
- png_bytep srcptr;
- png_bytep dstptr;
-
- {
- register png_uint_32 i;
- png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
- /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
- register int stride = BPP3 * png_pass_inc[png_ptr->pass];
- /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
- register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
- /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
- png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
- int diff = (int) (png_ptr->width & 7); /* amount lost */
- register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
-
- srcptr = png_ptr->row_buf + 1 + initial_val;
- dstptr = row + initial_val;
-
- for (i = initial_val; i < final_val; i += stride)
- {
- png_memcpy(dstptr, srcptr, rep_bytes);
- srcptr += stride;
- dstptr += stride;
- }
- if (diff) /* number of leftover pixels: 3 for pngtest */
- {
- final_val += diff*BPP3;
- for (; i < final_val; i += stride)
- {
- if (rep_bytes > (int)(final_val-i))
- rep_bytes = (int)(final_val-i);
- png_memcpy(dstptr, srcptr, rep_bytes);
- srcptr += stride;
- dstptr += stride;
- }
- }
- } /* end of else (_mmx_supported) */
-
- break;
- } /* end 24 bpp */
case 48: /* png_ptr->row_info.pixel_depth */
{
diff --git a/pngtest.c b/pngtest.c
index 71c6f337f..d5a6cce62 100644
--- a/pngtest.c
+++ b/pngtest.c
@@ -1553,4 +1553,4 @@ main(int argc, char *argv[])
}
/* Generate a compiler error if there is an old png.h in the search path. */
-typedef version_1_2_19beta18 your_png_h_is_not_version_1_2_19beta18;
+typedef version_1_2_19beta19 your_png_h_is_not_version_1_2_19beta19;
diff --git a/pngvcrd.c b/pngvcrd.c
index 4b76feae9..41b21c0f9 100644
--- a/pngvcrd.c
+++ b/pngvcrd.c
@@ -3,7 +3,7 @@
*
* For Intel x86 CPU and Microsoft Visual C++ compiler
*
- * Last changed in libpng 1.2.19 June 23, 2007
+ * Last changed in libpng 1.2.19 June 28, 2007
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998-2007 Glenn Randers-Pehrson
* Copyright (c) 1998, Intel Corporation
@@ -136,181 +136,23 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
{
switch (png_ptr->row_info.pixel_depth)
{
- case 1:
- {
- png_bytep sp;
- png_bytep dp;
- int s_inc, s_start, s_end;
- int m;
- int shift;
- png_uint_32 i;
-
- sp = png_ptr->row_buf + 1;
- dp = row;
- m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (png_ptr->transformations & PNG_PACKSWAP)
- {
- s_start = 0;
- s_end = 7;
- s_inc = 1;
- }
- else
-#endif
- {
- s_start = 7;
- s_end = 0;
- s_inc = -1;
- }
-
- shift = s_start;
-
- for (i = 0; i < png_ptr->width; i++)
- {
- if (m & mask)
- {
- int value;
-
- value = (*sp >> shift) & 0x1;
- *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
- *dp |= (png_byte)(value << shift);
- }
-
- if (shift == s_end)
- {
- shift = s_start;
- sp++;
- dp++;
- }
- else
- shift += s_inc;
-
- if (m == 1)
- m = 0x80;
- else
- m >>= 1;
- }
- break;
- }
-
- case 2:
- {
- png_bytep sp;
- png_bytep dp;
- int s_start, s_end, s_inc;
- int m;
- int shift;
- png_uint_32 i;
- int value;
-
- sp = png_ptr->row_buf + 1;
- dp = row;
- m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (png_ptr->transformations & PNG_PACKSWAP)
- {
- s_start = 0;
- s_end = 6;
- s_inc = 2;
- }
- else
-#endif
- {
- s_start = 6;
- s_end = 0;
- s_inc = -2;
- }
-
- shift = s_start;
-
- for (i = 0; i < png_ptr->width; i++)
- {
- if (m & mask)
- {
- value = (*sp >> shift) & 0x3;
- *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
- *dp |= (png_byte)(value << shift);
- }
-
- if (shift == s_end)
- {
- shift = s_start;
- sp++;
- dp++;
- }
- else
- shift += s_inc;
- if (m == 1)
- m = 0x80;
- else
- m >>= 1;
- }
- break;
- }
-
- case 4:
- {
- png_bytep sp;
- png_bytep dp;
- int s_start, s_end, s_inc;
- int m;
- int shift;
- png_uint_32 i;
- int value;
-
- sp = png_ptr->row_buf + 1;
- dp = row;
- m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (png_ptr->transformations & PNG_PACKSWAP)
- {
- s_start = 0;
- s_end = 4;
- s_inc = 4;
- }
- else
-#endif
- {
- s_start = 4;
- s_end = 0;
- s_inc = -4;
- }
- shift = s_start;
-
- for (i = 0; i < png_ptr->width; i++)
- {
- if (m & mask)
- {
- value = (*sp >> shift) & 0xf;
- *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
- *dp |= (png_byte)(value << shift);
- }
-
- if (shift == s_end)
- {
- shift = s_start;
- sp++;
- dp++;
- }
- else
- shift += s_inc;
- if (m == 1)
- m = 0x80;
- else
- m >>= 1;
- }
- break;
- }
-
- case 8:
+ case 24:
{
png_bytep srcptr;
png_bytep dstptr;
png_uint_32 len;
- int m;
- int diff, unmask;
+ int unmask, diff;
- __int64 mask0=0x0102040810204080;
+ __int64 mask2=0x0101010202020404, //24bpp
+ mask1=0x0408080810101020,
+ mask0=0x2020404040808080;
+
+ srcptr = png_ptr->row_buf + 1;
+ dstptr = row;
+
+ unmask = ~mask;
+ len = (png_ptr->width)&~7;
+ diff = (png_ptr->width)&7;
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
@@ -319,66 +161,87 @@ png_combine_row(png_structp png_ptr, png_bytep row, int mask)
if (mmx_supported)
#endif
{
- srcptr = png_ptr->row_buf + 1;
- dstptr = row;
- m = 0x80;
- unmask = ~mask;
- len = png_ptr->width &~7; //reduce to multiple of 8
- diff = png_ptr->width & 7; //amount lost
-
_asm
{
- movd mm7, unmask //load bit pattern
- psubb mm6,mm6 //zero mm6
+ movd mm7, unmask //load bit pattern
+ psubb mm6,mm6 //zero mm6
punpcklbw mm7,mm7
punpcklwd mm7,mm7
- punpckldq mm7,mm7 //fill register with 8 masks
+ punpckldq mm7,mm7 //fill register with 8 masks
movq mm0,mask0
+ movq mm1,mask1
+ movq mm2,mask2
- pand mm0,mm7 //nonzero if keep byte
- pcmpeqb mm0,mm6 //zeros->1s, v versa
+ pand mm0,mm7
+ pand mm1,mm7
+ pand mm2,mm7
- mov ecx,len //load length of line (pixels)
- mov esi,srcptr //load source
- mov ebx,dstptr //load dest
- cmp ecx,0 //lcr
- je mainloop8end
+ pcmpeqb mm0,mm6
+ pcmpeqb mm1,mm6
+ pcmpeqb mm2,mm6
-mainloop8:
+ mov ecx,len //load length of line
+ mov esi,srcptr //load source
+ mov ebx,dstptr //load dest
+ cmp ecx,0
+ jz mainloop24end
+
+mainloop24:
movq mm4,[esi]
pand mm4,mm0
movq mm6,mm0
- pandn mm6,[ebx]
+ movq mm7,[ebx]
+ pandn mm6,mm7
por mm4,mm6
movq [ebx],mm4
- add esi,8 //inc by 8 bytes processed
- add ebx,8
- sub ecx,8 //dec by 8 pixels processed
- ja mainloop8
-mainloop8end:
+ movq mm5,[esi+8]
+ pand mm5,mm1
+ movq mm7,mm1
+ movq mm6,[ebx+8]
+ pandn mm7,mm6
+ por mm5,mm7
+ movq [ebx+8],mm5
+ movq mm6,[esi+16]
+ pand mm6,mm2
+ movq mm4,mm2
+ movq mm7,[ebx+16]
+ pandn mm4,mm7
+ por mm6,mm4
+ movq [ebx+16],mm6
+
+ add esi,24 //inc by 24 bytes processed
+ add ebx,24
+ sub ecx,8 //dec by 8 pixels processed
+
+ ja mainloop24
+
+mainloop24end:
mov ecx,diff
cmp ecx,0
- jz end8
+ jz end24
mov edx,mask
- sal edx,24 //make low byte the high byte
-
-secondloop8:
- sal edx,1 //move high bit to CF
- jnc skip8 //if CF = 0
- mov al,[esi]
- mov [ebx],al
-skip8:
- inc esi
- inc ebx
+ sal edx,24 //make low byte the high byte
+secondloop24:
+ sal edx,1 //move high bit to CF
+ jnc skip24 //if CF = 0
+ mov ax,[esi]
+ mov [ebx],ax
+ xor eax,eax
+ mov al,[esi+2]
+ mov [ebx+2],al
+skip24:
+ add esi,3
+ add ebx,3
dec ecx
- jnz secondloop8
-end8:
+ jnz secondloop24
+
+end24:
emms
}
}
@@ -406,16 +269,26 @@ end8:
} /* end of else */
break;
- } // end 8 bpp
+ } // end 24 bpp
- case 16:
+ case 32:
{
png_bytep srcptr;
png_bytep dstptr;
png_uint_32 len;
int unmask, diff;
- __int64 mask1=0x0101020204040808,
- mask0=0x1010202040408080;
+
+ __int64 mask3=0x0101010102020202, //32bpp
+ mask2=0x0404040408080808,
+ mask1=0x1010101020202020,
+ mask0=0x4040404080808080;
+
+ srcptr = png_ptr->row_buf + 1;
+ dstptr = row;
+
+ unmask = ~mask;
+ len = (png_ptr->width)&~7;
+ diff = (png_ptr->width)&7;
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
@@ -424,12 +297,6 @@ end8:
if (mmx_supported)
#endif
{
- srcptr = png_ptr->row_buf + 1;
- dstptr = row;
-
- unmask = ~mask;
- len = (png_ptr->width)&~7;
- diff = (png_ptr->width)&7;
_asm
{
movd mm7, unmask //load bit pattern
@@ -440,20 +307,27 @@ end8:
movq mm0,mask0
movq mm1,mask1
+ movq mm2,mask2
+ movq mm3,mask3
pand mm0,mm7
pand mm1,mm7
+ pand mm2,mm7
+ pand mm3,mm7
pcmpeqb mm0,mm6
pcmpeqb mm1,mm6
+ pcmpeqb mm2,mm6
+ pcmpeqb mm3,mm6
mov ecx,len //load length of line
mov esi,srcptr //load source
mov ebx,dstptr //load dest
+
cmp ecx,0 //lcr
- jz mainloop16end
+ jz mainloop32end
-mainloop16:
+mainloop32:
movq mm4,[esi]
pand mm4,mm0
movq mm6,mm0
@@ -470,35 +344,52 @@ mainloop16:
por mm5,mm7
movq [ebx+8],mm5
- add esi,16 //inc by 16 bytes processed
- add ebx,16
+ movq mm6,[esi+16]
+ pand mm6,mm2
+ movq mm4,mm2
+ movq mm7,[ebx+16]
+ pandn mm4,mm7
+ por mm6,mm4
+ movq [ebx+16],mm6
+
+ movq mm7,[esi+24]
+ pand mm7,mm3
+ movq mm5,mm3
+ movq mm4,[ebx+24]
+ pandn mm5,mm4
+ por mm7,mm5
+ movq [ebx+24],mm7
+
+ add esi,32 //inc by 32 bytes processed
+ add ebx,32
sub ecx,8 //dec by 8 pixels processed
- ja mainloop16
+ ja mainloop32
-mainloop16end:
+mainloop32end:
mov ecx,diff
cmp ecx,0
- jz end16
+ jz end32
mov edx,mask
sal edx,24 //make low byte the high byte
-secondloop16:
+secondloop32:
sal edx,1 //move high bit to CF
- jnc skip16 //if CF = 0
- mov ax,[esi]
- mov [ebx],ax
-skip16:
- add esi,2
- add ebx,2
+ jnc skip32 //if CF = 0
+ mov eax,[esi]
+ mov [ebx],eax
+skip32:
+ add esi,4
+ add ebx,4
dec ecx
- jnz secondloop16
-end16:
+ jnz secondloop32
+
+end32:
emms
}
}
- else /* mmx not supported - use modified C routine */
+ else /* mmx _not supported - Use modified C routine */
{
register unsigned int incr1, initial_val, final_val;
png_size_t pixel_bytes;
@@ -522,25 +413,17 @@ end16:
} /* end of else */
break;
- } // end 16 bpp
+ } // end 32 bpp
- case 24:
+ case 8:
{
png_bytep srcptr;
png_bytep dstptr;
png_uint_32 len;
- int unmask, diff;
-
- __int64 mask2=0x0101010202020404, //24bpp
- mask1=0x0408080810101020,
- mask0=0x2020404040808080;
-
- srcptr = png_ptr->row_buf + 1;
- dstptr = row;
+ int m;
+ int diff, unmask;
- unmask = ~mask;
- len = (png_ptr->width)&~7;
- diff = (png_ptr->width)&7;
+ __int64 mask0=0x0102040810204080;
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
@@ -549,87 +432,66 @@ end16:
if (mmx_supported)
#endif
{
+ srcptr = png_ptr->row_buf + 1;
+ dstptr = row;
+ m = 0x80;
+ unmask = ~mask;
+ len = png_ptr->width &~7; //reduce to multiple of 8
+ diff = png_ptr->width & 7; //amount lost
+
_asm
{
- movd mm7, unmask //load bit pattern
- psubb mm6,mm6 //zero mm6
+ movd mm7, unmask //load bit pattern
+ psubb mm6,mm6 //zero mm6
punpcklbw mm7,mm7
punpcklwd mm7,mm7
- punpckldq mm7,mm7 //fill register with 8 masks
+ punpckldq mm7,mm7 //fill register with 8 masks
movq mm0,mask0
- movq mm1,mask1
- movq mm2,mask2
-
- pand mm0,mm7
- pand mm1,mm7
- pand mm2,mm7
- pcmpeqb mm0,mm6
- pcmpeqb mm1,mm6
- pcmpeqb mm2,mm6
+ pand mm0,mm7 //nonzero if keep byte
+ pcmpeqb mm0,mm6 //zeros->1s, v versa
- mov ecx,len //load length of line
- mov esi,srcptr //load source
- mov ebx,dstptr //load dest
- cmp ecx,0
- jz mainloop24end
+ mov ecx,len //load length of line (pixels)
+ mov esi,srcptr //load source
+ mov ebx,dstptr //load dest
+ cmp ecx,0 //lcr
+ je mainloop8end
-mainloop24:
+mainloop8:
movq mm4,[esi]
pand mm4,mm0
movq mm6,mm0
- movq mm7,[ebx]
- pandn mm6,mm7
+ pandn mm6,[ebx]
por mm4,mm6
movq [ebx],mm4
+ add esi,8 //inc by 8 bytes processed
+ add ebx,8
+ sub ecx,8 //dec by 8 pixels processed
- movq mm5,[esi+8]
- pand mm5,mm1
- movq mm7,mm1
- movq mm6,[ebx+8]
- pandn mm7,mm6
- por mm5,mm7
- movq [ebx+8],mm5
-
- movq mm6,[esi+16]
- pand mm6,mm2
- movq mm4,mm2
- movq mm7,[ebx+16]
- pandn mm4,mm7
- por mm6,mm4
- movq [ebx+16],mm6
-
- add esi,24 //inc by 24 bytes processed
- add ebx,24
- sub ecx,8 //dec by 8 pixels processed
-
- ja mainloop24
+ ja mainloop8
+mainloop8end:
-mainloop24end:
mov ecx,diff
cmp ecx,0
- jz end24
+ jz end8
mov edx,mask
- sal edx,24 //make low byte the high byte
-secondloop24:
- sal edx,1 //move high bit to CF
- jnc skip24 //if CF = 0
- mov ax,[esi]
- mov [ebx],ax
- xor eax,eax
- mov al,[esi+2]
- mov [ebx+2],al
-skip24:
- add esi,3
- add ebx,3
+ sal edx,24 //make low byte the high byte
- dec ecx
- jnz secondloop24
+secondloop8:
+ sal edx,1 //move high bit to CF
+ jnc skip8 //if CF = 0
+ mov al,[esi]
+ mov [ebx],al
+skip8:
+ inc esi
+ inc ebx
-end24:
+ dec ecx
+ jnz secondloop8
+end8:
emms
}
}
@@ -657,26 +519,182 @@ end24:
} /* end of else */
break;
- } // end 24 bpp
+ } // end 8 bpp
- case 32:
+ case 1:
+ {
+ png_bytep sp;
+ png_bytep dp;
+ int s_inc, s_start, s_end;
+ int m;
+ int shift;
+ png_uint_32 i;
+
+ sp = png_ptr->row_buf + 1;
+ dp = row;
+ m = 0x80;
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+ if (png_ptr->transformations & PNG_PACKSWAP)
+ {
+ s_start = 0;
+ s_end = 7;
+ s_inc = 1;
+ }
+ else
+#endif
+ {
+ s_start = 7;
+ s_end = 0;
+ s_inc = -1;
+ }
+
+ shift = s_start;
+
+ for (i = 0; i < png_ptr->width; i++)
+ {
+ if (m & mask)
+ {
+ int value;
+
+ value = (*sp >> shift) & 0x1;
+ *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
+ *dp |= (png_byte)(value << shift);
+ }
+
+ if (shift == s_end)
+ {
+ shift = s_start;
+ sp++;
+ dp++;
+ }
+ else
+ shift += s_inc;
+
+ if (m == 1)
+ m = 0x80;
+ else
+ m >>= 1;
+ }
+ break;
+ }
+
+ case 2:
+ {
+ png_bytep sp;
+ png_bytep dp;
+ int s_start, s_end, s_inc;
+ int m;
+ int shift;
+ png_uint_32 i;
+ int value;
+
+ sp = png_ptr->row_buf + 1;
+ dp = row;
+ m = 0x80;
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+ if (png_ptr->transformations & PNG_PACKSWAP)
+ {
+ s_start = 0;
+ s_end = 6;
+ s_inc = 2;
+ }
+ else
+#endif
+ {
+ s_start = 6;
+ s_end = 0;
+ s_inc = -2;
+ }
+
+ shift = s_start;
+
+ for (i = 0; i < png_ptr->width; i++)
+ {
+ if (m & mask)
+ {
+ value = (*sp >> shift) & 0x3;
+ *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
+ *dp |= (png_byte)(value << shift);
+ }
+
+ if (shift == s_end)
+ {
+ shift = s_start;
+ sp++;
+ dp++;
+ }
+ else
+ shift += s_inc;
+ if (m == 1)
+ m = 0x80;
+ else
+ m >>= 1;
+ }
+ break;
+ }
+
+ case 4:
+ {
+ png_bytep sp;
+ png_bytep dp;
+ int s_start, s_end, s_inc;
+ int m;
+ int shift;
+ png_uint_32 i;
+ int value;
+
+ sp = png_ptr->row_buf + 1;
+ dp = row;
+ m = 0x80;
+#if defined(PNG_READ_PACKSWAP_SUPPORTED)
+ if (png_ptr->transformations & PNG_PACKSWAP)
+ {
+ s_start = 0;
+ s_end = 4;
+ s_inc = 4;
+ }
+ else
+#endif
+ {
+ s_start = 4;
+ s_end = 0;
+ s_inc = -4;
+ }
+ shift = s_start;
+
+ for (i = 0; i < png_ptr->width; i++)
+ {
+ if (m & mask)
+ {
+ value = (*sp >> shift) & 0xf;
+ *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
+ *dp |= (png_byte)(value << shift);
+ }
+
+ if (shift == s_end)
+ {
+ shift = s_start;
+ sp++;
+ dp++;
+ }
+ else
+ shift += s_inc;
+ if (m == 1)
+ m = 0x80;
+ else
+ m >>= 1;
+ }
+ break;
+ }
+
+ case 16:
{
png_bytep srcptr;
png_bytep dstptr;
png_uint_32 len;
int unmask, diff;
-
- __int64 mask3=0x0101010102020202, //32bpp
- mask2=0x0404040408080808,
- mask1=0x1010101020202020,
- mask0=0x4040404080808080;
-
- srcptr = png_ptr->row_buf + 1;
- dstptr = row;
-
- unmask = ~mask;
- len = (png_ptr->width)&~7;
- diff = (png_ptr->width)&7;
+ __int64 mask1=0x0101020204040808,
+ mask0=0x1010202040408080;
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
@@ -685,6 +703,12 @@ end24:
if (mmx_supported)
#endif
{
+ srcptr = png_ptr->row_buf + 1;
+ dstptr = row;
+
+ unmask = ~mask;
+ len = (png_ptr->width)&~7;
+ diff = (png_ptr->width)&7;
_asm
{
movd mm7, unmask //load bit pattern
@@ -695,27 +719,20 @@ end24:
movq mm0,mask0
movq mm1,mask1
- movq mm2,mask2
- movq mm3,mask3
pand mm0,mm7
pand mm1,mm7
- pand mm2,mm7
- pand mm3,mm7
pcmpeqb mm0,mm6
pcmpeqb mm1,mm6
- pcmpeqb mm2,mm6
- pcmpeqb mm3,mm6
mov ecx,len //load length of line
mov esi,srcptr //load source
mov ebx,dstptr //load dest
-
cmp ecx,0 //lcr
- jz mainloop32end
+ jz mainloop16end
-mainloop32:
+mainloop16:
movq mm4,[esi]
pand mm4,mm0
movq mm6,mm0
@@ -732,52 +749,35 @@ mainloop32:
por mm5,mm7
movq [ebx+8],mm5
- movq mm6,[esi+16]
- pand mm6,mm2
- movq mm4,mm2
- movq mm7,[ebx+16]
- pandn mm4,mm7
- por mm6,mm4
- movq [ebx+16],mm6
-
- movq mm7,[esi+24]
- pand mm7,mm3
- movq mm5,mm3
- movq mm4,[ebx+24]
- pandn mm5,mm4
- por mm7,mm5
- movq [ebx+24],mm7
-
- add esi,32 //inc by 32 bytes processed
- add ebx,32
+ add esi,16 //inc by 16 bytes processed
+ add ebx,16
sub ecx,8 //dec by 8 pixels processed
- ja mainloop32
+ ja mainloop16
-mainloop32end:
+mainloop16end:
mov ecx,diff
cmp ecx,0
- jz end32
+ jz end16
mov edx,mask
sal edx,24 //make low byte the high byte
-secondloop32:
+secondloop16:
sal edx,1 //move high bit to CF
- jnc skip32 //if CF = 0
- mov eax,[esi]
- mov [ebx],eax
-skip32:
- add esi,4
- add ebx,4
+ jnc skip16 //if CF = 0
+ mov ax,[esi]
+ mov [ebx],ax
+skip16:
+ add esi,2
+ add ebx,2
dec ecx
- jnz secondloop32
-
-end32:
+ jnz secondloop16
+end16:
emms
}
}
- else /* mmx _not supported - Use modified C routine */
+ else /* mmx not supported - use modified C routine */
{
register unsigned int incr1, initial_val, final_val;
png_size_t pixel_bytes;
@@ -801,7 +801,7 @@ end32:
} /* end of else */
break;
- } // end 32 bpp
+ } // end 16 bpp
case 48:
{
@@ -1225,70 +1225,7 @@ png_do_read_interlace(png_structp png_ptr)
{
if (pixel_bytes == 3)
{
- if (((pass == 0) || (pass == 1)) && width)
- {
- _asm
- {
- mov esi, sptr
- mov edi, dp
- mov ecx, width
- sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
-loop_pass0:
- movd mm0, [esi] ; X X X X X v2 v1 v0
- pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
- movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
- psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
- movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
- psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
- psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
- por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
- por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
- movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
- psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
- movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
- punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
- movq [edi+16] , mm4
- psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
- movq [edi+8] , mm3
- punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
- sub esi, 3
- movq [edi], mm0
- sub edi, 24
- //sub esi, 3
- dec ecx
- jnz loop_pass0
- EMMS
- }
- }
- else if (((pass == 2) || (pass == 3)) && width)
- {
- _asm
- {
- mov esi, sptr
- mov edi, dp
- mov ecx, width
- sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
-loop_pass2:
- movd mm0, [esi] ; X X X X X v2 v1 v0
- pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
- movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
- psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
- movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
- psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
- psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
- por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
- por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
- movq [edi+4], mm0 ; move to memory
- psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
- movd [edi], mm0 ; move to memory
- sub esi, 3
- sub edi, 12
- dec ecx
- jnz loop_pass2
- EMMS
- }
- }
- else if (width) /* && ((pass == 4) || (pass == 5)) */
+ if (((pass == 4) || (pass == 5)) && width)
{
int width_mmx = ((width >> 1) << 1) - 8;
if (width_mmx < 0)
@@ -1342,13 +1279,76 @@ loop_pass4:
sptr -= 3;
}
}
+ else if (((pass == 2) || (pass == 3)) && width)
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width
+ sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
+loop_pass2:
+ movd mm0, [esi] ; X X X X X v2 v1 v0
+ pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
+ movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
+ psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
+ movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
+ psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
+ psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
+ por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
+ por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
+ movq [edi+4], mm0 ; move to memory
+ psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
+ movd [edi], mm0 ; move to memory
+ sub esi, 3
+ sub edi, 12
+ dec ecx
+ jnz loop_pass2
+ EMMS
+ }
+ }
+ else if (width) /* && ((pass == 0) || (pass == 1))) */
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width
+ sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
+loop_pass0:
+ movd mm0, [esi] ; X X X X X v2 v1 v0
+ pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
+ movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
+ psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
+ movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
+ psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
+ psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
+ por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
+ por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
+ movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
+ psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
+ movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
+ punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
+ movq [edi+16] , mm4
+ psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
+ movq [edi+8] , mm3
+ punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
+ sub esi, 3
+ movq [edi], mm0
+ sub edi, 24
+ //sub esi, 3
+ dec ecx
+ jnz loop_pass0
+ EMMS
+ }
+ }
} /* end of pixel_bytes == 3 */
else if (pixel_bytes == 1)
{
- if (((pass == 0) || (pass == 1)) && width)
+ if (((pass == 4) || (pass == 5)) && width)
{
- int width_mmx = ((width >> 2) << 2);
+ int width_mmx = ((width >> 3) << 3);
width -= width_mmx;
if (width_mmx)
{
@@ -1357,60 +1357,36 @@ loop_pass4:
mov esi, sptr
mov edi, dp
mov ecx, width_mmx
- sub edi, 31
- sub esi, 3
-loop1_pass0:
- movd mm0, [esi] ; X X X X v0 v1 v2 v3
- movq mm1, mm0 ; X X X X v0 v1 v2 v3
- punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
- movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
- punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
- movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
- punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
- punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
- movq [edi], mm0 ; move to memory v3
- punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
- movq [edi+8], mm3 ; move to memory v2
- movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
- punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
- punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
- movq [edi+16], mm2 ; move to memory v1
- movq [edi+24], mm4 ; move to memory v0
- sub esi, 4
- sub edi, 32
- sub ecx, 4
- jnz loop1_pass0
+ sub edi, 15
+ sub esi, 7
+loop1_pass4:
+ movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
+ movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
+ punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
+ //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
+ punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
+ movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
+ sub esi, 8
+ movq [edi], mm0 ; move to memory v4 v5 v6 and v7
+ //sub esi, 4
+ sub edi, 16
+ sub ecx, 8
+ jnz loop1_pass4
EMMS
}
}
sptr -= width_mmx;
- dp -= width_mmx*8;
+ dp -= width_mmx*2;
for (i = width; i; i--)
{
int j;
- /* I simplified this part in version 1.0.4e
- * here and in several other instances where
- * pixel_bytes == 1 -- GR-P
- *
- * Original code:
- *
- * png_byte v[8];
- * png_memcpy(v, sptr, pixel_bytes);
- * for (j = 0; j < png_pass_inc[pass]; j++)
- * {
- * png_memcpy(dp, v, pixel_bytes);
- * dp -= pixel_bytes;
- * }
- * sptr -= pixel_bytes;
- *
- * Replacement code is in the next three lines:
- */
-
for (j = 0; j < png_pass_inc[pass]; j++)
+ {
*dp-- = *sptr;
- sptr--;
+ }
+ sptr --;
}
}
else if (((pass == 2) || (pass == 3)) && width)
@@ -1455,9 +1431,9 @@ loop1_pass2:
sptr --;
}
}
- else if (width) /* && ((pass == 4) || (pass == 5))) */
+ else if (width) /* && ((pass == 0) || (pass == 1))) */
{
- int width_mmx = ((width >> 3) << 3);
+ int width_mmx = ((width >> 2) << 2);
width -= width_mmx;
if (width_mmx)
{
@@ -1466,45 +1442,69 @@ loop1_pass2:
mov esi, sptr
mov edi, dp
mov ecx, width_mmx
- sub edi, 15
- sub esi, 7
-loop1_pass4:
- movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
- movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
- punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
- //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
- punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
- movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
- sub esi, 8
- movq [edi], mm0 ; move to memory v4 v5 v6 and v7
- //sub esi, 4
- sub edi, 16
- sub ecx, 8
- jnz loop1_pass4
+ sub edi, 31
+ sub esi, 3
+loop1_pass0:
+ movd mm0, [esi] ; X X X X v0 v1 v2 v3
+ movq mm1, mm0 ; X X X X v0 v1 v2 v3
+ punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
+ movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
+ punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
+ movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
+ punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
+ punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
+ movq [edi], mm0 ; move to memory v3
+ punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
+ movq [edi+8], mm3 ; move to memory v2
+ movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
+ punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
+ punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
+ movq [edi+16], mm2 ; move to memory v1
+ movq [edi+24], mm4 ; move to memory v0
+ sub esi, 4
+ sub edi, 32
+ sub ecx, 4
+ jnz loop1_pass0
EMMS
}
}
sptr -= width_mmx;
- dp -= width_mmx*2;
+ dp -= width_mmx*8;
for (i = width; i; i--)
{
int j;
+ /* I simplified this part in version 1.0.4e
+ * here and in several other instances where
+ * pixel_bytes == 1 -- GR-P
+ *
+ * Original code:
+ *
+ * png_byte v[8];
+ * png_memcpy(v, sptr, pixel_bytes);
+ * for (j = 0; j < png_pass_inc[pass]; j++)
+ * {
+ * png_memcpy(dp, v, pixel_bytes);
+ * dp -= pixel_bytes;
+ * }
+ * sptr -= pixel_bytes;
+ *
+ * Replacement code is in the next three lines:
+ */
+
for (j = 0; j < png_pass_inc[pass]; j++)
- {
*dp-- = *sptr;
- }
- sptr --;
+ sptr--;
}
}
} /* end of pixel_bytes == 1 */
else if (pixel_bytes == 2)
{
- if (((pass == 0) || (pass == 1)) && width)
+ if (((pass == 4) || (pass == 5)) && width)
{
- int width_mmx = ((width >> 1) << 1);
+ int width_mmx = ((width >> 1) << 1) ;
width -= width_mmx;
if (width_mmx)
{
@@ -1514,27 +1514,21 @@ loop1_pass4:
mov edi, dp
mov ecx, width_mmx
sub esi, 2
- sub edi, 30
-loop2_pass0:
+ sub edi, 6
+loop2_pass4:
movd mm0, [esi] ; X X X X v1 v0 v3 v2
punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
- movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
- punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
- punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
- movq [edi], mm0
- movq [edi + 8], mm0
- movq [edi + 16], mm1
- movq [edi + 24], mm1
sub esi, 4
- sub edi, 32
+ movq [edi], mm0
+ sub edi, 8
sub ecx, 2
- jnz loop2_pass0
+ jnz loop2_pass4
EMMS
}
}
sptr -= (width_mmx*2 - 2); // sign fixed
- dp -= (width_mmx*16 - 2); // sign fixed
+ dp -= (width_mmx*4 - 2); // sign fixed
for (i = width; i; i--)
{
png_byte v[8];
@@ -1593,9 +1587,9 @@ loop2_pass2:
}
}
}
- else if (width) // pass == 4 or 5
+ else if (width) /* && ((pass == 0) || (pass == 1))) */
{
- int width_mmx = ((width >> 1) << 1) ;
+ int width_mmx = ((width >> 1) << 1);
width -= width_mmx;
if (width_mmx)
{
@@ -1605,21 +1599,27 @@ loop2_pass2:
mov edi, dp
mov ecx, width_mmx
sub esi, 2
- sub edi, 6
-loop2_pass4:
+ sub edi, 30
+loop2_pass0:
movd mm0, [esi] ; X X X X v1 v0 v3 v2
punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
- sub esi, 4
+ movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
+ punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
+ punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
movq [edi], mm0
- sub edi, 8
+ movq [edi + 8], mm0
+ movq [edi + 16], mm1
+ movq [edi + 24], mm1
+ sub esi, 4
+ sub edi, 32
sub ecx, 2
- jnz loop2_pass4
+ jnz loop2_pass0
EMMS
}
}
sptr -= (width_mmx*2 - 2); // sign fixed
- dp -= (width_mmx*4 - 2); // sign fixed
+ dp -= (width_mmx*16 - 2); // sign fixed
for (i = width; i; i--)
{
png_byte v[8];
@@ -1637,7 +1637,7 @@ loop2_pass4:
else if (pixel_bytes == 4)
{
- if (((pass == 0) || (pass == 1)) && width)
+ if (((pass == 4) || (pass == 5)) && width)
{
int width_mmx = ((width >> 1) << 1) ;
width -= width_mmx;
@@ -1649,30 +1649,24 @@ loop2_pass4:
mov edi, dp
mov ecx, width_mmx
sub esi, 4
- sub edi, 60
-loop4_pass0:
- movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
- movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
- punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
- punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
+ sub edi, 12
+loop4_pass4:
+ movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
+ movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
+ punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
+ punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
movq [edi], mm0
- movq [edi + 8], mm0
- movq [edi + 16], mm0
- movq [edi + 24], mm0
- movq [edi+32], mm1
- movq [edi + 40], mm1
- movq [edi+ 48], mm1
sub esi, 8
- movq [edi + 56], mm1
- sub edi, 64
+ movq [edi + 8], mm1
+ sub edi, 16
sub ecx, 2
- jnz loop4_pass0
+ jnz loop4_pass4
EMMS
}
}
- sptr -= (width_mmx*4 - 4); // sign fixed
- dp -= (width_mmx*32 - 4); // sign fixed
+ sptr -= (width_mmx*4 - 4); // sign fixed
+ dp -= (width_mmx*8 - 4); // sign fixed
for (i = width; i; i--)
{
png_byte v[8];
@@ -1731,7 +1725,7 @@ loop4_pass2:
}
}
}
- else if (width) // pass == 4 or 5
+ else if (width) /* && ((pass == 0) || (pass == 1))) */
{
int width_mmx = ((width >> 1) << 1) ;
width -= width_mmx;
@@ -1743,24 +1737,30 @@ loop4_pass2:
mov edi, dp
mov ecx, width_mmx
sub esi, 4
- sub edi, 12
-loop4_pass4:
- movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
- movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
- punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
- punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
+ sub edi, 60
+loop4_pass0:
+ movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
+ movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
+ punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
+ punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
movq [edi], mm0
+ movq [edi + 8], mm0
+ movq [edi + 16], mm0
+ movq [edi + 24], mm0
+ movq [edi+32], mm1
+ movq [edi + 40], mm1
+ movq [edi+ 48], mm1
sub esi, 8
- movq [edi + 8], mm1
- sub edi, 16
+ movq [edi + 56], mm1
+ sub edi, 64
sub ecx, 2
- jnz loop4_pass4
+ jnz loop4_pass0
EMMS
}
}
- sptr -= (width_mmx*4 - 4); // sign fixed
- dp -= (width_mmx*8 - 4); // sign fixed
+ sptr -= (width_mmx*4 - 4); // sign fixed
+ dp -= (width_mmx*32 - 4); // sign fixed
for (i = width; i; i--)
{
png_byte v[8];
diff --git a/pngwutil.c b/pngwutil.c
index 2d56f7faf..c40bfee41 100644
--- a/pngwutil.c
+++ b/pngwutil.c
@@ -1,7 +1,7 @@
/* pngwutil.c - utilities to write a PNG file
*
- * Last changed in libpng 1.2.19 June 23, 2007
+ * Last changed in libpng 1.2.19 June 28, 2007
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998-2007 Glenn Randers-Pehrson
* (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index ca266a5f8..751087c95 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -169,7 +169,7 @@ configure_file(${PNG_SOURCE_DIR}/scripts/libpng-config.in
# SET UP LINKS
set_target_properties(${PNG_LIB_NAME} PROPERTIES
-# VERSION 0.${PNGLIB_RELEASE}.1.2.19beta18
+# VERSION 0.${PNGLIB_RELEASE}.1.2.19beta19
VERSION 0.${PNGLIB_RELEASE}.0
SOVERSION 0
CLEAN_DIRECT_OUTPUT 1)
diff --git a/scripts/libpng-config-head.in b/scripts/libpng-config-head.in
index 0657d73b4..d47abc876 100755
--- a/scripts/libpng-config-head.in
+++ b/scripts/libpng-config-head.in
@@ -8,7 +8,7 @@
# Modeled after libxml-config.
-version=1.2.19beta18
+version=1.2.19beta19
prefix=""
libdir=""
libs=""
diff --git a/scripts/libpng.pc-configure.in b/scripts/libpng.pc-configure.in
index beb569178..75ffc306b 100644
--- a/scripts/libpng.pc-configure.in
+++ b/scripts/libpng.pc-configure.in
@@ -5,6 +5,6 @@ includedir=@includedir@/libpng12
Name: libpng
Description: Loads and saves PNG files
-Version: 1.2.19beta18
+Version: 1.2.19beta19
Libs: -L${libdir} -lpng12
Cflags: -I${includedir} @LIBPNG_NO_MMX@
diff --git a/scripts/libpng.pc.in b/scripts/libpng.pc.in
index dd6fc050e..d5670fbca 100644
--- a/scripts/libpng.pc.in
+++ b/scripts/libpng.pc.in
@@ -5,6 +5,6 @@ includedir=@includedir@/libpng12
Name: libpng
Description: Loads and saves PNG files
-Version: 1.2.19beta18
+Version: 1.2.19beta19
Libs: -L${libdir} -lpng12
Cflags: -I${includedir}
diff --git a/scripts/makefile.32sunu b/scripts/makefile.32sunu
index 801eaeb92..9b9adadc9 100644
--- a/scripts/makefile.32sunu
+++ b/scripts/makefile.32sunu
@@ -8,7 +8,7 @@
# Library name:
LIBNAME=libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.64sunu b/scripts/makefile.64sunu
index f30d886b5..9218c09cc 100644
--- a/scripts/makefile.64sunu
+++ b/scripts/makefile.64sunu
@@ -8,7 +8,7 @@
# Library name:
LIBNAME=libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.aix b/scripts/makefile.aix
index 7b85ddd13..453605629 100644
--- a/scripts/makefile.aix
+++ b/scripts/makefile.aix
@@ -20,7 +20,7 @@ LN_SF = ln -f -s
LIBNAME=libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
prefix=/usr/local
diff --git a/scripts/makefile.beos b/scripts/makefile.beos
index fa11ad70d..7bcb37161 100644
--- a/scripts/makefile.beos
+++ b/scripts/makefile.beos
@@ -8,7 +8,7 @@
# Library name:
LIBNAME=libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.cygwin b/scripts/makefile.cygwin
index 52730d25b..96bc65ec7 100644
--- a/scripts/makefile.cygwin
+++ b/scripts/makefile.cygwin
@@ -77,7 +77,7 @@ CFLAGS= $(strip $(MINGW_CCFLAGS) $(addprefix -I,$(ZLIBINC)) \
LIBNAME = libpng12
PNGMAJ = 0
CYGDLL = 12
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
SHAREDLIB=cygpng$(CYGDLL).dll
diff --git a/scripts/makefile.darwin b/scripts/makefile.darwin
index 9be157472..a66d2248a 100644
--- a/scripts/makefile.darwin
+++ b/scripts/makefile.darwin
@@ -19,7 +19,7 @@ ZLIBINC=../zlib
# Library name:
LIBNAME = libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.dec b/scripts/makefile.dec
index b35f463a1..102d8c62a 100644
--- a/scripts/makefile.dec
+++ b/scripts/makefile.dec
@@ -5,7 +5,7 @@
# Library name:
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
LIBNAME = libpng12
diff --git a/scripts/makefile.elf b/scripts/makefile.elf
index 0ea51448d..495e29e15 100644
--- a/scripts/makefile.elf
+++ b/scripts/makefile.elf
@@ -12,7 +12,7 @@
# Library name:
LIBNAME = libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.gcmmx b/scripts/makefile.gcmmx
index 76e7b6623..6a50b2d0c 100644
--- a/scripts/makefile.gcmmx
+++ b/scripts/makefile.gcmmx
@@ -16,7 +16,7 @@
# Library name:
LIBNAME = libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.hp64 b/scripts/makefile.hp64
index b16403201..53dd7d107 100644
--- a/scripts/makefile.hp64
+++ b/scripts/makefile.hp64
@@ -18,7 +18,7 @@ ZLIBINC=/opt/zlib/include
# Library name:
LIBNAME = libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.hpgcc b/scripts/makefile.hpgcc
index 6ce77f99d..2264c24fc 100644
--- a/scripts/makefile.hpgcc
+++ b/scripts/makefile.hpgcc
@@ -8,7 +8,7 @@
# Library name:
LIBNAME = libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.hpux b/scripts/makefile.hpux
index 2a2f6068c..96937cadf 100644
--- a/scripts/makefile.hpux
+++ b/scripts/makefile.hpux
@@ -18,7 +18,7 @@ ZLIBINC=/opt/zlib/include
# Library name:
LIBNAME = libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.linux b/scripts/makefile.linux
index f830a7fd0..659b8af86 100644
--- a/scripts/makefile.linux
+++ b/scripts/makefile.linux
@@ -6,7 +6,7 @@
# Library name:
LIBNAME = libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.mingw b/scripts/makefile.mingw
index b7d53dd3b..9121cc074 100644
--- a/scripts/makefile.mingw
+++ b/scripts/makefile.mingw
@@ -78,7 +78,7 @@ CFLAGS= $(strip $(MINGW_CCFLAGS) $(addprefix -I,$(ZLIBINC)) \
LIBNAME = libpng12
PNGMAJ = 0
MINGDLL = 12
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
SHAREDLIB=libpng$(MINGDLL).dll
diff --git a/scripts/makefile.ne12bsd b/scripts/makefile.ne12bsd
index 7a52528c8..e27ef5fb2 100644
--- a/scripts/makefile.ne12bsd
+++ b/scripts/makefile.ne12bsd
@@ -14,7 +14,7 @@ INCSDIR=${LOCALBASE}/include/libpng12
LIB= png12
SHLIB_MAJOR= 0
-SHLIB_MINOR= 1.2.19beta18
+SHLIB_MINOR= 1.2.19beta19
SRCS= pnggccrd.c png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
pngwtran.c pngmem.c pngerror.c pngpread.c
diff --git a/scripts/makefile.netbsd b/scripts/makefile.netbsd
index e865035b1..375c7e2d3 100644
--- a/scripts/makefile.netbsd
+++ b/scripts/makefile.netbsd
@@ -14,7 +14,7 @@ INCSDIR=${LOCALBASE}/include/libpng
LIB= png
SHLIB_MAJOR= 3
-SHLIB_MINOR= 1.2.19beta18
+SHLIB_MINOR= 1.2.19beta19
SRCS= pnggccrd.c png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
pngwtran.c pngmem.c pngerror.c pngpread.c
diff --git a/scripts/makefile.nommx b/scripts/makefile.nommx
index 97e6526ce..456270818 100644
--- a/scripts/makefile.nommx
+++ b/scripts/makefile.nommx
@@ -7,7 +7,7 @@
# Library name:
LIBNAME = libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.openbsd b/scripts/makefile.openbsd
index f0a19d38e..e6f4d99b6 100644
--- a/scripts/makefile.openbsd
+++ b/scripts/makefile.openbsd
@@ -8,7 +8,7 @@ LIBDIR= ${PREFIX}/lib
MANDIR= ${PREFIX}/man/cat
SHLIB_MAJOR= 0
-SHLIB_MINOR= 1.2.19beta18
+SHLIB_MINOR= 1.2.19beta19
LIB= png
SRCS= png.c pngerror.c pnggccrd.c pngget.c pngmem.c pngpread.c \
diff --git a/scripts/makefile.sco b/scripts/makefile.sco
index ff509c0c1..3db172ccb 100644
--- a/scripts/makefile.sco
+++ b/scripts/makefile.sco
@@ -9,7 +9,7 @@
# Library name:
LIBNAME = libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.sggcc b/scripts/makefile.sggcc
index 7cff3fc5d..38679a8a6 100644
--- a/scripts/makefile.sggcc
+++ b/scripts/makefile.sggcc
@@ -6,7 +6,7 @@
# Library name:
LIBNAME=libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.sgi b/scripts/makefile.sgi
index d8e6ce8a3..bd45bb3a5 100644
--- a/scripts/makefile.sgi
+++ b/scripts/makefile.sgi
@@ -6,7 +6,7 @@
# Library name:
LIBNAME=libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.so9 b/scripts/makefile.so9
index a2ef5535e..643290d73 100644
--- a/scripts/makefile.so9
+++ b/scripts/makefile.so9
@@ -8,7 +8,7 @@
# Library name:
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
LIBNAME = libpng12
diff --git a/scripts/makefile.solaris b/scripts/makefile.solaris
index 344bd05cb..44978f26e 100644
--- a/scripts/makefile.solaris
+++ b/scripts/makefile.solaris
@@ -8,7 +8,7 @@
# Library name:
LIBNAME = libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/makefile.solaris-x86 b/scripts/makefile.solaris-x86
index 41b23ab46..3131fdec3 100644
--- a/scripts/makefile.solaris-x86
+++ b/scripts/makefile.solaris-x86
@@ -8,7 +8,7 @@
# Library name:
LIBNAME = libpng12
PNGMAJ = 0
-PNGMIN = 1.2.19beta18
+PNGMIN = 1.2.19beta19
PNGVER = $(PNGMAJ).$(PNGMIN)
# Shared library names:
diff --git a/scripts/pngos2.def b/scripts/pngos2.def
index acf9fe6cc..2098ce2d8 100644
--- a/scripts/pngos2.def
+++ b/scripts/pngos2.def
@@ -2,7 +2,7 @@
; PNG.LIB module definition file for OS/2
;----------------------------------------
-; Version 1.2.19beta18
+; Version 1.2.19beta19
LIBRARY PNG
DESCRIPTION "PNG image compression library for OS/2"
diff --git a/scripts/pngw32.def b/scripts/pngw32.def
index e186f6338..3ba50e92b 100644
--- a/scripts/pngw32.def
+++ b/scripts/pngw32.def
@@ -5,7 +5,7 @@
LIBRARY
EXPORTS
-;Version 1.2.19beta18
+;Version 1.2.19beta19
png_build_grayscale_palette @1
png_check_sig @2
png_chunk_error @3