summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGlenn Randers-Pehrson <glennrp at users.sourceforge.net>1999-10-01 14:22:25 -0500
committerGlenn Randers-Pehrson <glennrp at users.sourceforge.net>2009-04-06 16:04:36 -0500
commitbcfd15d9f2142d1e5d2a0e21a643ba9e70e953eb (patch)
tree5f9dd7f1ca6145f049934ebfbf7462ef09513d18
parent54a066a8a9a40c91065a7941e877a0ba4ad20097 (diff)
downloadlibpng-1.0.4c.tar.gz
Imported from libpng-1.0.4c.tarv1.0.4c
-rw-r--r--ANNOUNCE35
-rw-r--r--CHANGES18
-rw-r--r--INSTALL13
-rw-r--r--KNOWNBUG6
-rw-r--r--LICENSE24
-rw-r--r--README11
-rw-r--r--Y2KINFO4
-rw-r--r--example.c3
-rw-r--r--libpng.361
-rw-r--r--libpng.txt26
-rw-r--r--libpngpf.34
-rw-r--r--png.52
-rw-r--r--png.c17
-rw-r--r--png.h73
-rw-r--r--pngasmrd.h4
-rw-r--r--pngconf.h2
-rw-r--r--pngerror.c2
-rw-r--r--pngget.c2
-rw-r--r--pngmem.c2
-rw-r--r--pngnow.pngbin0 -> 2196 bytes
-rw-r--r--pngpread.c2
-rw-r--r--pngread.c6
-rw-r--r--pngrio.c2
-rw-r--r--pngrtran.c3
-rw-r--r--pngrutil.c25
-rw-r--r--pngset.c2
-rw-r--r--pngtest.c67
-rw-r--r--pngtrans.c2
-rw-r--r--pngvcrd.c4728
-rw-r--r--pngwio.c2
-rw-r--r--pngwrite.c2
-rw-r--r--pngwtran.c2
-rw-r--r--pngwutil.c2
-rw-r--r--scripts/makefile.beos4
-rw-r--r--scripts/makefile.borland4
-rw-r--r--scripts/makefile.dec2
-rw-r--r--scripts/makefile.linux4
-rw-r--r--scripts/makefile.msc2
-rw-r--r--scripts/makefile.sco2
-rw-r--r--scripts/makefile.solaris4
-rw-r--r--scripts/makefile.turboc32
-rw-r--r--scripts/makefile.vcawin32 (renamed from scripts/makefile.win32vc)10
-rw-r--r--scripts/makefile.vcwin3287
-rw-r--r--scripts/makefile.watcom2
-rw-r--r--scripts/pngdef.pas4
45 files changed, 2736 insertions, 2545 deletions
diff --git a/ANNOUNCE b/ANNOUNCE
index d98869175..dcdd13593 100644
--- a/ANNOUNCE
+++ b/ANNOUNCE
@@ -1,10 +1,12 @@
-Libpng 1.0.4 - September 19, 1999
+Libpng 1.0.4c - October 1, 1999
-This is a public release of libpng, intended for use in production codes.
+This is not intended to be a public release. It will be replaced
+within a few weeks by a public version or by another test version.
Changes since the last public release (1.0.3):
+version 1.0.3a [August 12, 1999]
Added check for PNG_READ_INTERLACE_SUPPORTED in pngread.c; issue a warning
if an attempt is made to read an interlaced image when it's not supported.
Added check if png_ptr->trans is defined before free'ing it in pngread.c
@@ -32,25 +34,50 @@ Changes since the last public release (1.0.3):
consistent with PNG-1.2, and allow variance of 500 before complaining.
Added assembler code contributed by Intel in file pngvcrd.c and modified
makefile.w32 to use it (Nirav Chhatrapati, INTEL Corporation, Gilles Vollant)
- Define PNG_USE_PNGVCRD in makefile.w32, to get MMX assembler code.
Changed "ln -s -f" to "ln -f -s" in the makefiles to make Solaris happy.
+ Added some aliases for png_set_expand() in pngrtran.c, namely
+ png_set_expand_PLTE(), png_set_expand_depth(), and png_set_expand_tRNS()
+ (Greg Roelofs, in "PNG: The Definitive Guide").
Added makefile.beo for BEOS on X86, contributed by Sander Stok.
+version 1.0.3b [August 26, 1999]
Replaced 2147483647L several places with PNG_MAX_UINT macro, defined in png.h
Changed leading blanks to tabs in all makefiles.
+ Define PNG_USE_PNGVCRD in makefile.w32, to get MMX assembler code.
Made alternate versions of png_set_expand() in pngrtran.c, namely
png_set_gray_1_2_4_to_8, png_set_palette_to_rgb, and png_set_tRNS_to_alpha
- (Greg Roelofs, in "PNG: The Definitive Guide").
+ (Greg Roelofs, in "PNG: The Definitive Guide"). Deleted the 1.0.3a aliases.
Relocated start of 'extern "C"' block in png.h so it doesn't include pngconf.h
Revised calculation of num_blocks in pngmem.c to avoid a potentially
negative shift distance, whose results are undefined in the C language.
Added a check in pngset.c to prevent writing multiple tIME chunks.
Added a check in pngwrite.c to detect invalid small window_bits sizes.
+version 1.0.3d [September 4, 1999]
+ Fixed type casting of igamma in pngrutil.c
+ Added new png_expand functions to scripts/pngdef.pas and pngos2.def
Added a demo read_user_transform_fn that examines the row filters in pngtest.c
+version 1.0.4 [September 24, 1999]
Define PNG_ALWAYS_EXTERN in pngconf.h if __STDC__ is defined
+ Delete #define PNG_INTERNAL and include "png.h" from pngasmrd.h
Made several minor corrections to pngtest.c
Changed "hptr += 16L" to "hptr = hptr + 16L" in pngmem.c for Turbo 3.0
Renamed the makefiles with longer but more user friendly extensions.
Copied the PNG copyright and license to a separate LICENSE file.
+ Revised documentation, png.h, and example.c to remove reference to
+ "viewing_gamma" which no longer appears in the PNG specification.
+ Revised pngvcrd.c to use MMX code for interlacing only on the final pass.
+ Updated pngvcrd.c to use the faster C filter algorithms from libpng-1.0.1a
+ Split makefile.win32vc into two versions, makefile.vcawin32 (uses MMX
+ assembler code) and makefile.vcwin32 (doesn't).
+ Added a CPU timing report to pngtest.c (enabled by defining PNGTEST_TIMING)
+version 1.0.4a September 25, 1999
+ Increase max_pixel_depth in pngrutil.c if a user transform needs it.
+ Changed several division operations to right-shifts in pngvcrd.c
+version 1.0.4b September 30, 1999
+ Added parentheses in line 3732 of pngvcrd.c
+ Added a comment in makefile.linux warning about buggy -O3 in pgcc 2.95.1
+version 1.0.4c [October 1, 1999]
+ Added a "png_check_version" function in png.c and pngtest.c that will generate
+ a helpful compiler error if an old png.h is found in the search path.
Send comments/corrections/commendations to
png-implement@ccrc.wustl.edu or to randeg@alum.rpi.edu
diff --git a/CHANGES b/CHANGES
index 7135912ec..ee7c17f45 100644
--- a/CHANGES
+++ b/CHANGES
@@ -436,9 +436,25 @@ version 1.0.3d [September 4, 1999]
Fixed type casting of igamma in pngrutil.c
Added new png_expand functions to scripts/pngdef.pas and pngos2.def
Added a demo read_user_transform_fn that examines the row filters in pngtest.c
-version 1.0.4 [September 19, 1999]
+version 1.0.4 [September 24, 1999]
Define PNG_ALWAYS_EXTERN in pngconf.h if __STDC__ is defined
Delete #define PNG_INTERNAL and include "png.h" from pngasmrd.h
Made several minor corrections to pngtest.c
Renamed the makefiles with longer but more user friendly extensions.
Copied the PNG copyright and license to a separate LICENSE file.
+ Revised documentation, png.h, and example.c to remove reference to
+ "viewing_gamma" which no longer appears in the PNG specification.
+ Revised pngvcrd.c to use MMX code for interlacing only on the final pass.
+ Updated pngvcrd.c to use the faster C filter algorithms from libpng-1.0.1a
+ Split makefile.win32vc into two versions, makefile.vcawin32 (uses MMX
+ assembler code) and makefile.vcwin32 (doesn't).
+ Added a CPU timing report to pngtest.c (enabled by defining PNGTEST_TIMING)
+version 1.0.4a [September 25, 1999]
+ Increase max_pixel_depth in pngrutil.c if a user transform needs it.
+ Changed several division operations to right-shifts in pngvcrd.c
+version 1.0.4b [September 30, 1999]
+ Added parentheses in line 3732 of pngvcrd.c
+ Added a comment in makefile.linux warning about buggy -O3 in pgcc 2.95.1
+version 1.0.4c [October 1, 1999]
+ Added a "png_check_version" function in png.c and pngtest.c that will generate
+ a helpful compiler error if an old png.h is found in the search path.
diff --git a/INSTALL b/INSTALL
index 1cd8b6323..475a8d477 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,5 +1,5 @@
-Installing libpng version 1.0.4 - September 19, 1999
+Installing libpng version 1.0.4c - October 1, 1999
Before installing libpng, you must first install zlib. zlib
can usually be found wherever you got libpng. zlib can be
@@ -10,7 +10,7 @@ zlib.h and zconf.h include files that correspond to the
version of zlib that's installed.
You can rename the directories that you downloaded (they
-might be called "libpng-1.0.4" or "lpng103" and "zlib-1.1.3"
+might be called "libpng-1.0.4c" or "lpng103" and "zlib-1.1.3"
or "zlib113") so that you have directories called "zlib" and "libpng".
Your directory structure should look like this:
@@ -47,8 +47,8 @@ include
makefile.hpux => HPUX (10.20 and 11.00) makefile
makefile.sgi => Silicon Graphics IRIX makefile
makefile.sunos => Sun makefile
- makefile.solaris => Solaris 2.X makefile (gcc, creates libpng.so.2.1.0.4)
- makefile.linux => Linux/ELF makefile (gcc, creates libpng.so.2.1.0.4)
+ makefile.solaris => Solaris 2.X makefile (gcc, creates libpng.so.2.1.0.4c)
+ makefile.linux => Linux/ELF makefile (gcc, creates libpng.so.2.1.0.4c)
makefile.sco => For SCO OSr5 ELF and Unixware 7 with Native cc
makefile.mips => MIPS makefile
makefile.acorn => Acorn makefile
@@ -61,7 +61,10 @@ include
build.bat => MS-DOS batch file for Borland compiler
makefile.dj2 => DJGPP 2 makefile
makefile.msc => Microsoft C makefile
- makefile.win32vc => makefile for Microsoft Visual C++ 4.0 and later
+ makefile.vcawin32 => makefile for Microsoft Visual C++ 5.0 and later (uses
+ assembler code)
+ makefile.vcwin32 => makefile for Microsoft Visual C++ 4.0 and later (does not
+ use assembler code)
makefile.turboc3 => Turbo C 3.0 makefile
makefile.os2 => OS/2 Makefile (gcc and emx, requires pngos2.def)
pngos2.def => OS/2 module definition file used by makefile.os2
diff --git a/KNOWNBUG b/KNOWNBUG
index 7c640a8ab..1c5629dcd 100644
--- a/KNOWNBUG
+++ b/KNOWNBUG
@@ -14,12 +14,12 @@ Known bugs and suggested enhancements in libpng-1.0.4
Question whether i-- or --i is better.
STATUS: Under investigation, postponed until after
- libpng-1.0.4. About 160 loops will be turned around
+ libpng-1.0.5. About 160 loops will be turned around
in libpng-1.0.Nn, for testing.
2. July 4, 1998 -- ENHANCEMENT -- Glenn R-P
- libpng-1.0.4 and earlier transform colors to gamma=1.0 space for
+ libpng-1.0.5 and earlier transform colors to gamma=1.0 space for
merging with background, and then back to the image's gamma. The
bit_depth of the intermediate (gamma=1.0) representation is probably
not sufficient. In the typical gamma=1/2.2 situation, the linear
@@ -34,7 +34,7 @@ Known bugs and suggested enhancements in libpng-1.0.4
It should be possible to use libpng without floating-point aritmetic.
STATUS: Under investigation, implementation postponed until after
- libpng-1.0.4. The application interface will change because replacements
+ libpng-1.0.5. The application interface will change because replacements
for the png_set_gAMA(), png_set_cHRM(), and corresponding png_get_()
functions will be needed.
diff --git a/LICENSE b/LICENSE
index 92dd0eaef..22fc3f20b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -5,7 +5,25 @@ Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
Copyright (c) 1996, 1997 Andreas Dilger
(libpng versions 0.90, December 1996, through 0.96, May 1997)
Copyright (c) 1998, 1999 Glenn Randers-Pehrson
-(libpng versions 0.97, January 1998, through 1.0.4, September 19, 1999)
+(libpng versions 0.97, January 1998, through 1.0.4c, October 1, 1999)
+
+For the purposes of this copyright and license, "Contributing Authors"
+is defined as the following set of individuals:
+
+ John Bowler
+ Kevin Bracey
+ Sam Bushell
+ Andreas Dilger
+ Magnus Holmgren
+ Tom Lane
+ Dave Martindale
+ Glenn Randers-Pehrson
+ Greg Roelofs
+ Guy Eric Schalnat
+ Paul Schmidt
+ Tom Tanner
+ Willem van Schaik
+ Tim Wegner
The PNG Reference Library is supplied "AS IS". The Contributing Authors
and Group 42, Inc. disclaim all warranties, expressed or implied,
@@ -37,5 +55,5 @@ source code in a product, acknowledgment is not required but would be
appreciated.
Glenn Randers-Pehrson
-randeg at alum.rpi.edu
-September 19, 1999
+randeg@alum.rpi.edu
+October 1, 1999
diff --git a/README b/README
index ae4f72858..e19f30608 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README for libpng 1.0.4 - September 19, 1999 (shared library 2.1)
+README for libpng 1.0.4c - October 1, 1999 (shared library 2.1)
See the note about version numbers near the top of png.h
See INSTALL for instructions on how to install libpng.
@@ -163,9 +163,9 @@ Files in this distribution:
makefile.sgi => Silicon Graphics IRIX makefile
makefile.sunos => Sun makefile
makefile.solaris => Solaris 2.X makefile
- (gcc, creates libpng.so.2.1.0.4)
+ (gcc, creates libpng.so.2.1.0.4c)
makefile.linux => Linux/ELF makefile
- (gcc, creates libpng.so.2.1.0.4)
+ (gcc, creates libpng.so.2.1.0.4c)
makefile.sco => For SCO OSr5 ELF and Unixware 7 with Native cc
makefile.mips => MIPS makefile
makefile.acorn => Acorn makefile
@@ -179,7 +179,10 @@ Files in this distribution:
build.bat => MS-DOS batch file for Borland compiler
makefile.dj2 => DJGPP 2 makefile
makefile.msc => Microsoft C makefile
- makefile.win32vc => makefile for Microsoft Visual C++ 4.0 and later
+ makefile.vcawin32 => makefile for Microsoft Visual C++ 5.0 and
+ later (uses assembler code)
+ makefile.vcwin32 => makefile for Microsoft Visual C++ 4.0 and
+ later (does not use assembler code)
makefile.turboc3 => Turbo C 3.0 makefile
makefile.os2 => OS/2 Makefile (gcc and emx, requires pngos2.def)
pngos2.def => OS/2 module definition file used by makefile.os2
diff --git a/Y2KINFO b/Y2KINFO
index bdd22447e..103579ad5 100644
--- a/Y2KINFO
+++ b/Y2KINFO
@@ -1,13 +1,13 @@
Y2K compliance in libpng:
=========================
- September 19, 1999
+ October 1, 1999
Since the PNG Development group is an ad-hoc body, we can't make
an official declaration.
This is your unofficial assurance that libpng from version 0.71 and
- upward through 1.0.4 are Y2K compliant. It is my belief that earlier
+ upward through 1.0.4c are Y2K compliant. It is my belief that earlier
versions were also Y2K compliant.
Libpng only has three year fields. One is a 2-byte unsigned integer
diff --git a/example.c b/example.c
index a83ea48b5..52afbdfce 100644
--- a/example.c
+++ b/example.c
@@ -197,7 +197,8 @@ void read_png(FILE *fp, unsigned int sig_read) /* file is already open */
/* Some suggestions as to how to get a screen gamma value */
- /* Note that screen gamma is (display_gamma/viewing_gamma) */
+ /* Note that screen gamma is the display_exponent, which includes
+ * the CRT_exponent and any correction for viewing conditions */
if (/* We have a user-defined screen gamma value */)
{
screen_gamma = user-defined screen_gamma;
diff --git a/libpng.3 b/libpng.3
index 48f3eb537..8fbfa9980 100644
--- a/libpng.3
+++ b/libpng.3
@@ -1,6 +1,6 @@
-.TH LIBPNG 3 "September 19, 1999"
+.TH LIBPNG 3 "October 1, 1999"
.SH NAME
-libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4 - September 19, 1999
+libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4c - October 1, 1999
.SH SYNOPSIS
\fI\fB
@@ -617,7 +617,7 @@ Following is a copy of the libpng.txt file that accompanies libpng.
.SH LIBPNG.TXT
libpng.txt - A description on how to use and modify libpng
- libpng version 1.0.4 - September 19, 1999
+ libpng version 1.0.4c - October 1, 1999
Updated and distributed by Glenn Randers-Pehrson
<randeg@alum.rpi.edu>
Copyright (c) 1998, 1999 Glenn Randers-Pehrson
@@ -1314,17 +1314,15 @@ or as an RGB triplet that may or may not be in the palette (need_expand = 0).
To properly display PNG images on any kind of system, the application needs
to know what the display gamma is. Ideally, the user will know this, and
the application will allow them to set it. One method of allowing the user
-to set the display gamma separately for each system is to check for the
-DISPLAY_GAMMA and VIEWING_GAMMA environment variables or for a SCREEN_GAMMA
-environment variable, which will hopefully be correctly set.
-
-Note that display_gamma is the gamma of your display, while screen_gamma is
-the overall gamma correction required to produce pleasing results,
-which depends on the lighting conditions in the surrounding environment.
-Screen_gamma is display_gamma/viewing_gamma, where viewing_gamma is
-the amount of additional gamma correction needed to compensate for
-a (viewing_gamma=1.25) environment. In a dim or brightly lit room, no
-compensation other than the display_gamma is needed (viewing_gamma=1.0).
+to set the display gamma separately for each system is to check for a
+SCREEN_GAMMA or DISPLAY_GAMMA environment variable, which will hopefully be
+correctly set.
+
+Note that display_gamma is the overall gamma correction required to produce
+pleasing results, which depends on the lighting conditions in the surrounding
+environment. In a dim or brightly lit room, no compensation other than
+the physical gamma exponent of the monitor is needed, while in a dark room
+a slightly smaller exponent is better.
double gamma, screen_gamma;
@@ -2677,13 +2675,13 @@ the old method.
.SH VII. Y2K Compliance in libpng
-January 13, 1999
+October 1, 1999
Since the PNG Development group is an ad-hoc body, we can't make
an official declaration.
This is your unofficial assurance that libpng from version 0.71 and
-upward through 1.0.4 are Y2K compliant. It is my belief that earlier
+upward through 1.0.4c are Y2K compliant. It is my belief that earlier
versions were also Y2K compliant.
Libpng only has three year fields. One is a 2-byte unsigned integer that
@@ -2802,12 +2800,6 @@ and this library, the specification takes precedence.
.SH AUTHORS
This man page: Glenn Randers-Pehrson
<randeg@alum.rpi.edu>
-
-Contributing Authors: John Bowler, Kevin Bracey, Sam Bushell, Andreas Dilger,
-Magnus Holmgren, Tom Lane, Dave Martindale, Glenn Randers-Pehrson,
-Greg Roelofs, Guy Eric Schalnat, Paul Schmidt, Tom Tanner, Willem van
-Schaik, Tim Wegner.
-<png-implement@ccrc.wustl.edu>
The contributing authors would like to thank all those who helped
with testing, bug fixes, and patience. This wouldn't have been
@@ -2815,7 +2807,7 @@ possible without all of you.
Thanks to Frank J. T. Wojcik for helping with the documentation.
-Libpng version 1.0.4 - September 19, 1999:
+Libpng version 1.0.4c - October 1, 1999:
Initially created in 1995 by Guy Eric Schalnat, then of Group 42, Inc.
Currently maintained by Glenn Randers-Pehrson (randeg@alum.rpi.edu).
@@ -2830,7 +2822,25 @@ Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
Copyright (c) 1996, 1997 Andreas Dilger
(libpng versions 0.90, December 1996, through 0.96, May 1997)
Copyright (c) 1998, 1999 Glenn Randers-Pehrson
-(libpng versions 0.97, January 1998, through 1.0.4, September 19, 1999)
+(libpng versions 0.97, January 1998, through 1.0.4c, October 1, 1999)
+
+For the purposes of this copyright and license, "Contributing Authors"
+is defined as the following set of individuals:
+
+ John Bowler
+ Kevin Bracey
+ Sam Bushell
+ Andreas Dilger
+ Magnus Holmgren
+ Tom Lane
+ Dave Martindale
+ Glenn Randers-Pehrson
+ Greg Roelofs
+ Guy Eric Schalnat
+ Paul Schmidt
+ Tom Tanner
+ Willem van Schaik
+ Tim Wegner
The PNG Reference Library (libpng) is supplied "AS IS". The Contributing
Authors and Group 42, Inc. disclaim all warranties, expressed or implied,
@@ -2869,5 +2879,8 @@ boxes and the like:
Also, the PNG logo (in PNG format, of course) is supplied in the
file "pngnow.png".
+Libpng is OSI Certified Open Source Software. OSI Certified is a
+certification mark of the Open Source Initiative.
+
.\" end of man page
diff --git a/libpng.txt b/libpng.txt
index 7fd60af35..8023e79e0 100644
--- a/libpng.txt
+++ b/libpng.txt
@@ -1,6 +1,6 @@
libpng.txt - A description on how to use and modify libpng
- libpng version 1.0.4 - September 19, 1999
+ libpng version 1.0.4c - October 1, 1999
Updated and distributed by Glenn Randers-Pehrson
<randeg@alum.rpi.edu>
Copyright (c) 1998, 1999 Glenn Randers-Pehrson
@@ -697,17 +697,15 @@ or as an RGB triplet that may or may not be in the palette (need_expand = 0).
To properly display PNG images on any kind of system, the application needs
to know what the display gamma is. Ideally, the user will know this, and
the application will allow them to set it. One method of allowing the user
-to set the display gamma separately for each system is to check for the
-DISPLAY_GAMMA and VIEWING_GAMMA environment variables or for a SCREEN_GAMMA
-environment variable, which will hopefully be correctly set.
-
-Note that display_gamma is the gamma of your display, while screen_gamma is
-the overall gamma correction required to produce pleasing results,
-which depends on the lighting conditions in the surrounding environment.
-Screen_gamma is display_gamma/viewing_gamma, where viewing_gamma is
-the amount of additional gamma correction needed to compensate for
-a (viewing_gamma=1.25) environment. In a dim or brightly lit room, no
-compensation other than the display_gamma is needed (viewing_gamma=1.0).
+to set the display gamma separately for each system is to check for a
+SCREEN_GAMMA or DISPLAY_GAMMA environment variable, which will hopefully be
+correctly set.
+
+Note that display_gamma is the overall gamma correction required to produce
+pleasing results, which depends on the lighting conditions in the surrounding
+environment. In a dim or brightly lit room, no compensation other than
+the physical gamma exponent of the monitor is needed, while in a dark room
+a slightly smaller exponent is better.
double gamma, screen_gamma;
@@ -2060,13 +2058,13 @@ the old method.
VII. Y2K Compliance in libpng
-January 13, 1999
+October 1, 1999
Since the PNG Development group is an ad-hoc body, we can't make
an official declaration.
This is your unofficial assurance that libpng from version 0.71 and
-upward through 1.0.4 are Y2K compliant. It is my belief that earlier
+upward through 1.0.4c are Y2K compliant. It is my belief that earlier
versions were also Y2K compliant.
Libpng only has three year fields. One is a 2-byte unsigned integer that
diff --git a/libpngpf.3 b/libpngpf.3
index c808ae8ed..fb0803f9d 100644
--- a/libpngpf.3
+++ b/libpngpf.3
@@ -1,6 +1,6 @@
-.TH LIBPNGPF 3 September 19, 1999
+.TH LIBPNGPF 3 October 1, 1999
.SH NAME
-libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4 - September 19, 1999
+libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4c - October 1, 1999
(private functions)
.SH SYNOPSIS
\fB#include <png.h>\fP
diff --git a/png.5 b/png.5
index 0bfb13015..081adb520 100644
--- a/png.5
+++ b/png.5
@@ -1,4 +1,4 @@
-.TH PNG 5 "September 19, 1999"
+.TH PNG 5 "October 1, 1999"
.SH NAME
png \- Portable Network Graphics (PNG) format
.SH DESCRIPTION
diff --git a/png.c b/png.c
index dfca0501f..1b0141b36 100644
--- a/png.c
+++ b/png.c
@@ -1,7 +1,7 @@
/* png.c - location for general purpose libpng functions
*
- * libpng version 1.0.4 - September 19, 1999
+ * libpng version 1.0.4c - October 1, 1999
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
* Copyright (c) 1998, 1999 Glenn Randers-Pehrson
@@ -16,7 +16,7 @@
* string defined in png.h.
*/
-char png_libpng_ver[12] = "1.0.4";
+char png_libpng_ver[12] = "1.0.4c";
/* Place to hold the signature string for a PNG file. */
png_byte FARDATA png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
@@ -73,12 +73,12 @@ int FARDATA png_pass_mask[] = {0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff};
/* Mask to determine which pixels to overwrite while displaying */
int FARDATA png_pass_dsp_mask[] = {0xff, 0x0f, 0xff, 0x33, 0xff, 0x55, 0xff};
-
/* Tells libpng that we have already handled the first "num_bytes" bytes
* of the PNG file signature. If the PNG data is embedded into another
* stream we can set num_bytes = 8 so that libpng will not attempt to read
* or write any of the magic bytes before it starts on the IHDR.
*/
+
void
png_set_sig_bytes(png_structp png_ptr, int num_bytes)
{
@@ -352,8 +352,17 @@ png_get_copyright(png_structp png_ptr)
{
if(png_ptr == NULL)
/* silence compiler warning about unused png_ptr */ ;
- return("\n libpng version 1.0.4 - September 19, 1999\n\
+ return("\n libpng version 1.0.4c - October 1, 1999\n\
Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.\n\
Copyright (c) 1996, 1997 Andreas Dilger\n\
Copyright (c) 1998, 1999 Glenn Randers-Pehrson\n");
}
+
+/* Generate a compiler error if there is an old png.h in the search path. */
+void
+png_check_version
+ (version_1_0_4c png_h_is_not_version_1_0_4c)
+{
+ if(png_h_is_not_version_1_0_4c == NULL)
+ /* silence compiler warning about unused parameter */ ;
+}
diff --git a/png.h b/png.h
index d37070383..67a29e575 100644
--- a/png.h
+++ b/png.h
@@ -1,7 +1,7 @@
/* png.h - header file for PNG reference library
*
- * libpng version 1.0.4 - September 19, 1999
+ * libpng version 1.0.4c - October 1, 1999
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
* Copyright (c) 1998, 1999 Glenn Randers-Pehrson
@@ -9,19 +9,19 @@
* Authors and maintainers:
* libpng versions 0.71, May 1995, through 0.89c, May 1996: Guy Schalnat
* libpng versions 0.90, December 1996, through 0.96, May 1997: Andreas Dilger
- * libpng versions 0.97, January 1998, through 1.0.4 - September 19, 1999: Glenn R-P
+ * libpng versions 0.97, January 1998, through 1.0.4c - October 1, 1999: Glenn
* See also "Contributing Authors", below.
*
* Y2K compliance in libpng:
* =========================
*
- * January 13, 1999
+ * October 1, 1999
*
* Since the PNG Development group is an ad-hoc body, we can't make
* an official declaration.
*
* This is your unofficial assurance that libpng from version 0.71 and
- * upward through 1.0.4 are Y2K compliant. It is my belief that earlier
+ * upward through 1.0.4c are Y2K compliant. It is my belief that earlier
* versions were also Y2K compliant.
*
* Libpng only has three year fields. One is a 2-byte unsigned integer
@@ -86,8 +86,8 @@
* 0.98 0.98 98 2.0.98
* 0.99 0.99 98 2.0.99
* 0.99a-m 0.99 99 2.0.99
- * 1.00 1.00 100 2.1.0 [int should be 10000]
- * 1.0.0 1.0.0 100 2.1.0 [int should be 10000]
+ * 1.00 1.00 100 2.1.0 [100 should be 10000]
+ * 1.0.0 1.0.0 100 2.1.0 [100 should be 10000]
* 1.0.1 1.0.1 10001 2.1.0
* 1.0.1a-e 1.0.1a-e 10002 2.1.0.1a-e
* 1.0.2 1.0.2 10002 2.1.0.2
@@ -95,6 +95,8 @@
* 1.0.3 1.0.3 10003 2.1.0.3
* 1.0.3a-d 1.0.3a-d 10004 2.1.0.3a-d
* 1.0.4 1.0.4 10004 2.1.0.4
+ * 1.0.4a-c 1.0.4a-c 10005 2.1.0.4a-c
+ * 1.0.5 1.0.5 10005 2.1.0.5
*
* Henceforth the source version will match the shared-library minor
* and patch numbers; the shared-library major version number will be
@@ -108,7 +110,18 @@
* is available as RFC 2083 <ftp://ftp.uu.net/graphics/png/documents/>
* and as a W3C Recommendation <http://www.w3.org/TR/REC.png.html>
*
- * Contributing Authors:
+ * COPYRIGHT NOTICE:
+ *
+ * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
+ * (libpng versions 0.5, May 1995, through 0.89c, May 1996)
+ * Copyright (c) 1996, 1997 Andreas Dilger
+ * (libpng versions 0.90, December 1996, through 0.96, May 1997)
+ * Copyright (c) 1998, 1999 Glenn Randers-Pehrson
+ * (libpng versions 0.97, January 1998, through 1.0.4c, October 1, 1999)
+ *
+ * For the purposes of this copyright and license, "Contributing Authors"
+ * is defined as the following set of individuals:
+ *
* John Bowler
* Kevin Bracey
* Sam Bushell
@@ -124,21 +137,6 @@
* Willem van Schaik
* Tim Wegner
*
- * The contributing authors would like to thank all those who helped
- * with testing, bug fixes, and patience. This wouldn't have been
- * possible without all of you.
- *
- * Thanks to Frank J. T. Wojcik for helping with the documentation.
- *
- * COPYRIGHT NOTICE:
- *
- * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
- * (libpng versions 0.5, May 1995, through 0.89c, May 1996)
- * Copyright (c) 1996, 1997 Andreas Dilger
- * (libpng versions 0.90, December 1996, through 0.96, May 1997)
- * Copyright (c) 1998, 1999 Glenn Randers-Pehrson
- * (libpng versions 0.97, January 1998, through 1.0.4, September 19, 1999)
- *
* The PNG Reference Library is supplied "AS IS". The Contributing Authors
* and Group 42, Inc. disclaim all warranties, expressed or implied,
* including, without limitation, the warranties of merchantability and of
@@ -169,6 +167,20 @@
* appreciated.
*/
+/*
+ *
+ * Libpng is OSI Certified Open Source Software. OSI Certified is a
+ * certification mark of the Open Source Initiative.
+ */
+
+/*
+ * The contributing authors would like to thank all those who helped
+ * with testing, bug fixes, and patience. This wouldn't have been
+ * possible without all of you.
+ *
+ * Thanks to Frank J. T. Wojcik for helping with the documentation.
+ */
+
#ifndef _PNG_H
#define _PNG_H
@@ -196,14 +208,16 @@ extern "C" {
*/
/* Version information for png.h - this should match the version in png.c */
-#define PNG_LIBPNG_VER_STRING "1.0.4"
+#define PNG_LIBPNG_VER_STRING "1.0.4c"
/* Careful here. At one time, Guy wanted to use 082, but that would be octal.
* We must not include leading zeros.
* Versions 0.7 through 1.0.0 were in the range 0 to 100 here (only
* version 1.0.0 was mis-numbered 100 instead of 10000). From
* version 1.0.1 it's xxyyzz, where x=major, y=minor, z=bugfix */
-#define PNG_LIBPNG_VER 10004 /* 1.0.4 */
+#define PNG_LIBPNG_VER 10005 /* 1.0.5 */
+
+/* Note to maintainer: update this number in scripts/pngdef.pas as well */
/* variables declared in png.c - only it needs to define PNG_NO_EXTERN */
#if !defined(PNG_NO_EXTERN) || defined(PNG_ALWAYS_EXTERN)
@@ -701,7 +715,7 @@ struct png_struct_def
#if defined(PNG_READ_GAMMA_SUPPORTED) || defined(PNG_READ_BACKGROUND_SUPPORTED)
int gamma_shift; /* number of "insignificant" bits 16-bit gamma */
float gamma; /* file gamma value */
- float screen_gamma; /* screen gamma value (display_gamma/viewing_gamma */
+ float screen_gamma; /* screen gamma value (display_exponent) */
#endif /* PNG_READ_GAMMA_SUPPORTED */
#if defined(PNG_READ_GAMMA_SUPPORTED) || defined(PNG_READ_BACKGROUND_SUPPORTED)
png_bytep gamma_table; /* gamma table for 8-bit depth files */
@@ -791,6 +805,11 @@ struct png_struct_def
#endif
};
+/* This prevents a compiler error in png_get_copyright() in png.c if png.c
+and png.h are both at * version 1.0.4c
+ */
+typedef png_structp version_1_0_4c;
+
typedef png_struct FAR * FAR * png_structpp;
/* Here are the function definitions most commonly used. This is not
@@ -993,7 +1012,7 @@ extern PNG_EXPORT(void,png_set_dither) PNGARG((png_structp png_ptr,
#endif /* PNG_READ_DITHER_SUPPORTED */
#if defined(PNG_READ_GAMMA_SUPPORTED)
-/* Handle gamma correction. Screen_gamma=(display_gamma/viewing_gamma) */
+/* Handle gamma correction. Screen_gamma=(display_exponent) */
extern PNG_EXPORT(void,png_set_gamma) PNGARG((png_structp png_ptr,
double screen_gamma, double default_file_gamma));
#endif /* PNG_READ_GAMMA_SUPPORTED */
@@ -1610,7 +1629,7 @@ png_get_header_version(png_structp png_ptr)
{
if(png_ptr == NULL)
/* silence compiler warning about unused png_ptr */ ;
- return("\n libpng version 1.0.4 - September 19, 1999 (header)\n");
+ return("\n libpng version 1.0.4c - October 1, 1999 (header)\n");
}
#endif
diff --git a/pngasmrd.h b/pngasmrd.h
index ae9853cc1..e6c9c02e0 100644
--- a/pngasmrd.h
+++ b/pngasmrd.h
@@ -1,6 +1,6 @@
/* pngasmrd.h - assembler version of utilities to read a PNG file
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1999 Glenn Randers-Pehrson
*
@@ -21,7 +21,7 @@
/* Set this in the makefile for gcc on Pentium, not in pngconf.h */
#ifdef PNG_USE_PNGGCCRD
/* Platform must be Pentium. Makefile must assemble and load pnggccrd.c
- * (not available in libpng 1.0.4).
+ * (not available in libpng 1.0.4c).
* MMX will be detected at run time and used if present.
*/
#define PNG_HAVE_ASSEMBLER_COMBINE_ROW
diff --git a/pngconf.h b/pngconf.h
index 3f546bc50..41316d9e9 100644
--- a/pngconf.h
+++ b/pngconf.h
@@ -1,7 +1,7 @@
/* pngconf.h - machine configurable file for libpng
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngerror.c b/pngerror.c
index 2d2cede02..63a2b4814 100644
--- a/pngerror.c
+++ b/pngerror.c
@@ -1,7 +1,7 @@
/* pngerror.c - stub functions for i/o and memory allocation
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngget.c b/pngget.c
index 248db8e36..c27d9b9c1 100644
--- a/pngget.c
+++ b/pngget.c
@@ -1,7 +1,7 @@
/* pngget.c - retrieval of values from info struct
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngmem.c b/pngmem.c
index 06eb09098..cbaa27e19 100644
--- a/pngmem.c
+++ b/pngmem.c
@@ -1,7 +1,7 @@
/* pngmem.c - stub functions for memory allocation
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngnow.png b/pngnow.png
new file mode 100644
index 000000000..16280e7d8
--- /dev/null
+++ b/pngnow.png
Binary files differ
diff --git a/pngpread.c b/pngpread.c
index 9d2fe4888..d703b8120 100644
--- a/pngpread.c
+++ b/pngpread.c
@@ -1,7 +1,7 @@
/* pngpread.c - read a png file in push mode
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngread.c b/pngread.c
index fc700f487..1517535a2 100644
--- a/pngread.c
+++ b/pngread.c
@@ -1,7 +1,7 @@
/* pngread.c - read a PNG file
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
@@ -531,7 +531,7 @@ png_read_row(png_structp png_ptr, png_bytep row, png_bytep dsp_row)
* not called png_set_interlace_handling(), the display_row buffer will
* be ignored, so pass NULL to it.
*
- * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4.
+ * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4c.
*/
void
@@ -580,7 +580,7 @@ png_read_rows(png_structp png_ptr, png_bytepp row,
* only call this function once. If you desire to have an image for
* each pass of a interlaced image, use png_read_rows() instead.
*
- * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4.
+ * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4c.
*/
void
png_read_image(png_structp png_ptr, png_bytepp image)
diff --git a/pngrio.c b/pngrio.c
index 8d4390c2d..4cc33a7fd 100644
--- a/pngrio.c
+++ b/pngrio.c
@@ -1,7 +1,7 @@
/* pngrio.c - functions for data input
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngrtran.c b/pngrtran.c
index f8b8e80de..9c2b0edfd 100644
--- a/pngrtran.c
+++ b/pngrtran.c
@@ -1,7 +1,7 @@
/* pngrtran.c - transforms the data in a row for PNG readers
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
@@ -1069,6 +1069,7 @@ png_read_transform_info(png_structp png_ptr, png_infop info_ptr)
info_ptr->pixel_depth = (png_byte)(info_ptr->channels *
info_ptr->bit_depth);
info_ptr->rowbytes = ((info_ptr->width * info_ptr->pixel_depth + 7) >> 3);
+
}
/* Transform the row. The order of transformations is significant,
diff --git a/pngrutil.c b/pngrutil.c
index c49ac6190..bbf0838da 100644
--- a/pngrutil.c
+++ b/pngrutil.c
@@ -1,7 +1,7 @@
/* pngrutil.c - utilities to read a PNG file
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
@@ -945,7 +945,7 @@ png_handle_hIST(png_structp png_ptr, png_infop info_ptr, png_uint_32 length)
return;
}
- num = (int)length / 2;
+ num = (int)length / 2 ;
png_ptr->hist = (png_uint_16p)png_malloc(png_ptr,
(png_uint_32)(num * sizeof (png_uint_16)));
png_ptr->flags |= PNG_FLAG_FREE_HIST;
@@ -1892,6 +1892,7 @@ png_do_read_interlace
png_size_t pixel_bytes = (row_info->pixel_depth >> 3);
png_bytep sp = row + (png_size_t)(row_info->width - 1) * pixel_bytes;
png_bytep dp = row + (png_size_t)(final_width - 1) * pixel_bytes;
+
int jstop = png_pass_inc[pass];
png_uint_32 i;
@@ -1937,7 +1938,7 @@ png_read_filter_row
{
png_uint_32 i;
png_uint_32 istop = row_info->rowbytes;
- png_uint_32 bpp = (row_info->pixel_depth + 7) / 8;
+ png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
png_bytep rp = row + bpp;
png_bytep lp = row;
@@ -1968,20 +1969,20 @@ png_read_filter_row
png_bytep rp = row;
png_bytep pp = prev_row;
png_bytep lp = row;
- png_uint_32 bpp = (row_info->pixel_depth + 7) / 8;
+ png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
png_uint_32 istop = row_info->rowbytes - bpp;
for (i = 0; i < bpp; i++)
{
*rp = (png_byte)(((int)(*rp) +
- ((int)(*pp++) / 2)) & 0xff);
+ ((int)(*pp++) / 2 )) & 0xff);
rp++;
}
for (i = 0; i < istop; i++)
{
*rp = (png_byte)(((int)(*rp) +
- (int)(*pp++ + *lp++) / 2) & 0xff);
+ (int)(*pp++ + *lp++) / 2 ) & 0xff);
rp++;
}
break;
@@ -1993,7 +1994,7 @@ png_read_filter_row
png_bytep pp = prev_row;
png_bytep lp = row;
png_bytep cp = prev_row;
- png_uint_32 bpp = (row_info->pixel_depth + 7) / 8;
+ png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
png_uint_32 istop=row_info->rowbytes - bpp;
for (i = 0; i < bpp; i++)
@@ -2267,6 +2268,16 @@ png_read_start_row(png_structp png_ptr)
}
#endif
+#if defined(PNG_READ_USER_TRANSFORM_SUPPORTED)
+ if(png_ptr->transformations & PNG_USER_TRANSFORM)
+ {
+ int user_pixel_depth=png_ptr->user_transform_depth*
+ png_ptr->user_transform_channels;
+ if(user_pixel_depth > max_pixel_depth)
+ max_pixel_depth=user_pixel_depth;
+ }
+#endif
+
/* align the width on the next larger 8 pixels. Mainly used
for interlacing */
row_bytes = ((png_ptr->width + 7) & ~((png_uint_32)7));
diff --git a/pngset.c b/pngset.c
index e0f9e0ac3..bfec9072b 100644
--- a/pngset.c
+++ b/pngset.c
@@ -1,7 +1,7 @@
/* pngset.c - storage of image information into info struct
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngtest.c b/pngtest.c
index afd5c376d..51d289c15 100644
--- a/pngtest.c
+++ b/pngtest.c
@@ -1,7 +1,7 @@
/* pngtest.c - a simple test program to test libpng
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
@@ -35,8 +35,24 @@
#define PNG_DEBUG 0
#endif
+/* Turn on CPU timing
+#define PNGTEST_TIMING
+*/
+
+#ifdef PNGTEST_TIMING
+static float t_start, t_stop, t_decode, t_encode, t_misc;
+#include <time.h>
+#endif
+
#include "png.h"
+#ifdef PNGTEST_TIMING
+static float t_start, t_stop, t_decode, t_encode, t_misc;
+#if !defined(PNG_READ_tIME_SUPPORTED) && !defined(PNG_WRITE_tIME_SUPPORTED)
+#include <time.h>
+#endif
+#endif
+
#if defined(PNG_TIME_RFC1123_SUPPORTED)
static int tIME_chunk_present=0;
static char tIME_string[30] = "no tIME chunk present in file";
@@ -800,16 +816,36 @@ test_one_file(PNG_CONST char *inname, PNG_CONST char *outname)
}
png_debug(0, "Writing row data\n");
+#if defined(PNG_READ_INTERLACING_SUPPORTED) || \
+ defined(PNG_WRITE_INTERLACING_SUPPORTED)
num_pass = png_set_interlace_handling(read_ptr);
png_set_interlace_handling(write_ptr);
+#else
+ num_pass=1;
+#endif
+#ifdef PNGTEST_TIMING
+ t_stop = (float)clock();
+ t_misc += (t_stop - t_start);
+ t_start = t_stop;
+#endif
for (pass = 0; pass < num_pass; pass++)
{
png_debug1(0, "Writing row data for pass %d\n",pass);
for (y = 0; y < height; y++)
{
png_read_rows(read_ptr, (png_bytepp)&row_buf, (png_bytepp)NULL, 1);
+#ifdef PNGTEST_TIMING
+ t_stop = (float)clock();
+ t_decode += (t_stop - t_start);
+ t_start = t_stop;
+#endif
png_write_rows(write_ptr, (png_bytepp)&row_buf, 1);
+#ifdef PNGTEST_TIMING
+ t_stop = (float)clock();
+ t_encode += (t_stop - t_start);
+ t_start = t_stop;
+#endif
}
}
@@ -1040,7 +1076,9 @@ main(int argc, char *argv[])
#endif
}
#ifdef PNG_USER_MEM_SUPPORTED
- fprintf(STDERR, "Maximum memory allocation: %d bytes\n",
+ fprintf(STDERR, " Current memory allocation: %d bytes\n",
+ current_allocation);
+ fprintf(STDERR, " Maximum memory allocation: %d bytes\n",
maximum_allocation);
#endif
}
@@ -1103,11 +1141,27 @@ main(int argc, char *argv[])
#endif
}
#ifdef PNG_USER_MEM_SUPPORTED
- fprintf(STDERR, "Maximum memory allocation: %d bytes\n",
+ fprintf(STDERR, " Current memory allocation: %d bytes\n",
+ current_allocation);
+ fprintf(STDERR, " Maximum memory allocation: %d bytes\n",
maximum_allocation);
#endif
}
+#ifdef PNGTEST_TIMING
+ t_stop = (float)clock();
+ t_misc += (t_stop - t_start);
+ t_start = t_stop;
+ fprintf(STDERR," CPU time used = %.3f seconds",
+ (t_misc+t_decode+t_encode)/(float)CLOCKS_PER_SEC);
+ fprintf(STDERR," (decoding %.3f,\n",
+ t_decode/(float)CLOCKS_PER_SEC);
+ fprintf(STDERR," encoding %.3f ,",
+ t_encode/(float)CLOCKS_PER_SEC);
+ fprintf(STDERR," other %.3f seconds)\n\n",
+ t_misc/(float)CLOCKS_PER_SEC);
+#endif
+
if (ierror == 0)
fprintf(STDERR, "libpng passes test\n");
else
@@ -1115,3 +1169,10 @@ main(int argc, char *argv[])
return (int)(ierror != 0);
}
+/* Generate a compiler error if there is an old png.h in the search path. */
+void
+png_check_pngtest_version
+ (version_1_0_4c png_h_is_not_version_1_0_4c)
+{
+ if(png_h_is_not_version_1_0_4c == NULL) return;
+}
diff --git a/pngtrans.c b/pngtrans.c
index bf1401858..57a1f9426 100644
--- a/pngtrans.c
+++ b/pngtrans.c
@@ -1,7 +1,7 @@
/* pngtrans.c - transforms the data in a row (used by both readers and writers)
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngvcrd.c b/pngvcrd.c
index 8f429d92a..4ab0a913c 100644
--- a/pngvcrd.c
+++ b/pngvcrd.c
@@ -1,13 +1,13 @@
-/* pngvcrd.c - assembler version of utilities to read a PNG file
+/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
*
- * For Intel CPU and Microsoft Visual C++ compiler
+ * For Intel x86 CPU and Microsoft Visual C++ compiler
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998, Intel Corporation
* Copyright (c) 1998, 1999 Glenn Randers-Pehrson
*
- * Contributed by Nirav Chhatrapati, INTEL Corporation, 1998
+ * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
* Interface to libpng contributed by Gilles Vollant, 1999
*
*/
@@ -15,7 +15,7 @@
#define PNG_INTERNAL
#include "png.h"
-#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
+#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
static int mmx_supported=2;
@@ -68,8 +68,8 @@ NOT_SUPPORTED:
}
-//mmx_supported_local=0; // test code for force don't support MMX
- //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
+ //mmx_supported_local=0; // test code for force don't support MMX
+ //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
return mmx_supported_local;
}
@@ -85,858 +85,857 @@ NOT_SUPPORTED:
to any alpha or transparency value associated with the pixel. If
you want all pixels to be combined, pass 0xff (255) in mask. */
-/* Use this routine for X86 platform - uses faster MMX routine if machine
-supports MMX */
+/* Use this routine for x86 platform - uses faster MMX routine if machine
+ supports MMX */
void
-png_combine_row(png_structp png_ptr, png_bytep row,
- int mask)
+png_combine_row(png_structp png_ptr, png_bytep row, int mask)
{
- //int mmx_supported=0; // another test code for remove MMX in this routine
+ int save_mmx_supported = mmx_supported;
png_debug(1,"in png_combine_row_asm\n");
- //if (mmx_supported==2)
- // mmx_supported=mmxsupport();
+
+ if ((png_ptr->transformations & PNG_INTERLACE) && png_ptr->pass != 6)
+ mmx_supported = 0;
+ else
+ if (mmx_supported == 2)
+ mmx_supported = mmxsupport();
if (mask == 0xff)
{
png_memcpy(row, png_ptr->row_buf + 1,
- (png_size_t)((png_ptr->width *
- png_ptr->row_info.pixel_depth + 7) >> 3));
+ (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
}
else
{
- switch (png_ptr->row_info.pixel_depth)
+ switch (png_ptr->row_info.pixel_depth)
{
- case 1:
- {
- png_bytep sp;
- png_bytep dp;
- int s_inc, s_start, s_end;
- int m;
- int shift;
- png_uint_32 i;
-
- sp = png_ptr->row_buf + 1;
- dp = row;
- m = 0x80;
+ case 1:
+ {
+ png_bytep sp;
+ png_bytep dp;
+ int s_inc, s_start, s_end;
+ int m;
+ int shift;
+ png_uint_32 i;
+
+ sp = png_ptr->row_buf + 1;
+ dp = row;
+ m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (png_ptr->transformations & PNG_PACKSWAP)
- {
- s_start = 0;
- s_end = 7;
- s_inc = 1;
- }
- else
+ if (png_ptr->transformations & PNG_PACKSWAP)
+ {
+ s_start = 0;
+ s_end = 7;
+ s_inc = 1;
+ }
+ else
#endif
- {
- s_start = 7;
- s_end = 0;
- s_inc = -1;
- }
+ {
+ s_start = 7;
+ s_end = 0;
+ s_inc = -1;
+ }
- shift = s_start;
+ shift = s_start;
- for (i = 0; i < png_ptr->width; i++)
- {
- if (m & mask)
- {
- int value;
+ for (i = 0; i < png_ptr->width; i++)
+ {
+ if (m & mask)
+ {
+ int value;
- value = (*sp >> shift) & 0x1;
- *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
- *dp |= (png_byte)(value << shift);
+ value = (*sp >> shift) & 0x1;
+ *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
+ *dp |= (png_byte)(value << shift);
+ }
+
+ if (shift == s_end)
+ {
+ shift = s_start;
+ sp++;
+ dp++;
+ }
+ else
+ shift += s_inc;
+
+ if (m == 1)
+ m = 0x80;
+ else
+ m >>= 1;
+ }
+ break;
}
- if (shift == s_end)
+ case 2:
{
- shift = s_start;
- sp++;
- dp++;
- }
- else
- shift += s_inc;
+ png_bytep sp;
+ png_bytep dp;
+ int s_start, s_end, s_inc;
+ int m;
+ int shift;
+ png_uint_32 i;
+ int value;
- if (m == 1)
- m = 0x80;
- else
- m >>= 1;
- }
- break;
- }
- case 2:
- {
- png_bytep sp;
- png_bytep dp;
- int s_start, s_end, s_inc;
- int m;
- int shift;
- png_uint_32 i;
- int value;
-
- sp = png_ptr->row_buf + 1;
- dp = row;
- m = 0x80;
+ sp = png_ptr->row_buf + 1;
+ dp = row;
+ m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (png_ptr->transformations & PNG_PACKSWAP)
- {
- s_start = 0;
- s_end = 6;
- s_inc = 2;
- }
- else
+ if (png_ptr->transformations & PNG_PACKSWAP)
+ {
+ s_start = 0;
+ s_end = 6;
+ s_inc = 2;
+ }
+ else
#endif
- {
- s_start = 6;
- s_end = 0;
- s_inc = -2;
- }
+ {
+ s_start = 6;
+ s_end = 0;
+ s_inc = -2;
+ }
- shift = s_start;
+ shift = s_start;
- for (i = 0; i < png_ptr->width; i++)
- {
- if (m & mask)
- {
- value = (*sp >> shift) & 0x3;
- *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
- *dp |= (png_byte)(value << shift);
- }
+ for (i = 0; i < png_ptr->width; i++)
+ {
+ if (m & mask)
+ {
+ value = (*sp >> shift) & 0x3;
+ *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
+ *dp |= (png_byte)(value << shift);
+ }
- if (shift == s_end)
- {
- shift = s_start;
- sp++;
- dp++;
+ if (shift == s_end)
+ {
+ shift = s_start;
+ sp++;
+ dp++;
+ }
+ else
+ shift += s_inc;
+ if (m == 1)
+ m = 0x80;
+ else
+ m >>= 1;
+ }
+ break;
}
- else
- shift += s_inc;
- if (m == 1)
- m = 0x80;
- else
- m >>= 1;
- }
- break;
- }
- case 4:
- {
- png_bytep sp;
- png_bytep dp;
- int s_start, s_end, s_inc;
- int m;
- int shift;
- png_uint_32 i;
- int value;
-
- sp = png_ptr->row_buf + 1;
- dp = row;
- m = 0x80;
+ case 4:
+ {
+ png_bytep sp;
+ png_bytep dp;
+ int s_start, s_end, s_inc;
+ int m;
+ int shift;
+ png_uint_32 i;
+ int value;
+
+ sp = png_ptr->row_buf + 1;
+ dp = row;
+ m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (png_ptr->transformations & PNG_PACKSWAP)
- {
- s_start = 0;
- s_end = 4;
- s_inc = 4;
- }
- else
+ if (png_ptr->transformations & PNG_PACKSWAP)
+ {
+ s_start = 0;
+ s_end = 4;
+ s_inc = 4;
+ }
+ else
#endif
- {
- s_start = 4;
- s_end = 0;
- s_inc = -4;
- }
- shift = s_start;
+ {
+ s_start = 4;
+ s_end = 0;
+ s_inc = -4;
+ }
+ shift = s_start;
- for (i = 0; i < png_ptr->width; i++)
- {
- if (m & mask)
- {
- value = (*sp >> shift) & 0xf;
- *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
- *dp |= (png_byte)(value << shift);
- }
+ for (i = 0; i < png_ptr->width; i++)
+ {
+ if (m & mask)
+ {
+ value = (*sp >> shift) & 0xf;
+ *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
+ *dp |= (png_byte)(value << shift);
+ }
- if (shift == s_end)
- {
- shift = s_start;
- sp++;
- dp++;
+ if (shift == s_end)
+ {
+ shift = s_start;
+ sp++;
+ dp++;
+ }
+ else
+ shift += s_inc;
+ if (m == 1)
+ m = 0x80;
+ else
+ m >>= 1;
+ }
+ break;
}
- else
- shift += s_inc;
- if (m == 1)
- m = 0x80;
- else
- m >>= 1;
- }
- break;
- }
- case 8:
- {
- png_bytep srcptr;
- png_bytep dstptr;
- png_uint_32 len;
- int m;
- int diff, unmask;
+ case 8:
+ {
+ png_bytep srcptr;
+ png_bytep dstptr;
+ png_uint_32 len;
+ int m;
+ int diff, unmask;
- __int64 mask0=0x0102040810204080;
+ __int64 mask0=0x0102040810204080;
- if (mmx_supported)
- {
- srcptr = png_ptr->row_buf + 1;
- dstptr = row;
- m = 0x80;
- unmask = ~mask;
- len = png_ptr->width &~7; //reduce to multiple of 8
- diff = png_ptr->width & 7; //amount lost
- _asm {
- movd mm7, unmask //load bit pattern
- psubb mm6,mm6 //zero mm6
- punpcklbw mm7,mm7
- punpcklwd mm7,mm7
- punpckldq mm7,mm7 //fill register with 8 masks
+ if (mmx_supported)
+ {
+ srcptr = png_ptr->row_buf + 1;
+ dstptr = row;
+ m = 0x80;
+ unmask = ~mask;
+ len = png_ptr->width &~7; //reduce to multiple of 8
+ diff = png_ptr->width & 7; //amount lost
+
+ _asm
+ {
+ movd mm7, unmask //load bit pattern
+ psubb mm6,mm6 //zero mm6
+ punpcklbw mm7,mm7
+ punpcklwd mm7,mm7
+ punpckldq mm7,mm7 //fill register with 8 masks
- movq mm0,mask0
+ movq mm0,mask0
- pand mm0,mm7 //nonzero if keep byte
- pcmpeqb mm0,mm6 //zeros->1s, v versa
+ pand mm0,mm7 //nonzero if keep byte
+ pcmpeqb mm0,mm6 //zeros->1s, v versa
- mov ecx,len //load length of line
- mov esi,srcptr //load source
- mov ebx,dstptr //load dest
- cmp ecx,0 //lcr
- je mainloop8end
+ mov ecx,len //load length of line
+ mov esi,srcptr //load source
+ mov ebx,dstptr //load dest
+ cmp ecx,0 //lcr
+ je mainloop8end
mainloop8:
- movq mm4,[esi]
- pand mm4,mm0
- movq mm6,mm0
- pandn mm6,[ebx]
- por mm4,mm6
- movq [ebx],mm4
-
- add esi,8 //inc by 8 bytes processed
- add ebx,8
- sub ecx,8 //dec by 8 pixels processed
-
- ja mainloop8
+ movq mm4,[esi]
+ pand mm4,mm0
+ movq mm6,mm0
+ pandn mm6,[ebx]
+ por mm4,mm6
+ movq [ebx],mm4
+
+ add esi,8 //inc by 8 bytes processed
+ add ebx,8
+ sub ecx,8 //dec by 8 pixels processed
+
+ ja mainloop8
mainloop8end:
- mov ecx,diff
- cmp ecx,0
- jz end8
+ mov ecx,diff
+ cmp ecx,0
+ jz end8
- mov edx,mask
- sal edx,24 //make low byte the high byte
+ mov edx,mask
+ sal edx,24 //make low byte the high byte
secondloop8:
- sal edx,1 //move high bit to CF
- jnc skip8 //if CF = 0
- mov al,[esi]
- mov [ebx],al
+ sal edx,1 //move high bit to CF
+ jnc skip8 //if CF = 0
+ mov al,[esi]
+ mov [ebx],al
skip8:
- inc esi
- inc ebx
+ inc esi
+ inc ebx
- dec ecx
- jnz secondloop8
+ dec ecx
+ jnz secondloop8
end8:
- emms
- }
- }
- else /* mmx _not supported - Use modified C routine*/
- {
- register unsigned int incr1, initial_val, final_val;
- png_size_t pixel_bytes;
- png_uint_32 i;
- //if ((mask != 0x0f) && (mask != 0x33))
- register int disp = png_pass_inc[png_ptr->pass];
- int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
- pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
- srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
- pixel_bytes;
- dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
- initial_val = offset_table[png_ptr->pass]*pixel_bytes;
- final_val = png_ptr->width*pixel_bytes;
- incr1 = (disp)*pixel_bytes;
- for (i = initial_val; i < final_val; i += incr1)
- {
- png_memcpy(dstptr, srcptr, pixel_bytes);
- srcptr += incr1;
- dstptr += incr1;
- }
- } /* end of else */
-
- break;
- } //end 8bpp
-
- case 16:
- {
- png_bytep srcptr;
- png_bytep dstptr;
- png_uint_32 len;
- int unmask, diff;
+ emms
+ }
+ }
+ else /* mmx not supported - use modified C routine */
+ {
+ register unsigned int incr1, initial_val, final_val;
+ png_size_t pixel_bytes;
+ png_uint_32 i;
+ //if ((mask != 0x0f) && (mask != 0x33))
+ register int disp = png_pass_inc[png_ptr->pass];
+ int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
+ pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+ srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+ pixel_bytes;
+ dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
+ initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+ final_val = png_ptr->width*pixel_bytes;
+ incr1 = (disp)*pixel_bytes;
+ for (i = initial_val; i < final_val; i += incr1)
+ {
+ png_memcpy(dstptr, srcptr, pixel_bytes);
+ srcptr += incr1;
+ dstptr += incr1;
+ }
+ } /* end of else */
- __int64 mask1=0x0101020204040808,
- mask0=0x1010202040408080;
+ break;
+ } //end 8bpp
- if (mmx_supported)
- {
- srcptr = png_ptr->row_buf + 1;
- dstptr = row;
+ case 16:
+ {
+ png_bytep srcptr;
+ png_bytep dstptr;
+ png_uint_32 len;
+ int unmask, diff;
+ __int64 mask1=0x0101020204040808,
+ mask0=0x1010202040408080;
+
+ if (mmx_supported)
+ {
+ srcptr = png_ptr->row_buf + 1;
+ dstptr = row;
- unmask = ~mask;
- len = (png_ptr->width)&~7;
- diff = (png_ptr->width)&7;
- _asm {
- movd mm7, unmask //load bit pattern
- psubb mm6,mm6 //zero mm6
- punpcklbw mm7,mm7
- punpcklwd mm7,mm7
- punpckldq mm7,mm7 //fill register with 8 masks
+ unmask = ~mask;
+ len = (png_ptr->width)&~7;
+ diff = (png_ptr->width)&7;
+ _asm
+ {
+ movd mm7, unmask //load bit pattern
+ psubb mm6,mm6 //zero mm6
+ punpcklbw mm7,mm7
+ punpcklwd mm7,mm7
+ punpckldq mm7,mm7 //fill register with 8 masks
- movq mm0,mask0
- movq mm1,mask1
+ movq mm0,mask0
+ movq mm1,mask1
- pand mm0,mm7
- pand mm1,mm7
+ pand mm0,mm7
+ pand mm1,mm7
- pcmpeqb mm0,mm6
- pcmpeqb mm1,mm6
+ pcmpeqb mm0,mm6
+ pcmpeqb mm1,mm6
- mov ecx,len //load length of line
- mov esi,srcptr //load source
- mov ebx,dstptr //load dest
- cmp ecx,0 //lcr
- jz mainloop16end
+ mov ecx,len //load length of line
+ mov esi,srcptr //load source
+ mov ebx,dstptr //load dest
+ cmp ecx,0 //lcr
+ jz mainloop16end
mainloop16:
- movq mm4,[esi]
- pand mm4,mm0
- movq mm6,mm0
- movq mm7,[ebx]
- pandn mm6,mm7
- por mm4,mm6
- movq [ebx],mm4
-
- movq mm5,[esi+8]
- pand mm5,mm1
- movq mm7,mm1
- movq mm6,[ebx+8]
- pandn mm7,mm6
- por mm5,mm7
- movq [ebx+8],mm5
-
- add esi,16 //inc by 16 bytes processed
- add ebx,16
- sub ecx,8 //dec by 8 pixels processed
-
- ja mainloop16
-mainloop16end:
-
- mov ecx,diff
- cmp ecx,0
- jz end16
+ movq mm4,[esi]
+ pand mm4,mm0
+ movq mm6,mm0
+ movq mm7,[ebx]
+ pandn mm6,mm7
+ por mm4,mm6
+ movq [ebx],mm4
+
+ movq mm5,[esi+8]
+ pand mm5,mm1
+ movq mm7,mm1
+ movq mm6,[ebx+8]
+ pandn mm7,mm6
+ por mm5,mm7
+ movq [ebx+8],mm5
+
+ add esi,16 //inc by 16 bytes processed
+ add ebx,16
+ sub ecx,8 //dec by 8 pixels processed
+
+ ja mainloop16
- mov edx,mask
- sal edx,24 //make low byte the high byte
+mainloop16end:
+ mov ecx,diff
+ cmp ecx,0
+ jz end16
+ mov edx,mask
+ sal edx,24 //make low byte the high byte
secondloop16:
- sal edx,1 //move high bit to CF
- jnc skip16 //if CF = 0
- mov ax,[esi]
- mov [ebx],ax
+ sal edx,1 //move high bit to CF
+ jnc skip16 //if CF = 0
+ mov ax,[esi]
+ mov [ebx],ax
skip16:
- add esi,2
- add ebx,2
-
- dec ecx
- jnz secondloop16
+ add esi,2
+ add ebx,2
+ dec ecx
+ jnz secondloop16
end16:
- emms
- }
- }
- else /* mmx _not supported - Use modified C routine */
- {
- register unsigned int incr1, initial_val, final_val;
- png_size_t pixel_bytes;
- png_uint_32 i;
- register int disp = png_pass_inc[png_ptr->pass];
- int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
- pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
- srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
- pixel_bytes;
- dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
- initial_val = offset_table[png_ptr->pass]*pixel_bytes;
- final_val = png_ptr->width*pixel_bytes;
- incr1 = (disp)*pixel_bytes;
- for (i = initial_val; i < final_val; i += incr1)
- {
- png_memcpy(dstptr, srcptr, pixel_bytes);
- srcptr += incr1;
- dstptr += incr1;
- }
- } /* end of else */
-
- break;
- }
- case 24:
- {
- png_bytep srcptr;
- png_bytep dstptr;
- png_uint_32 len;
- int unmask, diff;
-
- __int64 mask2=0x0101010202020404, //24bpp
- mask1=0x0408080810101020,
- mask0=0x2020404040808080;
-
- srcptr = png_ptr->row_buf + 1;
- dstptr = row;
-
- unmask = ~mask;
- len = (png_ptr->width)&~7;
- diff = (png_ptr->width)&7;
+ emms
+ }
+ }
+ else /* mmx not supported - use modified C routine */
+ {
+ register unsigned int incr1, initial_val, final_val;
+ png_size_t pixel_bytes;
+ png_uint_32 i;
+ register int disp = png_pass_inc[png_ptr->pass];
+ int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
+
+ pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+ srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+ pixel_bytes;
+ dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
+ initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+ final_val = png_ptr->width*pixel_bytes;
+ incr1 = (disp)*pixel_bytes;
+ for (i = initial_val; i < final_val; i += incr1)
+ {
+ png_memcpy(dstptr, srcptr, pixel_bytes);
+ srcptr += incr1;
+ dstptr += incr1;
+ }
+ } /* end of else */
- if (mmx_supported)
- {
- _asm {
- movd mm7, unmask //load bit pattern
- psubb mm6,mm6 //zero mm6
- punpcklbw mm7,mm7
- punpcklwd mm7,mm7
- punpckldq mm7,mm7 //fill register with 8 masks
+ break;
+ }
- movq mm0,mask0
- movq mm1,mask1
- movq mm2,mask2
+ case 24:
+ {
+ png_bytep srcptr;
+ png_bytep dstptr;
+ png_uint_32 len;
+ int unmask, diff;
+ __int64 mask2=0x0101010202020404, //24bpp
+ mask1=0x0408080810101020,
+ mask0=0x2020404040808080;
- pand mm0,mm7
- pand mm1,mm7
- pand mm2,mm7
+ srcptr = png_ptr->row_buf + 1;
+ dstptr = row;
- pcmpeqb mm0,mm6
- pcmpeqb mm1,mm6
- pcmpeqb mm2,mm6
+ unmask = ~mask;
+ len = (png_ptr->width)&~7;
+ diff = (png_ptr->width)&7;
- mov ecx,len //load length of line
- mov esi,srcptr //load source
- mov ebx,dstptr //load dest
- cmp ecx,0
- jz mainloop24end
+ if (mmx_supported)
+ {
+ _asm
+ {
+ movd mm7, unmask //load bit pattern
+ psubb mm6,mm6 //zero mm6
+ punpcklbw mm7,mm7
+ punpcklwd mm7,mm7
+ punpckldq mm7,mm7 //fill register with 8 masks
+
+ movq mm0,mask0
+ movq mm1,mask1
+ movq mm2,mask2
+
+ pand mm0,mm7
+ pand mm1,mm7
+ pand mm2,mm7
+
+ pcmpeqb mm0,mm6
+ pcmpeqb mm1,mm6
+ pcmpeqb mm2,mm6
+
+ mov ecx,len //load length of line
+ mov esi,srcptr //load source
+ mov ebx,dstptr //load dest
+ cmp ecx,0
+ jz mainloop24end
mainloop24:
- movq mm4,[esi]
- pand mm4,mm0
- movq mm6,mm0
- movq mm7,[ebx]
- pandn mm6,mm7
- por mm4,mm6
- movq [ebx],mm4
-
-
- movq mm5,[esi+8]
- pand mm5,mm1
- movq mm7,mm1
- movq mm6,[ebx+8]
- pandn mm7,mm6
- por mm5,mm7
- movq [ebx+8],mm5
-
- movq mm6,[esi+16]
- pand mm6,mm2
- movq mm4,mm2
- movq mm7,[ebx+16]
- pandn mm4,mm7
- por mm6,mm4
- movq [ebx+16],mm6
-
- add esi,24 //inc by 24 bytes processed
- add ebx,24
- sub ecx,8 //dec by 8 pixels processed
-
- ja mainloop24
-mainloop24end:
-
- mov ecx,diff
- cmp ecx,0
- jz end24
+ movq mm4,[esi]
+ pand mm4,mm0
+ movq mm6,mm0
+ movq mm7,[ebx]
+ pandn mm6,mm7
+ por mm4,mm6
+ movq [ebx],mm4
+
+
+ movq mm5,[esi+8]
+ pand mm5,mm1
+ movq mm7,mm1
+ movq mm6,[ebx+8]
+ pandn mm7,mm6
+ por mm5,mm7
+ movq [ebx+8],mm5
+
+ movq mm6,[esi+16]
+ pand mm6,mm2
+ movq mm4,mm2
+ movq mm7,[ebx+16]
+ pandn mm4,mm7
+ por mm6,mm4
+ movq [ebx+16],mm6
+
+ add esi,24 //inc by 24 bytes processed
+ add ebx,24
+ sub ecx,8 //dec by 8 pixels processed
+
+ ja mainloop24
- mov edx,mask
- sal edx,24 //make low byte the high byte
+mainloop24end:
+ mov ecx,diff
+ cmp ecx,0
+ jz end24
+ mov edx,mask
+ sal edx,24 //make low byte the high byte
secondloop24:
- sal edx,1 //move high bit to CF
- jnc skip24 //if CF = 0
- mov ax,[esi]
- mov [ebx],ax
- xor eax,eax
- mov al,[esi+2]
- mov [ebx+2],al
+ sal edx,1 //move high bit to CF
+ jnc skip24 //if CF = 0
+ mov ax,[esi]
+ mov [ebx],ax
+ xor eax,eax
+ mov al,[esi+2]
+ mov [ebx+2],al
skip24:
- add esi,3
- add ebx,3
+ add esi,3
+ add ebx,3
- dec ecx
- jnz secondloop24
+ dec ecx
+ jnz secondloop24
end24:
- emms
-
- }
- }
- else /* mmx _not supported - Use modified C routine */
- {
- register unsigned int incr1, initial_val, final_val;
- png_size_t pixel_bytes;
- png_uint_32 i;
- register int disp = png_pass_inc[png_ptr->pass];
- int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
- pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
- srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]
- *pixel_bytes;
- dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
- initial_val = offset_table[png_ptr->pass]*pixel_bytes;
- final_val = png_ptr->width*pixel_bytes;
- incr1 = (disp)*pixel_bytes;
- for (i = initial_val; i < final_val; i += incr1)
- {
- png_memcpy(dstptr, srcptr, pixel_bytes);
- srcptr += incr1;
- dstptr += incr1;
- }
- } /* end of else */
-
- break;
- } //end 24bpp
- case 32:
- {
- png_bytep srcptr;
- png_bytep dstptr;
- png_uint_32 len;
- int unmask, diff;
-
-
-
- __int64 mask3=0x0101010102020202, //32bpp
- mask2=0x0404040408080808,
- mask1=0x1010101020202020,
- mask0=0x4040404080808080;
-
- srcptr = png_ptr->row_buf + 1;
- dstptr = row;
-
- unmask = ~mask;
- len = (png_ptr->width)&~7;
- diff = (png_ptr->width)&7;
-
- if (mmx_supported)
- {
- _asm {
- movd mm7, unmask //load bit pattern
- psubb mm6,mm6 //zero mm6
- punpcklbw mm7,mm7
- punpcklwd mm7,mm7
- punpckldq mm7,mm7 //fill register with 8 masks
+ emms
+ }
+ }
+ else /* mmx not supported - use modified C routine */
+ {
+ register unsigned int incr1, initial_val, final_val;
+ png_size_t pixel_bytes;
+ png_uint_32 i;
+ register int disp = png_pass_inc[png_ptr->pass];
+ int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
+ pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+ srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+ pixel_bytes;
+ dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
+ initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+ final_val = png_ptr->width*pixel_bytes;
+ incr1 = (disp)*pixel_bytes;
+ for (i = initial_val; i < final_val; i += incr1)
+ {
+ png_memcpy(dstptr, srcptr, pixel_bytes);
+ srcptr += incr1;
+ dstptr += incr1;
+ }
+ } /* end of else */
- movq mm0,mask0
- movq mm1,mask1
- movq mm2,mask2
- movq mm3,mask3
+ break;
+ } //end 24bpp
+ case 32:
+ {
+ png_bytep srcptr;
+ png_bytep dstptr;
+ png_uint_32 len;
+ int unmask, diff;
- pand mm0,mm7
- pand mm1,mm7
- pand mm2,mm7
- pand mm3,mm7
+ __int64 mask3=0x0101010102020202, //32bpp
+ mask2=0x0404040408080808,
+ mask1=0x1010101020202020,
+ mask0=0x4040404080808080;
- pcmpeqb mm0,mm6
- pcmpeqb mm1,mm6
- pcmpeqb mm2,mm6
- pcmpeqb mm3,mm6
+ srcptr = png_ptr->row_buf + 1;
+ dstptr = row;
- mov ecx,len //load length of line
- mov esi,srcptr //load source
- mov ebx,dstptr //load dest
+ unmask = ~mask;
+ len = (png_ptr->width)&~7;
+ diff = (png_ptr->width)&7;
- cmp ecx,0 //lcr
- jz mainloop32end
+ if (mmx_supported)
+ {
+ _asm
+ {
+ movd mm7, unmask //load bit pattern
+ psubb mm6,mm6 //zero mm6
+ punpcklbw mm7,mm7
+ punpcklwd mm7,mm7
+ punpckldq mm7,mm7 //fill register with 8 masks
+
+ movq mm0,mask0
+ movq mm1,mask1
+ movq mm2,mask2
+ movq mm3,mask3
+
+ pand mm0,mm7
+ pand mm1,mm7
+ pand mm2,mm7
+ pand mm3,mm7
+
+ pcmpeqb mm0,mm6
+ pcmpeqb mm1,mm6
+ pcmpeqb mm2,mm6
+ pcmpeqb mm3,mm6
+
+ mov ecx,len //load length of line
+ mov esi,srcptr //load source
+ mov ebx,dstptr //load dest
+
+ cmp ecx,0 //lcr
+ jz mainloop32end
mainloop32:
- movq mm4,[esi]
- pand mm4,mm0
- movq mm6,mm0
- movq mm7,[ebx]
- pandn mm6,mm7
- por mm4,mm6
- movq [ebx],mm4
-
-
- movq mm5,[esi+8]
- pand mm5,mm1
- movq mm7,mm1
- movq mm6,[ebx+8]
- pandn mm7,mm6
- por mm5,mm7
- movq [ebx+8],mm5
-
- movq mm6,[esi+16]
- pand mm6,mm2
- movq mm4,mm2
- movq mm7,[ebx+16]
- pandn mm4,mm7
- por mm6,mm4
- movq [ebx+16],mm6
-
- movq mm7,[esi+24]
- pand mm7,mm3
- movq mm5,mm3
- movq mm4,[ebx+24]
- pandn mm5,mm4
- por mm7,mm5
- movq [ebx+24],mm7
-
-
- add esi,32 //inc by 32 bytes processed
- add ebx,32
- sub ecx,8 //dec by 8 pixels processed
-
- ja mainloop32
-mainloop32end:
+ movq mm4,[esi]
+ pand mm4,mm0
+ movq mm6,mm0
+ movq mm7,[ebx]
+ pandn mm6,mm7
+ por mm4,mm6
+ movq [ebx],mm4
+
+ movq mm5,[esi+8]
+ pand mm5,mm1
+ movq mm7,mm1
+ movq mm6,[ebx+8]
+ pandn mm7,mm6
+ por mm5,mm7
+ movq [ebx+8],mm5
+
+ movq mm6,[esi+16]
+ pand mm6,mm2
+ movq mm4,mm2
+ movq mm7,[ebx+16]
+ pandn mm4,mm7
+ por mm6,mm4
+ movq [ebx+16],mm6
+
+ movq mm7,[esi+24]
+ pand mm7,mm3
+ movq mm5,mm3
+ movq mm4,[ebx+24]
+ pandn mm5,mm4
+ por mm7,mm5
+ movq [ebx+24],mm7
+
+ add esi,32 //inc by 32 bytes processed
+ add ebx,32
+ sub ecx,8 //dec by 8 pixels processed
+
+ ja mainloop32
- mov ecx,diff
- cmp ecx,0
- jz end32
-
- mov edx,mask
- sal edx,24 //make low byte the high byte
+mainloop32end:
+ mov ecx,diff
+ cmp ecx,0
+ jz end32
+ mov edx,mask
+ sal edx,24 //make low byte the high byte
secondloop32:
- sal edx,1 //move high bit to CF
- jnc skip32 //if CF = 0
- mov eax,[esi]
- mov [ebx],eax
+ sal edx,1 //move high bit to CF
+ jnc skip32 //if CF = 0
+ mov eax,[esi]
+ mov [ebx],eax
skip32:
- add esi,4
- add ebx,4
+ add esi,4
+ add ebx,4
- dec ecx
- jnz secondloop32
+ dec ecx
+ jnz secondloop32
end32:
- emms
-
- }
- }
- else /* mmx _not supported - Use modified C routine */
- {
- register unsigned int incr1, initial_val, final_val;
- png_size_t pixel_bytes;
- png_uint_32 i;
- register int disp = png_pass_inc[png_ptr->pass];
- int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
- pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
- srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
- pixel_bytes;
- dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
- initial_val = offset_table[png_ptr->pass]*pixel_bytes;
- final_val = png_ptr->width*pixel_bytes;
- incr1 = (disp)*pixel_bytes;
- for (i = initial_val; i < final_val; i += incr1)
- {
- png_memcpy(dstptr, srcptr, pixel_bytes);
- srcptr += incr1;
- dstptr += incr1;
- }
- } /* end of else */
-
- break;
- } //end 32bpp
+ emms
+ }
+ }
+ else /* mmx _not supported - Use modified C routine */
+ {
+ register unsigned int incr1, initial_val, final_val;
+ png_size_t pixel_bytes;
+ png_uint_32 i;
+ register int disp = png_pass_inc[png_ptr->pass];
+ int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
+ pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+ srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+ pixel_bytes;
+ dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
+ initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+ final_val = png_ptr->width*pixel_bytes;
+ incr1 = (disp)*pixel_bytes;
+ for (i = initial_val; i < final_val; i += incr1)
+ {
+ png_memcpy(dstptr, srcptr, pixel_bytes);
+ srcptr += incr1;
+ dstptr += incr1;
+ }
+ } /* end of else */
+ break;
+ } //end 32bpp
- case 48:
- {
- png_bytep srcptr;
- png_bytep dstptr;
- png_uint_32 len;
- int unmask, diff;
-
- __int64 mask5=0x0101010101010202,
- mask4=0x0202020204040404,
- mask3=0x0404080808080808,
- mask2=0x1010101010102020,
- mask1=0x2020202040404040,
- mask0=0x4040808080808080;
-
- if (mmx_supported)
- {
+ case 48:
+ {
+ png_bytep srcptr;
+ png_bytep dstptr;
+ png_uint_32 len;
+ int unmask, diff;
+
+ __int64 mask5=0x0101010101010202,
+ mask4=0x0202020204040404,
+ mask3=0x0404080808080808,
+ mask2=0x1010101010102020,
+ mask1=0x2020202040404040,
+ mask0=0x4040808080808080;
+
+ if (mmx_supported)
+ {
+ srcptr = png_ptr->row_buf + 1;
+ dstptr = row;
- srcptr = png_ptr->row_buf + 1;
- dstptr = row;
-
- unmask = ~mask;
- len = (png_ptr->width)&~7;
- diff = (png_ptr->width)&7;
- _asm {
- movd mm7, unmask //load bit pattern
- psubb mm6,mm6 //zero mm6
- punpcklbw mm7,mm7
- punpcklwd mm7,mm7
- punpckldq mm7,mm7 //fill register with 8 masks
-
- movq mm0,mask0
- movq mm1,mask1
- movq mm2,mask2
- movq mm3,mask3
- movq mm4,mask4
- movq mm5,mask5
-
- pand mm0,mm7
- pand mm1,mm7
- pand mm2,mm7
- pand mm3,mm7
- pand mm4,mm7
- pand mm5,mm7
-
- pcmpeqb mm0,mm6
- pcmpeqb mm1,mm6
- pcmpeqb mm2,mm6
- pcmpeqb mm3,mm6
- pcmpeqb mm4,mm6
- pcmpeqb mm5,mm6
-
- mov ecx,len //load length of line
- mov esi,srcptr //load source
- mov ebx,dstptr //load dest
-
- cmp ecx,0
- jz mainloop48end
+ unmask = ~mask;
+ len = (png_ptr->width)&~7;
+ diff = (png_ptr->width)&7;
+ _asm
+ {
+ movd mm7, unmask //load bit pattern
+ psubb mm6,mm6 //zero mm6
+ punpcklbw mm7,mm7
+ punpcklwd mm7,mm7
+ punpckldq mm7,mm7 //fill register with 8 masks
+
+ movq mm0,mask0
+ movq mm1,mask1
+ movq mm2,mask2
+ movq mm3,mask3
+ movq mm4,mask4
+ movq mm5,mask5
+
+ pand mm0,mm7
+ pand mm1,mm7
+ pand mm2,mm7
+ pand mm3,mm7
+ pand mm4,mm7
+ pand mm5,mm7
+
+ pcmpeqb mm0,mm6
+ pcmpeqb mm1,mm6
+ pcmpeqb mm2,mm6
+ pcmpeqb mm3,mm6
+ pcmpeqb mm4,mm6
+ pcmpeqb mm5,mm6
+
+ mov ecx,len //load length of line
+ mov esi,srcptr //load source
+ mov ebx,dstptr //load dest
+
+ cmp ecx,0
+ jz mainloop48end
mainloop48:
- movq mm7,[esi]
- pand mm7,mm0
- movq mm6,mm0
- pandn mm6,[ebx]
- por mm7,mm6
- movq [ebx],mm7
-
-
- movq mm6,[esi+8]
- pand mm6,mm1
- movq mm7,mm1
- pandn mm7,[ebx+8]
- por mm6,mm7
- movq [ebx+8],mm6
-
- movq mm6,[esi+16]
- pand mm6,mm2
- movq mm7,mm2
- pandn mm7,[ebx+16]
- por mm6,mm7
- movq [ebx+16],mm6
-
- movq mm7,[esi+24]
- pand mm7,mm3
- movq mm6,mm3
- pandn mm6,[ebx+24]
- por mm7,mm6
- movq [ebx+24],mm7
-
- movq mm6,[esi+32]
- pand mm6,mm4
- movq mm7,mm4
- pandn mm7,[ebx+32]
- por mm6,mm7
- movq [ebx+32],mm6
-
- movq mm7,[esi+40]
- pand mm7,mm5
- movq mm6,mm5
- pandn mm6,[ebx+40]
- por mm7,mm6
- movq [ebx+40],mm7
-
- add esi,48 //inc by 32 bytes processed
- add ebx,48
- sub ecx,8 //dec by 8 pixels processed
-
- ja mainloop48
+ movq mm7,[esi]
+ pand mm7,mm0
+ movq mm6,mm0
+ pandn mm6,[ebx]
+ por mm7,mm6
+ movq [ebx],mm7
+
+ movq mm6,[esi+8]
+ pand mm6,mm1
+ movq mm7,mm1
+ pandn mm7,[ebx+8]
+ por mm6,mm7
+ movq [ebx+8],mm6
+
+ movq mm6,[esi+16]
+ pand mm6,mm2
+ movq mm7,mm2
+ pandn mm7,[ebx+16]
+ por mm6,mm7
+ movq [ebx+16],mm6
+
+ movq mm7,[esi+24]
+ pand mm7,mm3
+ movq mm6,mm3
+ pandn mm6,[ebx+24]
+ por mm7,mm6
+ movq [ebx+24],mm7
+
+ movq mm6,[esi+32]
+ pand mm6,mm4
+ movq mm7,mm4
+ pandn mm7,[ebx+32]
+ por mm6,mm7
+ movq [ebx+32],mm6
+
+ movq mm7,[esi+40]
+ pand mm7,mm5
+ movq mm6,mm5
+ pandn mm6,[ebx+40]
+ por mm7,mm6
+ movq [ebx+40],mm7
+
+ add esi,48 //inc by 32 bytes processed
+ add ebx,48
+ sub ecx,8 //dec by 8 pixels processed
+
+ ja mainloop48
mainloop48end:
- mov ecx,diff
- cmp ecx,0
- jz end48
+ mov ecx,diff
+ cmp ecx,0
+ jz end48
- mov edx,mask
- sal edx,24 //make low byte the high byte
+ mov edx,mask
+ sal edx,24 //make low byte the high byte
secondloop48:
- sal edx,1 //move high bit to CF
- jnc skip48 //if CF = 0
- mov eax,[esi]
- mov [ebx],eax
+ sal edx,1 //move high bit to CF
+ jnc skip48 //if CF = 0
+ mov eax,[esi]
+ mov [ebx],eax
skip48:
- add esi,4
- add ebx,4
+ add esi,4
+ add ebx,4
- dec ecx
- jnz secondloop48
+ dec ecx
+ jnz secondloop48
end48:
- emms
- }
- }
- else /* mmx _not supported - Use modified C routine */
- {
- register unsigned int incr1, initial_val, final_val;
- png_size_t pixel_bytes;
- png_uint_32 i;
- register int disp = png_pass_inc[png_ptr->pass];
- int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
- pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
- srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
- pixel_bytes;
- dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
- initial_val = offset_table[png_ptr->pass]*pixel_bytes;
- final_val = png_ptr->width*pixel_bytes;
- incr1 = (disp)*pixel_bytes;
- for (i = initial_val; i < final_val; i += incr1)
- {
- png_memcpy(dstptr, srcptr, pixel_bytes);
- srcptr += incr1;
- dstptr += incr1;
- }
- } /* end of else */
- break; // end 48 bpp
- }
- default:
- {
- png_bytep sptr;
- png_bytep dp;
- png_size_t pixel_bytes;
- int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
- unsigned int i;
- register int disp = png_pass_inc[png_ptr->pass]; // get the offset
- register unsigned int incr1, initial_val, final_val;
- pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
- sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*pixel_bytes;
- dp = row + offset_table[png_ptr->pass]*pixel_bytes;
- initial_val = offset_table[png_ptr->pass]*pixel_bytes;
- final_val = png_ptr->width*pixel_bytes;
- incr1 = (disp)*pixel_bytes;
- for (i = initial_val; i < final_val; i += incr1)
- {
- png_memcpy(dp, sptr, pixel_bytes);
- sptr += incr1;
- dp += incr1;
- }
+ emms
+ }
+ }
+ else /* mmx _not supported - Use modified C routine */
+ {
+ register unsigned int incr1, initial_val, final_val;
+ png_size_t pixel_bytes;
+ png_uint_32 i;
+ register int disp = png_pass_inc[png_ptr->pass];
+ int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
+ pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+ srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+ pixel_bytes;
+ dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
+ initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+ final_val = png_ptr->width*pixel_bytes;
+ incr1 = (disp)*pixel_bytes;
+ for (i = initial_val; i < final_val; i += incr1)
+ {
+ png_memcpy(dstptr, srcptr, pixel_bytes);
+ srcptr += incr1;
+ dstptr += incr1;
+ }
+ } /* end of else */
+ break; // end 48 bpp
+ }
- break;
- }
- }
- }
-}
+ default:
+ {
+ png_bytep sptr;
+ png_bytep dp;
+ png_size_t pixel_bytes;
+ int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
+ unsigned int i;
+ register int disp = png_pass_inc[png_ptr->pass]; // get the offset
+ register unsigned int incr1, initial_val, final_val;
+ pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
+ sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
+ pixel_bytes;
+ dp = row + offset_table[png_ptr->pass]*pixel_bytes;
+ initial_val = offset_table[png_ptr->pass]*pixel_bytes;
+ final_val = png_ptr->width*pixel_bytes;
+ incr1 = (disp)*pixel_bytes;
+ for (i = initial_val; i < final_val; i += incr1)
+ {
+ png_memcpy(dp, sptr, pixel_bytes);
+ sptr += incr1;
+ dp += incr1;
+ }
+ break;
+ }
+ } /* end switch (png_ptr->row_info.pixel_depth) */
+ }
+ mmx_supported = save_mmx_supported;
+
+} /* end png_combine_row() */
#if defined(PNG_READ_INTERLACING_SUPPORTED)
@@ -946,9 +945,11 @@ png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass,
png_uint_32 transformations)
{
+ int save_mmx_supported = mmx_supported;
png_debug(1,"in png_do_read_interlace\n");
- if (mmx_supported==2)
- mmx_supported=mmxsupport();
+
+ // mmx_supported = mmxsupport(); // doesn't work
+ mmx_supported = 0;
if (row != NULL && row_info != NULL)
{
@@ -958,1068 +959,931 @@ png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass,
switch (row_info->pixel_depth)
{
- case 1:
- {
- png_bytep sp, dp;
- int sshift, dshift;
- int s_start, s_end, s_inc;
- png_byte v;
- png_uint_32 i;
- int j;
-
- sp = row + (png_size_t)((row_info->width - 1) >> 3);
- dp = row + (png_size_t)((final_width - 1) >> 3);
+ case 1:
+ {
+ png_bytep sp, dp;
+ int sshift, dshift;
+ int s_start, s_end, s_inc;
+ png_byte v;
+ png_uint_32 i;
+ int j;
+
+ sp = row + (png_size_t)((row_info->width - 1) >> 3);
+ dp = row + (png_size_t)((final_width - 1) >> 3);
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (transformations & PNG_PACKSWAP)
- {
- sshift = (int)((row_info->width + 7) & 7);
- dshift = (int)((final_width + 7) & 7);
- s_start = 7;
- s_end = 0;
- s_inc = -1;
- }
- else
+ if (transformations & PNG_PACKSWAP)
+ {
+ sshift = (int)((row_info->width + 7) & 7);
+ dshift = (int)((final_width + 7) & 7);
+ s_start = 7;
+ s_end = 0;
+ s_inc = -1;
+ }
+ else
#endif
- {
- sshift = 7 - (int)((row_info->width + 7) & 7);
- dshift = 7 - (int)((final_width + 7) & 7);
- s_start = 0;
- s_end = 7;
- s_inc = 1;
- }
+ {
+ sshift = 7 - (int)((row_info->width + 7) & 7);
+ dshift = 7 - (int)((final_width + 7) & 7);
+ s_start = 0;
+ s_end = 7;
+ s_inc = 1;
+ }
- for (i = row_info->width; i; i--)
- {
- v = (png_byte)((*sp >> sshift) & 0x1);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
- *dp |= (png_byte)(v << dshift);
- if (dshift == s_end)
- {
- dshift = s_start;
- dp--;
- }
- else
- dshift += s_inc;
+ for (i = row_info->width; i; i--)
+ {
+ v = (png_byte)((*sp >> sshift) & 0x1);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
+ *dp |= (png_byte)(v << dshift);
+ if (dshift == s_end)
+ {
+ dshift = s_start;
+ dp--;
+ }
+ else
+ dshift += s_inc;
+ }
+ if (sshift == s_end)
+ {
+ sshift = s_start;
+ sp--;
+ }
+ else
+ sshift += s_inc;
+ }
+ break;
}
- if (sshift == s_end)
+
+ case 2:
{
- sshift = s_start;
- sp--;
- }
- else
- sshift += s_inc;
- }
- break;
- }
- case 2:
- {
- png_bytep sp, dp;
- int sshift, dshift;
- int s_start, s_end, s_inc;
- png_uint_32 i;
+ png_bytep sp, dp;
+ int sshift, dshift;
+ int s_start, s_end, s_inc;
+ png_uint_32 i;
- sp = row + (png_size_t)((row_info->width - 1) >> 2);
- dp = row + (png_size_t)((final_width - 1) >> 2);
+ sp = row + (png_size_t)((row_info->width - 1) >> 2);
+ dp = row + (png_size_t)((final_width - 1) >> 2);
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (transformations & PNG_PACKSWAP)
- {
- sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
- dshift = (png_size_t)(((final_width + 3) & 3) << 1);
- s_start = 6;
- s_end = 0;
- s_inc = -2;
- }
- else
+ if (transformations & PNG_PACKSWAP)
+ {
+ sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
+ dshift = (png_size_t)(((final_width + 3) & 3) << 1);
+ s_start = 6;
+ s_end = 0;
+ s_inc = -2;
+ }
+ else
#endif
- {
- sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
- dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
- s_start = 0;
- s_end = 6;
- s_inc = 2;
- }
+ {
+ sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
+ dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
+ s_start = 0;
+ s_end = 6;
+ s_inc = 2;
+ }
- for (i = row_info->width; i; i--)
- {
- png_byte v;
- int j;
+ for (i = row_info->width; i; i--)
+ {
+ png_byte v;
+ int j;
- v = (png_byte)((*sp >> sshift) & 0x3);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
- *dp |= (png_byte)(v << dshift);
- if (dshift == s_end)
- {
- dshift = s_start;
- dp--;
- }
- else
- dshift += s_inc;
+ v = (png_byte)((*sp >> sshift) & 0x3);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
+ *dp |= (png_byte)(v << dshift);
+ if (dshift == s_end)
+ {
+ dshift = s_start;
+ dp--;
+ }
+ else
+ dshift += s_inc;
+ }
+ if (sshift == s_end)
+ {
+ sshift = s_start;
+ sp--;
+ }
+ else
+ sshift += s_inc;
+ }
+ break;
}
- if (sshift == s_end)
+
+ case 4:
{
- sshift = s_start;
- sp--;
- }
- else
- sshift += s_inc;
- }
- break;
- }
- case 4:
- {
- png_bytep sp, dp;
- int sshift, dshift;
- int s_start, s_end, s_inc;
- png_uint_32 i;
+ png_bytep sp, dp;
+ int sshift, dshift;
+ int s_start, s_end, s_inc;
+ png_uint_32 i;
- sp = row + (png_size_t)((row_info->width - 1) >> 1);
- dp = row + (png_size_t)((final_width - 1) >> 1);
+ sp = row + (png_size_t)((row_info->width - 1) >> 1);
+ dp = row + (png_size_t)((final_width - 1) >> 1);
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
- if (transformations & PNG_PACKSWAP)
- {
- sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
- dshift = (png_size_t)(((final_width + 1) & 1) << 2);
- s_start = 4;
- s_end = 0;
- s_inc = -4;
- }
- else
+ if (transformations & PNG_PACKSWAP)
+ {
+ sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
+ dshift = (png_size_t)(((final_width + 1) & 1) << 2);
+ s_start = 4;
+ s_end = 0;
+ s_inc = -4;
+ }
+ else
#endif
- {
- sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
- dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
- s_start = 0;
- s_end = 4;
- s_inc = 4;
- }
+ {
+ sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
+ dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
+ s_start = 0;
+ s_end = 4;
+ s_inc = 4;
+ }
- for (i = row_info->width; i; i--)
- {
- png_byte v;
- int j;
+ for (i = row_info->width; i; i--)
+ {
+ png_byte v;
+ int j;
- v = (png_byte)((*sp >> sshift) & 0xf);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
- *dp |= (png_byte)(v << dshift);
- if (dshift == s_end)
- {
- dshift = s_start;
- dp--;
- }
- else
- dshift += s_inc;
- }
- if (sshift == s_end)
- {
- sshift = s_start;
- sp--;
+ v = (png_byte)((*sp >> sshift) & 0xf);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
+ *dp |= (png_byte)(v << dshift);
+ if (dshift == s_end)
+ {
+ dshift = s_start;
+ dp--;
+ }
+ else
+ dshift += s_inc;
+ }
+ if (sshift == s_end)
+ {
+ sshift = s_start;
+ sp--;
+ }
+ else
+ sshift += s_inc;
+ }
+ break;
}
- else
- sshift += s_inc;
- }
- break;
- }
- default: // This is the place where the routine is modified
- {
- __int64 const4 = 0x0000000000FFFFFF;
- __int64 const5 = 0x000000FFFFFF0000;
- __int64 const6 = 0x00000000000000FF;
- //int mmx_supported = 1;
-
- png_bytep sptr, dp;
- png_uint_32 i;
- png_size_t pixel_bytes;
- int width = row_info->width;
-
- pixel_bytes = (row_info->pixel_depth >> 3);
-
- sptr = row + (row_info->width - 1) * pixel_bytes;
- dp = row + (final_width - 1) * pixel_bytes;
- // New code by Nirav Chhatrapati - Intel Corporation
+ default: // This is the place where the routine is modified
+ {
+ __int64 const4 = 0x0000000000FFFFFF;
+ __int64 const5 = 0x000000FFFFFF0000;
+ __int64 const6 = 0x00000000000000FF;
+ //int mmx_supported = 1;
- if (mmx_supported) // If machine supports MMX technology use MMX routine
- {
- if (pixel_bytes == 3)
- {
- if ((pass == 0) || (pass == 1))
- {
- _asm
- {
- mov esi, sptr
+ png_bytep sptr, dp;
+ png_uint_32 i;
+ png_size_t pixel_bytes;
- mov edi, dp
+ int width = row_info->width;
- mov ecx, width
+ pixel_bytes = (row_info->pixel_depth >> 3);
- sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
+ sptr = row + (row_info->width - 1) * pixel_bytes;
+ dp = row + (final_width - 1) * pixel_bytes;
+ // New code by Nirav Chhatrapati - Intel Corporation
+ if (mmx_supported) // use MMX routine if machine supports it
+ {
+ if (pixel_bytes == 3)
+ {
+ if ((pass == 0) || (pass == 1))
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width
+ sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
loop_pass0:
-
- movd mm0, [esi] ; X X X X X val2 val1 val0
-
- pand mm0, const4 ; 0 0 0 0 0 val2 val1 val0
-
- movq mm1, mm0 ; 0 0 0 0 0 val2 val1 val0
-
- psllq mm0, 16 ; 0 0 0 val2 val1 val0 0 0
-
- movq mm2, mm0 ; 0 0 0 val2 val1 val0 0 0
-
- psllq mm0, 24 ; val2 val1 val0 0 0 0 0 0
-
- psrlq mm1, 8 ; 0 0 0 0 0 0 val2 val1
-
- por mm0, mm2 ; val2 val1 val0 val2 val1 val0 0 0
-
- por mm0, mm1 ; val2 val1 val0 val2 val1 val0 val2 val1
-
- movq mm3, mm0 ; val2 val1 val0 val2 val1 val0 val2 val1
-
- psllq mm0, 16 ; val0 val2 val1 val0 val2 val1 0 0
-
- movq mm4, mm3 ; val2 val1 val0 val2 val1 val0 val2 val1
-
- punpckhdq mm3, mm0 ; val0 val2 val1 val0 val2 val1 val0 val2
-
- movq [edi+16] , mm4
-
- psrlq mm0, 32 ; 0 0 0 0 val0 val2 val1 val0
-
- movq [edi+8] , mm3
-
- punpckldq mm0, mm4 ; val1 val0 val2 val1 val0 val2 val1 val0
-
- sub esi, 3
-
- movq [edi], mm0
-
- sub edi, 24
-
- //sub esi, 3
-
- dec ecx
-
- jnz loop_pass0
-
- EMMS
- }
-
- }
-
- else if ((pass == 2) || (pass == 3))
- {
- _asm
- {
- mov esi, sptr
-
- mov edi, dp
-
- mov ecx, width
-
- sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
-
+ movd mm0, [esi] ; X X X X X v2 v1 v0
+ pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
+ movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
+ psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
+ movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
+ psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
+ psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
+ por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
+ por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
+ movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
+ psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
+ movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
+ punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
+ movq [edi+16] , mm4
+ psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
+ movq [edi+8] , mm3
+ punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
+ sub esi, 3
+ movq [edi], mm0
+ sub edi, 24
+ //sub esi, 3
+ dec ecx
+ jnz loop_pass0
+ EMMS
+ }
+ }
+ else if ((pass == 2) || (pass == 3))
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width
+ sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
loop_pass2:
-
- movd mm0, [esi] ; X X X X X val2 val1 val0
-
- pand mm0, const4 ; 0 0 0 0 0 val2 val1 val0
-
- movq mm1, mm0 ; 0 0 0 0 0 val2 val1 val0
-
- psllq mm0, 16 ; 0 0 0 val2 val1 val0 0 0
-
- movq mm2, mm0 ; 0 0 0 val2 val1 val0 0 0
-
- psllq mm0, 24 ; val2 val1 val0 0 0 0 0 0
-
- psrlq mm1, 8 ; 0 0 0 0 0 0 val2 val1
-
- por mm0, mm2 ; val2 val1 val0 val2 val1 val0 0 0
-
- por mm0, mm1 ; val2 val1 val0 val2 val1 val0 val2 val1
-
- movq [edi+4], mm0 ; move to memory
-
- psrlq mm0, 16 ; 0 0 val2 val1 val0 val2 val1 val0
-
- movd [edi], mm0 ; move to memory
-
- sub esi, 3
-
- sub edi, 12
-
- dec ecx
-
- jnz loop_pass2
-
- EMMS
- }
- }
-
- else /*if ((pass == 4) || (pass == 5)) */
- {
-
- int width_mmx = ((width >> 1) << 1) - 8;
- width -= width_mmx;
- if(width_mmx)
- _asm
- {
- mov esi, sptr
-
- mov edi, dp
-
- mov ecx, width_mmx
-
- sub esi, 3
-
- sub edi, 9
-
+ movd mm0, [esi] ; X X X X X v2 v1 v0
+ pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
+ movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
+ psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
+ movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
+ psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
+ psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
+ por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
+ por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
+ movq [edi+4], mm0 ; move to memory
+ psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
+ movd [edi], mm0 ; move to memory
+ sub esi, 3
+ sub edi, 12
+ dec ecx
+ jnz loop_pass2
+ EMMS
+ }
+ }
+ else /* if ((pass == 4) || (pass == 5)) */
+ {
+ int width_mmx = ((width >> 1) << 1) - 8;
+ width -= width_mmx;
+ if (width_mmx)
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width_mmx
+ sub esi, 3
+ sub edi, 9
loop_pass4:
-
- movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
-
- movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
-
- movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
-
- psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
-
- pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
-
- psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
-
- por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
-
- movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
-
- psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
-
- movq [edi], mm0 ; move quad to memory
-
- psrlq mm5, 16 ; 0 0 0 0 0 X X v2
-
- pand mm5, const6 ; 0 0 0 0 0 0 0 v2
-
- por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
-
- movd [edi+8], mm6 ; move double to memory
-
- sub esi, 6
-
- sub edi, 12
-
- sub ecx, 2
-
- jnz loop_pass4
-
- EMMS
- }
-
- sptr -= width_mmx*3;
- dp -= width_mmx*6;
- for (i = width; i; i--)
- {
- png_byte v[8];
- int j;
-
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
-
- }
-
- } /* end of pixel_bytes == 3 */
-
- else if (pixel_bytes == 1)
- {
-
- if ((pass == 0) || (pass == 1))
- {
- int width_mmx = ((width >> 2) << 2);
- width -= width_mmx;
- if(width_mmx)
- _asm
- {
-
- mov esi, sptr
-
- mov edi, dp
-
- mov ecx, width_mmx
-
- sub edi, 31
-
- sub esi, 3
-
+ movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
+ movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
+ movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
+ psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
+ pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
+ psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
+ por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
+ movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
+ psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
+ movq [edi], mm0 ; move quad to memory
+ psrlq mm5, 16 ; 0 0 0 0 0 X X v2
+ pand mm5, const6 ; 0 0 0 0 0 0 0 v2
+ por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
+ movd [edi+8], mm6 ; move double to memory
+ sub esi, 6
+ sub edi, 12
+ sub ecx, 2
+ jnz loop_pass4
+ EMMS
+ }
+ }
+
+ sptr -= width_mmx*3;
+ dp -= width_mmx*6;
+ for (i = width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr -= pixel_bytes;
+ }
+ }
+ } /* end of pixel_bytes == 3 */
+
+ else if (pixel_bytes == 1)
+ {
+ if ((pass == 0) || (pass == 1))
+ {
+ int width_mmx = ((width >> 2) << 2);
+ width -= width_mmx;
+ if (width_mmx)
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width_mmx
+ sub edi, 31
+ sub esi, 3
loop1_pass0:
-
- movd mm0, [esi] ; X X X X v0 v1 v2 v3
-
- movq mm1, mm0 ; X X X X v0 v1 v2 v3
-
- punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
-
- movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
-
- punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
-
- movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
-
- punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
-
- punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
-
- movq [edi], mm0 ; move to memory v3
-
- punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
-
- movq [edi+8], mm3 ; move to memory v2
-
- movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
-
- punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
-
- punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
-
- movq [edi+16], mm2 ; move to memory v1
-
- movq [edi+24], mm4 ; move to memory v0
-
- sub esi, 4
-
- sub edi, 32
-
- sub ecx, 4
-
- jnz loop1_pass0
-
- EMMS
- }
-
- sptr -= width_mmx;
- dp -= width_mmx*8;
- for (i = width; i; i--)
- {
- png_byte v[8];
- int j;
-
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
-
- }
-
-
- else if ((pass == 2) || (pass == 3))
- {
- int width_mmx = ((width >> 2) << 2);
- width -= width_mmx;
- if(width_mmx)
- _asm
- {
-
- mov esi, sptr
-
- mov edi, dp
-
- mov ecx, width_mmx
-
- sub edi, 15
-
- sub esi, 3
-
+ movd mm0, [esi] ; X X X X v0 v1 v2 v3
+ movq mm1, mm0 ; X X X X v0 v1 v2 v3
+ punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
+ movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
+ punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
+ movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
+ punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
+ punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
+ movq [edi], mm0 ; move to memory v3
+ punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
+ movq [edi+8], mm3 ; move to memory v2
+ movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
+ punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
+ punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
+ movq [edi+16], mm2 ; move to memory v1
+ movq [edi+24], mm4 ; move to memory v0
+ sub esi, 4
+ sub edi, 32
+ sub ecx, 4
+ jnz loop1_pass0
+ EMMS
+ }
+ }
+
+ sptr -= width_mmx;
+ dp -= width_mmx*8;
+ for (i = width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr -= pixel_bytes;
+ }
+ }
+ else if ((pass == 2) || (pass == 3))
+ {
+ int width_mmx = ((width >> 2) << 2);
+ width -= width_mmx;
+ if (width_mmx)
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width_mmx
+ sub edi, 15
+ sub esi, 3
loop1_pass2:
-
- movd mm0, [esi] ; X X X X v0 v1 v2 v3
-
- punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
-
- movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
-
- punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
-
- punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
-
- movq [edi], mm0 ; move to memory v2 and v3
-
- sub esi, 4
-
- movq [edi+8], mm1 ; move to memory v1 and v0
-
- sub edi, 16
-
- sub ecx, 4
-
- jnz loop1_pass2
-
- EMMS
- }
-
- sptr -= width_mmx;
- dp -= width_mmx*4;
- for (i = width; i; i--)
- {
- png_byte v[8];
- int j;
-
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
-
- }
-
- else //if ((pass == 4) || (pass == 5))
- {
- int width_mmx = ((width >> 3) << 3);
- width -= width_mmx;
- if(width_mmx)
- _asm
- {
-
- mov esi, sptr
- mov edi, dp
- mov ecx, width_mmx
- sub edi, 15
- sub esi, 7
-
+ movd mm0, [esi] ; X X X X v0 v1 v2 v3
+ punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
+ movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
+ punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
+ punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
+ movq [edi], mm0 ; move to memory v2 and v3
+ sub esi, 4
+ movq [edi+8], mm1 ; move to memory v1 and v0
+ sub edi, 16
+ sub ecx, 4
+ jnz loop1_pass2
+ EMMS
+ }
+ }
+
+ sptr -= width_mmx;
+ dp -= width_mmx*4;
+ for (i = width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr -= pixel_bytes;
+ }
+ }
+ else //if ((pass == 4) || (pass == 5))
+ {
+ int width_mmx = ((width >> 3) << 3);
+ width -= width_mmx;
+ if (width_mmx)
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width_mmx
+ sub edi, 15
+ sub esi, 7
loop1_pass4:
-
- movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
- movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
- punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
- //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
- punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
- movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
- sub esi, 8
- movq [edi], mm0 ; move to memory v4 v5 v6 and v7
- //sub esi, 4
- sub edi, 16
- sub ecx, 8
- jnz loop1_pass4
-
- EMMS
- }
-
- sptr -= width_mmx;
- dp -= width_mmx*2;
- for (i = width; i; i--)
- {
- png_byte v[8];
- int j;
-
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
-
- }
-
- } /* end of pixel_bytes == 1 */
-
- else if (pixel_bytes == 2)
- {
-
- if ((pass == 0) || (pass == 1))
- {
- int width_mmx = ((width >> 1) << 1);
- width -= width_mmx;
- if(width_mmx)
- _asm
- {
- mov esi, sptr
- mov edi, dp
- mov ecx, width_mmx
- sub esi, 2
- sub edi, 30
-
+ movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
+ movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
+ punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
+ //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
+ punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
+ movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
+ sub esi, 8
+ movq [edi], mm0 ; move to memory v4 v5 v6 and v7
+ //sub esi, 4
+ sub edi, 16
+ sub ecx, 8
+ jnz loop1_pass4
+ EMMS
+ }
+ }
+
+ sptr -= width_mmx;
+ dp -= width_mmx*2;
+ for (i = width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr -= pixel_bytes;
+ }
+ }
+ } /* end of pixel_bytes == 1 */
+
+ else if (pixel_bytes == 2)
+ {
+ if ((pass == 0) || (pass == 1))
+ {
+ int width_mmx = ((width >> 1) << 1);
+ width -= width_mmx;
+ if (width_mmx)
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width_mmx
+ sub esi, 2
+ sub edi, 30
loop2_pass0:
- movd mm0, [esi] ; X X X X v1 v0 v3 v2
- punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
- movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
- punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
- punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
- movq [edi], mm0
- movq [edi + 8], mm0
- movq [edi + 16], mm1
- movq [edi + 24], mm1
- sub esi, 4
- sub edi, 32
- sub ecx, 2
- jnz loop2_pass0
-
- EMMS
- }
-
- sptr -= (width_mmx*2 + 2);
- dp -= (width_mmx*16 + 2);
-
- for (i = width; i; i--)
- {
-
- png_byte v[8];
- int j;
- sptr -= pixel_bytes;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- dp -= pixel_bytes;
- png_memcpy(dp, v, pixel_bytes);
- //dp -= pixel_bytes;
- }
- //sptr -= pixel_bytes;
- }
- }
-
- else if ((pass == 2) || (pass == 3))
- {
- int width_mmx = ((width >> 1) << 1) ;
- width -= width_mmx;
- if(width_mmx)
- _asm
- {
- mov esi, sptr
- mov edi, dp
- mov ecx, width_mmx
- sub esi, 2
- sub edi, 14
-
+ movd mm0, [esi] ; X X X X v1 v0 v3 v2
+ punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
+ movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
+ punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
+ punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
+ movq [edi], mm0
+ movq [edi + 8], mm0
+ movq [edi + 16], mm1
+ movq [edi + 24], mm1
+ sub esi, 4
+ sub edi, 32
+ sub ecx, 2
+ jnz loop2_pass0
+ EMMS
+ }
+ }
+
+ sptr -= (width_mmx*2 + 2);
+ dp -= (width_mmx*16 + 2);
+ for (i = width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ sptr -= pixel_bytes;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ dp -= pixel_bytes;
+ png_memcpy(dp, v, pixel_bytes);
+ //dp -= pixel_bytes;
+ }
+ //sptr -= pixel_bytes;
+ }
+ }
+
+ else if ((pass == 2) || (pass == 3))
+ {
+ int width_mmx = ((width >> 1) << 1) ;
+ width -= width_mmx;
+ if (width_mmx)
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width_mmx
+ sub esi, 2
+ sub edi, 14
loop2_pass2:
- movd mm0, [esi] ; X X X X v1 v0 v3 v2
- punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
- movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
- punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
- punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
- movq [edi], mm0
- sub esi, 4
- movq [edi + 8], mm1
- //sub esi, 4
- sub edi, 16
- sub ecx, 2
- jnz loop2_pass2
-
- EMMS
- }
-
- sptr -= (width_mmx*2 + 2);
- dp -= (width_mmx*8 + 2);
-
- for (i = width; i; i--)
- {
-
- png_byte v[8];
- int j;
- sptr -= pixel_bytes;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- dp -= pixel_bytes;
- png_memcpy(dp, v, pixel_bytes);
- //dp -= pixel_bytes;
- }
- //sptr -= pixel_bytes;
- }
- }
-
- else // pass == 4 or 5
- {
- int width_mmx = ((width >> 1) << 1) ;
- width -= width_mmx;
- if(width_mmx)
- _asm
- {
- mov esi, sptr
- mov edi, dp
- mov ecx, width_mmx
- sub esi, 2
- sub edi, 6
-
+ movd mm0, [esi] ; X X X X v1 v0 v3 v2
+ punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
+ movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
+ punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
+ punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
+ movq [edi], mm0
+ sub esi, 4
+ movq [edi + 8], mm1
+ //sub esi, 4
+ sub edi, 16
+ sub ecx, 2
+ jnz loop2_pass2
+ EMMS
+ }
+ }
+
+ sptr -= (width_mmx*2 + 2);
+ dp -= (width_mmx*8 + 2);
+ for (i = width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ sptr -= pixel_bytes;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ dp -= pixel_bytes;
+ png_memcpy(dp, v, pixel_bytes);
+ //dp -= pixel_bytes;
+ }
+ //sptr -= pixel_bytes;
+ }
+ }
+
+ else // pass == 4 or 5
+ {
+ int width_mmx = ((width >> 1) << 1) ;
+ width -= width_mmx;
+ if (width_mmx)
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width_mmx
+ sub esi, 2
+ sub edi, 6
loop2_pass4:
- movd mm0, [esi] ; X X X X v1 v0 v3 v2
- punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
- sub esi, 4
- movq [edi], mm0
- sub edi, 8
- sub ecx, 2
- jnz loop2_pass4
-
- EMMS
- }
-
- sptr -= (width_mmx*2 + 2);
- dp -= (width_mmx*4 + 2);
-
- for (i = width; i; i--)
- {
-
- png_byte v[8];
- int j;
- sptr -= pixel_bytes;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- dp -= pixel_bytes;
- png_memcpy(dp, v, pixel_bytes);
- //dp -= pixel_bytes;
- }
- //sptr -= pixel_bytes;
- }
- }
-
- } /* end of pixel_bytes == 2 */
-
- else if (pixel_bytes == 4)
- {
- if ((pass == 0) || (pass == 1))
- {
- int width_mmx = ((width >> 1) << 1) ;
- width -= width_mmx;
- if(width_mmx)
- _asm
- {
- mov esi, sptr
- mov edi, dp
- mov ecx, width_mmx
- sub esi, 4
- sub edi, 60
-
+ movd mm0, [esi] ; X X X X v1 v0 v3 v2
+ punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
+ sub esi, 4
+ movq [edi], mm0
+ sub edi, 8
+ sub ecx, 2
+ jnz loop2_pass4
+ EMMS
+ }
+ }
+
+ sptr -= (width_mmx*2 + 2);
+ dp -= (width_mmx*4 + 2);
+ for (i = width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ sptr -= pixel_bytes;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ dp -= pixel_bytes;
+ png_memcpy(dp, v, pixel_bytes);
+ //dp -= pixel_bytes;
+ }
+ //sptr -= pixel_bytes;
+ }
+ }
+ } /* end of pixel_bytes == 2 */
+
+ else if (pixel_bytes == 4)
+ {
+ if ((pass == 0) || (pass == 1))
+ {
+ int width_mmx = ((width >> 1) << 1) ;
+ width -= width_mmx;
+ if (width_mmx)
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width_mmx
+ sub esi, 4
+ sub edi, 60
loop4_pass0:
- movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
- movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
- punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
- punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
- movq [edi], mm0
- movq [edi + 8], mm0
- movq [edi + 16], mm0
- movq [edi + 24], mm0
- movq [edi+32], mm1
- movq [edi + 40], mm1
- movq [edi+ 48], mm1
- sub esi, 8
- movq [edi + 56], mm1
- sub edi, 64
- sub ecx, 2
- jnz loop4_pass0
-
- EMMS
- }
-
- sptr -= (width_mmx*4 + 4);
- dp -= (width_mmx*32 + 4);
-
- for (i = width; i; i--)
- {
-
- png_byte v[8];
- int j;
- sptr -= pixel_bytes;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- dp -= pixel_bytes;
- png_memcpy(dp, v, pixel_bytes);
- //dp -= pixel_bytes;
- }
- //sptr -= pixel_bytes;
- }
- }
-
- else if ((pass == 2) || (pass == 3))
- {
- int width_mmx = ((width >> 1) << 1) ;
- width -= width_mmx;
- if(width_mmx)
- _asm
- {
- mov esi, sptr
- mov edi, dp
- mov ecx, width_mmx
- sub esi, 4
- sub edi, 28
-
+ movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
+ movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
+ punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
+ punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
+ movq [edi], mm0
+ movq [edi + 8], mm0
+ movq [edi + 16], mm0
+ movq [edi + 24], mm0
+ movq [edi+32], mm1
+ movq [edi + 40], mm1
+ movq [edi+ 48], mm1
+ sub esi, 8
+ movq [edi + 56], mm1
+ sub edi, 64
+ sub ecx, 2
+ jnz loop4_pass0
+ EMMS
+ }
+ }
+
+ sptr -= (width_mmx*4 + 4);
+ dp -= (width_mmx*32 + 4);
+ for (i = width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ sptr -= pixel_bytes;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ dp -= pixel_bytes;
+ png_memcpy(dp, v, pixel_bytes);
+ //dp -= pixel_bytes;
+ }
+ //sptr -= pixel_bytes;
+ }
+ }
+
+ else if ((pass == 2) || (pass == 3))
+ {
+ int width_mmx = ((width >> 1) << 1) ;
+ width -= width_mmx;
+ if (width_mmx)
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width_mmx
+ sub esi, 4
+ sub edi, 28
loop4_pass2:
- movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
- movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
- punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
- punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
- movq [edi], mm0
- movq [edi + 8], mm0
- movq [edi+16], mm1
- movq [edi + 24], mm1
- sub esi, 8
- sub edi, 32
- sub ecx, 2
- jnz loop4_pass2
-
- EMMS
- }
-
- sptr -= (width_mmx*4 + 4);
- dp -= (width_mmx*16 + 4);
-
- for (i = width; i; i--)
- {
-
- png_byte v[8];
- int j;
- sptr -= pixel_bytes;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- dp -= pixel_bytes;
- png_memcpy(dp, v, pixel_bytes);
- //dp -= pixel_bytes;
- }
- //sptr -= pixel_bytes;
- }
- }
-
- else // pass == 4 or 5
- {
- int width_mmx = ((width >> 1) << 1) ;
- width -= width_mmx;
- if(width_mmx)
- _asm
- {
- mov esi, sptr
- mov edi, dp
- mov ecx, width_mmx
- sub esi, 4
- sub edi, 12
-
+ movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
+ movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
+ punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
+ punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
+ movq [edi], mm0
+ movq [edi + 8], mm0
+ movq [edi+16], mm1
+ movq [edi + 24], mm1
+ sub esi, 8
+ sub edi, 32
+ sub ecx, 2
+ jnz loop4_pass2
+ EMMS
+ }
+ }
+
+ sptr -= (width_mmx*4 + 4);
+ dp -= (width_mmx*16 + 4);
+ for (i = width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ sptr -= pixel_bytes;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ dp -= pixel_bytes;
+ png_memcpy(dp, v, pixel_bytes);
+ //dp -= pixel_bytes;
+ }
+ //sptr -= pixel_bytes;
+ }
+ }
+
+ else // pass == 4 or 5
+ {
+ int width_mmx = ((width >> 1) << 1) ;
+ width -= width_mmx;
+ if (width_mmx)
+ {
+ _asm
+ {
+ mov esi, sptr
+ mov edi, dp
+ mov ecx, width_mmx
+ sub esi, 4
+ sub edi, 12
loop4_pass4:
- movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
- movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
- punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
- punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
- movq [edi], mm0
- sub esi, 8
- movq [edi + 8], mm1
- sub edi, 16
- sub ecx, 2
- jnz loop4_pass4
-
- EMMS
- }
-
- sptr -= (width_mmx*4 + 4);
- dp -= (width_mmx*8 + 4);
-
- for (i = width; i; i--)
- {
-
- png_byte v[8];
- int j;
- sptr -= pixel_bytes;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- dp -= pixel_bytes;
- png_memcpy(dp, v, pixel_bytes);
- //dp -= pixel_bytes;
- }
- //sptr -= pixel_bytes;
- }
- }
-
- } /* end of pixel_bytes == 4 */
-
- else if (pixel_bytes == 6)
- {
- for (i = row_info->width; i; i--)
- {
-
- png_byte v[8];
- int j;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
- } /* end of pixel_bytes == 6 */
-
- else
- {
- for (i = row_info->width; i; i--)
- {
-
- png_byte v[8];
- int j;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr-= pixel_bytes;
- }
- }
- } /* end of mmx_supported */
-
- else /* MMX not supported */
- /* use modified C code - takes advantage of inlining of memcpy for
- a constant */
- {
- if (pixel_bytes == 1)
- {
- for (i = row_info->width; i; i--)
- {
- png_byte v[8];
- int j;
+ movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
+ movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
+ punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
+ punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
+ movq [edi], mm0
+ sub esi, 8
+ movq [edi + 8], mm1
+ sub edi, 16
+ sub ecx, 2
+ jnz loop4_pass4
+ EMMS
+ }
+ }
+
+ sptr -= (width_mmx*4 + 4);
+ dp -= (width_mmx*8 + 4);
+ for (i = width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ sptr -= pixel_bytes;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ dp -= pixel_bytes;
+ png_memcpy(dp, v, pixel_bytes);
+ //dp -= pixel_bytes;
+ }
+ //sptr -= pixel_bytes;
+ }
+ }
+
+ } /* end of pixel_bytes == 4 */
+
+ else if (pixel_bytes == 6)
+ {
+ for (i = row_info->width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr -= pixel_bytes;
+ }
+ } /* end of pixel_bytes == 6 */
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
- }
- else if (pixel_bytes == 3)
- {
- for (i = row_info->width; i; i--)
- {
- png_byte v[8];
- int j;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
+ else
+ {
+ for (i = row_info->width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr-= pixel_bytes;
+ }
+ }
+ } /* end of mmx_supported */
+
+ else /* MMX not supported: use modified C code - takes advantage
+ * of inlining of memcpy for a constant */
{
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
- }
- else if (pixel_bytes == 2)
- {
- for (i = row_info->width; i; i--)
- {
- png_byte v[8];
- int j;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
- }
- else if (pixel_bytes == 4)
- {
- for (i = row_info->width; i; i--)
- {
- png_byte v[8];
- int j;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
- }
- else if (pixel_bytes == 6)
- {
- for (i = row_info->width; i; i--)
- {
- png_byte v[8];
- int j;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
- }
- else
- {
- for (i = row_info->width; i; i--)
- {
- png_byte v[8];
- int j;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
+ if (pixel_bytes == 1)
{
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
- }
+ for (i = row_info->width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr -= pixel_bytes;
+ }
+ }
+ else if (pixel_bytes == 3)
+ {
+ for (i = row_info->width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr -= pixel_bytes;
+ }
+ }
+ else if (pixel_bytes == 2)
+ {
+ for (i = row_info->width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr -= pixel_bytes;
+ }
+ }
+ else if (pixel_bytes == 4)
+ {
+ for (i = row_info->width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr -= pixel_bytes;
+ }
+ }
+ else if (pixel_bytes == 6)
+ {
+ for (i = row_info->width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr -= pixel_bytes;
+ }
+ }
+ else
+ {
+ for (i = row_info->width; i; i--)
+ {
+ png_byte v[8];
+ int j;
+ png_memcpy(v, sptr, pixel_bytes);
+ for (j = 0; j < png_pass_inc[pass]; j++)
+ {
+ png_memcpy(dp, v, pixel_bytes);
+ dp -= pixel_bytes;
+ }
+ sptr -= pixel_bytes;
+ }
+ }
+
+ } /* end of MMX not supported */
+ break;
+ }
+ } /* end switch (row_info->pixel_depth) */
- } /* end of MMX not supported */
- break;
- }
- }
- row_info->width = final_width;
+ row_info->width = final_width;
row_info->rowbytes = ((final_width *
- (png_uint_32)row_info->pixel_depth + 7) >> 3);
+ (png_uint_32)row_info->pixel_depth + 7) >> 3);
}
+ mmx_supported = save_mmx_supported;
}
-#endif
-
+#endif /* PNG_READ_INTERLACING_SUPPORTED */
// These variables are utilized in the functions below. They are declared
// globally here to ensure alignment on 8-byte boundaries.
+
union uAll {
__int64 use;
double align;
-} LBCarryMask = {0x0101010101010101}, HBClearMask = {0x7f7f7f7f7f7f7f7f},
- ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
+} LBCarryMask = {0x0101010101010101},
+ HBClearMask = {0x7f7f7f7f7f7f7f7f},
+ ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
+
// Optimized code for PNG Average filter decoder
void
png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
, png_bytep prev_row)
{
- int bpp;
- png_uint_32 FullLength;
- png_uint_32 MMXLength;
- //png_uint_32 len;
- int diff;
- bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
- FullLength = row_info->rowbytes; // # of bytes to filter
- _asm {
+ int bpp;
+ png_uint_32 FullLength;
+ png_uint_32 MMXLength;
+ //png_uint_32 len;
+ int diff;
+
+ bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
+ FullLength = row_info->rowbytes; // # of bytes to filter
+ _asm {
// Init address pointers and offset
mov edi, row // edi ==> Avg(x)
xor ebx, ebx // ebx ==> x
mov edx, edi
- mov esi, prev_row // esi ==> Prior(x)
+ mov esi, prev_row // esi ==> Prior(x)
sub edx, bpp // edx ==> Raw(x-bpp)
xor eax, eax
@@ -2027,12 +1891,12 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
// Raw(x) = Avg(x) + (Prior(x)/2)
davgrlp:
mov al, [esi + ebx] // Load al with Prior(x)
- inc ebx
+ inc ebx
shr al, 1 // divide by 2
add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
cmp ebx, bpp
- mov [edi+ebx-1], al // Write back Raw(x);
- // mov does not affect flags; -1 to offset inc ebx
+ mov [edi+ebx-1], al // Write back Raw(x);
+ // mov does not affect flags; -1 to offset inc ebx
jb davgrlp
// get # of bytes to alignment
mov diff, edi // take start of row
@@ -2047,27 +1911,27 @@ davgrlp:
xor ecx, ecx
davglp1:
xor eax, eax
- mov cl, [esi + ebx] // load cl with Prior(x)
+ mov cl, [esi + ebx] // load cl with Prior(x)
mov al, [edx + ebx] // load al with Raw(x-bpp)
add ax, cx
- inc ebx
+ inc ebx
shr ax, 1 // divide by 2
add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
- cmp ebx, diff // Check if at alignment boundary
- mov [edi+ebx-1], al // Write back Raw(x);
+ cmp ebx, diff // Check if at alignment boundary
+ mov [edi+ebx-1], al // Write back Raw(x);
// mov does not affect flags; -1 to offset inc ebx
- jb davglp1 // Repeat until at alignment boundary
+ jb davglp1 // Repeat until at alignment boundary
davggo:
- mov eax, FullLength
+ mov eax, FullLength
mov ecx, eax
sub eax, ebx // subtract alignment fix
and eax, 0x00000007 // calc bytes over mult of 8
sub ecx, eax // drop over bytes from original length
mov MMXLength, ecx
- } // end _asm block
- // Now do the math for the rest of the row
- switch ( bpp )
- {
+ } // end _asm block
+ // Now do the math for the rest of the row
+ switch ( bpp )
+ {
case 3:
{
ActiveMask.use = 0x0000000000ffffff;
@@ -2080,21 +1944,21 @@ davggo:
movq mm5, LBCarryMask
mov edi, row // edi ==> Avg(x)
movq mm4, HBClearMask
- mov esi, prev_row // esi ==> Prior(x)
+ mov esi, prev_row // esi ==> Prior(x)
// PRIME the pump (load the first Raw(x-bpp) data set
- movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
+ movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
// (we correct position in loop below)
davg3lp:
- movq mm0, [edi + ebx] // Load mm0 with Avg(x)
+ movq mm0, [edi + ebx] // Load mm0 with Avg(x)
// Add (Prev_row/2) to Average
movq mm3, mm5
psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
- movq mm1, [esi + ebx] // Load mm1 with Prior(x)
+ movq mm1, [esi + ebx] // Load mm1 with Prior(x)
movq mm6, mm7
pand mm3, mm1 // get lsb for each prev_row byte
psrlq mm1, 1 // divide prev_row bytes by 2
pand mm1, mm4 // clear invalid bit 7 of each byte
- paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
+ paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
movq mm1, mm3 // now use mm1 for getting LBCarrys
pand mm1, mm2 // get LBCarrys for each byte where both
@@ -2103,173 +1967,180 @@ davg3lp:
pand mm2, mm4 // clear invalid bit 7 of each byte
paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
- paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
+ paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
+ // byte
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
movq mm2, mm0 // mov updated Raws to mm2
psllq mm2, ShiftBpp // shift data to position correctly
movq mm1, mm3 // now use mm1 for getting LBCarrys
- pand mm1, mm2 // get LBCarrys for each byte where both
- // lsb's were == 1 (Only valid for active group)
- psrlq mm2, 1 // divide raw bytes by 2
- pand mm2, mm4 // clear invalid bit 7 of each byte
- paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
- pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
- paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
+ pand mm1, mm2 // get LBCarrys for each byte where both
+ // lsb's were == 1 (Only valid for active group)
+ psrlq mm2, 1 // divide raw bytes by 2
+ pand mm2, mm4 // clear invalid bit 7 of each byte
+ paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
+ pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
+ paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
+ // byte
// Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
- psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two bytes
+ psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
+ // bytes
movq mm2, mm0 // mov updated Raws to mm2
psllq mm2, ShiftBpp // shift data to position correctly
- // Data only needs to be shifted once here to
- // get the correct x-bpp offset.
- movq mm1, mm3 // now use mm1 for getting LBCarrys
- pand mm1, mm2 // get LBCarrys for each byte where both
- // lsb's were == 1 (Only valid for active group)
- psrlq mm2, 1 // divide raw bytes by 2
- pand mm2, mm4 // clear invalid bit 7 of each byte
- paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
- pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
- add ebx, 8
- paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
+ // Data only needs to be shifted once here to
+ // get the correct x-bpp offset.
+ movq mm1, mm3 // now use mm1 for getting LBCarrys
+ pand mm1, mm2 // get LBCarrys for each byte where both
+ // lsb's were == 1 (Only valid for active group)
+ psrlq mm2, 1 // divide raw bytes by 2
+ pand mm2, mm4 // clear invalid bit 7 of each byte
+ paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
+ pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
+ add ebx, 8
+ paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
+ // byte
// Now ready to write back to memory
- movq [edi + ebx - 8], mm0
+ movq [edi + ebx - 8], mm0
// Move updated Raw(x) to use as Raw(x-bpp) for next loop
- cmp ebx, MMXLength
+ cmp ebx, MMXLength
movq mm2, mm0 // mov updated Raw(x) to mm2
- jb davg3lp
- } // end _asm block
+ jb davg3lp
+ } // end _asm block
}
break;
+
case 6:
case 4:
case 7:
case 5:
{
ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
- // appropriate inactive bytes
+ // appropriate inactive bytes
ShiftBpp.use = bpp << 3;
ShiftRem.use = 64 - ShiftBpp.use;
- _asm {
+ _asm {
movq mm4, HBClearMask
// Re-init address pointers and offset
mov ebx, diff // ebx ==> x = offset to alignment boundary
// Load ActiveMask and clear all bytes except for 1st active group
movq mm7, ActiveMask
- mov edi, row // edi ==> Avg(x)
+ mov edi, row // edi ==> Avg(x)
psrlq mm7, ShiftRem
- mov esi, prev_row // esi ==> Prior(x)
+ mov esi, prev_row // esi ==> Prior(x)
movq mm6, mm7
movq mm5, LBCarryMask
- psllq mm6, ShiftBpp // Create mask for 2nd active group
+ psllq mm6, ShiftBpp // Create mask for 2nd active group
// PRIME the pump (load the first Raw(x-bpp) data set
- movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
+ movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
// (we correct position in loop below)
davg4lp:
- movq mm0, [edi + ebx]
+ movq mm0, [edi + ebx]
psrlq mm2, ShiftRem // shift data to position correctly
- movq mm1, [esi + ebx]
+ movq mm1, [esi + ebx]
// Add (Prev_row/2) to Average
movq mm3, mm5
- pand mm3, mm1 // get lsb for each prev_row byte
- psrlq mm1, 1 // divide prev_row bytes by 2
- pand mm1, mm4 // clear invalid bit 7 of each byte
- paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
+ pand mm3, mm1 // get lsb for each prev_row byte
+ psrlq mm1, 1 // divide prev_row bytes by 2
+ pand mm1, mm4 // clear invalid bit 7 of each byte
+ paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
- movq mm1, mm3 // now use mm1 for getting LBCarrys
- pand mm1, mm2 // get LBCarrys for each byte where both
- // lsb's were == 1 (Only valid for active group)
- psrlq mm2, 1 // divide raw bytes by 2
- pand mm2, mm4 // clear invalid bit 7 of each byte
- paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
- pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
- paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
+ movq mm1, mm3 // now use mm1 for getting LBCarrys
+ pand mm1, mm2 // get LBCarrys for each byte where both
+ // lsb's were == 1 (Only valid for active group)
+ psrlq mm2, 1 // divide raw bytes by 2
+ pand mm2, mm4 // clear invalid bit 7 of each byte
+ paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
+ pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
+ paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
+ // byte
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
- movq mm2, mm0 // mov updated Raws to mm2
- psllq mm2, ShiftBpp // shift data to position correctly
- add ebx, 8
- movq mm1, mm3 // now use mm1 for getting LBCarrys
- pand mm1, mm2 // get LBCarrys for each byte where both
- // lsb's were == 1 (Only valid for active group)
- psrlq mm2, 1 // divide raw bytes by 2
- pand mm2, mm4 // clear invalid bit 7 of each byte
- paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
- pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
- paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
- cmp ebx, MMXLength
+ movq mm2, mm0 // mov updated Raws to mm2
+ psllq mm2, ShiftBpp // shift data to position correctly
+ add ebx, 8
+ movq mm1, mm3 // now use mm1 for getting LBCarrys
+ pand mm1, mm2 // get LBCarrys for each byte where both
+ // lsb's were == 1 (Only valid for active group)
+ psrlq mm2, 1 // divide raw bytes by 2
+ pand mm2, mm4 // clear invalid bit 7 of each byte
+ paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
+ pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
+ paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
+ // byte
+ cmp ebx, MMXLength
// Now ready to write back to memory
- movq [edi + ebx - 8], mm0
+ movq [edi + ebx - 8], mm0
// Prep Raw(x-bpp) for next loop
- movq mm2, mm0 // mov updated Raws to mm2
- jb davg4lp
- } // end _asm block
+ movq mm2, mm0 // mov updated Raws to mm2
+ jb davg4lp
+ } // end _asm block
}
break;
case 2:
{
ActiveMask.use = 0x000000000000ffff;
- ShiftBpp.use = 24; // == 3 * 8
- ShiftRem.use = 40; // == 64 - 24
- _asm {
+ ShiftBpp.use = 24; // == 3 * 8
+ ShiftRem.use = 40; // == 64 - 24
+ _asm {
// Load ActiveMask
movq mm7, ActiveMask
// Re-init address pointers and offset
- mov ebx, diff // ebx ==> x = offset to alignment boundary
+ mov ebx, diff // ebx ==> x = offset to alignment boundary
movq mm5, LBCarryMask
- mov edi, row // edi ==> Avg(x)
+ mov edi, row // edi ==> Avg(x)
movq mm4, HBClearMask
- mov esi, prev_row // esi ==> Prior(x)
+ mov esi, prev_row // esi ==> Prior(x)
// PRIME the pump (load the first Raw(x-bpp) data set
- movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
- // (we correct position in loop below)
+ movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
+ // (we correct position in loop below)
davg2lp:
- movq mm0, [edi + ebx]
+ movq mm0, [edi + ebx]
psllq mm2, ShiftRem // shift data to position correctly
- movq mm1, [esi + ebx]
+ movq mm1, [esi + ebx]
// Add (Prev_row/2) to Average
movq mm3, mm5
- pand mm3, mm1 // get lsb for each prev_row byte
- psrlq mm1, 1 // divide prev_row bytes by 2
- pand mm1, mm4 // clear invalid bit 7 of each byte
+ pand mm3, mm1 // get lsb for each prev_row byte
+ psrlq mm1, 1 // divide prev_row bytes by 2
+ pand mm1, mm4 // clear invalid bit 7 of each byte
movq mm6, mm7
- paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
+ paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
- movq mm1, mm3 // now use mm1 for getting LBCarrys
- pand mm1, mm2 // get LBCarrys for each byte where both
- // lsb's were == 1 (Only valid for active group)
- psrlq mm2, 1 // divide raw bytes by 2
- pand mm2, mm4 // clear invalid bit 7 of each byte
- paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
- pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
- paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
+ movq mm1, mm3 // now use mm1 for getting LBCarrys
+ pand mm1, mm2 // get LBCarrys for each byte where both
+ // lsb's were == 1 (Only valid for active group)
+ psrlq mm2, 1 // divide raw bytes by 2
+ pand mm2, mm4 // clear invalid bit 7 of each byte
+ paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
+ pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
+ paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
- psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
- movq mm2, mm0 // mov updated Raws to mm2
- psllq mm2, ShiftBpp // shift data to position correctly
- movq mm1, mm3 // now use mm1 for getting LBCarrys
- pand mm1, mm2 // get LBCarrys for each byte where both
- // lsb's were == 1 (Only valid for active group)
- psrlq mm2, 1 // divide raw bytes by 2
- pand mm2, mm4 // clear invalid bit 7 of each byte
- paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
- pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
- paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
+ psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
+ movq mm2, mm0 // mov updated Raws to mm2
+ psllq mm2, ShiftBpp // shift data to position correctly
+ movq mm1, mm3 // now use mm1 for getting LBCarrys
+ pand mm1, mm2 // get LBCarrys for each byte where both
+ // lsb's were == 1 (Only valid for active group)
+ psrlq mm2, 1 // divide raw bytes by 2
+ pand mm2, mm4 // clear invalid bit 7 of each byte
+ paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
+ pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
+ paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
// Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
- psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
- movq mm2, mm0 // mov updated Raws to mm2
- psllq mm2, ShiftBpp // shift data to position correctly
- // Data only needs to be shifted once here to
- // get the correct x-bpp offset.
- movq mm1, mm3 // now use mm1 for getting LBCarrys
- pand mm1, mm2 // get LBCarrys for each byte where both
- // lsb's were == 1 (Only valid for active group)
- psrlq mm2, 1 // divide raw bytes by 2
- pand mm2, mm4 // clear invalid bit 7 of each byte
- paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
- pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
- paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
+ psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
+ movq mm2, mm0 // mov updated Raws to mm2
+ psllq mm2, ShiftBpp // shift data to position correctly
+ // Data only needs to be shifted once here to
+ // get the correct x-bpp offset.
+ movq mm1, mm3 // now use mm1 for getting LBCarrys
+ pand mm1, mm2 // get LBCarrys for each byte where both
+ // lsb's were == 1 (Only valid for active group)
+ psrlq mm2, 1 // divide raw bytes by 2
+ pand mm2, mm4 // clear invalid bit 7 of each byte
+ paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
+ pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
+ paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
// Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
@@ -2278,72 +2149,73 @@ davg2lp:
// Data only needs to be shifted once here to
// get the correct x-bpp offset.
add ebx, 8
- movq mm1, mm3 // now use mm1 for getting LBCarrys
- pand mm1, mm2 // get LBCarrys for each byte where both
- // lsb's were == 1 (Only valid for active group)
- psrlq mm2, 1 // divide raw bytes by 2
- pand mm2, mm4 // clear invalid bit 7 of each byte
- paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
- pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
- paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
-
- cmp ebx, MMXLength
+ movq mm1, mm3 // now use mm1 for getting LBCarrys
+ pand mm1, mm2 // get LBCarrys for each byte where both
+ // lsb's were == 1 (Only valid for active group)
+ psrlq mm2, 1 // divide raw bytes by 2
+ pand mm2, mm4 // clear invalid bit 7 of each byte
+ paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
+ pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
+ paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
+
+ cmp ebx, MMXLength
// Now ready to write back to memory
- movq [edi + ebx - 8], mm0
+ movq [edi + ebx - 8], mm0
// Prep Raw(x-bpp) for next loop
- movq mm2, mm0 // mov updated Raws to mm2
- jb davg2lp
- } // end _asm block
+ movq mm2, mm0 // mov updated Raws to mm2
+ jb davg2lp
+ } // end _asm block
}
break;
- case 1: // bpp == 1
+
+ case 1: // bpp == 1
{
_asm {
// Re-init address pointers and offset
- mov ebx, diff // ebx ==> x = offset to alignment boundary
- mov edi, row // edi ==> Avg(x)
+ mov ebx, diff // ebx ==> x = offset to alignment boundary
+ mov edi, row // edi ==> Avg(x)
cmp ebx, FullLength // Test if offset at end of array
- jnb davg1end
+ jnb davg1end
// Do Paeth decode for remaining bytes
- mov esi, prev_row // esi ==> Prior(x)
+ mov esi, prev_row // esi ==> Prior(x)
mov edx, edi
xor ecx, ecx // zero ecx before using cl & cx in loop below
sub edx, bpp // edx ==> Raw(x-bpp)
davg1lp:
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
xor eax, eax
- mov cl, [esi + ebx] // load cl with Prior(x)
+ mov cl, [esi + ebx] // load cl with Prior(x)
mov al, [edx + ebx] // load al with Raw(x-bpp)
add ax, cx
- inc ebx
+ inc ebx
shr ax, 1 // divide by 2
add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
- cmp ebx, FullLength // Check if at end of array
- mov [edi+ebx-1], al // Write back Raw(x);
+ cmp ebx, FullLength // Check if at end of array
+ mov [edi+ebx-1], al // Write back Raw(x);
// mov does not affect flags; -1 to offset inc ebx
- jb davg1lp
+ jb davg1lp
davg1end:
- } // end _asm block
+ } // end _asm block
}
return;
case 8: // bpp == 8
{
- _asm {
+ _asm {
// Re-init address pointers and offset
mov ebx, diff // ebx ==> x = offset to alignment boundary
movq mm5, LBCarryMask
mov edi, row // edi ==> Avg(x)
movq mm4, HBClearMask
- mov esi, prev_row // esi ==> Prior(x)
+ mov esi, prev_row // esi ==> Prior(x)
// PRIME the pump (load the first Raw(x-bpp) data set
- movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
+ movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
// (NO NEED to correct position in loop below)
davg8lp:
- movq mm0, [edi + ebx]
+ movq mm0, [edi + ebx]
movq mm3, mm5
- movq mm1, [esi + ebx]
- add ebx, 8
+ movq mm1, [esi + ebx]
+ add ebx, 8
pand mm3, mm1 // get lsb for each prev_row byte
psrlq mm1, 1 // divide prev_row bytes by 2
pand mm3, mm2 // get LBCarrys for each byte where both
@@ -2353,31 +2225,31 @@ davg8lp:
paddb mm0, mm3 // add LBCarrys to Avg for each byte
pand mm2, mm4 // clear invalid bit 7 of each byte
paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
- paddb mm0, mm2 // add (Raw/2) to Avg for each byte
- cmp ebx, MMXLength
- movq [edi + ebx - 8], mm0
+ paddb mm0, mm2 // add (Raw/2) to Avg for each byte
+ cmp ebx, MMXLength
+ movq [edi + ebx - 8], mm0
movq mm2, mm0 // reuse as Raw(x-bpp)
- jb davg8lp
- } // end _asm block
+ jb davg8lp
+ } // end _asm block
}
break;
default: // bpp greater than 8
{
- _asm {
+ _asm {
movq mm5, LBCarryMask
// Re-init address pointers and offset
mov ebx, diff // ebx ==> x = offset to alignment boundary
mov edi, row // edi ==> Avg(x)
movq mm4, HBClearMask
mov edx, edi
- mov esi, prev_row // esi ==> Prior(x)
+ mov esi, prev_row // esi ==> Prior(x)
sub edx, bpp // edx ==> Raw(x-bpp)
davgAlp:
- movq mm0, [edi + ebx]
+ movq mm0, [edi + ebx]
movq mm3, mm5
- movq mm1, [esi + ebx]
+ movq mm1, [esi + ebx]
pand mm3, mm1 // get lsb for each prev_row byte
- movq mm2, [edx + ebx]
+ movq mm2, [edx + ebx]
psrlq mm1, 1 // divide prev_row bytes by 2
pand mm3, mm2 // get LBCarrys for each byte where both
// lsb's were == 1
@@ -2386,70 +2258,72 @@ davgAlp:
paddb mm0, mm3 // add LBCarrys to Avg for each byte
pand mm2, mm4 // clear invalid bit 7 of each byte
paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
- add ebx, 8
- paddb mm0, mm2 // add (Raw/2) to Avg for each byte
- cmp ebx, MMXLength
- movq [edi + ebx - 8], mm0
- jb davgAlp
- } // end _asm block
+ add ebx, 8
+ paddb mm0, mm2 // add (Raw/2) to Avg for each byte
+ cmp ebx, MMXLength
+ movq [edi + ebx - 8], mm0
+ jb davgAlp
+ } // end _asm block
}
break;
- } // end switch ( bpp )
+ } // end switch ( bpp )
- _asm {
+ _asm {
// MMX acceleration complete now do clean-up
// Check if any remaining bytes left to decode
- mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
- mov edi, row // edi ==> Avg(x)
- cmp ebx, FullLength // Test if offset at end of array
- jnb davgend
+ mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
+ mov edi, row // edi ==> Avg(x)
+ cmp ebx, FullLength // Test if offset at end of array
+ jnb davgend
// Do Paeth decode for remaining bytes
- mov esi, prev_row // esi ==> Prior(x)
+ mov esi, prev_row // esi ==> Prior(x)
mov edx, edi
- xor ecx, ecx // zero ecx before using cl & cx in loop below
- sub edx, bpp // edx ==> Raw(x-bpp)
+ xor ecx, ecx // zero ecx before using cl & cx in loop below
+ sub edx, bpp // edx ==> Raw(x-bpp)
davglp2:
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
xor eax, eax
- mov cl, [esi + ebx] // load cl with Prior(x)
- mov al, [edx + ebx] // load al with Raw(x-bpp)
+ mov cl, [esi + ebx] // load cl with Prior(x)
+ mov al, [edx + ebx] // load al with Raw(x-bpp)
add ax, cx
- inc ebx
+ inc ebx
shr ax, 1 // divide by 2
add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
- cmp ebx, FullLength // Check if at end of array
- mov [edi+ebx-1], al // Write back Raw(x);
+ cmp ebx, FullLength // Check if at end of array
+ mov [edi+ebx-1], al // Write back Raw(x);
// mov does not affect flags; -1 to offset inc ebx
- jb davglp2
+ jb davglp2
davgend:
- emms // End MMX instructions; prep for possible FP instrs.
+ emms // End MMX instructions; prep for possible FP instrs.
} // end _asm block
}
// Optimized code for PNG Paeth filter decoder
void
-png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row
- , png_bytep prev_row)
+png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
+ png_bytep prev_row)
{
- png_uint_32 FullLength;
- png_uint_32 MMXLength;
- //png_uint_32 len;
- int bpp;
- int diff;
- //int ptemp;
- int patemp, pbtemp, pctemp;
- bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
- FullLength = row_info->rowbytes; // # of bytes to filter
- _asm {
- xor ebx, ebx // ebx ==> x offset
- mov edi, row
- xor edx, edx // edx ==> x-bpp offset
- mov esi, prev_row
+ png_uint_32 FullLength;
+ png_uint_32 MMXLength;
+ //png_uint_32 len;
+ int bpp;
+ int diff;
+ //int ptemp;
+ int patemp, pbtemp, pctemp;
+
+ bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
+ FullLength = row_info->rowbytes; // # of bytes to filter
+ _asm
+ {
+ xor ebx, ebx // ebx ==> x offset
+ mov edi, row
+ xor edx, edx // edx ==> x-bpp offset
+ mov esi, prev_row
xor eax, eax
- // Compute the Raw value for the first bpp bytes
- // Note: the formula works out to always be Paeth(x) = Raw(x) + Prior(x)
- // where x < bpp
+ // Compute the Raw value for the first bpp bytes
+ // Note: the formula works out to be always
+ // Paeth(x) = Raw(x) + Prior(x) where x < bpp
dpthrlp:
mov al, [edi + ebx]
add al, [esi + ebx]
@@ -2460,7 +2334,7 @@ dpthrlp:
// get # of bytes to alignment
mov diff, edi // take start of row
add diff, ebx // add bpp
- xor ecx, ecx
+ xor ecx, ecx
add diff, 0xf // add 7 + 8 to incr past alignment boundary
and diff, 0xfffffff8 // mask to alignment boundary
sub diff, edi // subtract from start ==> value ebx at alignment
@@ -2523,33 +2397,34 @@ dpthabc:
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
mov cl, [edi + edx] // load Raw(x-bpp) into cl
dpthpaeth:
- inc ebx
- inc edx
+ inc ebx
+ inc edx
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
add [edi + ebx - 1], cl
- cmp ebx, diff
- jb dpthlp1
+ cmp ebx, diff
+ jb dpthlp1
dpthgo:
- mov ecx, FullLength
+ mov ecx, FullLength
mov eax, ecx
sub eax, ebx // subtract alignment fix
and eax, 0x00000007 // calc bytes over mult of 8
sub ecx, eax // drop over bytes from original length
mov MMXLength, ecx
- } // end _asm block
- // Now do the math for the rest of the row
- switch ( bpp )
- {
+ } // end _asm block
+ // Now do the math for the rest of the row
+ switch ( bpp )
+ {
case 3:
{
ActiveMask.use = 0x0000000000ffffff;
ActiveMaskEnd.use = 0xffff000000000000;
ShiftBpp.use = 24; // == bpp(3) * 8
ShiftRem.use = 40; // == 64 - 24
- _asm {
+ _asm
+ {
mov ebx, diff
- mov edi, row
- mov esi, prev_row
+ mov edi, row
+ mov esi, prev_row
pxor mm0, mm0
// PRIME the pump (load the first Raw(x-bpp) data set
movq mm1, [edi+ebx-8]
@@ -2574,23 +2449,23 @@ dpth3lp:
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
- pcmpgtw mm0, mm4 // Create mask pav bytes < 0
+ pcmpgtw mm0, mm4 // Create mask pav bytes < 0
paddw mm6, mm5
- pand mm0, mm4 // Only pav bytes < 0 in mm7
- pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
+ pand mm0, mm4 // Only pav bytes < 0 in mm7
+ pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
psubw mm4, mm0
- pand mm7, mm5 // Only pbv bytes < 0 in mm0
+ pand mm7, mm5 // Only pbv bytes < 0 in mm0
psubw mm4, mm0
psubw mm5, mm7
pxor mm0, mm0
- pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
- pand mm0, mm6 // Only pav bytes < 0 in mm7
+ pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
+ pand mm0, mm6 // Only pav bytes < 0 in mm7
psubw mm5, mm7
psubw mm6, mm0
// test pa <= pb
movq mm7, mm4
psubw mm6, mm0
- pcmpgtw mm7, mm5 // pa > pb?
+ pcmpgtw mm7, mm5 // pa > pb?
movq mm0, mm7
// use mm7 mask to merge pa & pb
pand mm5, mm7
@@ -2601,7 +2476,7 @@ dpth3lp:
paddw mm7, mm5
paddw mm0, mm2
// test ((pa <= pb)? pa:pb) <= pc
- pcmpgtw mm7, mm6 // pab > pc?
+ pcmpgtw mm7, mm6 // pab > pc?
pxor mm1, mm1
pand mm3, mm7
pandn mm7, mm0
@@ -2634,22 +2509,22 @@ dpth3lp:
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
- pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
- pcmpgtw mm7, mm4 // Create mask pav bytes < 0
- pand mm0, mm5 // Only pbv bytes < 0 in mm0
- pand mm7, mm4 // Only pav bytes < 0 in mm7
+ pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
+ pcmpgtw mm7, mm4 // Create mask pav bytes < 0
+ pand mm0, mm5 // Only pbv bytes < 0 in mm0
+ pand mm7, mm4 // Only pav bytes < 0 in mm7
psubw mm5, mm0
psubw mm4, mm7
psubw mm5, mm0
psubw mm4, mm7
pxor mm0, mm0
- pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
- pand mm0, mm6 // Only pav bytes < 0 in mm7
+ pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
+ pand mm0, mm6 // Only pav bytes < 0 in mm7
psubw mm6, mm0
// test pa <= pb
movq mm7, mm4
psubw mm6, mm0
- pcmpgtw mm7, mm5 // pa > pb?
+ pcmpgtw mm7, mm5 // pa > pb?
movq mm0, mm7
// use mm7 mask to merge pa & pb
pand mm5, mm7
@@ -2660,8 +2535,8 @@ dpth3lp:
paddw mm7, mm5
paddw mm0, mm2
// test ((pa <= pb)? pa:pb) <= pc
- pcmpgtw mm7, mm6 // pab > pc?
- movq mm2, [esi + ebx] // load b=Prior(x)
+ pcmpgtw mm7, mm6 // pab > pc?
+ movq mm2, [esi + ebx] // load b=Prior(x)
pand mm3, mm7
pandn mm7, mm0
pxor mm1, mm1
@@ -2696,22 +2571,22 @@ dpth3lp:
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
- pcmpgtw mm0, mm4 // Create mask pav bytes < 0
- pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
- pand mm0, mm4 // Only pav bytes < 0 in mm7
- pand mm7, mm5 // Only pbv bytes < 0 in mm0
+ pcmpgtw mm0, mm4 // Create mask pav bytes < 0
+ pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
+ pand mm0, mm4 // Only pav bytes < 0 in mm7
+ pand mm7, mm5 // Only pbv bytes < 0 in mm0
psubw mm4, mm0
psubw mm5, mm7
psubw mm4, mm0
psubw mm5, mm7
pxor mm0, mm0
- pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
- pand mm0, mm6 // Only pav bytes < 0 in mm7
+ pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
+ pand mm0, mm6 // Only pav bytes < 0 in mm7
psubw mm6, mm0
// test pa <= pb
movq mm7, mm4
psubw mm6, mm0
- pcmpgtw mm7, mm5 // pa > pb?
+ pcmpgtw mm7, mm5 // pa > pb?
movq mm0, mm7
// use mm0 mask copy to merge a & b
pand mm2, mm0
@@ -2722,26 +2597,27 @@ dpth3lp:
paddw mm0, mm2
paddw mm7, mm5
// test ((pa <= pb)? pa:pb) <= pc
- pcmpgtw mm7, mm6 // pab > pc?
+ pcmpgtw mm7, mm6 // pab > pc?
pand mm3, mm7
pandn mm7, mm0
paddw mm7, mm3
pxor mm1, mm1
packuswb mm1, mm7
// Step ebx to next set of 8 bytes and repeat loop til done
- add ebx, 8
+ add ebx, 8
pand mm1, ActiveMaskEnd
paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
- cmp ebx, MMXLength
+ cmp ebx, MMXLength
pxor mm0, mm0 // pxor does not affect flags
movq [edi + ebx - 8], mm1 // write back updated value
// mm1 will be used as Raw(x-bpp) next loop
// mm3 ready to be used as Prior(x-bpp) next loop
- jb dpth3lp
- } // end _asm block
+ jb dpth3lp
+ } // end _asm block
}
break;
+
case 6:
case 7:
case 5:
@@ -2750,18 +2626,19 @@ dpth3lp:
ActiveMask2.use = 0xffffffff00000000;
ShiftBpp.use = bpp << 3; // == bpp * 8
ShiftRem.use = 64 - ShiftBpp.use;
- _asm {
+ _asm
+ {
mov ebx, diff
- mov edi, row //
- mov esi, prev_row
+ mov edi, row
+ mov esi, prev_row
// PRIME the pump (load the first Raw(x-bpp) data set
- movq mm1, [edi+ebx-8]
+ movq mm1, [edi+ebx-8]
pxor mm0, mm0
dpth6lp:
// Must shift to position Raw(x-bpp) data
psrlq mm1, ShiftRem
// Do first set of 4 bytes
- movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
+ movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
punpcklbw mm1, mm0 // Unpack Low bytes of a
movq mm2, [esi + ebx] // load b=Prior(x)
punpcklbw mm2, mm0 // Unpack Low bytes of b
@@ -2780,23 +2657,23 @@ dpth6lp:
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
- pcmpgtw mm0, mm4 // Create mask pav bytes < 0
+ pcmpgtw mm0, mm4 // Create mask pav bytes < 0
paddw mm6, mm5
- pand mm0, mm4 // Only pav bytes < 0 in mm7
- pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
+ pand mm0, mm4 // Only pav bytes < 0 in mm7
+ pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
psubw mm4, mm0
- pand mm7, mm5 // Only pbv bytes < 0 in mm0
+ pand mm7, mm5 // Only pbv bytes < 0 in mm0
psubw mm4, mm0
psubw mm5, mm7
pxor mm0, mm0
- pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
- pand mm0, mm6 // Only pav bytes < 0 in mm7
+ pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
+ pand mm0, mm6 // Only pav bytes < 0 in mm7
psubw mm5, mm7
psubw mm6, mm0
// test pa <= pb
movq mm7, mm4
psubw mm6, mm0
- pcmpgtw mm7, mm5 // pa > pb?
+ pcmpgtw mm7, mm5 // pa > pb?
movq mm0, mm7
// use mm7 mask to merge pa & pb
pand mm5, mm7
@@ -2807,7 +2684,7 @@ dpth6lp:
paddw mm7, mm5
paddw mm0, mm2
// test ((pa <= pb)? pa:pb) <= pc
- pcmpgtw mm7, mm6 // pab > pc?
+ pcmpgtw mm7, mm6 // pab > pc?
pxor mm1, mm1
pand mm3, mm7
pandn mm7, mm0
@@ -2821,7 +2698,7 @@ dpth6lp:
paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
movq mm6, mm2
movq [edi + ebx], mm7 // write back updated value
- movq mm1, [edi+ebx-8]
+ movq mm1, [edi+ebx-8]
psllq mm6, ShiftBpp
movq mm5, mm7
psrlq mm1, ShiftRem
@@ -2844,23 +2721,23 @@ dpth6lp:
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
- pcmpgtw mm0, mm4 // Create mask pav bytes < 0
+ pcmpgtw mm0, mm4 // Create mask pav bytes < 0
paddw mm6, mm5
- pand mm0, mm4 // Only pav bytes < 0 in mm7
- pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
+ pand mm0, mm4 // Only pav bytes < 0 in mm7
+ pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
psubw mm4, mm0
- pand mm7, mm5 // Only pbv bytes < 0 in mm0
+ pand mm7, mm5 // Only pbv bytes < 0 in mm0
psubw mm4, mm0
psubw mm5, mm7
pxor mm0, mm0
- pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
- pand mm0, mm6 // Only pav bytes < 0 in mm7
+ pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
+ pand mm0, mm6 // Only pav bytes < 0 in mm7
psubw mm5, mm7
psubw mm6, mm0
// test pa <= pb
movq mm7, mm4
psubw mm6, mm0
- pcmpgtw mm7, mm5 // pa > pb?
+ pcmpgtw mm7, mm5 // pa > pb?
movq mm0, mm7
// use mm7 mask to merge pa & pb
pand mm5, mm7
@@ -2879,29 +2756,31 @@ dpth6lp:
paddw mm7, mm3
pxor mm0, mm0
// Step ex to next set of 8 bytes and repeat loop til done
- add ebx, 8
+ add ebx, 8
packuswb mm1, mm7
paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
- cmp ebx, MMXLength
+ cmp ebx, MMXLength
movq [edi + ebx - 8], mm1 // write back updated value
// mm1 will be used as Raw(x-bpp) next loop
- jb dpth6lp
- } // end _asm block
+ jb dpth6lp
+ } // end _asm block
}
break;
+
case 4:
{
ActiveMask.use = 0x00000000ffffffff;
- _asm {
+ _asm {
mov ebx, diff
- mov edi, row //
- mov esi, prev_row
+ mov edi, row
+ mov esi, prev_row
pxor mm0, mm0
// PRIME the pump (load the first Raw(x-bpp) data set
- movq mm1, [edi+ebx-8] // Only time should need to read a=Raw(x-bpp) bytes
+ movq mm1, [edi+ebx-8] // Only time should need to read
+ // a=Raw(x-bpp) bytes
dpth4lp:
// Do first set of 4 bytes
- movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
+ movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
punpckhbw mm1, mm0 // Unpack Low bytes of a
movq mm2, [esi + ebx] // load b=Prior(x)
punpcklbw mm2, mm0 // Unpack High bytes of b
@@ -2918,23 +2797,23 @@ dpth4lp:
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
- pcmpgtw mm0, mm4 // Create mask pav bytes < 0
+ pcmpgtw mm0, mm4 // Create mask pav bytes < 0
paddw mm6, mm5
- pand mm0, mm4 // Only pav bytes < 0 in mm7
- pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
+ pand mm0, mm4 // Only pav bytes < 0 in mm7
+ pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
psubw mm4, mm0
- pand mm7, mm5 // Only pbv bytes < 0 in mm0
+ pand mm7, mm5 // Only pbv bytes < 0 in mm0
psubw mm4, mm0
psubw mm5, mm7
pxor mm0, mm0
- pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
- pand mm0, mm6 // Only pav bytes < 0 in mm7
+ pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
+ pand mm0, mm6 // Only pav bytes < 0 in mm7
psubw mm5, mm7
psubw mm6, mm0
// test pa <= pb
movq mm7, mm4
psubw mm6, mm0
- pcmpgtw mm7, mm5 // pa > pb?
+ pcmpgtw mm7, mm5 // pa > pb?
movq mm0, mm7
// use mm7 mask to merge pa & pb
pand mm5, mm7
@@ -2945,7 +2824,7 @@ dpth4lp:
paddw mm7, mm5
paddw mm0, mm2
// test ((pa <= pb)? pa:pb) <= pc
- pcmpgtw mm7, mm6 // pab > pc?
+ pcmpgtw mm7, mm6 // pab > pc?
pxor mm1, mm1
pand mm3, mm7
pandn mm7, mm0
@@ -2974,23 +2853,23 @@ dpth4lp:
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
- pcmpgtw mm0, mm4 // Create mask pav bytes < 0
+ pcmpgtw mm0, mm4 // Create mask pav bytes < 0
paddw mm6, mm5
- pand mm0, mm4 // Only pav bytes < 0 in mm7
- pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
+ pand mm0, mm4 // Only pav bytes < 0 in mm7
+ pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
psubw mm4, mm0
- pand mm7, mm5 // Only pbv bytes < 0 in mm0
+ pand mm7, mm5 // Only pbv bytes < 0 in mm0
psubw mm4, mm0
psubw mm5, mm7
pxor mm0, mm0
- pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
- pand mm0, mm6 // Only pav bytes < 0 in mm7
+ pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
+ pand mm0, mm6 // Only pav bytes < 0 in mm7
psubw mm5, mm7
psubw mm6, mm0
// test pa <= pb
movq mm7, mm4
psubw mm6, mm0
- pcmpgtw mm7, mm5 // pa > pb?
+ pcmpgtw mm7, mm5 // pa > pb?
movq mm0, mm7
// use mm7 mask to merge pa & pb
pand mm5, mm7
@@ -3001,7 +2880,7 @@ dpth4lp:
paddw mm7, mm5
paddw mm0, mm2
// test ((pa <= pb)? pa:pb) <= pc
- pcmpgtw mm7, mm6 // pab > pc?
+ pcmpgtw mm7, mm6 // pab > pc?
pxor mm1, mm1
pand mm3, mm7
pandn mm7, mm0
@@ -3009,29 +2888,30 @@ dpth4lp:
paddw mm7, mm3
pxor mm0, mm0
// Step ex to next set of 8 bytes and repeat loop til done
- add ebx, 8
+ add ebx, 8
packuswb mm1, mm7
paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
- cmp ebx, MMXLength
+ cmp ebx, MMXLength
movq [edi + ebx - 8], mm1 // write back updated value
// mm1 will be used as Raw(x-bpp) next loop
- jb dpth4lp
- } // end _asm block
+ jb dpth4lp
+ } // end _asm block
}
break;
case 8: // bpp == 8
{
ActiveMask.use = 0x00000000ffffffff;
- _asm {
+ _asm {
mov ebx, diff
- mov edi, row //
- mov esi, prev_row
+ mov edi, row
+ mov esi, prev_row
pxor mm0, mm0
// PRIME the pump (load the first Raw(x-bpp) data set
- movq mm1, [edi+ebx-8] // Only time should need to read a=Raw(x-bpp) bytes
+ movq mm1, [edi+ebx-8] // Only time should need to read
+ // a=Raw(x-bpp) bytes
dpth8lp:
// Do first set of 4 bytes
- movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
+ movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
punpcklbw mm1, mm0 // Unpack Low bytes of a
movq mm2, [esi + ebx] // load b=Prior(x)
punpcklbw mm2, mm0 // Unpack Low bytes of b
@@ -3048,23 +2928,23 @@ dpth8lp:
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
- pcmpgtw mm0, mm4 // Create mask pav bytes < 0
+ pcmpgtw mm0, mm4 // Create mask pav bytes < 0
paddw mm6, mm5
- pand mm0, mm4 // Only pav bytes < 0 in mm7
- pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
+ pand mm0, mm4 // Only pav bytes < 0 in mm7
+ pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
psubw mm4, mm0
- pand mm7, mm5 // Only pbv bytes < 0 in mm0
+ pand mm7, mm5 // Only pbv bytes < 0 in mm0
psubw mm4, mm0
psubw mm5, mm7
pxor mm0, mm0
- pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
- pand mm0, mm6 // Only pav bytes < 0 in mm7
+ pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
+ pand mm0, mm6 // Only pav bytes < 0 in mm7
psubw mm5, mm7
psubw mm6, mm0
// test pa <= pb
movq mm7, mm4
psubw mm6, mm0
- pcmpgtw mm7, mm5 // pa > pb?
+ pcmpgtw mm7, mm5 // pa > pb?
movq mm0, mm7
// use mm7 mask to merge pa & pb
pand mm5, mm7
@@ -3075,24 +2955,24 @@ dpth8lp:
paddw mm7, mm5
paddw mm0, mm2
// test ((pa <= pb)? pa:pb) <= pc
- pcmpgtw mm7, mm6 // pab > pc?
+ pcmpgtw mm7, mm6 // pab > pc?
pxor mm1, mm1
pand mm3, mm7
pandn mm7, mm0
paddw mm7, mm3
pxor mm0, mm0
packuswb mm7, mm1
- movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
+ movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
pand mm7, ActiveMask
- movq mm2, [esi + ebx] // load b=Prior(x)
- paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
- punpckhbw mm3, mm0 // Unpack High bytes of c
- movq [edi + ebx], mm7 // write back updated value
- movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
+ movq mm2, [esi + ebx] // load b=Prior(x)
+ paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
+ punpckhbw mm3, mm0 // Unpack High bytes of c
+ movq [edi + ebx], mm7 // write back updated value
+ movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
// Do second set of 4 bytes
- punpckhbw mm2, mm0 // Unpack High bytes of b
- punpckhbw mm1, mm0 // Unpack High bytes of a
+ punpckhbw mm2, mm0 // Unpack High bytes of b
+ punpckhbw mm1, mm0 // Unpack High bytes of a
// pav = p - a = (a + b - c) - a = b - c
movq mm4, mm2
// pbv = p - b = (a + b - c) - b = a - c
@@ -3105,23 +2985,23 @@ dpth8lp:
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
- pcmpgtw mm0, mm4 // Create mask pav bytes < 0
+ pcmpgtw mm0, mm4 // Create mask pav bytes < 0
paddw mm6, mm5
- pand mm0, mm4 // Only pav bytes < 0 in mm7
- pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
+ pand mm0, mm4 // Only pav bytes < 0 in mm7
+ pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
psubw mm4, mm0
- pand mm7, mm5 // Only pbv bytes < 0 in mm0
+ pand mm7, mm5 // Only pbv bytes < 0 in mm0
psubw mm4, mm0
psubw mm5, mm7
pxor mm0, mm0
- pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
- pand mm0, mm6 // Only pav bytes < 0 in mm7
+ pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
+ pand mm0, mm6 // Only pav bytes < 0 in mm7
psubw mm5, mm7
psubw mm6, mm0
// test pa <= pb
movq mm7, mm4
psubw mm6, mm0
- pcmpgtw mm7, mm5 // pa > pb?
+ pcmpgtw mm7, mm5 // pa > pb?
movq mm0, mm7
// use mm7 mask to merge pa & pb
pand mm5, mm7
@@ -3132,7 +3012,7 @@ dpth8lp:
paddw mm7, mm5
paddw mm0, mm2
// test ((pa <= pb)? pa:pb) <= pc
- pcmpgtw mm7, mm6 // pab > pc?
+ pcmpgtw mm7, mm6 // pab > pc?
pxor mm1, mm1
pand mm3, mm7
pandn mm7, mm0
@@ -3140,26 +3020,27 @@ dpth8lp:
paddw mm7, mm3
pxor mm0, mm0
// Step ex to next set of 8 bytes and repeat loop til done
- add ebx, 8
+ add ebx, 8
packuswb mm1, mm7
paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
- cmp ebx, MMXLength
+ cmp ebx, MMXLength
movq [edi + ebx - 8], mm1 // write back updated value
// mm1 will be used as Raw(x-bpp) next loop
- jb dpth8lp
- } // end _asm block
+ jb dpth8lp
+ } // end _asm block
}
break;
- case 1: // bpp = 1
- case 2: // bpp = 2
- default: // bpp > 8
+
+ case 1: // bpp = 1
+ case 2: // bpp = 2
+ default: // bpp > 8
{
- _asm {
- mov ebx, diff
- cmp ebx, FullLength
- jnb dpthdend
- mov edi, row //
- mov esi, prev_row
+ _asm {
+ mov ebx, diff
+ cmp ebx, FullLength
+ jnb dpthdend
+ mov edi, row
+ mov esi, prev_row
// Do Paeth decode for remaining bytes
mov edx, ebx
xor ecx, ecx // zero ecx before using cl & cx in loop below
@@ -3221,25 +3102,26 @@ dpthdabc:
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
mov cl, [edi + edx] // load Raw(x-bpp) into cl
dpthdpaeth:
- inc ebx
- inc edx
+ inc ebx
+ inc edx
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
add [edi + ebx - 1], cl
- cmp ebx, FullLength
- jb dpthdlp
+ cmp ebx, FullLength
+ jb dpthdlp
dpthdend:
- } // end _asm block
+ } // end _asm block
}
return; // No need to go further with this one
- } // end switch ( bpp )
- _asm {
+ } // end switch ( bpp )
+ _asm
+ {
// MMX acceleration complete now do clean-up
// Check if any remaining bytes left to decode
- mov ebx, MMXLength
- cmp ebx, FullLength
- jnb dpthend
- mov edi, row
- mov esi, prev_row
+ mov ebx, MMXLength
+ cmp ebx, FullLength
+ jnb dpthend
+ mov edi, row
+ mov esi, prev_row
// Do Paeth decode for remaining bytes
mov edx, ebx
xor ecx, ecx // zero ecx before using cl & cx in loop below
@@ -3301,69 +3183,71 @@ dpthabc2:
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
mov cl, [edi + edx] // load Raw(x-bpp) into cl
dpthpaeth2:
- inc ebx
- inc edx
+ inc ebx
+ inc edx
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
add [edi + ebx - 1], cl
- cmp ebx, FullLength
- jb dpthlp2
+ cmp ebx, FullLength
+ jb dpthlp2
dpthend:
- emms // End MMX instructions; prep for possible FP instrs.
- } // end _asm block
+ emms // End MMX instructions; prep for possible FP instrs.
+ } // end _asm block
}
// Optimized code for PNG Sub filter decoder
void
png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
{
- //int test;
- int bpp;
- png_uint_32 FullLength;
- png_uint_32 MMXLength;
- int diff;
- bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
- FullLength = row_info->rowbytes - bpp; // # of bytes to filter
- _asm {
+ //int test;
+ int bpp;
+ png_uint_32 FullLength;
+ png_uint_32 MMXLength;
+ int diff;
+
+ bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
+ FullLength = row_info->rowbytes - bpp; // # of bytes to filter
+ _asm {
mov edi, row
mov esi, edi // lp = row
- add edi, bpp // rp = row + bpp
- xor eax, eax
- // get # of bytes to alignment
- mov diff, edi // take start of row
- add diff, 0xf // add 7 + 8 to incr past
+ add edi, bpp // rp = row + bpp
+ xor eax, eax
+ // get # of bytes to alignment
+ mov diff, edi // take start of row
+ add diff, 0xf // add 7 + 8 to incr past
// alignment boundary
- xor ebx, ebx
- and diff, 0xfffffff8 // mask to alignment boundary
- sub diff, edi // subtract from start ==> value
+ xor ebx, ebx
+ and diff, 0xfffffff8 // mask to alignment boundary
+ sub diff, edi // subtract from start ==> value
// ebx at alignment
- jz dsubgo
- // fix alignment
+ jz dsubgo
+ // fix alignment
dsublp1:
- mov al, [esi+ebx]
- add [edi+ebx], al
- inc ebx
- cmp ebx, diff
- jb dsublp1
+ mov al, [esi+ebx]
+ add [edi+ebx], al
+ inc ebx
+ cmp ebx, diff
+ jb dsublp1
dsubgo:
- mov ecx, FullLength
- mov edx, ecx
- sub edx, ebx // subtract alignment fix
- and edx, 0x00000007 // calc bytes over mult of 8
- sub ecx, edx // drop over bytes from length
- mov MMXLength, ecx
- } // end _asm block
- // Now do the math for the rest of the row
- switch ( bpp )
- {
- case 3:
- {
+ mov ecx, FullLength
+ mov edx, ecx
+ sub edx, ebx // subtract alignment fix
+ and edx, 0x00000007 // calc bytes over mult of 8
+ sub ecx, edx // drop over bytes from length
+ mov MMXLength, ecx
+ } // end _asm block
+
+ // Now do the math for the rest of the row
+ switch ( bpp )
+ {
+ case 3:
+ {
ActiveMask.use = 0x0000ffffff000000;
ShiftBpp.use = 24; // == 3 * 8
ShiftRem.use = 40; // == 64 - 24
- _asm {
+ _asm {
mov edi, row
movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
- mov esi, edi // lp = row
+ mov esi, edi // lp = row
add edi, bpp // rp = row + bpp
movq mm6, mm7
mov ebx, diff
@@ -3376,234 +3260,242 @@ dsub3lp:
// no need for mask; shift clears inactive bytes
// Add 1st active group
movq mm0, [edi+ebx]
- paddb mm0, mm1
+ paddb mm0, mm1
// Add 2nd active group
movq mm1, mm0 // mov updated Raws to mm1
psllq mm1, ShiftBpp // shift data to position correctly
pand mm1, mm7 // mask to use only 2nd active group
- paddb mm0, mm1
+ paddb mm0, mm1
// Add 3rd active group
movq mm1, mm0 // mov updated Raws to mm1
psllq mm1, ShiftBpp // shift data to position correctly
pand mm1, mm6 // mask to use only 3rd active group
- add ebx, 8
- paddb mm0, mm1
- cmp ebx, MMXLength
- movq [edi+ebx-8], mm0 // Write updated Raws back to array
+ add ebx, 8
+ paddb mm0, mm1
+ cmp ebx, MMXLength
+ movq [edi+ebx-8], mm0 // Write updated Raws back to array
// Prep for doing 1st add at top of loop
movq mm1, mm0
- jb dsub3lp
- } // end _asm block
+ jb dsub3lp
+ } // end _asm block
}
break;
+
case 1:
- {
- /* Placed here just in case this is a duplicate of the
- non-MMX code for the SUB filter in png_read_filter_row
- above
- */
-// png_bytep rp;
-// png_bytep lp;
-// png_uint_32 i;
-// bpp = (row_info->pixel_depth + 7) >> 3;
-// for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
-// i < row_info->rowbytes; i++, rp++, lp++)
-// {
-// *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
-// }
- _asm {
+ {
+ // Placed here just in case this is a duplicate of the
+ // non-MMX code for the SUB filter in png_read_filter_row above
+ //
+ // png_bytep rp;
+ // png_bytep lp;
+ // png_uint_32 i;
+ // bpp = (row_info->pixel_depth + 7) >> 3;
+ // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
+ // i < row_info->rowbytes; i++, rp++, lp++)
+ // {
+ // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
+ // }
+ _asm {
mov ebx, diff
mov edi, row
- cmp ebx, FullLength
- jnb dsub1end
- mov esi, edi // lp = row
- xor eax, eax
+ cmp ebx, FullLength
+ jnb dsub1end
+ mov esi, edi // lp = row
+ xor eax, eax
add edi, bpp // rp = row + bpp
dsub1lp:
- mov al, [esi+ebx]
- add [edi+ebx], al
- inc ebx
- cmp ebx, FullLength
- jb dsub1lp
+ mov al, [esi+ebx]
+ add [edi+ebx], al
+ inc ebx
+ cmp ebx, FullLength
+ jb dsub1lp
dsub1end:
- } // end _asm block
- }
+ } // end _asm block
+ }
return;
+
case 6:
case 7:
case 4:
case 5:
- {
+ {
ShiftBpp.use = bpp << 3;
ShiftRem.use = 64 - ShiftBpp.use;
- _asm {
+ _asm {
mov edi, row
mov ebx, diff
- mov esi, edi // lp = row
+ mov esi, edi // lp = row
add edi, bpp // rp = row + bpp
// PRIME the pump (load the first Raw(x-bpp) data set
movq mm1, [edi+ebx-8]
dsub4lp:
psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
// no need for mask; shift clears inactive bytes
- movq mm0, [edi+ebx]
- paddb mm0, mm1
+ movq mm0, [edi+ebx]
+ paddb mm0, mm1
// Add 2nd active group
movq mm1, mm0 // mov updated Raws to mm1
psllq mm1, ShiftBpp // shift data to position correctly
// there is no need for any mask
// since shift clears inactive bits/bytes
- add ebx, 8
- paddb mm0, mm1
- cmp ebx, MMXLength
- movq [edi+ebx-8], mm0
+ add ebx, 8
+ paddb mm0, mm1
+ cmp ebx, MMXLength
+ movq [edi+ebx-8], mm0
movq mm1, mm0 // Prep for doing 1st add at top of loop
- jb dsub4lp
- } // end _asm block
+ jb dsub4lp
+ } // end _asm block
}
break;
+
case 2:
- {
+ {
ActiveMask.use = 0x00000000ffff0000;
ShiftBpp.use = 16; // == 2 * 8
ShiftRem.use = 48; // == 64 - 16
- _asm {
+ _asm {
movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
mov ebx, diff
movq mm6, mm7
- mov edi, row
- psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active byte group
- mov esi, edi // lp = row
+ mov edi, row
+ psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
+ // byte group
+ mov esi, edi // lp = row
movq mm5, mm6
- add edi, bpp // rp = row + bpp
- psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active byte group
+ add edi, bpp // rp = row + bpp
+ psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
+ // byte group
// PRIME the pump (load the first Raw(x-bpp) data set
movq mm1, [edi+ebx-8]
dsub2lp:
// Add 1st active group
- psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
- // no need for mask; shift clears inactive bytes
+ psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
+ // no need for mask; shift clears inactive
+ // bytes
movq mm0, [edi+ebx]
- paddb mm0, mm1
+ paddb mm0, mm1
// Add 2nd active group
- movq mm1, mm0 // mov updated Raws to mm1
- psllq mm1, ShiftBpp // shift data to position correctly
- pand mm1, mm7 // mask to use only 2nd active group
- paddb mm0, mm1
+ movq mm1, mm0 // mov updated Raws to mm1
+ psllq mm1, ShiftBpp // shift data to position correctly
+ pand mm1, mm7 // mask to use only 2nd active group
+ paddb mm0, mm1
// Add 3rd active group
- movq mm1, mm0 // mov updated Raws to mm1
- psllq mm1, ShiftBpp // shift data to position correctly
- pand mm1, mm6 // mask to use only 3rd active group
- paddb mm0, mm1
+ movq mm1, mm0 // mov updated Raws to mm1
+ psllq mm1, ShiftBpp // shift data to position correctly
+ pand mm1, mm6 // mask to use only 3rd active group
+ paddb mm0, mm1
// Add 4th active group
- movq mm1, mm0 // mov updated Raws to mm1
- psllq mm1, ShiftBpp // shift data to position correctly
- pand mm1, mm5 // mask to use only 4th active group
- add ebx, 8
- paddb mm0, mm1
- cmp ebx, MMXLength
- movq [edi+ebx-8], mm0 // Write updated Raws back to array
- movq mm1, mm0 // Prep for doing 1st add at top of loop
- jb dsub2lp
- } // end _asm block
+ movq mm1, mm0 // mov updated Raws to mm1
+ psllq mm1, ShiftBpp // shift data to position correctly
+ pand mm1, mm5 // mask to use only 4th active group
+ add ebx, 8
+ paddb mm0, mm1
+ cmp ebx, MMXLength
+ movq [edi+ebx-8], mm0 // Write updated Raws back to array
+ movq mm1, mm0 // Prep for doing 1st add at top of loop
+ jb dsub2lp
+ } // end _asm block
}
break;
case 8:
- {
- _asm {
- mov edi, row
+ {
+ _asm {
+ mov edi, row
mov ebx, diff
- mov esi, edi // lp = row
- add edi, bpp // rp = row + bpp
- mov ecx, MMXLength
+ mov esi, edi // lp = row
+ add edi, bpp // rp = row + bpp
+ mov ecx, MMXLength
movq mm7, [edi+ebx-8] // PRIME the pump (load the first
// Raw(x-bpp) data set
and ecx, 0x0000003f // calc bytes over mult of 64
dsub8lp:
- movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
- paddb mm0, mm7
- movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
- movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
+ movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
+ paddb mm0, mm7
+ movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
+ movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
// Now mm0 will be used as Raw(x-bpp) for
// the 2nd group of 8 bytes. This will be
// repeated for each group of 8 bytes with
// the 8th group being used as the Raw(x-bpp)
// for the 1st group of the next loop.
- paddb mm1, mm0
- movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
- movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
- paddb mm2, mm1
- movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
- movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
- paddb mm3, mm2
- movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
- movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
- paddb mm4, mm3
- movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
- movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
- paddb mm5, mm4
- movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
- movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
- paddb mm6, mm5
- movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
- movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
- add ebx, 64
- paddb mm7, mm6
- cmp ebx, ecx
- movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
- jb dsub8lp
- cmp ebx, MMXLength
- jnb dsub8lt8
+ paddb mm1, mm0
+ movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
+ movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
+ paddb mm2, mm1
+ movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
+ movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
+ paddb mm3, mm2
+ movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
+ movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
+ paddb mm4, mm3
+ movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
+ movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
+ paddb mm5, mm4
+ movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
+ movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
+ paddb mm6, mm5
+ movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
+ movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
+ add ebx, 64
+ paddb mm7, mm6
+ cmp ebx, ecx
+ movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
+ jb dsub8lp
+ cmp ebx, MMXLength
+ jnb dsub8lt8
dsub8lpA:
movq mm0, [edi+ebx]
- add ebx, 8
- paddb mm0, mm7
- cmp ebx, MMXLength
- movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
- movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
- // be the new Raw(x-bpp) for the next loop
- jb dsub8lpA
+ add ebx, 8
+ paddb mm0, mm7
+ cmp ebx, MMXLength
+ movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
+ movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
+ // be the new Raw(x-bpp) for the next loop
+ jb dsub8lpA
dsub8lt8:
- } // end _asm block
+ } // end _asm block
}
break;
+
default: // bpp greater than 8 bytes
- {
- _asm {
+ {
+ _asm {
mov ebx, diff
- mov edi, row
- mov esi, edi // lp = row
+ mov edi, row
+ mov esi, edi // lp = row
add edi, bpp // rp = row + bpp
dsubAlp:
- movq mm0, [edi+ebx]
- movq mm1, [esi+ebx]
- add ebx, 8
- paddb mm0, mm1
- cmp ebx, MMXLength
- movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset add ebx
- jb dsubAlp
- } // end _asm block
+ movq mm0, [edi+ebx]
+ movq mm1, [esi+ebx]
+ add ebx, 8
+ paddb mm0, mm1
+ cmp ebx, MMXLength
+ movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
+ // add ebx
+ jb dsubAlp
+ } // end _asm block
}
break;
- } // end switch ( bpp )
- _asm {
- mov ebx, MMXLength
- mov edi, row
+ } // end switch ( bpp )
+
+ _asm {
+ mov ebx, MMXLength
+ mov edi, row
cmp ebx, FullLength
jnb dsubend
mov esi, edi // lp = row
xor eax, eax
- add edi, bpp // rp = row + bpp
+ add edi, bpp // rp = row + bpp
dsublp2:
mov al, [esi+ebx]
add [edi+ebx], al
- inc ebx
- cmp ebx, FullLength
+ inc ebx
+ cmp ebx, FullLength
jb dsublp2
dsubend:
- emms // End MMX instructions; prep for possible FP instrs.
- } // end _asm block
+ emms // End MMX instructions; prep for possible FP instrs.
+ } // end _asm block
}
// Optimized code for PNG Up filter decoder
@@ -3611,20 +3503,20 @@ void
png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
png_bytep prev_row)
{
- png_uint_32 len;
- len = row_info->rowbytes; // # of bytes to filter
- _asm {
+ png_uint_32 len;
+ len = row_info->rowbytes; // # of bytes to filter
+ _asm {
mov edi, row
- // get # of bytes to alignment
- mov ecx, edi
- xor ebx, ebx
- add ecx, 0x7
- xor eax, eax
- and ecx, 0xfffffff8
+ // get # of bytes to alignment
+ mov ecx, edi
+ xor ebx, ebx
+ add ecx, 0x7
+ xor eax, eax
+ and ecx, 0xfffffff8
mov esi, prev_row
- sub ecx, edi
- jz dupgo
- // fix alignment
+ sub ecx, edi
+ jz dupgo
+ // fix alignment
duplp1:
mov al, [edi+ebx]
add al, [esi+ebx]
@@ -3634,47 +3526,47 @@ duplp1:
jb duplp1
dupgo:
mov ecx, len
- mov edx, ecx
- sub edx, ebx // subtract alignment fix
- and edx, 0x0000003f // calc bytes over mult of 64
- sub ecx, edx // drop over bytes from length
- // Unrolled loop - use all MMX registers and interleave to reduce
- // number of branch instructions (loops) and reduce partial stalls
+ mov edx, ecx
+ sub edx, ebx // subtract alignment fix
+ and edx, 0x0000003f // calc bytes over mult of 64
+ sub ecx, edx // drop over bytes from length
+ // Unrolled loop - use all MMX registers and interleave to reduce
+ // number of branch instructions (loops) and reduce partial stalls
duploop:
movq mm1, [esi+ebx]
movq mm0, [edi+ebx]
- movq mm3, [esi+ebx+8]
+ movq mm3, [esi+ebx+8]
paddb mm0, mm1
- movq mm2, [edi+ebx+8]
+ movq mm2, [edi+ebx+8]
movq [edi+ebx], mm0
- paddb mm2, mm3
- movq mm5, [esi+ebx+16]
- movq [edi+ebx+8], mm2
- movq mm4, [edi+ebx+16]
- movq mm7, [esi+ebx+24]
- paddb mm4, mm5
- movq mm6, [edi+ebx+24]
- movq [edi+ebx+16], mm4
- paddb mm6, mm7
+ paddb mm2, mm3
+ movq mm5, [esi+ebx+16]
+ movq [edi+ebx+8], mm2
+ movq mm4, [edi+ebx+16]
+ movq mm7, [esi+ebx+24]
+ paddb mm4, mm5
+ movq mm6, [edi+ebx+24]
+ movq [edi+ebx+16], mm4
+ paddb mm6, mm7
movq mm1, [esi+ebx+32]
- movq [edi+ebx+24], mm6
+ movq [edi+ebx+24], mm6
movq mm0, [edi+ebx+32]
- movq mm3, [esi+ebx+40]
+ movq mm3, [esi+ebx+40]
paddb mm0, mm1
- movq mm2, [edi+ebx+40]
+ movq mm2, [edi+ebx+40]
movq [edi+ebx+32], mm0
- paddb mm2, mm3
- movq mm5, [esi+ebx+48]
- movq [edi+ebx+40], mm2
- movq mm4, [edi+ebx+48]
- movq mm7, [esi+ebx+56]
- paddb mm4, mm5
- movq mm6, [edi+ebx+56]
- movq [edi+ebx+48], mm4
- add ebx, 64
- paddb mm6, mm7
+ paddb mm2, mm3
+ movq mm5, [esi+ebx+48]
+ movq [edi+ebx+40], mm2
+ movq mm4, [edi+ebx+48]
+ movq mm7, [esi+ebx+56]
+ paddb mm4, mm5
+ movq mm6, [edi+ebx+56]
+ movq [edi+ebx+48], mm4
+ add ebx, 64
+ paddb mm6, mm7
cmp ebx, ecx
- movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
+ movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
// -8 to offset add ebx
jb duploop
@@ -3682,17 +3574,17 @@ duploop:
jz dupend
- // 2 lines added by lcreeve@netins.net
- // (mail 11 Jul 98 in png-implement list)
- cmp edx, 8 //test for less than 8 bytes
- jb duplt8
+ // 2 lines added by lcreeve@netins.net
+ // (mail 11 Jul 98 in png-implement list)
+ cmp edx, 8 //test for less than 8 bytes
+ jb duplt8
- add ecx, edx
- and edx, 0x00000007 // calc bytes over mult of 8
- sub ecx, edx // drop over bytes from length
+ add ecx, edx
+ and edx, 0x00000007 // calc bytes over mult of 8
+ sub ecx, edx // drop over bytes from length
jz duplt8
- // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
+ // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
duplpA:
movq mm1, [esi+ebx]
movq mm0, [edi+ebx]
@@ -3704,9 +3596,9 @@ duplpA:
cmp edx, 0 // Test for bytes over mult of 8
jz dupend
duplt8:
- xor eax, eax
+ xor eax, eax
add ecx, edx // move over byte count into counter
- // Loop using x86 registers to update remaining bytes
+ // Loop using x86 registers to update remaining bytes
duplp2:
mov al, [edi + ebx]
add al, [esi + ebx]
@@ -3715,52 +3607,54 @@ duplp2:
mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
jb duplp2
dupend:
- // Conversion of filtered row completed
+ // Conversion of filtered row completed
emms // End MMX instructions; prep for possible FP instrs.
- } // end _asm block
+ } // end _asm block
}
-
// Optimized png_read_filter_row routines
void
png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
row, png_bytep prev_row, int filter)
{
+#ifdef PNG_DEBUG
char filnm[6];
+#endif
#define UseMMX (1)
+ if (mmx_supported == 2)
+ mmx_supported = mmxsupport();
- if (mmx_supported==2)
- mmx_supported=mmxsupport();
- //if (!mmx_supported)
+ if (!mmx_supported)
{
png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter);
return ;
}
-
+#ifdef PNG_DEBUG
png_debug(1, "in png_read_filter_row\n");
png_debug1(0,"%s, ", (UseMMX?"MMX":"x86"));
switch (filter)
{
- case 0: sprintf(filnm, "None ");
- break;
- case 1: sprintf(filnm, "Sub ");
- break;
- case 2: sprintf(filnm, "Up ");
- break;
- case 3: sprintf(filnm, "Avg ");
- break;
- case 4: sprintf(filnm, "Paeth");
- break;
- default: sprintf(filnm, "Unknw");
- break;
+ case 0: sprintf(filnm, "None ");
+ break;
+ case 1: sprintf(filnm, "Sub ");
+ break;
+ case 2: sprintf(filnm, "Up ");
+ break;
+ case 3: sprintf(filnm, "Avg ");
+ break;
+ case 4: sprintf(filnm, "Paeth");
+ break;
+ default: sprintf(filnm, "Unknw");
+ break;
}
png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
(int)((row_info->pixel_depth + 7) >> 3));
png_debug1(0,"len=%8d, ", row_info->rowbytes);
+#endif
switch (filter)
{
@@ -3775,16 +3669,17 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
} //end if UseMMX
else
{
- int bpp;
- png_bytep rp;
- png_bytep lp;
png_uint_32 i;
- bpp = (row_info->pixel_depth + 7) >> 3;
- for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
- i < row_info->rowbytes; i++, rp++, lp++)
+ png_uint_32 istop = row_info->rowbytes;
+ png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
+ png_bytep rp = row + bpp;
+ png_bytep lp = row;
+
+ for (i = bpp; i < istop; i++)
{
- *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
- }
+ *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
+ rp++;
+ }
} //end !UseMMX
break;
}
@@ -3817,23 +3712,26 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
} //end if UseMMX
else
{
- png_uint_32 i;
- int bpp;
- png_bytep rp;
- png_bytep pp;
- png_bytep lp;
- bpp = (row_info->pixel_depth + 7) >> 3;
- for (i = 0, rp = row, pp = prev_row;
- i < (png_uint_32)bpp; i++, rp++, pp++)
- {
+ png_uint_32 i;
+ png_bytep rp = row;
+ png_bytep pp = prev_row;
+ png_bytep lp = row;
+ png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
+ png_uint_32 istop = row_info->rowbytes - bpp;
+
+ for (i = 0; i < bpp; i++)
+ {
*rp = (png_byte)(((int)(*rp) +
- ((int)(*pp) / 2)) & 0xff);
- }
- for (lp = row; i < row_info->rowbytes; i++, rp++, lp++, pp++)
- {
+ ((int)(*pp++) >> 1)) & 0xff);
+ rp++;
+ }
+
+ for (i = 0; i < istop; i++)
+ {
*rp = (png_byte)(((int)(*rp) +
- (int)(*pp + *lp) / 2) & 0xff);
- }
+ ((int)(*pp++ + *lp++) >> 1)) & 0xff);
+ rp++;
+ }
} //end !UseMMX
break;
}
@@ -3846,36 +3744,54 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
} //end if UseMMX
else
{
- int bpp;
png_uint_32 i;
- png_bytep rp;
- png_bytep pp;
- png_bytep lp;
- png_bytep cp;
- bpp = (row_info->pixel_depth + 7) >> 3;
- for (i = 0, rp = row, pp = prev_row;
- i < (png_uint_32)bpp; i++, rp++, pp++)
+ png_bytep rp = row;
+ png_bytep pp = prev_row;
+ png_bytep lp = row;
+ png_bytep cp = prev_row;
+ png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
+ png_uint_32 istop=row_info->rowbytes - bpp;
+
+ for (i = 0; i < bpp; i++)
{
- *rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff);
+ *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
+ rp++;
}
- for (lp = rp - bpp, cp = pp - bpp;
- i < row_info->rowbytes; i++, rp++, pp++, lp++, cp++)
+
+ for (i = 0; i < istop; i++) // use leftover rp,pp
{
int a, b, c, pa, pb, pc, p;
- b = *pp;
- c = *cp;
- a = *lp;
- p = a + b - c;
- pa = abs(p - a);
- pb = abs(p - b);
- pc = abs(p - c);
- if (pa <= pb && pa <= pc)
- p = a;
- else if (pb <= pc)
- p = b;
- else
- p = c;
+
+ a = *lp++;
+ b = *pp++;
+ c = *cp++;
+
+ p = b - c;
+ pc = a - c;
+
+#ifdef PNG_USE_ABS
+ pa = abs(p);
+ pb = abs(pc);
+ pc = abs(p + pc);
+#else
+ pa = p < 0 ? -p : p;
+ pb = pc < 0 ? -pc : pc;
+ pc = (p + pc) < 0 ? -(p + pc) : p + pc;
+#endif
+
+ /*
+ if (pa <= pb && pa <= pc)
+ p = a;
+ else if (pb <= pc)
+ p = b;
+ else
+ p = c;
+ */
+
+ p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
+
*rp = (png_byte)(((int)(*rp) + p) & 0xff);
+ rp++;
}
} //end !UseMMX
break;
diff --git a/pngwio.c b/pngwio.c
index 3831acfb3..d5444a0ab 100644
--- a/pngwio.c
+++ b/pngwio.c
@@ -1,7 +1,7 @@
/* pngwio.c - functions for data output
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngwrite.c b/pngwrite.c
index 9a3c928ed..9830ca121 100644
--- a/pngwrite.c
+++ b/pngwrite.c
@@ -1,7 +1,7 @@
/* pngwrite.c - general routines to write a PNG file
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngwtran.c b/pngwtran.c
index 10f50c0fa..cd32a62d1 100644
--- a/pngwtran.c
+++ b/pngwtran.c
@@ -1,7 +1,7 @@
/* pngwtran.c - transforms the data in a row for PNG writers
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/pngwutil.c b/pngwutil.c
index 446c4daf6..b7a104be9 100644
--- a/pngwutil.c
+++ b/pngwutil.c
@@ -1,7 +1,7 @@
/* pngwutil.c - utilities to write a PNG file
*
- * libpng 1.0.4 - September 19, 1999
+ * libpng 1.0.4c - October 1, 1999
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
* Copyright (c) 1996, 1997 Andreas Dilger
diff --git a/scripts/makefile.beos b/scripts/makefile.beos
index bc7be4af3..0a8915133 100644
--- a/scripts/makefile.beos
+++ b/scripts/makefile.beos
@@ -1,5 +1,5 @@
# makefile for libpng on BeOS x86 ELF with gcc
-# modified from makefile.lnx by Sander Stoks
+# modified from makefile.linux by Sander Stoks
# Copyright (C) 1996, 1997 Andreas Dilger
# Copyright (C) 1999 Greg Roelofs
# For conditions of distribution and use, see copyright notice in png.h
@@ -31,7 +31,7 @@ RANLIB=ranlib
# read libpng.txt or png.h to see why PNGMAJ is 2. You should not
# have to change it.
PNGMAJ = 2
-PNGMIN = 1.0.4
+PNGMIN = 1.0.4c
PNGVER = $(PNGMAJ).$(PNGMIN)
# where make install puts libpng.a, libpng.so*, and png.h
diff --git a/scripts/makefile.borland b/scripts/makefile.borland
index 57f374f60..2d3fe5b26 100644
--- a/scripts/makefile.borland
+++ b/scripts/makefile.borland
@@ -2,8 +2,8 @@
# Borland C++ 4.5 (Note: All modules are compiled in C mode)
# Will work with C++ 4.02 also
# To build the library, do:
-# "make -fmakefile.bor -DMODEL=m"
-# or: "make -fmakefile.bor -DMODEL=l"
+# "make -fmakefile.borland -DMODEL=m"
+# or: "make -fmakefile.borland -DMODEL=l"
#
# ------------- Borland C++ 4.5 -------------
diff --git a/scripts/makefile.dec b/scripts/makefile.dec
index 6f252e45e..51403caa4 100644
--- a/scripts/makefile.dec
+++ b/scripts/makefile.dec
@@ -14,7 +14,7 @@ ZLIBINC=../zlib
# read libpng.txt or png.h to see why PNGMAJ is 2. You should not
# have to change it.
PNGMAJ = 2
-PNGMIN = 1.0.4
+PNGMIN = 1.0.4c
PNGVER = $(PNGMAJ).$(PNGMIN)
CC=cc
diff --git a/scripts/makefile.linux b/scripts/makefile.linux
index 42012015c..c84b6cea9 100644
--- a/scripts/makefile.linux
+++ b/scripts/makefile.linux
@@ -22,6 +22,8 @@ WARNMORE=-Wwrite-strings -Wpointer-arith -Wshadow \
-Wmissing-declarations -Wtraditional -Wcast-align \
-Wstrict-prototypes -Wmissing-prototypes #-Wconversion
+# for pgcc version 2.95.1, -O3 is buggy; don't us it.
+
CFLAGS=-I$(ZLIBINC) -Wall -O3 -funroll-loops \
$(ALIGN) # $(WARNMORE) -g -DPNG_DEBUG=5
LDFLAGS=-L. -Wl,-rpath,. -L$(ZLIBLIB) -Wl,-rpath,$(ZLIBLIB) -lpng -lz -lm
@@ -32,7 +34,7 @@ RANLIB=ranlib
# read libpng.txt or png.h to see why PNGMAJ is 2. You should not
# have to change it.
PNGMAJ = 2
-PNGMIN = 1.0.4
+PNGMIN = 1.0.4c
PNGVER = $(PNGMAJ).$(PNGMIN)
INCPATH=$(prefix)/include
diff --git a/scripts/makefile.msc b/scripts/makefile.msc
index 6356218ae..96b2cfc62 100644
--- a/scripts/makefile.msc
+++ b/scripts/makefile.msc
@@ -3,7 +3,7 @@
# For conditions of distribution and use, see copyright notice in png.h
# Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib
-# ------------- Microsoft C 5.1 and later -------------
+# -------- Microsoft C 5.1 and later, does not use assembler code -----
MODEL=-AL
CFLAGS=-Oait -Gs -nologo -W3 $(MODEL) -I..\zlib
#-Ox generates bad code with MSC 5.1
diff --git a/scripts/makefile.sco b/scripts/makefile.sco
index 9eee4a2d5..1e5100449 100644
--- a/scripts/makefile.sco
+++ b/scripts/makefile.sco
@@ -25,7 +25,7 @@ RANLIB=echo
# read libpng.txt or png.h to see why PNGMAJ is 2. You should not
# have to change it.
PNGMAJ = 2
-PNGMIN = 1.0.4
+PNGMIN = 1.0.4c
PNGVER = $(PNGMAJ).$(PNGMIN)
INCPATH=$(prefix)/include
diff --git a/scripts/makefile.solaris b/scripts/makefile.solaris
index 5f3a412c9..fcc307800 100644
--- a/scripts/makefile.solaris
+++ b/scripts/makefile.solaris
@@ -1,5 +1,5 @@
# makefile for libpng on Solaris 2.x with gcc
-# Contributed by William L. Sebok, based on makefile.lnx
+# Contributed by William L. Sebok, based on makefile.linux
# Copyright (C) 1996, 1997 Andreas Dilger
# Copyright (C) 1998 Greg Roelofs
# For conditions of distribution and use, see copyright notice in png.h
@@ -36,7 +36,7 @@ RANLIB=echo
# read libpng.txt or png.h to see why PNGMAJ is 2. You should not
# have to change it.
PNGMAJ = 2
-PNGMIN = 1.0.4
+PNGMIN = 1.0.4c
PNGVER = $(PNGMAJ).$(PNGMIN)
INCPATH=$(prefix)/include
diff --git a/scripts/makefile.turboc3 b/scripts/makefile.turboc3
index c925831d4..f9a2269d2 100644
--- a/scripts/makefile.turboc3
+++ b/scripts/makefile.turboc3
@@ -1,7 +1,7 @@
# Makefile for libpng
# TurboC++ 3.0 (Note: All modules are compiled in C mode)
-# To use, do "make -fmakefile.tc3"
+# To use, do "make -fmakefile.turboc3"
# ------------- Turbo C++ 3.0 -------------
MODEL=-ml
diff --git a/scripts/makefile.win32vc b/scripts/makefile.vcawin32
index 52934c34a..be7fcc8a2 100644
--- a/scripts/makefile.win32vc
+++ b/scripts/makefile.vcawin32
@@ -2,9 +2,15 @@
# Copyright (C) 1998 Tim Wegner
# For conditions of distribution and use, see copyright notice in png.h
# Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib
-# To use, do "nmake /f scripts\makefile.w32"
+# To use, do "nmake /f scripts\makefile.vcawin32"
+
+# ---------- Microsoft Visual C++ 5.0 and later, uses assembler code------
+
+# Caution: the assembler code was introduced at libpng version 1.0.4 and has
+# not yet been thoroughly tested.
+
+# If you don't want to use assembler code, use makefile.vcwin32 instead.
-# ------------- Microsoft Visual C++ 4.0 and later -------------
MODEL=-
CFLAGS=-DPNG_USE_PNGVCRD -Ox -GA3s -nologo -W3 -I..\zlib
diff --git a/scripts/makefile.vcwin32 b/scripts/makefile.vcwin32
new file mode 100644
index 000000000..5b62fc316
--- /dev/null
+++ b/scripts/makefile.vcwin32
@@ -0,0 +1,87 @@
+# makefile for libpng
+# Copyright (C) 1998 Tim Wegner
+# For conditions of distribution and use, see copyright notice in png.h
+# Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib
+# To use, do "nmake /f scripts\makefile.vcwin32"
+
+# ---------- Microsoft Visual C++ 4.0 and later, no assembler code------
+# If you want to use assembler code, use makefile.vcawin32 instead.
+
+MODEL=-
+CFLAGS= -Ox -GA3s -nologo -W3 -I..\zlib
+
+CC=cl
+LD=link
+LDFLAGS=
+O=.obj
+
+#uncomment next to put error messages in a file
+#ERRFILE= >> pngerrs
+
+# variables
+OBJS1 = png$(O) pngset$(O) pngget$(O) pngrutil$(O) pngtrans$(O) pngwutil$(O)
+OBJS2 = pngmem$(O) pngpread$(O) pngread$(O) pngerror$(O) pngwrite$(O)
+OBJS3 = pngrtran$(O) pngwtran$(O) pngrio$(O) pngwio$(O)
+
+all: libpng.lib
+
+png$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngset$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngget$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngread$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngpread$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngrtran$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngrutil$(O): png.h pngconf.h pngasmrd.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngerror$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngmem$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngrio$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngwio$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngtest$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngtrans$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngwrite$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngwtran$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+pngwutil$(O): png.h pngconf.h
+ $(CC) -c $(CFLAGS) $*.c $(ERRFILE)
+
+libpng.lib: $(OBJS1) $(OBJS2) $(OBJS3)
+ del libpng.lib
+ lib /OUT:libpng.lib $(OBJS1) $(OBJS2) $(OBJS3)
+
+pngtest.exe: pngtest.obj libpng.lib
+ $(LD) $(LDFLAGS) pngtest.obj libpng.lib ..\zlib\zlib.lib /OUT:pngtest.exe /SUBSYSTEM:CONSOLE
+
+test: pngtest.exe
+ pngtest
+
+# End of makefile for libpng
+
diff --git a/scripts/makefile.watcom b/scripts/makefile.watcom
index a7d99c224..e14f162ac 100644
--- a/scripts/makefile.watcom
+++ b/scripts/makefile.watcom
@@ -5,7 +5,7 @@
# For conditions of distribution and use, see copyright notice in png.h
# Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib
-# To use, do "wmake /f scripts\makefile.wat"
+# To use, do "wmake /f scripts\makefile.watcom"
# ------------- Watcom 10.0 and later -------------
MODEL=-mf
diff --git a/scripts/pngdef.pas b/scripts/pngdef.pas
index 94e859acf..1441808a1 100644
--- a/scripts/pngdef.pas
+++ b/scripts/pngdef.pas
@@ -3,8 +3,8 @@ unit pngdef;
interface
const
- PNG_LIBPNG_VER_STRING = '1.0.4';
- PNG_LIBPNG_VER = 10004;
+ PNG_LIBPNG_VER_STRING = '1.0.4c';
+ PNG_LIBPNG_VER = 10005;
type
png_uint_32 = Cardinal;