summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2017-02-14 15:02:59 -0800
committerPaul Eggert <eggert@cs.ucla.edu>2017-02-16 08:54:41 -0800
commit5ac3d87a69122d45a837840914c2f9e66b5bb8d4 (patch)
treedc4b278491a96fa312afadf093bf44b3a2b54348
parent5398777784b4455aa8466c1aef48b5738b8cd626 (diff)
downloadgrep-5ac3d87a69122d45a837840914c2f9e66b5bb8d4.tar.gz
Simplify -U on MS-Windows by removing guesswork
Suggested by Eric Blake (Bug#25707#11). * NEWS, doc/grep.texi: Document this. * src/dosbuf.c: Remove. * bootstrap.conf (gnulib_modules): Add xfreopen. * src/grep.c: Include xfreopen.h, not dosbuf.c. (fillbuf, print_line_head): Do not undossify input. (binary): New static var. (grepdesc): Apply BINARY to input file. (usage): Remove -u help. (main): Set BINARY if -U, and apply it to stdout. Do nothing if -u. With -f, apply BINARY to input file.
-rw-r--r--NEWS7
-rw-r--r--bootstrap.conf1
-rw-r--r--doc/grep.texi54
-rw-r--r--src/dosbuf.c222
-rw-r--r--src/grep.c34
5 files changed, 46 insertions, 272 deletions
diff --git a/NEWS b/NEWS
index 89674f30..c387aef8 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,13 @@ GNU grep NEWS -*- outline -*-
* Noteworthy changes in release ?.? (????-??-??) [?]
+** Changes in behavior
+
+ The following changes affect only MS-Windows platforms. First, the
+ --binary (-U) option now governs whether binary I/O is used, instead
+ of a heuristic that was sometimes incorrect. Second, the
+ --unix-byte-offsets (-u) option now has no effect on MS-Windows too.
+
* Noteworthy changes in release 3.0 (2017-02-09) [stable]
diff --git a/bootstrap.conf b/bootstrap.conf
index cd2d756b..4115f35b 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -95,6 +95,7 @@ wcrtomb
wctob
wctype-h
xalloc
+xfreopen
xstrtoimax
'
gnulib_name=libgreputils
diff --git a/doc/grep.texi b/doc/grep.texi
index 7c051e05..94a06210 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -412,10 +412,6 @@ Print the 0-based byte offset within the input file
before each line of output.
If @option{-o} (@option{--only-matching}) is specified,
print the offset of the matching part itself.
-When @command{grep} runs on MS-DOS or MS-Windows,
-the printed byte offsets depend on whether
-the @option{-u} (@option{--unix-byte-offsets}) option is used;
-see below.
@item -H
@itemx --with-filename
@@ -466,21 +462,6 @@ This is useful with options that prefix their output to the actual content:
This may also prepend spaces to output line numbers and byte offsets
so that lines from a single file all start at the same column.
-@item -u
-@itemx --unix-byte-offsets
-@opindex -u
-@opindex --unix-byte-offsets
-@cindex MS-DOS/MS-Windows byte offsets
-@cindex byte offsets, on MS-DOS/MS-Windows
-Report Unix-style byte offsets.
-This option causes @command{grep} to report byte offsets
-as if the file were a Unix-style text file,
-i.e., the byte offsets ignore carriage returns that were stripped.
-This will produce results identical
-to running @command{grep} on a Unix machine.
-This option has no effect unless the @option{-b} option is also used;
-it has no effect on platforms other than MS-DOS and MS-Windows.
-
@item -Z
@itemx --null
@opindex -Z
@@ -759,21 +740,26 @@ This can cause a performance penalty.
@itemx --binary
@opindex -U
@opindex --binary
-@cindex MS-DOS/MS-Windows binary files
-@cindex binary files, MS-DOS/MS-Windows
-Treat the file(s) as binary.
-By default, under MS-DOS and MS-Windows,
-@command{grep} guesses whether a file is text or binary
-as described for the @option{--binary-files} option.
-If @command{grep} decides the file is a text file,
-it strips carriage returns from the original file contents
-(to make regular expressions with @code{^} and @code{$} work correctly).
-Specifying @option{-U} overrules this guesswork,
-causing all files to be read and passed to the matching mechanism verbatim;
-if the file is a text file with @code{CR/LF} pairs at the end of each line,
-this will cause some regular expressions to fail.
-This option has no effect
-on platforms other than MS-DOS and MS-Windows.
+@cindex MS-Windows binary I/O
+@cindex binary I/O, /MS-Windows
+Under MS-Windows, use binary I/O when reading and writing files other
+than the user's terminal, so that all input bytes are read and written
+as-is. This overrides the default behavior where @command{grep}
+follows the operating system's advice whether to use binary or text
+I/O. When @command{grep} uses text I/O it reads a carriage
+return--newline pair as a newline and a Control-Z as end-of-file, and
+it writes a newline as a carriage return--newline pair. When this
+option is combined with @option{--byte-offset} (@option{-b}), byte
+counts treat each text I/O carriage return--newline as a single byte.
+
+When using text I/O, @option{--binary-files} heuristics are applied to
+input data after text-I/O processing. These heuristics need not agree
+with the @option{--binary} option; that is, they may treat the data as
+text even if @option{--binary} is given, or vice versa.
+@xref{File and Directory Selection}.
+
+This option has no effect on platforms other than MS-Windows.
+
@item -z
@itemx --null-data
diff --git a/src/dosbuf.c b/src/dosbuf.c
deleted file mode 100644
index 9892755c..00000000
--- a/src/dosbuf.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/* dosbuf.c
- Copyright (C) 1992, 1997-2002, 2004-2017 Free Software Foundation, Inc.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 3, or (at your option)
- any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
- 02110-1301, USA. */
-
-/* Messy DOS-specific code for correctly treating binary, Unix text
- and DOS text files.
-
- This has several aspects:
-
- * Guessing the file type (unless the user tells us);
- * Stripping CR characters from DOS text files (otherwise regex
- functions won't work correctly);
- * Reporting correct byte count with -b for any kind of file.
-
-*/
-
-#include <config.h>
-
-typedef enum {
- UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT
-} File_type;
-
-struct dos_map {
- off_t pos; /* position in buffer passed to matcher */
- off_t add; /* how much to add when reporting char position */
-};
-
-static int dos_report_unix_offset = 0;
-
-static File_type dos_file_type = UNKNOWN;
-static File_type dos_use_file_type = UNKNOWN;
-static off_t dos_stripped_crs = 0;
-static struct dos_map *dos_pos_map;
-static int dos_pos_map_size = 0;
-static int dos_pos_map_used = 0;
-static int inp_map_idx = 0, out_map_idx = 1;
-
-/* Set default DOS file type to binary. */
-static void
-dos_binary (void)
-{
- if (O_BINARY)
- dos_use_file_type = DOS_BINARY;
-}
-
-/* Tell DOS routines to report Unix offset. */
-static void
-dos_unix_byte_offsets (void)
-{
- if (O_BINARY)
- dos_report_unix_offset = 1;
-}
-
-/* Guess DOS file type by looking at its contents. */
-static File_type
-guess_type (char *buf, size_t buflen)
-{
- int crlf_seen = 0;
- char *bp = buf;
-
- while (buflen--)
- {
- /* Treat a file as binary if it has a NUL character. */
- if (!*bp)
- return DOS_BINARY;
-
- /* CR before LF means DOS text file (unless we later see
- binary characters). */
- else if (*bp == '\r' && buflen && bp[1] == '\n')
- crlf_seen = 1;
-
- bp++;
- }
-
- return crlf_seen ? DOS_TEXT : UNIX_TEXT;
-}
-
-/* Convert external DOS file representation to internal.
- Return the count of bytes left in the buffer.
- Build table to map character positions when reporting byte counts. */
-static size_t
-undossify_input (char *buf, size_t buflen)
-{
- if (! O_BINARY)
- return buflen;
-
- size_t bytes_left = 0;
-
- if (totalcc == 0)
- {
- /* New file: forget everything we knew about character
- position mapping table and file type. */
- inp_map_idx = 0;
- out_map_idx = 1;
- dos_pos_map_used = 0;
- dos_stripped_crs = 0;
- dos_file_type = dos_use_file_type;
- }
-
- /* Guess if this file is binary, unless we already know that. */
- if (dos_file_type == UNKNOWN)
- dos_file_type = guess_type (buf, buflen);
-
- /* If this file is to be treated as DOS Text, strip the CR characters
- and maybe build the table for character position mapping on output. */
- if (dos_file_type == DOS_TEXT)
- {
- char *destp = buf;
-
- while (buflen--)
- {
- if (*buf != '\r')
- {
- *destp++ = *buf++;
- bytes_left++;
- }
- else
- {
- buf++;
- if (out_byte && !dos_report_unix_offset)
- {
- dos_stripped_crs++;
- while (buflen && *buf == '\r')
- {
- dos_stripped_crs++;
- buflen--;
- buf++;
- }
- if (inp_map_idx >= dos_pos_map_size - 1)
- {
- dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000;
- dos_pos_map = xrealloc (dos_pos_map,
- (dos_pos_map_size
- * sizeof (struct dos_map)));
- }
-
- if (!inp_map_idx)
- {
- /* Add sentinel entry. */
- dos_pos_map[inp_map_idx].pos = 0;
- dos_pos_map[inp_map_idx++].add = 0;
-
- /* Initialize first real entry. */
- dos_pos_map[inp_map_idx].add = 0;
- }
-
- /* Put the new entry. If the stripped CR characters
- precede a Newline (the usual case), pretend that
- they were found *after* the Newline. This makes
- displayed byte offsets more reasonable in some
- cases, and fits better the intuitive notion that
- the line ends *before* the CR, not *after* it. */
- inp_map_idx++;
- dos_pos_map[inp_map_idx-1].pos =
- (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc;
- dos_pos_map[inp_map_idx].add = dos_stripped_crs;
- dos_pos_map_used = inp_map_idx;
-
- /* The following will be updated on the next pass. */
- dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1;
- }
- }
- }
-
- return bytes_left;
- }
-
- return buflen;
-}
-
-/* Convert internal byte count into external. */
-static off_t
-dossified_pos (off_t byteno)
-{
- if (! O_BINARY)
- return byteno;
-
- off_t pos_lo;
- off_t pos_hi;
-
- if (dos_file_type != DOS_TEXT || dos_report_unix_offset)
- return byteno;
-
- /* Optimization: usually the file will be scanned sequentially.
- So in most cases, this byte position will be found in the
- table near the previous one, as recorded in 'out_map_idx'. */
- pos_lo = dos_pos_map[out_map_idx-1].pos;
- pos_hi = dos_pos_map[out_map_idx].pos;
-
- /* If the initial guess failed, search up or down, as
- appropriate, beginning with the previous place. */
- if (byteno >= pos_hi)
- {
- out_map_idx++;
- while (out_map_idx < dos_pos_map_used
- && byteno >= dos_pos_map[out_map_idx].pos)
- out_map_idx++;
- }
-
- else if (byteno < pos_lo)
- {
- out_map_idx--;
- while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos)
- out_map_idx--;
- }
-
- return byteno + dos_pos_map[out_map_idx].add;
-}
diff --git a/src/grep.c b/src/grep.c
index 437ae0a5..32cb6429 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -48,6 +48,7 @@
#include "search.h"
#include "version-etc.h"
#include "xalloc.h"
+#include "xfreopen.h"
#include "xstrtol.h"
enum { SEP_CHAR_SELECTED = ':' };
@@ -536,10 +537,6 @@ static enum
static bool grepfile (int, char const *, bool, bool);
static bool grepdesc (int, bool);
-static void dos_binary (void);
-static void dos_unix_byte_offsets (void);
-static size_t undossify_input (char *, size_t);
-
static bool
is_device_mode (mode_t m)
{
@@ -973,7 +970,6 @@ fillbuf (size_t save, struct stat const *st)
}
}
- fillsize = undossify_input (readbuf, fillsize);
buflim = readbuf + fillsize;
/* Initialize the following word, because skip_easy_bytes and some
@@ -1033,8 +1029,7 @@ static intmax_t pending; /* Pending lines of output.
static bool done_on_match; /* Stop scanning file on first match. */
static bool exit_on_match; /* Exit on first match. */
static bool dev_null_output; /* Stdout is known to be /dev/null. */
-
-#include "dosbuf.c"
+static bool binary; /* Use binary rather than text I/O. */
static void
nlscan (char const *lim)
@@ -1127,7 +1122,6 @@ print_line_head (char *beg, size_t len, char const *lim, char sep)
if (out_byte)
{
uintmax_t pos = add_count (totalcc, beg - bufbeg);
- pos = dossified_pos (pos);
print_offset (pos, byte_num_color);
print_sep (sep);
}
@@ -1862,7 +1856,7 @@ grepdesc (int desc, bool command_line)
/* Set input to binary mode. Pipes are simulated with files
on DOS, so this includes the case of "foo | grep bar". */
- if (O_BINARY && !isatty (desc))
+ if (binary && !isatty (desc))
set_binary_mode (desc, O_BINARY);
count = grep (desc, &st, &ineof);
@@ -2006,8 +2000,6 @@ Context control:\n\
--colour[=WHEN] use markers to highlight the matching strings;\n\
WHEN is 'always', 'never', or 'auto'\n\
-U, --binary do not strip CR characters at EOL (MSDOS/Windows)\n\
- -u, --unix-byte-offsets report offsets as if CRs were not there\n\
- (MSDOS/Windows)\n\
\n"));
printf (_("\
When FILE is '-', read standard input. With no FILE, read '.' if\n\
@@ -2537,11 +2529,13 @@ main (int argc, char **argv)
break;
case 'U':
- dos_binary ();
+ if (O_BINARY)
+ binary = true;
break;
case 'u':
- dos_unix_byte_offsets ();
+ /* Obsolete option; it has no effect. FIXME: Diagnose use of
+ this option starting in (say) the year 2020. */
break;
case 'V':
@@ -2582,7 +2576,15 @@ main (int argc, char **argv)
break;
case 'f':
- fp = STREQ (optarg, "-") ? stdin : fopen (optarg, "r");
+ if (STREQ (optarg, "-"))
+ {
+ if (binary)
+ xfreopen (NULL, "rb", stdin);
+ fp = stdin;
+ }
+ else
+ fp = fopen (optarg, binary ? "rb" : "r");
+
if (!fp)
die (EXIT_TROUBLE, errno, "%s", optarg);
oldcc = keycc;
@@ -2897,8 +2899,8 @@ main (int argc, char **argv)
/* Output is set to binary mode because we shouldn't convert
NL to CR-LF pairs, especially when grepping binary files. */
- if (O_BINARY && !isatty (STDOUT_FILENO))
- set_binary_mode (STDOUT_FILENO, O_BINARY);
+ if (binary && !isatty (STDOUT_FILENO))
+ xfreopen (NULL, "wb", stdout);
/* Prefer sysconf for page size, as getpagesize typically returns int. */
#ifdef _SC_PAGESIZE