diff options
author | Paolo Bonzini <bonzini@gnu.org> | 2006-08-03 13:03:24 +0000 |
---|---|---|
committer | Paolo Bonzini <bonzini@gnu.org> | 2008-01-09 16:12:10 +0100 |
commit | 3ca529fbe25706387200425b2a99012d6008f26c (patch) | |
tree | 02500670e8e554ea26bdad2b4156107c6452aa84 /sed | |
parent | 4ca34ac568881f50c9d5ecd2750938396686cf25 (diff) | |
download | sed-3ca529fbe25706387200425b2a99012d6008f26c.tar.gz |
hack to speed up UTF-8 processing
2006-08-03 Paolo Bonzini <bonzini@gnu.org>
* sed/execute.c (str_append): Use is_utf8 to skip useless work.
* sed/mbcs.c (initialize_mbcs): Look for a UTF-8 locale.
(is_utf8): New.
* sed/sed.h (is_utf8): New.
git-archimport-id: bonzini@gnu.org--2004b/sed--stable--4.1--patch-70
Diffstat (limited to 'sed')
-rw-r--r-- | sed/execute.c | 35 | ||||
-rw-r--r-- | sed/mbcs.c | 25 | ||||
-rw-r--r-- | sed/sed.h | 1 |
3 files changed, 44 insertions, 17 deletions
diff --git a/sed/execute.c b/sed/execute.c index ab2cc5c..1401674 100644 --- a/sed/execute.c +++ b/sed/execute.c @@ -235,25 +235,26 @@ str_append(to, string, length) to->length = new_length; #ifdef HAVE_MBRTOWC - if (mb_cur_max == 1) - return; - - while (length) - { - int n = MBRLEN (string, length, &to->mbstate); + if (mb_cur_max > 1 && !is_utf8) + while (length) + { + size_t n = MBRLEN (string, length, &to->mbstate); - /* An invalid sequence is treated like a singlebyte character. */ - if (n == -1) - { - memset (&to->mbstate, 0, sizeof (to->mbstate)); - n = 1; - } + /* An invalid sequence is treated like a singlebyte character. */ + if (n == (size_t) -1) + { + memset (&to->mbstate, 0, sizeof (to->mbstate)); + n = 1; + } - if (n > 0) - length -= n; - else - break; - } + if (n > 0) + { + string += n; + length -= n; + } + else + break; + } #endif } @@ -18,7 +18,12 @@ #include "sed.h" #include <stdlib.h> +#ifdef HAVE_LANGINFO_CODESET +#include <langinfo.h> +#endif + int mb_cur_max; +bool is_utf8; #ifdef HAVE_MBRTOWC /* Add a byte to the multibyte character represented by the state @@ -47,6 +52,26 @@ int brlen (ch, cur_stat) void initialize_mbcs () { + /* For UTF-8, we know that the encoding is stateless. */ + const char *codeset_name; + +#ifdef HAVE_LANGINFO_CODESET + codeset_name = nl_langinfo (CODESET); +#else + codeset_name = getenv ("LC_ALL"); + if (codeset_name == NULL || codeset_name[0] == '\0') + codeset_name = getenv ("LC_CTYPE"); + if (codeset_name == NULL || codeset_name[0] == '\0') + codeset_name = getenv ("LANG"); + if (codeset_name == NULL) + codeset_name = ""; + else if (strchr (codeset_name, '.') != NULL) + codeset_name = strchr (codeset_name, '.') + 1; +#endif + + is_utf8 = (strcasecmp (codeset_name, "UTF-8") == 0 + || strcasecmp (codeset_name, "UTF8") == 0); + #ifdef HAVE_MBRTOWC mb_cur_max = MB_CUR_MAX; #else @@ -233,6 +233,7 @@ extern bool use_extended_syntax_p; /* Declarations for multibyte character sets. */ extern int mb_cur_max; +extern bool is_utf8; #ifdef HAVE_MBRTOWC #ifdef HAVE_BTOWC |