summaryrefslogtreecommitdiff
path: root/sed
diff options
context:
space:
mode:
authorPaolo Bonzini <bonzini@gnu.org>2006-08-03 13:03:24 +0000
committerPaolo Bonzini <bonzini@gnu.org>2008-01-09 16:12:10 +0100
commit3ca529fbe25706387200425b2a99012d6008f26c (patch)
tree02500670e8e554ea26bdad2b4156107c6452aa84 /sed
parent4ca34ac568881f50c9d5ecd2750938396686cf25 (diff)
downloadsed-3ca529fbe25706387200425b2a99012d6008f26c.tar.gz
hack to speed up UTF-8 processing
2006-08-03 Paolo Bonzini <bonzini@gnu.org> * sed/execute.c (str_append): Use is_utf8 to skip useless work. * sed/mbcs.c (initialize_mbcs): Look for a UTF-8 locale. (is_utf8): New. * sed/sed.h (is_utf8): New. git-archimport-id: bonzini@gnu.org--2004b/sed--stable--4.1--patch-70
Diffstat (limited to 'sed')
-rw-r--r--sed/execute.c35
-rw-r--r--sed/mbcs.c25
-rw-r--r--sed/sed.h1
3 files changed, 44 insertions, 17 deletions
diff --git a/sed/execute.c b/sed/execute.c
index ab2cc5c..1401674 100644
--- a/sed/execute.c
+++ b/sed/execute.c
@@ -235,25 +235,26 @@ str_append(to, string, length)
to->length = new_length;
#ifdef HAVE_MBRTOWC
- if (mb_cur_max == 1)
- return;
-
- while (length)
- {
- int n = MBRLEN (string, length, &to->mbstate);
+ if (mb_cur_max > 1 && !is_utf8)
+ while (length)
+ {
+ size_t n = MBRLEN (string, length, &to->mbstate);
- /* An invalid sequence is treated like a singlebyte character. */
- if (n == -1)
- {
- memset (&to->mbstate, 0, sizeof (to->mbstate));
- n = 1;
- }
+ /* An invalid sequence is treated like a singlebyte character. */
+ if (n == (size_t) -1)
+ {
+ memset (&to->mbstate, 0, sizeof (to->mbstate));
+ n = 1;
+ }
- if (n > 0)
- length -= n;
- else
- break;
- }
+ if (n > 0)
+ {
+ string += n;
+ length -= n;
+ }
+ else
+ break;
+ }
#endif
}
diff --git a/sed/mbcs.c b/sed/mbcs.c
index 3756547..e325b9d 100644
--- a/sed/mbcs.c
+++ b/sed/mbcs.c
@@ -18,7 +18,12 @@
#include "sed.h"
#include <stdlib.h>
+#ifdef HAVE_LANGINFO_CODESET
+#include <langinfo.h>
+#endif
+
int mb_cur_max;
+bool is_utf8;
#ifdef HAVE_MBRTOWC
/* Add a byte to the multibyte character represented by the state
@@ -47,6 +52,26 @@ int brlen (ch, cur_stat)
void
initialize_mbcs ()
{
+ /* For UTF-8, we know that the encoding is stateless. */
+ const char *codeset_name;
+
+#ifdef HAVE_LANGINFO_CODESET
+ codeset_name = nl_langinfo (CODESET);
+#else
+ codeset_name = getenv ("LC_ALL");
+ if (codeset_name == NULL || codeset_name[0] == '\0')
+ codeset_name = getenv ("LC_CTYPE");
+ if (codeset_name == NULL || codeset_name[0] == '\0')
+ codeset_name = getenv ("LANG");
+ if (codeset_name == NULL)
+ codeset_name = "";
+ else if (strchr (codeset_name, '.') != NULL)
+ codeset_name = strchr (codeset_name, '.') + 1;
+#endif
+
+ is_utf8 = (strcasecmp (codeset_name, "UTF-8") == 0
+ || strcasecmp (codeset_name, "UTF8") == 0);
+
#ifdef HAVE_MBRTOWC
mb_cur_max = MB_CUR_MAX;
#else
diff --git a/sed/sed.h b/sed/sed.h
index ef125db..b46f10f 100644
--- a/sed/sed.h
+++ b/sed/sed.h
@@ -233,6 +233,7 @@ extern bool use_extended_syntax_p;
/* Declarations for multibyte character sets. */
extern int mb_cur_max;
+extern bool is_utf8;
#ifdef HAVE_MBRTOWC
#ifdef HAVE_BTOWC