diff options
author | tromey <tromey@138bc75d-0d04-0410-961f-82ee72b054a4> | 2008-04-21 14:02:00 +0000 |
---|---|---|
committer | tromey <tromey@138bc75d-0d04-0410-961f-82ee72b054a4> | 2008-04-21 14:02:00 +0000 |
commit | d656d07a0ea99c1a5c9c8273f2fe486381e12c15 (patch) | |
tree | 95c70d6534e91125519d9a8f2e7c2cf84de0c82b /libcpp/charset.c | |
parent | 7f0c1cb278a40dd00f83f7253eedf3d227c3937f (diff) | |
download | gcc-d656d07a0ea99c1a5c9c8273f2fe486381e12c15.tar.gz |
libcpp
PR libcpp/33415:
* charset.c (_cpp_convert_input): Add buffer_start argument.
Ignore UTF-8 BOM if seen.
* internal.h (_cpp_convert_input): Add argument.
* files.c (struct _cpp_file) <buffer_start>: New field.
(destroy_cpp_file): Free buffer_start, not buffer.
(_cpp_pop_file_buffer): Likewise.
(read_file_guts): Update.
gcc/testsuite
PR libcpp/33415:
* gcc.dg/cpp/pr33415.c: New file.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@134507 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libcpp/charset.c')
-rw-r--r-- | libcpp/charset.c | 40 |
1 files changed, 31 insertions, 9 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c index 225cdb4915e..d70d05cc020 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -1,5 +1,5 @@ /* CPP Library - charsets - Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006 + Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008 Free Software Foundation, Inc. Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges. @@ -1637,18 +1637,24 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) source file) from INPUT_CHARSET to the source character set. INPUT points to the input buffer, SIZE is its allocated size, and LEN is the length of the meaningful data within the buffer. The - translated buffer is returned, and *ST_SIZE is set to the length of - the meaningful data within the translated buffer. - - INPUT is expected to have been allocated with xmalloc. This function - will either return INPUT, or free it and return a pointer to another - xmalloc-allocated block of memory. */ + translated buffer is returned, *ST_SIZE is set to the length of + the meaningful data within the translated buffer, and *BUFFER_START + is set to the start of the returned buffer. *BUFFER_START may + differ from the return value in the case of a BOM or other ignored + marker information. + + INPUT is expected to have been allocated with xmalloc. This + function will either set *BUFFER_START to INPUT, or free it and set + *BUFFER_START to a pointer to another xmalloc-allocated block of + memory. */ uchar * _cpp_convert_input (cpp_reader *pfile, const char *input_charset, - uchar *input, size_t size, size_t len, off_t *st_size) + uchar *input, size_t size, size_t len, + const unsigned char **buffer_start, off_t *st_size) { struct cset_converter input_cset; struct _cpp_strbuf to; + unsigned char *buffer; input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset); if (input_cset.func == convert_no_conversion) @@ -1689,8 +1695,24 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, else to.text[to.len] = '\n'; + buffer = to.text; *st_size = to.len; - return to.text; +#if HOST_CHARSET == HOST_CHARSET_ASCII + /* The HOST_CHARSET test just above ensures that the source charset + is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that + glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a + BOM -- however, even if it did, we would still need this code due + to the 'convert_no_conversion' case. */ + if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb + && to.text[2] == 0xbf) + { + *st_size -= 3; + buffer += 3; + } +#endif + + *buffer_start = to.text; + return buffer; } /* Decide on the default encoding to assume for input files. */ |