summaryrefslogtreecommitdiff
path: root/src/coding.c
diff options
context:
space:
mode:
authorKenichi Handa <handa@m17n.org>2008-01-19 05:55:50 +0000
committerKenichi Handa <handa@m17n.org>2008-01-19 05:55:50 +0000
commit5a7a1dde2c0cc502c3db47bb867b14534f741867 (patch)
tree7196c1c2fc59402b7ba2063794d2308e30aabacb /src/coding.c
parent69c43ff730922b9a55920ac1c0601e057eac9766 (diff)
downloademacs-5a7a1dde2c0cc502c3db47bb867b14534f741867.tar.gz
(detect_coding_iso2022): New arg
latin_extra_code_state. Allow Latin extra codes only when *latin_extra_code_state is nonzero. (detect_coding_mask): If there is a NULL byte, detect the encoding as UTF-16 or binary. If Latin extra codes exist, detect the encoding as ISO-2022 only when there's no other proper encoding is found.
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c117
1 files changed, 93 insertions, 24 deletions
diff --git a/src/coding.c b/src/coding.c
index 716eb98a87f..dd201ae61d6 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -1410,12 +1410,17 @@ enum iso_code_class_type iso_code_class[256];
CODING_CATEGORY_MASK_ISO_7_ELSE
CODING_CATEGORY_MASK_ISO_8_ELSE
are set. If a code which should never appear in ISO2022 is found,
- returns 0. */
+ returns 0.
+
+ If *latin_extra_code_state is zero and Latin extra codes are found,
+ set *latin_extra_code_state to 1 and return 0. If it is nonzero,
+ accept Latin extra codes. */
static int
-detect_coding_iso2022 (src, src_end, multibytep)
+detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
unsigned char *src, *src_end;
int multibytep;
+ int *latin_extra_code_state;
{
int mask = CODING_CATEGORY_MASK_ISO;
int mask_found = 0;
@@ -1578,6 +1583,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
if (VECTORP (Vlatin_extra_code_table)
&& !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
{
+ if (! *latin_extra_code_state)
+ {
+ *latin_extra_code_state = 1;
+ return 0;
+ }
if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
& CODING_FLAG_ISO_LATIN_EXTRA)
newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@@ -1604,6 +1614,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
{
int newmask = 0;
+ if (! *latin_extra_code_state)
+ {
+ *latin_extra_code_state = 1;
+ return 0;
+ }
if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
& CODING_FLAG_ISO_LATIN_EXTRA)
newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@@ -4131,6 +4146,8 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
unsigned char *src = source, *src_end = source + src_bytes;
unsigned int mask, utf16_examined_p, iso2022_examined_p;
int i;
+ int null_byte_found;
+ int latin_extra_code_state = 1;
/* At first, skip all ASCII characters and control characters except
for three ISO2022 specific control characters. */
@@ -4139,21 +4156,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
ascii_skip_code[ISO_CODE_ESC] = 0;
label_loop_detect_coding:
- while (src < src_end && ascii_skip_code[*src]) src++;
+ null_byte_found = 0;
+ /* We stop this loop before the last byte because it may be a NULL
+ anchor byte. */
+ while (src < src_end - 1 && ascii_skip_code[*src])
+ null_byte_found |= (! *src++);
+ if (ascii_skip_code[*src])
+ src++;
+ else if (! null_byte_found)
+ {
+ unsigned char *p = src + 1;
+ while (p < src_end - 1)
+ null_byte_found |= (! *p++);
+ }
*skip = src - source;
if (src >= src_end)
- /* We found nothing other than ASCII. There's nothing to do. */
+ /* We found nothing other than ASCII (and NULL byte). There's
+ nothing to do. */
return 0;
c = *src;
/* The text seems to be encoded in some multilingual coding system.
Now, try to find in which coding system the text is encoded. */
- if (c < 0x80)
+ if (! null_byte_found && c < 0x80)
{
/* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
/* C is an ISO2022 specific control code of C0. */
- mask = detect_coding_iso2022 (src, src_end, multibytep);
+ latin_extra_code_state = 1;
+ mask = detect_coding_iso2022 (src, src_end, multibytep,
+ &latin_extra_code_state);
if (mask == 0)
{
/* No valid ISO2022 code follows C. Try again. */
@@ -4181,21 +4213,27 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
c = src[1] - 0x20;
- if (c < 0xA0)
+ if (null_byte_found)
+ {
+ try = (CODING_CATEGORY_MASK_UTF_16_BE
+ | CODING_CATEGORY_MASK_UTF_16_LE);
+ }
+ else if (c < 0xA0)
{
/* C is the first byte of SJIS character code,
or a leading-code of Emacs' internal format (emacs-mule),
or the first byte of UTF-16. */
try = (CODING_CATEGORY_MASK_SJIS
- | CODING_CATEGORY_MASK_EMACS_MULE
- | CODING_CATEGORY_MASK_UTF_16_BE
- | CODING_CATEGORY_MASK_UTF_16_LE);
+ | CODING_CATEGORY_MASK_EMACS_MULE
+ | CODING_CATEGORY_MASK_UTF_16_BE
+ | CODING_CATEGORY_MASK_UTF_16_LE);
/* Or, if C is a special latin extra code,
or is an ISO2022 specific control code of C1 (SS2 or SS3),
or is an ISO2022 control-sequence-introducer (CSI),
we should also consider the possibility of ISO2022 codings. */
- if ((VECTORP (Vlatin_extra_code_table)
+ if ((latin_extra_code_state
+ && VECTORP (Vlatin_extra_code_table)
&& !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
|| (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
|| (c == ISO_CODE_CSI
@@ -4205,7 +4243,7 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
&& src + 1 < src_end
&& src[1] == ']')))))
try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
- | CODING_CATEGORY_MASK_ISO_8BIT);
+ | CODING_CATEGORY_MASK_ISO_8BIT);
}
else
/* C is a character of ISO2022 in graphic plane right,
@@ -4213,29 +4251,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
or the first byte of BIG5's 2-byte code,
or the first byte of UTF-8/16. */
try = (CODING_CATEGORY_MASK_ISO_8_ELSE
- | CODING_CATEGORY_MASK_ISO_8BIT
- | CODING_CATEGORY_MASK_SJIS
- | CODING_CATEGORY_MASK_BIG5
- | CODING_CATEGORY_MASK_UTF_8
- | CODING_CATEGORY_MASK_UTF_16_BE
- | CODING_CATEGORY_MASK_UTF_16_LE);
+ | CODING_CATEGORY_MASK_ISO_8BIT
+ | CODING_CATEGORY_MASK_SJIS
+ | CODING_CATEGORY_MASK_BIG5
+ | CODING_CATEGORY_MASK_UTF_8
+ | CODING_CATEGORY_MASK_UTF_16_BE
+ | CODING_CATEGORY_MASK_UTF_16_LE);
/* Or, we may have to consider the possibility of CCL. */
- if (coding_system_table[CODING_CATEGORY_IDX_CCL]
+ if (! null_byte_found
+ && coding_system_table[CODING_CATEGORY_IDX_CCL]
&& (coding_system_table[CODING_CATEGORY_IDX_CCL]
->spec.ccl.valid_codes)[c])
try |= CODING_CATEGORY_MASK_CCL;
mask = 0;
- utf16_examined_p = iso2022_examined_p = 0;
if (priorities)
{
+ /* At first try detection with Latin extra codes not-allowed.
+ If no proper coding system is found because of Latin extra
+ codes, try detection with Latin extra codes allowed. */
+ latin_extra_code_state = 0;
+ label_retry:
+ utf16_examined_p = iso2022_examined_p = 0;
for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
{
if (!iso2022_examined_p
&& (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
{
- mask |= detect_coding_iso2022 (src, src_end, multibytep);
+ mask |= detect_coding_iso2022 (src, src_end, multibytep,
+ &latin_extra_code_state);
iso2022_examined_p = 1;
}
else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
@@ -4256,16 +4301,40 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
mask |= detect_coding_ccl (src, src_end, multibytep);
else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
- mask |= CODING_CATEGORY_MASK_RAW_TEXT;
+ {
+ if (latin_extra_code_state == 1)
+ {
+ /* Detection of ISO-2022 based coding system
+ failed because of Latin extra codes. Before
+ falling back to raw-text, try again with
+ Latin extra codes allowed. */
+ latin_extra_code_state = 2;
+ try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
+ | CODING_CATEGORY_MASK_ISO_8BIT);
+ goto label_retry;
+ }
+ mask |= CODING_CATEGORY_MASK_RAW_TEXT;
+ }
else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
- mask |= CODING_CATEGORY_MASK_BINARY;
+ {
+ if (latin_extra_code_state == 1)
+ {
+ /* See the above comment. */
+ latin_extra_code_state = 2;
+ try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
+ | CODING_CATEGORY_MASK_ISO_8BIT);
+ goto label_retry;
+ }
+ mask |= CODING_CATEGORY_MASK_BINARY;
+ }
if (mask & priorities[i])
return priorities[i];
}
return CODING_CATEGORY_MASK_RAW_TEXT;
}
if (try & CODING_CATEGORY_MASK_ISO)
- mask |= detect_coding_iso2022 (src, src_end, multibytep);
+ mask |= detect_coding_iso2022 (src, src_end, multibytep,
+ &latin_extra_code_state);
if (try & CODING_CATEGORY_MASK_SJIS)
mask |= detect_coding_sjis (src, src_end, multibytep);
if (try & CODING_CATEGORY_MASK_BIG5)