diff options
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 409 |
1 files changed, 312 insertions, 97 deletions
diff --git a/src/coding.c b/src/coding.c index 6cfcec905a1..42fd81b6322 100644 --- a/src/coding.c +++ b/src/coding.c @@ -1125,6 +1125,14 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes, *buf++ = id; \ } while (0) + +/* Bitmasks for coding->eol_seen. */ + +#define EOL_SEEN_NONE 0 +#define EOL_SEEN_LF 1 +#define EOL_SEEN_CR 2 +#define EOL_SEEN_CRLF 4 + /*** 2. Emacs' internal format (emacs-utf-8) ***/ @@ -1147,6 +1155,9 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes, #define UTF_8_BOM_2 0xBB #define UTF_8_BOM_3 0xBF +/* Unlike the other detect_coding_XXX, this function counts number of + characters and check EOL format. */ + static bool detect_coding_utf_8 (struct coding_system *coding, struct coding_detection_info *detect_info) @@ -1156,11 +1167,23 @@ detect_coding_utf_8 (struct coding_system *coding, bool multibytep = coding->src_multibyte; ptrdiff_t consumed_chars = 0; bool bom_found = 0; - bool found = 0; + int nchars = coding->head_ascii; + int eol_seen = coding->eol_seen; detect_info->checked |= CATEGORY_MASK_UTF_8; /* A coding system of this category is always ASCII compatible. */ - src += coding->head_ascii; + src += nchars; + + if (src == coding->source /* BOM should be at the head. */ + && src + 3 < src_end /* BOM is 3-byte long. */ + && src[0] == UTF_8_BOM_1 + && src[1] == UTF_8_BOM_2 + && src[2] == UTF_8_BOM_3) + { + bom_found = 1; + src += 3; + nchars++; + } while (1) { @@ -1169,13 +1192,29 @@ detect_coding_utf_8 (struct coding_system *coding, src_base = src; ONE_MORE_BYTE (c); if (c < 0 || UTF_8_1_OCTET_P (c)) - continue; + { + nchars++; + if (c == '\r') + { + if (src < src_end && *src == '\n') + { + eol_seen |= EOL_SEEN_CRLF; + src++; + nchars++; + } + else + eol_seen |= EOL_SEEN_CR; + } + else if (c == '\n') + eol_seen |= EOL_SEEN_LF; + continue; + } ONE_MORE_BYTE (c1); if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1)) break; if (UTF_8_2_OCTET_LEADING_P (c)) { - found = 1; + nchars++; continue; } ONE_MORE_BYTE (c2); @@ -1183,10 +1222,7 @@ detect_coding_utf_8 (struct coding_system *coding, break; if (UTF_8_3_OCTET_LEADING_P (c)) { - found = 1; - if (src_base == coding->source - && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3) - bom_found = 1; + nchars++; continue; } ONE_MORE_BYTE (c3); @@ -1194,7 +1230,7 @@ detect_coding_utf_8 (struct coding_system *coding, break; if (UTF_8_4_OCTET_LEADING_P (c)) { - found = 1; + nchars++; continue; } ONE_MORE_BYTE (c4); @@ -1202,7 +1238,7 @@ detect_coding_utf_8 (struct coding_system *coding, break; if (UTF_8_5_OCTET_LEADING_P (c)) { - found = 1; + nchars++; continue; } break; @@ -1219,14 +1255,17 @@ detect_coding_utf_8 (struct coding_system *coding, if (bom_found) { /* The first character 0xFFFE doesn't necessarily mean a BOM. */ - detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG; + detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG; } else { detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG; - if (found) - detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG; + if (nchars < src_end - coding->source) + /* The found characters are less than source bytes, which + means that we found a valid non-ASCII characters. */ + detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG; } + coding->detected_utf8_chars = nchars; return 1; } @@ -3887,6 +3926,14 @@ decode_coding_iso_2022 (struct coding_system *coding) *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); char_offset++; coding->errors++; + /* Reset the invocation and designation status to the safest + one; i.e. designate ASCII to the graphic register 0, and + invoke that register to the graphic plane 0. This typically + helps the case that an designation sequence for ASCII "ESC ( + B" is somehow broken (e.g. broken by a newline). */ + CODING_ISO_INVOCATION (coding, 0) = 0; + CODING_ISO_DESIGNATION (coding, 0) = charset_ascii; + charset_id_0 = charset_ascii; continue; break_loop: @@ -5614,7 +5661,6 @@ setup_coding_system (Lisp_Object coding_system, struct coding_system *coding) eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); coding->mode = 0; - coding->head_ascii = -1; if (VECTORP (eol_type)) coding->common_flags = (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_DETECTION_MASK); @@ -6066,51 +6112,40 @@ complement_process_encoding_system (Lisp_Object coding_system) */ -#define EOL_SEEN_NONE 0 -#define EOL_SEEN_LF 1 -#define EOL_SEEN_CR 2 -#define EOL_SEEN_CRLF 4 +static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, + int eol_seen); -static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen); +/* Return the number of ASCII characters at the head of the source. + By side effects, set coding->head_ascii and update + coding->eol_seen. The value of coding->eol_seen is "logical or" of + EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is + reliable only when all the source bytes are ASCII. */ - -/* Return 1 if all the source bytes are ASCII, and return 0 otherwize. - By side effects, set coding->head_ascii and coding->eol_seen. The - value of coding->eol_seen is "logical or" of EOL_SEEN_LF, - EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when - all the source bytes are ASCII. */ - -static bool -detect_ascii (struct coding_system *coding) +static int +check_ascii (struct coding_system *coding) { const unsigned char *src, *end; Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); - int eol_seen; + int eol_seen = coding->eol_seen; - eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE - : EQ (eol_type, Qunix) ? EOL_SEEN_LF - : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF - : EOL_SEEN_CR); coding_set_source (coding); src = coding->source; end = src + coding->src_bytes; - if (inhibit_eol_conversion) + if (inhibit_eol_conversion + || SYMBOLP (eol_type)) { /* We don't have to check EOL format. */ - while (src < end && !( *src & 0x80)) src++; - eol_seen = EOL_SEEN_LF; - adjust_coding_eol_type (coding, eol_seen); - } - else if (eol_seen != EOL_SEEN_NONE) - { - /* We don't have to check EOL format either. */ - while (src < end && !(*src & 0x80)) src++; + while (src < end && !( *src & 0x80)) + { + if (*src++ == '\n') + eol_seen |= EOL_SEEN_LF; + } } else { - end--; /* We look ahead one byte. */ + end--; /* We look ahead one byte for "CR LF". */ while (src < end) { int c = *src; @@ -6118,6 +6153,69 @@ detect_ascii (struct coding_system *coding) if (c & 0x80) break; src++; + if (c == '\r') + { + if (*src == '\n') + { + eol_seen |= EOL_SEEN_CRLF; + src++; + } + else + eol_seen |= EOL_SEEN_CR; + } + else if (c == '\n') + eol_seen |= EOL_SEEN_LF; + } + if (src == end) + { + int c = *src; + + /* All bytes but the last one C are ASCII. */ + if (! (c & 0x80)) + { + if (c == '\r') + eol_seen |= EOL_SEEN_CR; + else if (c == '\n') + eol_seen |= EOL_SEEN_LF; + src++; + } + } + } + coding->head_ascii = src - coding->source; + coding->eol_seen = eol_seen; + return (coding->head_ascii); +} + + +/* Return the number of characters at the source if all the bytes are + valid UTF-8 (of Unicode range). Otherwise, return -1. By side + effects, update coding->eol_seen. The value of coding->eol_seen is + "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but + the value is reliable only when all the source bytes are valid + UTF-8. */ + +static int +check_utf_8 (struct coding_system *coding) +{ + const unsigned char *src, *end; + int eol_seen; + int nchars = coding->head_ascii; + + if (coding->head_ascii < 0) + check_ascii (coding); + else + coding_set_source (coding); + src = coding->source + coding->head_ascii; + /* We look ahead one byte for CR LF. */ + end = coding->source + coding->src_bytes - 1; + eol_seen = coding->eol_seen; + while (src < end) + { + int c = *src; + + if (UTF_8_1_OCTET_P (*src)) + { + src++; if (c < 0x20) { if (c == '\r') @@ -6126,6 +6224,7 @@ detect_ascii (struct coding_system *coding) { eol_seen |= EOL_SEEN_CRLF; src++; + nchars++; } else eol_seen |= EOL_SEEN_CR; @@ -6134,27 +6233,58 @@ detect_ascii (struct coding_system *coding) eol_seen |= EOL_SEEN_LF; } } - if (src > end) - /* The last two bytes are CR LF, which means that we have - scanned all bytes. */ - end++; - else if (src == end) + else if (UTF_8_2_OCTET_LEADING_P (c)) { - end++; - if (! (*src & 0x80)) - { - if (*src == '\r') - eol_seen |= EOL_SEEN_CR; - else if (*src == '\n') - eol_seen |= EOL_SEEN_LF; - src++; - } + if (c < 0xC2 /* overlong sequence */ + || src + 1 >= end + || ! UTF_8_EXTRA_OCTET_P (src[1])) + return -1; + src += 2; } - adjust_coding_eol_type (coding, eol_seen); + else if (UTF_8_3_OCTET_LEADING_P (c)) + { + if (src + 2 >= end + || ! (UTF_8_EXTRA_OCTET_P (src[1]) + && UTF_8_EXTRA_OCTET_P (src[2]))) + return -1; + c = (((c & 0xF) << 12) + | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); + if (c < 0x800 /* overlong sequence */ + || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */ + return -1; + src += 3; + } + else if (UTF_8_4_OCTET_LEADING_P (c)) + { + if (src + 3 >= end + || ! (UTF_8_EXTRA_OCTET_P (src[1]) + && UTF_8_EXTRA_OCTET_P (src[2]) + && UTF_8_EXTRA_OCTET_P (src[3]))) + return -1; + c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12) + | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); + if (c < 0x10000 /* overlong sequence */ + || c >= 0x110000) /* non-Unicode character */ + return -1; + src += 4; + } + else + return -1; + nchars++; + } + + if (src == end) + { + if (! UTF_8_1_OCTET_P (*src)) + return -1; + nchars++; + if (*src == '\r') + eol_seen |= EOL_SEEN_CR; + else if (*src == '\n') + eol_seen |= EOL_SEEN_LF; } - coding->head_ascii = src - coding->source; coding->eol_seen = eol_seen; - return (src == end); + return nchars; } @@ -6269,6 +6399,9 @@ adjust_coding_eol_type (struct coding_system *coding, int eol_seen) Lisp_Object eol_type; eol_type = CODING_ID_EOL_TYPE (coding->id); + if (! VECTORP (eol_type)) + /* Already adjusted. */ + return eol_type; if (eol_seen & EOL_SEEN_LF) { coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0)); @@ -6296,6 +6429,8 @@ detect_coding (struct coding_system *coding) { const unsigned char *src, *src_end; unsigned int saved_mode = coding->mode; + Lisp_Object found = Qnil; + Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); coding->consumed = coding->consumed_char = 0; coding->produced = coding->produced_char = 0; @@ -6303,6 +6438,7 @@ detect_coding (struct coding_system *coding) src_end = coding->source + coding->src_bytes; + coding->eol_seen = EOL_SEEN_NONE; /* If we have not yet decided the text encoding type, detect it now. */ if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) @@ -6312,7 +6448,6 @@ detect_coding (struct coding_system *coding) bool null_byte_found = 0, eight_bit_found = 0; coding->head_ascii = 0; - coding->eol_seen = EOL_SEEN_NONE; detect_info.checked = detect_info.found = detect_info.rejected = 0; for (src = coding->source; src < src_end; src++) { @@ -6360,7 +6495,8 @@ detect_coding (struct coding_system *coding) { coding->eol_seen |= EOL_SEEN_CRLF; src++; - coding->head_ascii++; + if (! eight_bit_found) + coding->head_ascii++; } else coding->eol_seen |= EOL_SEEN_CR; @@ -6422,32 +6558,58 @@ detect_coding (struct coding_system *coding) } else if ((*(this->detector)) (coding, &detect_info) && detect_info.found & (1 << category)) - { - if (category == coding_category_utf_16_auto) - { - if (detect_info.found & CATEGORY_MASK_UTF_16_LE) - category = coding_category_utf_16_le; - else - category = coding_category_utf_16_be; - } - break; - } + break; } } if (i < coding_category_raw_text) - setup_coding_system (CODING_ID_NAME (this->id), coding); + { + if (category == coding_category_utf_8_auto) + { + Lisp_Object coding_systems; + + coding_systems = AREF (CODING_ID_ATTRS (this->id), + coding_attr_utf_bom); + if (CONSP (coding_systems)) + { + if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) + found = XCAR (coding_systems); + else + found = XCDR (coding_systems); + } + else + found = CODING_ID_NAME (this->id); + } + else if (category == coding_category_utf_16_auto) + { + Lisp_Object coding_systems; + + coding_systems = AREF (CODING_ID_ATTRS (this->id), + coding_attr_utf_bom); + if (CONSP (coding_systems)) + { + if (detect_info.found & CATEGORY_MASK_UTF_16_LE) + found = XCAR (coding_systems); + else if (detect_info.found & CATEGORY_MASK_UTF_16_BE) + found = XCDR (coding_systems); + } + else + found = CODING_ID_NAME (this->id); + } + else + found = CODING_ID_NAME (this->id); + } else if (null_byte_found) - setup_coding_system (Qno_conversion, coding); + found = Qno_conversion; else if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY) - setup_coding_system (Qraw_text, coding); + found = Qraw_text; else if (detect_info.rejected) for (i = 0; i < coding_category_raw_text; i++) if (! (detect_info.rejected & (1 << coding_priorities[i]))) { this = coding_categories + coding_priorities[i]; - setup_coding_system (CODING_ID_NAME (this->id), coding); + found = CODING_ID_NAME (this->id); break; } } @@ -6461,9 +6623,10 @@ detect_coding (struct coding_system *coding) coding_systems = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); detect_info.found = detect_info.rejected = 0; - if (detect_ascii (coding)) + if (check_ascii (coding) == coding->src_bytes) { - setup_coding_system (XCDR (coding_systems), coding); + if (CONSP (coding_systems)) + found = XCDR (coding_systems); } else { @@ -6471,9 +6634,9 @@ detect_coding (struct coding_system *coding) && detect_coding_utf_8 (coding, &detect_info)) { if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) - setup_coding_system (XCAR (coding_systems), coding); + found = XCAR (coding_systems); else - setup_coding_system (XCDR (coding_systems), coding); + found = XCDR (coding_systems); } } } @@ -6487,16 +6650,28 @@ detect_coding (struct coding_system *coding) = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); detect_info.found = detect_info.rejected = 0; coding->head_ascii = 0; - coding->eol_seen = EOL_SEEN_NONE; if (CONSP (coding_systems) && detect_coding_utf_16 (coding, &detect_info)) { if (detect_info.found & CATEGORY_MASK_UTF_16_LE) - setup_coding_system (XCAR (coding_systems), coding); + found = XCAR (coding_systems); else if (detect_info.found & CATEGORY_MASK_UTF_16_BE) - setup_coding_system (XCDR (coding_systems), coding); + found = XCDR (coding_systems); } } + + if (! NILP (found)) + { + int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE + : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF + : EQ (eol_type, Qmac) ? EOL_SEEN_CR + : EOL_SEEN_LF); + + setup_coding_system (found, coding); + if (specified_eol != EOL_SEEN_NONE) + adjust_coding_eol_type (coding, specified_eol); + } + coding->mode = saved_mode; } @@ -7617,19 +7792,55 @@ decode_coding_gap (struct coding_system *coding, coding->dst_pos_byte = PT_BYTE; coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters)); + coding->head_ascii = -1; + coding->detected_utf8_chars = -1; + coding->eol_seen = EOL_SEEN_NONE; if (CODING_REQUIRE_DETECTION (coding)) detect_coding (coding); attrs = CODING_ID_ATTRS (coding->id); - if (! disable_ascii_optimization) - { - if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) - && NILP (CODING_ATTR_POST_READ (attrs)) - && NILP (get_translation_table (attrs, 0, NULL)) - && (coding->head_ascii >= 0 /* We've already called detect_coding */ - ? coding->head_ascii == bytes - : detect_ascii (coding))) + if (! disable_ascii_optimization + && ! coding->src_multibyte + && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) + && NILP (CODING_ATTR_POST_READ (attrs)) + && NILP (get_translation_table (attrs, 0, NULL))) + { + chars = coding->head_ascii; + if (chars < 0) + chars = check_ascii (coding); + if (chars != bytes) + { + /* There exists a non-ASCII byte. */ + if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)) + { + if (coding->detected_utf8_chars >= 0) + chars = coding->detected_utf8_chars; + else + chars = check_utf_8 (coding); + if (CODING_UTF_8_BOM (coding) != utf_without_bom + && coding->head_ascii == 0 + && coding->source[0] == UTF_8_BOM_1 + && coding->source[1] == UTF_8_BOM_2 + && coding->source[2] == UTF_8_BOM_3) + { + chars--; + bytes -= 3; + coding->src_bytes -= 3; + } + } + else + chars = -1; + } + if (chars >= 0) { - if (coding->eol_seen == EOL_SEEN_CR) + Lisp_Object eol_type; + + eol_type = CODING_ID_EOL_TYPE (coding->id); + if (VECTORP (eol_type)) + { + if (coding->eol_seen != EOL_SEEN_NONE) + eol_type = adjust_coding_eol_type (coding, coding->eol_seen); + } + if (EQ (eol_type, Qmac)) { unsigned char *src_end = GAP_END_ADDR; unsigned char *src = src_end - coding->src_bytes; @@ -7640,22 +7851,26 @@ decode_coding_gap (struct coding_system *coding, src[-1] = '\n'; } } - else if (coding->eol_seen == EOL_SEEN_CRLF) + else if (EQ (eol_type, Qdos)) { unsigned char *src = GAP_END_ADDR; unsigned char *src_beg = src - coding->src_bytes; unsigned char *dst = src; + ptrdiff_t diff; while (src_beg < src) { *--dst = *--src; - if (*src == '\n') + if (*src == '\n' && src > src_beg && src[-1] == '\r') src--; } - bytes -= dst - src; + diff = dst - src; + bytes -= diff; + chars -= diff; } - coding->produced_char = coding->produced = bytes; - insert_from_gap (bytes, bytes, 1); + coding->produced = bytes; + coding->produced_char = chars; + insert_from_gap (chars, bytes, 1); return; } } |
