diff options
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 197 |
1 files changed, 159 insertions, 38 deletions
diff --git a/src/coding.c b/src/coding.c index c18632f301b..5047e1149bc 100644 --- a/src/coding.c +++ b/src/coding.c @@ -6071,6 +6071,93 @@ complement_process_encoding_system (Lisp_Object coding_system) #define EOL_SEEN_CR 2 #define EOL_SEEN_CRLF 4 + +static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen); + + +/* Return 1 if all the source bytes are ASCII, and return 0 otherwize. + By side effects, set coding->head_ascii and coding->eol_seen. The + value of coding->eol_seen is "logical or" of EOL_SEEN_LF, + EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when + all the source bytes are ASCII. */ + +static bool +detect_ascii (struct coding_system *coding) +{ + const unsigned char *src, *end; + Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); + int eol_seen; + + eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE + : EQ (eol_type, Qunix) ? EOL_SEEN_LF + : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF + : EOL_SEEN_CR); + coding_set_source (coding); + src = coding->source; + end = src + coding->src_bytes; + + if (inhibit_eol_conversion) + { + /* We don't have to check EOL format. */ + while (src < end && !( *src & 0x80)) src++; + eol_seen = EOL_SEEN_LF; + adjust_coding_eol_type (coding, eol_seen); + } + else if (eol_seen != EOL_SEEN_NONE) + { + /* We don't have to check EOL format either. */ + while (src < end && !(*src & 0x80)) src++; + } + else + { + end--; /* We look ahead one byte. */ + while (src < end) + { + int c = *src; + + if (c & 0x80) + break; + src++; + if (c < 0x20) + { + if (c == '\r') + { + if (*src == '\n') + { + eol_seen |= EOL_SEEN_CRLF; + src++; + } + else + eol_seen |= EOL_SEEN_CR; + } + else if (c == '\n') + eol_seen |= EOL_SEEN_LF; + } + } + if (src > end) + /* The last two bytes are CR LF, which means that we have + scanned all bytes. */ + end++; + else if (src == end) + { + end++; + if (! (*src & 0x80)) + { + if (*src == '\r') + eol_seen |= EOL_SEEN_CR; + else if (*src == '\n') + eol_seen |= EOL_SEEN_LF; + src++; + } + } + adjust_coding_eol_type (coding, eol_seen); + } + coding->head_ascii = src - coding->source; + coding->eol_seen = eol_seen; + return (src == end); +} + + /* Detect how end-of-line of a text of length SRC_BYTES pointed by SOURCE is encoded. If CATEGORY is one of coding_category_utf_16_XXXX, assume that CR and LF are encoded by @@ -6215,7 +6302,6 @@ detect_coding (struct coding_system *coding) coding_set_source (coding); src_end = coding->source + coding->src_bytes; - coding->head_ascii = 0; /* If we have not yet decided the text encoding type, detect it now. */ @@ -6225,6 +6311,8 @@ detect_coding (struct coding_system *coding) struct coding_detection_info detect_info; bool null_byte_found = 0, eight_bit_found = 0; + coding->head_ascii = 0; + coding->eol_seen = EOL_SEEN_NONE; detect_info.checked = detect_info.found = detect_info.rejected = 0; for (src = coding->source; src < src_end; src++) { @@ -6263,6 +6351,26 @@ detect_coding (struct coding_system *coding) if (eight_bit_found) break; } + else if (! disable_ascii_optimization + && ! inhibit_eol_conversion) + { + if (c == '\r') + { + if (src < src_end && src[1] == '\n') + { + coding->eol_seen |= EOL_SEEN_CRLF; + src++; + coding->head_ascii++; + } + else + coding->eol_seen |= EOL_SEEN_CR; + } + else if (c == '\n') + { + coding->eol_seen |= EOL_SEEN_LF; + } + } + if (! eight_bit_found) coding->head_ascii++; } @@ -6353,19 +6461,20 @@ detect_coding (struct coding_system *coding) coding_systems = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); detect_info.found = detect_info.rejected = 0; - for (src = coding->source; src < src_end; src++) + if (detect_ascii (coding)) { - if (*src & 0x80) - break; + setup_coding_system (XCDR (coding_systems), coding); } - coding->head_ascii = src - coding->source; - if (CONSP (coding_systems) - && detect_coding_utf_8 (coding, &detect_info)) + else { - if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) - setup_coding_system (XCAR (coding_systems), coding); - else - setup_coding_system (XCDR (coding_systems), coding); + if (CONSP (coding_systems) + && detect_coding_utf_8 (coding, &detect_info)) + { + if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) + setup_coding_system (XCAR (coding_systems), coding); + else + setup_coding_system (XCDR (coding_systems), coding); + } } } else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) @@ -6378,6 +6487,7 @@ detect_coding (struct coding_system *coding) = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); detect_info.found = detect_info.rejected = 0; coding->head_ascii = 0; + coding->eol_seen = EOL_SEEN_NONE; if (CONSP (coding_systems) && detect_coding_utf_16 (coding, &detect_info)) { @@ -6815,7 +6925,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table, produced = dst - (coding->destination + coding->produced); if (BUFFERP (coding->dst_object) && produced_chars > 0) - insert_from_gap (produced_chars, produced); + insert_from_gap (produced_chars, produced, 0); coding->produced += produced; coding->produced_char += produced_chars; return carryover; @@ -7400,7 +7510,7 @@ encode_coding (struct coding_system *coding) } while (coding->consumed_char < coding->src_chars); if (BUFFERP (coding->dst_object) && coding->produced_char > 0) - insert_from_gap (coding->produced_char, coding->produced); + insert_from_gap (coding->produced_char, coding->produced, 0); SAFE_FREE (); } @@ -7510,39 +7620,45 @@ decode_coding_gap (struct coding_system *coding, if (CODING_REQUIRE_DETECTION (coding)) detect_coding (coding); attrs = CODING_ID_ATTRS (coding->id); -#ifndef CODING_DISABLE_ASCII_OPTIMIZATION - if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) - && NILP (CODING_ATTR_POST_READ (attrs)) - && NILP (get_translation_table (attrs, 0, NULL)) - && (inhibit_eol_conversion - || EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))) + if (! disable_ascii_optimization) { - /* We can skip the conversion if all source bytes are ASCII. */ - if (coding->head_ascii < 0) + if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) + && NILP (CODING_ATTR_POST_READ (attrs)) + && NILP (get_translation_table (attrs, 0, NULL)) + && (coding->head_ascii >= 0 /* We've already called detect_coding */ + ? coding->head_ascii == bytes + : detect_ascii (coding))) { - /* We have not yet counted the number of ASCII bytes at the - head of the source. Do it now. */ - const unsigned char *src, *src_end; + if (coding->eol_seen == EOL_SEEN_CR) + { + unsigned char *src_end = GAP_END_ADDR; + unsigned char *src = src - coding->src_bytes; - coding_set_source (coding); - src_end = coding->source + coding->src_bytes; - for (src = coding->source; src < src_end; src++) + while (src < src_end) + { + if (*src++ == '\r') + src[-1] = '\n'; + } + } + else if (coding->eol_seen == EOL_SEEN_CRLF) { - if (*src & 0x80) - break; + unsigned char *src = GAP_END_ADDR; + unsigned char *src_beg = src - coding->src_bytes; + unsigned char *dst = src; + + while (src_beg < src) + { + *--dst = *--src; + if (*src == '\n') + src--; + } + bytes -= dst - src; } - coding->head_ascii = src - coding->source; - } - if (coding->src_bytes == coding->head_ascii) - { - /* No need of conversion. Use the data in the gap as is. */ - coding->produced_char = chars; - coding->produced = bytes; - adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1); + coding->produced_char = coding->produced = bytes; + insert_from_gap (bytes, bytes, 1); return; } } -#endif /* not CODING_DISABLE_ASCII_OPTIMIZATION */ code_conversion_save (0, 0); coding->mode |= CODING_MODE_LAST_BLOCK; @@ -10758,6 +10874,11 @@ from GNU Find and GNU Grep. Emacs will then ignore the null bytes and decode text as usual. */); inhibit_null_byte_detection = 0; + DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization, + doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files. +Internal use only. Removed after the experimental optimizer gets stable. */); + disable_ascii_optimization = 0; + DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input, doc: /* Char table for translating self-inserting characters. This is applied to the result of input methods, not their input. |
