summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars Ingebrigtsen <larsi@gnus.org>2022-07-03 14:05:01 +0200
committerLars Ingebrigtsen <larsi@gnus.org>2022-07-03 14:08:14 +0200
commitb72e4b149329797b8f2c947953251f92615ee73e (patch)
tree3232e47d9ff3c7e682b0bb086de36b73dae4cbb8
parentcfee07d4dd6317bc235046b99542fa76dc676dde (diff)
downloademacs-b72e4b149329797b8f2c947953251f92615ee73e.tar.gz
Make string-limit with encoding return complete glyphs
* lisp/emacs-lisp/subr-x.el (string-limit): Return more correct results in the CODING-SYSTEM case for coding systems with BOM and charset designations (bug#48324). Also amend the algorithm to return complete glyphs, not just complete code points.
-rw-r--r--lisp/emacs-lisp/subr-x.el83
-rw-r--r--test/lisp/emacs-lisp/subr-x-tests.el23
2 files changed, 71 insertions, 35 deletions
diff --git a/lisp/emacs-lisp/subr-x.el b/lisp/emacs-lisp/subr-x.el
index 390e505f009..56e8c2aa862 100644
--- a/lisp/emacs-lisp/subr-x.el
+++ b/lisp/emacs-lisp/subr-x.el
@@ -167,9 +167,9 @@ non-nil, return the last LENGTH characters instead.
If CODING-SYSTEM is non-nil, STRING will be encoded before
limiting, and LENGTH is interpreted as the number of bytes to
limit the string to. The result will be a unibyte string that is
-shorter than LENGTH, but will not contain \"partial\" characters,
-even if CODING-SYSTEM encodes characters with several bytes per
-character.
+shorter than LENGTH, but will not contain \"partial\"
+characters (or glyphs), even if CODING-SYSTEM encodes characters
+with several bytes per character.
When shortening strings for display purposes,
`truncate-string-to-width' is almost always a better alternative
@@ -177,34 +177,55 @@ than this function."
(unless (natnump length)
(signal 'wrong-type-argument (list 'natnump length)))
(if coding-system
- (let ((result nil)
- (result-length 0)
- (index (if end (1- (length string)) 0)))
- ;; FIXME: This implementation, which uses encode-coding-char
- ;; to encode the string one character at a time, is in general
- ;; incorrect: coding-systems that produce prefix or suffix
- ;; bytes, such as ISO-2022-based or UTF-8/16 with BOM, will
- ;; produce those bytes for each character, instead of just
- ;; once for the entire string. encode-coding-char attempts to
- ;; remove those extra bytes at least in some situations, but
- ;; it cannot do that in all cases. And in any case, producing
- ;; what is supposed to be a UTF-16 or ISO-2022-CN encoded
- ;; string which lacks the BOM bytes at the beginning and the
- ;; charset designation sequences at the head and tail of the
- ;; result will definitely surprise the callers in some cases.
- (while (let ((encoded (encode-coding-char
- (aref string index) coding-system)))
- (and (<= (+ (length encoded) result-length) length)
- (progn
- (push encoded result)
- (cl-incf result-length (length encoded))
- (setq index (if end (1- index)
- (1+ index))))
- (if end (> index -1)
- (< index (length string)))))
- ;; No body.
- )
- (apply #'concat (if end result (nreverse result))))
+ ;; The previous implementation here tried to encode char by
+ ;; char, and then adding up the length of the encoded octets,
+ ;; but that's not reliably in the presence of BOM marks and
+ ;; ISO-2022-CN which may add charset designations at the
+ ;; start/end of each encoded char (which we don't want). So
+ ;; iterate (with a binary search) instead to find the desired
+ ;; length.
+ (let* ((glyphs (string-glyph-split string))
+ (nglyphs (length glyphs))
+ (too-long (1+ nglyphs))
+ (stop (max (/ nglyphs 2) 1))
+ (gap stop)
+ candidate encoded found candidate-stop)
+ ;; We're returning the end of the string.
+ (when end
+ (setq glyphs (nreverse glyphs)))
+ (while (and (not found)
+ (< stop too-long))
+ (setq encoded
+ (encode-coding-string (string-join (seq-take glyphs stop))
+ coding-system))
+ (cond
+ ((= (length encoded) length)
+ (setq found encoded
+ candidate-stop stop))
+ ;; Too long; try shortening.
+ ((> (length encoded) length)
+ (setq too-long stop
+ stop (max (- stop gap) 1)))
+ ;; Too short; try lengthening.
+ (t
+ (setq candidate encoded
+ candidate-stop stop)
+ (setq stop
+ (if (>= stop nglyphs)
+ too-long
+ (min (+ stop gap) nglyphs)))))
+ (setq gap (max (/ gap 2) 1)))
+ (cond
+ ((not (or found candidate))
+ "")
+ ;; We're returning the end, so redo the encoding.
+ (end
+ (encode-coding-string
+ (string-join (nreverse (seq-take glyphs candidate-stop)))
+ coding-system))
+ (t
+ (or found candidate))))
+ ;; Char-based version.
(cond
((<= (length string) length) string)
(end (substring string (- (length string) length)))
diff --git a/test/lisp/emacs-lisp/subr-x-tests.el b/test/lisp/emacs-lisp/subr-x-tests.el
index 0bec9db36e9..99c0e822155 100644
--- a/test/lisp/emacs-lisp/subr-x-tests.el
+++ b/test/lisp/emacs-lisp/subr-x-tests.el
@@ -607,21 +607,36 @@
(should (equal (string-limit "foó" 4 nil 'utf-8) "fo\303\263"))
(should (equal (string-limit "foóa" 4 nil 'utf-8) "fo\303\263"))
(should (equal (string-limit "foóá" 4 nil 'utf-8) "fo\303\263"))
+ (should (equal (string-limit "foóá" 2 nil 'utf-8-with-signature)
+ ""))
(should (equal (string-limit "foóá" 4 nil 'utf-8-with-signature)
- "fo\303\263"))
+ "\357\273\277f"))
(should (equal (string-limit "foóa" 4 nil 'iso-8859-1) "fo\363a"))
(should (equal (string-limit "foóá" 4 nil 'iso-8859-1) "fo\363\341"))
- (should (equal (string-limit "foóá" 4 nil 'utf-16) "\000f\000o"))
+ (should (equal (string-limit "foóá" 3 nil 'utf-16) ""))
+ (should (equal (string-limit "foóá" 6 nil 'utf-16) "\376\377\000f\000o"))
(should (equal (string-limit "foó" 10 t 'utf-8) "fo\303\263"))
(should (equal (string-limit "foó" 3 t 'utf-8) "o\303\263"))
(should (equal (string-limit "foó" 4 t 'utf-8) "fo\303\263"))
(should (equal (string-limit "foóa" 4 t 'utf-8) "o\303\263a"))
(should (equal (string-limit "foóá" 4 t 'utf-8) "\303\263\303\241"))
- (should (equal (string-limit "foóá" 2 t 'utf-8-with-signature) "\303\241"))
+ (should (equal (string-limit "foóá" 2 t 'utf-8-with-signature)
+ ""))
(should (equal (string-limit "foóa" 4 t 'iso-8859-1) "fo\363a"))
(should (equal (string-limit "foóá" 4 t 'iso-8859-1) "fo\363\341"))
- (should (equal (string-limit "foóá" 4 t 'utf-16) "\000\363\000\341")))
+ (should (equal (string-limit "foóá" 6 t 'utf-16) "\376\377\000\363\000\341")))
+
+(ert-deftest subr-string-limit-glyphs ()
+ (should (equal (encode-coding-string "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 'utf-8)
+ "Hello, \360\237\221\274\360\237\217\273\360\237\247\221\360\237\217\274\342\200\215\360\237\244\235\342\200\215\360\237\247\221\360\237\217\273"))
+ (should (= (length (encode-coding-string "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 'utf-8)) 41))
+ (should (equal (string-limit "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 100 nil 'utf-8)
+ "Hello, \360\237\221\274\360\237\217\273\360\237\247\221\360\237\217\274\342\200\215\360\237\244\235\342\200\215\360\237\247\221\360\237\217\273"))
+ (should (equal (string-limit "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 15 nil 'utf-8)
+ "Hello, \360\237\221\274\360\237\217\273"))
+ (should (equal (string-limit "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 10 nil 'utf-8)
+ "Hello, ")))
(ert-deftest subr-string-lines ()
(should (equal (string-lines "foo") '("foo")))