summaryrefslogtreecommitdiff
path: root/lisp/international/mule-util.el
diff options
context:
space:
mode:
Diffstat (limited to 'lisp/international/mule-util.el')
-rw-r--r--lisp/international/mule-util.el272
1 files changed, 226 insertions, 46 deletions
diff --git a/lisp/international/mule-util.el b/lisp/international/mule-util.el
index 21e09593187..b575c2b7db3 100644
--- a/lisp/international/mule-util.el
+++ b/lisp/international/mule-util.el
@@ -1,6 +1,6 @@
-;;; mule-util.el --- utility functions for multilingual environment (mule)
+;;; mule-util.el --- utility functions for multilingual environment (mule) -*- lexical-binding:t -*-
-;; Copyright (C) 1997-1998, 2000-2013 Free Software Foundation, Inc.
+;; Copyright (C) 1997-1998, 2000-2015 Free Software Foundation, Inc.
;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
;; National Institute of Advanced Industrial Science and Technology (AIST)
@@ -30,8 +30,7 @@
;;; Code:
-;;; String manipulations while paying attention to multibyte
-;;; characters.
+;;; String manipulations while paying attention to multibyte characters.
;;;###autoload
(defsubst string-to-list (string)
@@ -49,13 +48,15 @@
(if (integerp obj)
(aset string idx obj)
(let ((len1 (length obj))
- (len2 (length string))
(i 0))
(while (< i len1)
(aset string (+ idx i) (aref obj i))
(setq i (1+ i)))))
string)
+(defvar truncate-string-ellipsis "..." ;"…"
+ "String to use to indicate truncation.")
+
;;;###autoload
(defun truncate-string-to-width (str end-column
&optional start-column padding ellipsis)
@@ -80,14 +81,13 @@ If ELLIPSIS is non-nil, it should be a string which will replace the
end of STR (including any padding) if it extends beyond END-COLUMN,
unless the display width of STR is equal to or less than the display
width of ELLIPSIS. If it is non-nil and not a string, then ELLIPSIS
-defaults to \"...\"."
+defaults to `truncate-string-ellipsis'."
(or start-column
(setq start-column 0))
(when (and ellipsis (not (stringp ellipsis)))
- (setq ellipsis "..."))
+ (setq ellipsis truncate-string-ellipsis))
(let ((str-len (length str))
(str-width (string-width str))
- (ellipsis-len (if ellipsis (length ellipsis) 0))
(ellipsis-width (if ellipsis (string-width ellipsis) 0))
(idx 0)
(column 0)
@@ -126,8 +126,8 @@ defaults to \"...\"."
tail-padding ellipsis))))
-;;; Nested alist handler. Nested alist is alist whose elements are
-;;; also nested alist.
+;;; Nested alist handler.
+;; Nested alist is alist whose elements are also nested alist.
;;;###autoload
(defsubst nested-alist-p (obj)
@@ -273,43 +273,223 @@ per-character basis, this may not be accurate."
((not enable-multibyte-characters)
;; Maybe there's a font for it, but we can't put it in the buffer.
nil)
- ((display-multi-font-p)
- ;; On a window system, a character is displayable if we have
- ;; a font for that character in the default face of the
- ;; currently selected frame.
- (car (internal-char-font nil char)))
(t
- ;; On a terminal, a character is displayable if the coding
- ;; system for the terminal can encode it.
- (let ((coding (terminal-coding-system)))
- (when coding
- (let ((cs-list (coding-system-get coding :charset-list)))
- (cond
- ((listp cs-list)
- (catch 'tag
- (mapc #'(lambda (charset)
- (if (encode-char char charset)
- (throw 'tag charset)))
- cs-list)
- nil))
- ((eq cs-list 'iso-2022)
- (catch 'tag2
- (mapc #'(lambda (charset)
- (if (and (plist-get (charset-plist charset)
- :iso-final-char)
- (encode-char char charset))
- (throw 'tag2 charset)))
- charset-list)
- nil))
- ((eq cs-list 'emacs-mule)
- (catch 'tag3
- (mapc #'(lambda (charset)
- (if (and (plist-get (charset-plist charset)
- :emacs-mule-id)
- (encode-char char charset))
- (throw 'tag3 charset)))
- charset-list)
- nil)))))))))
+ (let ((font-glyph (internal-char-font nil char)))
+ (if font-glyph
+ (if (consp font-glyph)
+ ;; On a window system, a character is displayable
+ ;; if a font for that character is in the default
+ ;; face of the currently selected frame.
+ (car font-glyph)
+ ;; On a text terminal supporting glyph codes, CHAR is
+ ;; displayable if its glyph code is nonnegative.
+ (<= 0 font-glyph))
+ ;; On a text terminal without glyph codes, CHAR is displayable
+ ;; if the coding system for the terminal can encode it.
+ (let ((coding (terminal-coding-system)))
+ (when coding
+ (let ((cs-list (coding-system-get coding :charset-list)))
+ (cond
+ ((listp cs-list)
+ (catch 'tag
+ (mapc #'(lambda (charset)
+ (if (encode-char char charset)
+ (throw 'tag charset)))
+ cs-list)
+ nil))
+ ((eq cs-list 'iso-2022)
+ (catch 'tag2
+ (mapc #'(lambda (charset)
+ (if (and (plist-get (charset-plist charset)
+ :iso-final-char)
+ (encode-char char charset))
+ (throw 'tag2 charset)))
+ charset-list)
+ nil))
+ ((eq cs-list 'emacs-mule)
+ (catch 'tag3
+ (mapc #'(lambda (charset)
+ (if (and (plist-get (charset-plist charset)
+ :emacs-mule-id)
+ (encode-char char charset))
+ (throw 'tag3 charset)))
+ charset-list)
+ nil)))))))))))
+
+(defun filepos-to-bufferpos--dos (byte f)
+ (let ((eol-offset 0)
+ ;; Make sure we terminate, even if BYTE falls right in the middle
+ ;; of a CRLF or some other weird corner case.
+ (omin 0) (omax most-positive-fixnum)
+ pos lines)
+ (while
+ (progn
+ (setq pos (funcall f (- byte eol-offset)))
+ ;; Protect against accidental values of BYTE outside of the
+ ;; valid region.
+ (when (null pos)
+ (if (<= byte eol-offset)
+ (setq pos (point-min))
+ (setq pos (point-max))))
+ ;; Adjust POS for DOS EOL format.
+ (setq lines (1- (line-number-at-pos pos)))
+ (and (not (= lines eol-offset)) (> omax omin)))
+ (if (> lines eol-offset)
+ (setq omax (min (1- omax) lines)
+ eol-offset omax)
+ (setq omin (max (1+ omin) lines)
+ eol-offset omin)))
+ pos))
+
+;;;###autoload
+(defun filepos-to-bufferpos (byte &optional quality coding-system)
+ "Try to return the buffer position corresponding to a particular file position.
+The file position is given as a (0-based) BYTE count.
+The function presumes the file is encoded with CODING-SYSTEM, which defaults
+to `buffer-file-coding-system'.
+QUALITY can be:
+ `approximate', in which case we may cut some corners to avoid
+ excessive work.
+ `exact', in which case we may end up re-(en/de)coding a large
+ part of the file/buffer.
+ nil, in which case we may return nil rather than an approximation."
+ (unless coding-system (setq coding-system buffer-file-coding-system))
+ (let ((eol (coding-system-eol-type coding-system))
+ (type (coding-system-type coding-system))
+ (base (coding-system-base coding-system))
+ (pm (save-restriction (widen) (point-min))))
+ (and (eq type 'utf-8)
+ ;; Any post-read/pre-write conversions mean it's not really UTF-8.
+ (not (null (coding-system-get coding-system :post-read-conversion)))
+ (setq type 'not-utf-8))
+ (and (memq type '(charset raw-text undecided))
+ ;; The following are all of type 'charset', but they are
+ ;; actually variable-width encodings.
+ (not (memq base '(chinese-gbk chinese-gb18030 euc-tw euc-jis-2004
+ korean-iso-8bit chinese-iso-8bit
+ japanese-iso-8bit chinese-big5-hkscs
+ japanese-cp932 korean-cp949)))
+ (setq type 'single-byte))
+ (pcase type
+ (`utf-8
+ (when (coding-system-get coding-system :bom)
+ (setq byte (max 0 (- byte 3))))
+ (if (= eol 1)
+ (filepos-to-bufferpos--dos (+ pm byte) #'byte-to-position)
+ (byte-to-position (+ pm byte))))
+ (`single-byte
+ (if (= eol 1)
+ (filepos-to-bufferpos--dos (+ pm byte) #'identity)
+ (+ pm byte)))
+ ((and `utf-16
+ ;; FIXME: For utf-16, we could use the same approach as used for
+ ;; dos EOLs (counting the number of non-BMP chars instead of the
+ ;; number of lines).
+ (guard (not (eq quality 'exact))))
+ ;; Account for BOM, which is always 2 bytes in UTF-16.
+ (when (coding-system-get coding-system :bom)
+ (setq byte (max 0 (- byte 2))))
+ ;; In approximate mode, assume all characters are within the
+ ;; BMP, i.e. take up 2 bytes.
+ (setq byte (/ byte 2))
+ (if (= eol 1)
+ (filepos-to-bufferpos--dos (+ pm byte) #'identity)
+ (+ pm byte)))
+ (_
+ (pcase quality
+ (`approximate (byte-to-position (+ pm byte)))
+ (`exact
+ ;; Rather than assume that the file exists and still holds the right
+ ;; data, we reconstruct it based on the buffer's content.
+ (let ((buf (current-buffer)))
+ (with-temp-buffer
+ (set-buffer-multibyte nil)
+ (let ((tmp-buf (current-buffer)))
+ (with-current-buffer buf
+ (save-restriction
+ (widen)
+ ;; Since encoding should always return more bytes than
+ ;; there were chars, encoding all chars up to (+ byte pm)
+ ;; guarantees the encoded result has at least `byte' bytes.
+ (encode-coding-region pm (min (point-max) (+ pm byte))
+ coding-system tmp-buf)))
+ (+ pm (length
+ (decode-coding-region (point-min)
+ (min (point-max) (+ pm byte))
+ coding-system t))))))))))))
+;;;###autoload
+(defun bufferpos-to-filepos (position &optional quality coding-system)
+ "Try to return the file byte corresponding to a particular buffer POSITION.
+Value is the file position given as a (0-based) byte count.
+The function presumes the file is encoded with CODING-SYSTEM, which defaults
+to `buffer-file-coding-system'.
+QUALITY can be:
+ `approximate', in which case we may cut some corners to avoid
+ excessive work.
+ `exact', in which case we may end up re-(en/de)coding a large
+ part of the file/buffer.
+ nil, in which case we may return nil rather than an approximation."
+ (unless coding-system (setq coding-system buffer-file-coding-system))
+ (let* ((eol (coding-system-eol-type coding-system))
+ (lineno (if (= eol 1) (1- (line-number-at-pos position)) 0))
+ (type (coding-system-type coding-system))
+ (base (coding-system-base coding-system))
+ byte)
+ (and (eq type 'utf-8)
+ ;; Any post-read/pre-write conversions mean it's not really UTF-8.
+ (not (null (coding-system-get coding-system :post-read-conversion)))
+ (setq type 'not-utf-8))
+ (and (memq type '(charset raw-text undecided))
+ ;; The following are all of type 'charset', but they are
+ ;; actually variable-width encodings.
+ (not (memq base '(chinese-gbk chinese-gb18030 euc-tw euc-jis-2004
+ korean-iso-8bit chinese-iso-8bit
+ japanese-iso-8bit chinese-big5-hkscs
+ japanese-cp932 korean-cp949)))
+ (setq type 'single-byte))
+ (pcase type
+ (`utf-8
+ (setq byte (position-bytes position))
+ (when (null byte)
+ (if (<= position 0)
+ (setq byte 1)
+ (setq byte (position-bytes (point-max)))))
+ (setq byte (1- byte))
+ (+ byte
+ ;; Account for BOM, if any.
+ (if (coding-system-get coding-system :bom) 3 0)
+ ;; Account for CR in CRLF pairs.
+ lineno))
+ (`single-byte
+ (+ position -1 lineno))
+ ((and `utf-16
+ ;; FIXME: For utf-16, we could use the same approach as used for
+ ;; dos EOLs (counting the number of non-BMP chars instead of the
+ ;; number of lines).
+ (guard (not (eq quality 'exact))))
+ ;; In approximate mode, assume all characters are within the
+ ;; BMP, i.e. each one takes up 2 bytes.
+ (+ (* (1- position) 2)
+ ;; Account for BOM, if any.
+ (if (coding-system-get coding-system :bom) 2 0)
+ ;; Account for CR in CRLF pairs.
+ lineno))
+ (_
+ (pcase quality
+ (`approximate (+ (position-bytes position) -1 lineno))
+ (`exact
+ ;; Rather than assume that the file exists and still holds the right
+ ;; data, we reconstruct its relevant portion.
+ (let ((buf (current-buffer)))
+ (with-temp-buffer
+ (set-buffer-multibyte nil)
+ (let ((tmp-buf (current-buffer)))
+ (with-current-buffer buf
+ (save-restriction
+ (widen)
+ (encode-coding-region (point-min) (min (point-max) position)
+ coding-system tmp-buf)))
+ (1- (point-max)))))))))))
(provide 'mule-util)