summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormilde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2011-05-04 20:51:59 +0000
committermilde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2011-05-04 20:51:59 +0000
commit99da4e158382bb0ac361d6dcd89b9595c7bac674 (patch)
tree6815e56a9e64eb07738d9820b0a6c9f9ff5a801a
parent2018351fcc57a4be20a5a1065a6d4bd5b0b8c72b (diff)
downloaddocutils-99da4e158382bb0ac361d6dcd89b9595c7bac674.tar.gz
More robust guess of input/output encoding.
The locale encoding is stored as `frontend.locale_encoding` and used in command line parsing and `default_error_encoding` before falling back to "ascii". git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk@7023 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
-rw-r--r--docutils/docs/dev/todo.txt39
-rw-r--r--docutils/docutils/core.py9
-rw-r--r--docutils/docutils/frontend.py21
-rw-r--r--docutils/docutils/utils.py8
4 files changed, 48 insertions, 29 deletions
diff --git a/docutils/docs/dev/todo.txt b/docutils/docs/dev/todo.txt
index 331a53ff3..dbc66b710 100644
--- a/docutils/docs/dev/todo.txt
+++ b/docutils/docs/dev/todo.txt
@@ -72,9 +72,26 @@ for inclusion in the Python standard library.
General
=======
-* We don't have consistent (or no) encoding handling for command line
- arguments. See
- <http://thread.gmane.org/gmane.text.docutils.user/2890/focus=2957>.
+* Encoding of command line arguments can only be guessed:
+
+ * try UTF-8/strict first, then try the locale's encoding with
+ strict error handling, then ASCII/replace?
+
+ UTF-8 is almost 100% safe to try first; false positives are rare,
+ The locale's encoding with strict error handling may be a
+ reasonable compromise, but any error would indicate that the
+ locale's encoding is inappropriate. The only safe fallback is
+ ASCII/replace.
+
+ * Do not decode argv before option parsing but individual string
+ values?
+
+ +1 Allows for separate command-line vs. filesystem encodings,
+ respectively to keep file names encoded.
+ +1 Allows to configure command-line encoding in a config file,
+ -1 More complicated.
+
+ Cf. <http://thread.gmane.org/gmane.text.docutils.user/2890/focus=2957>.
* Improve handling on Windows:
@@ -1232,14 +1249,14 @@ Which equation environments should be supported by the math directive?
+ numbered: `equation`
+ unnumbered: `equation*`
-* multiline (test for ``\\`` outside of a nested environment
+* multiline (test for ``\\`` outside of a nested environment
(e.g. `array` or `cases`)
+ numbered: `align` (number every line)
-
+
(To give one common number to all lines, put them in a `split`
environment. Docutils then places it in an `equation` environment.)
-
+
+ unnumbered: `align*`
+ Sphinx math also supports `gather` (checking for blank lines in
@@ -1284,10 +1301,10 @@ MathML_
latex_math_ is the base for the current latex2mathml_ module used
with ``--math-output=MathML``.
-
+
* Write a new converter based on:
-
- * a generic tokenizer (see e.g. a `latex-codec recipe`_,
+
+ * a generic tokenizer (see e.g. a `latex-codec recipe`_,
`updated latex-codec`_, )
* the Unicode-Char <-> LaTeX mappings database unimathsymbols_
@@ -1303,9 +1320,9 @@ MathML_
.. _ttm: http://hutchinson.belmont.ma.us/tth/mml/
.. _Steve’s LATEX-to-MathML translator:
http://www.gold-saucer.org/mathml/greasemonkey/dist/display-latex
- .. _latex-codec recipe:
+ .. _latex-codec recipe:
http://code.activestate.com/recipes/252124-latex-codec/
- .. _updated latex-codec:
+ .. _updated latex-codec:
http://mirror.ctan.org/biblio/bibtex/utils/mab2bib/latex.py
.. _unimathsymbols: http://milde.users.sourceforge.net/LUCR/Math/
diff --git a/docutils/docutils/core.py b/docutils/docutils/core.py
index 21f8d54be..0484a153f 100644
--- a/docutils/docutils/core.py
+++ b/docutils/docutils/core.py
@@ -22,13 +22,6 @@ from docutils.frontend import OptionParser
from docutils.transforms import Transformer
import docutils.readers.doctree
-try:
- import locale
- argv_encoding = locale.getpreferredencoding()
-except:
- argv_encoding = 'ascii'
-
-
class Publisher:
"""
@@ -156,6 +149,8 @@ class Publisher:
option_parser = self.setup_option_parser(
usage, description, settings_spec, config_section, **defaults)
if argv is None:
+ argv_encoding = (sys.stdin.encoding or frontend.locale_encoding
+ or 'ascii')
argv = [a.decode(argv_encoding) for a in sys.argv[1:]]
self.settings = option_parser.parse_args(argv)
diff --git a/docutils/docutils/frontend.py b/docutils/docutils/frontend.py
index 00c340a87..819a168e3 100644
--- a/docutils/docutils/frontend.py
+++ b/docutils/docutils/frontend.py
@@ -39,6 +39,21 @@ import docutils.nodes
import optparse
from optparse import SUPPRESS_HELP
+# Guess the locale's encoding.
+# If no valid guess can be made, locale_encoding is set to `None`:
+try:
+ import locale # module missing in Jython
+except ImportError:
+ locale_encoding = None
+else:
+ locale_encoding = locale.getlocale()[1] or locale.getdefaultlocale()[1]
+ # locale.getpreferredencoding([do_setlocale=True|False])
+ # has side-effects | might return a wrong guess.
+ # (cf. Update 1 in http://stackoverflow.com/questions/4082645/using-python-2-xs-locale-module-to-format-numbers-and-currency)
+ try:
+ codecs.lookup(locale_encoding)
+ except LookupError:
+ locale_encoding = None
def store_multiple(option, opt, value, parser, *args, **kwargs):
"""
@@ -313,10 +328,8 @@ class OptionParser(optparse.OptionParser, docutils.SettingsSpec):
'0': 0, 'off': 0, 'no': 0, 'false': 0, '': 0}
"""Lookup table for boolean configuration file settings."""
- try:
- default_error_encoding = sys.stderr.encoding or 'ascii'
- except AttributeError:
- default_error_encoding = 'ascii'
+ default_error_encoding = getattr(sys.stderr, 'encoding',
+ None) or locale_encoding or 'ascii'
default_error_encoding_error_handler = 'backslashreplace'
diff --git a/docutils/docutils/utils.py b/docutils/docutils/utils.py
index f1320868c..a5d75734c 100644
--- a/docutils/docutils/utils.py
+++ b/docutils/docutils/utils.py
@@ -116,13 +116,7 @@ class Reporter:
self.stream = stream
"""Where warning output is sent."""
- if encoding is None:
- try:
- encoding = stream.encoding
- except AttributeError:
- pass
-
- self.encoding = encoding or 'ascii'
+ self.encoding = encoding or getattr(stream, 'encoding', 'ascii')
"""The output character encoding."""
self.observers = []