diff options
Diffstat (limited to 'i18n/polib.py')
-rw-r--r-- | i18n/polib.py | 1501 |
1 files changed, 771 insertions, 730 deletions
diff --git a/i18n/polib.py b/i18n/polib.py index c09aebc..88428ce 100644 --- a/i18n/polib.py +++ b/i18n/polib.py @@ -5,48 +5,113 @@ # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: """ -**polib** allows you to manipulate, create, modify gettext files (pot, po and -mo files). You can load existing files, iterate through it's entries, add, -modify entries, comments or metadata, etc. or create new po files from scratch. - -**polib** provides a simple and pythonic API via the :func:`~polib.pofile` and -:func:`~polib.mofile` convenience functions. +**polib** allows you to manipulate, create, modify gettext files (pot, po +and mo files). You can load existing files, iterate through it's entries, +add, modify entries, comments or metadata, etc... or create new po files +from scratch. + +**polib** provides a simple and pythonic API, exporting only three +convenience functions (*pofile*, *mofile* and *detect_encoding*), and the +four core classes, *POFile*, *MOFile*, *POEntry* and *MOEntry* for creating +new files/entries. + +**Basic example**: + +>>> import polib +>>> # load an existing po file +>>> po = polib.pofile('tests/test_utf8.po') +>>> for entry in po: +... # do something with entry... +... pass +>>> # add an entry +>>> entry = polib.POEntry(msgid='Welcome', msgstr='Bienvenue') +>>> entry.occurrences = [('welcome.py', '12'), ('anotherfile.py', '34')] +>>> po.append(entry) +>>> # to save our modified po file: +>>> # po.save() +>>> # or you may want to compile the po file +>>> # po.save_as_mofile('tests/test_utf8.mo') """ -__author__ = 'David Jean Louis <izimobil@gmail.com>' -__version__ = '0.6.4' +__author__ = 'David JEAN LOUIS <izimobil@gmail.com>' +__version__ = '0.5.2' __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry', 'detect_encoding', 'escape', 'unescape', 'detect_encoding',] -import array import codecs -import os -import re import struct -import sys import textwrap import types +import re - -# the default encoding to use when encoding cannot be detected default_encoding = 'utf-8' -# _pofile_or_mofile {{{ +# function pofile() {{{ -def _pofile_or_mofile(f, type, **kwargs): +def pofile(fpath, **kwargs): """ - Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to - honor the DRY concept. + Convenience function that parse the po/pot file *fpath* and return + a POFile instance. + + **Keyword arguments**: + - *fpath*: string, full or relative path to the po/pot file to parse + - *wrapwidth*: integer, the wrap width, only useful when -w option was + passed to xgettext (optional, default to 78) + - *autodetect_encoding*: boolean, if set to False the function will + not try to detect the po file encoding (optional, default to True) + - *encoding*: string, an encoding, only relevant if autodetect_encoding + is set to False + - *check_for_duplicates*: whether to check for duplicate entries when + adding entries to the file, default: False (optional) + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_weird_occurrences.po', + ... check_for_duplicates=True) + >>> po #doctest: +ELLIPSIS + <POFile instance at ...> + >>> import os, tempfile + >>> all_attrs = ('msgctxt', 'msgid', 'msgstr', 'msgid_plural', + ... 'msgstr_plural', 'obsolete', 'comment', 'tcomment', + ... 'occurrences', 'flags', 'previous_msgctxt', + ... 'previous_msgid', 'previous_msgid_plural') + >>> for fname in ['test_iso-8859-15.po', 'test_utf8.po']: + ... orig_po = polib.pofile('tests/'+fname) + ... tmpf = tempfile.NamedTemporaryFile().name + ... orig_po.save(tmpf) + ... try: + ... new_po = polib.pofile(tmpf) + ... for old, new in zip(orig_po, new_po): + ... for attr in all_attrs: + ... if getattr(old, attr) != getattr(new, attr): + ... getattr(old, attr) + ... getattr(new, attr) + ... finally: + ... os.unlink(tmpf) + >>> po_file = polib.pofile('tests/test_save_as_mofile.po') + >>> tmpf = tempfile.NamedTemporaryFile().name + >>> po_file.save_as_mofile(tmpf) + >>> try: + ... mo_file = polib.mofile(tmpf) + ... for old, new in zip(po_file, mo_file): + ... if po_file._encode(old.msgid) != mo_file._encode(new.msgid): + ... 'OLD: ', po_file._encode(old.msgid) + ... 'NEW: ', mo_file._encode(new.msgid) + ... if po_file._encode(old.msgstr) != mo_file._encode(new.msgstr): + ... 'OLD: ', po_file._encode(old.msgstr) + ... 'NEW: ', mo_file._encode(new.msgstr) + ... print new.msgstr + ... finally: + ... os.unlink(tmpf) """ - # get the file encoding - enc = kwargs.get('encoding') - if enc is None: - enc = detect_encoding(f, type == 'mofile') - - # parse the file - kls = type == 'pofile' and _POFileParser or _MOFileParser - parser = kls( - f, + if kwargs.get('autodetect_encoding', True): + enc = detect_encoding(fpath) + else: + enc = kwargs.get('encoding', default_encoding) + check_for_duplicates = kwargs.get('check_for_duplicates', False) + parser = _POFileParser( + fpath, encoding=enc, check_for_duplicates=kwargs.get('check_for_duplicates', False) ) @@ -55,108 +120,95 @@ def _pofile_or_mofile(f, type, **kwargs): return instance # }}} -# function pofile() {{{ - -def pofile(pofile, **kwargs): - """ - Convenience function that parses the po or pot file ``pofile`` and returns - a :class:`~polib.POFile` instance. - - Arguments: - - ``pofile`` - string, full or relative path to the po/pot file or its content (data). - - ``wrapwidth`` - integer, the wrap width, only useful when the ``-w`` option was passed - to xgettext (optional, default: ``78``). - - ``encoding`` - string, the encoding to use (e.g. "utf-8") (default: ``None``, the - encoding will be auto-detected). - - ``check_for_duplicates`` - whether to check for duplicate entries when adding entries to the - file (optional, default: ``False``). - """ - return _pofile_or_mofile(pofile, 'pofile', **kwargs) - -# }}} # function mofile() {{{ -def mofile(mofile, **kwargs): +def mofile(fpath, **kwargs): """ - Convenience function that parses the mo file ``mofile`` and returns a - :class:`~polib.MOFile` instance. - - Arguments: - - ``mofile`` - string, full or relative path to the mo file or its content (data). - - ``wrapwidth`` - integer, the wrap width, only useful when the ``-w`` option was passed - to xgettext to generate the po file that was used to format the mo file - (optional, default: ``78``). - - ``encoding`` - string, the encoding to use (e.g. "utf-8") (default: ``None``, the - encoding will be auto-detected). - - ``check_for_duplicates`` - whether to check for duplicate entries when adding entries to the - file (optional, default: ``False``). + Convenience function that parse the mo file *fpath* and return + a MOFile instance. + + **Keyword arguments**: + - *fpath*: string, full or relative path to the mo file to parse + - *wrapwidth*: integer, the wrap width, only useful when -w option was + passed to xgettext to generate the po file that was used to format + the mo file (optional, default to 78) + - *autodetect_encoding*: boolean, if set to False the function will + not try to detect the po file encoding (optional, default to True) + - *encoding*: string, an encoding, only relevant if autodetect_encoding + is set to False + - *check_for_duplicates*: whether to check for duplicate entries when + adding entries to the file, default: False (optional) + + **Example**: + + >>> import polib + >>> mo = polib.mofile('tests/test_utf8.mo', check_for_duplicates=True) + >>> mo #doctest: +ELLIPSIS + <MOFile instance at ...> + >>> import os, tempfile + >>> for fname in ['test_iso-8859-15.mo', 'test_utf8.mo']: + ... orig_mo = polib.mofile('tests/'+fname) + ... tmpf = tempfile.NamedTemporaryFile().name + ... orig_mo.save(tmpf) + ... try: + ... new_mo = polib.mofile(tmpf) + ... for old, new in zip(orig_mo, new_mo): + ... if old.msgid != new.msgid: + ... old.msgstr + ... new.msgstr + ... finally: + ... os.unlink(tmpf) """ - return _pofile_or_mofile(mofile, 'mofile', **kwargs) + if kwargs.get('autodetect_encoding', True): + enc = detect_encoding(fpath, True) + else: + enc = kwargs.get('encoding', default_encoding) + parser = _MOFileParser( + fpath, + encoding=enc, + check_for_duplicates=kwargs.get('check_for_duplicates', False) + ) + instance = parser.parse() + instance.wrapwidth = kwargs.get('wrapwidth', 78) + return instance # }}} # function detect_encoding() {{{ -def detect_encoding(file, binary_mode=False): +def detect_encoding(fpath, binary_mode=False): """ - Try to detect the encoding used by the ``file``. The ``file`` argument can - be a PO or MO file path or a string containing the contents of the file. - If the encoding cannot be detected, the function will return the value of - ``default_encoding``. - - Arguments: - - ``file`` - string, full or relative path to the po/mo file or its content. - - ``binary_mode`` - boolean, set this to True if ``file`` is a mo file. + Try to detect the encoding used by the file *fpath*. The function will + return polib default *encoding* if it's unable to detect it. + + **Keyword argument**: + - *fpath*: string, full or relative path to the mo file to parse. + + **Examples**: + + >>> print(detect_encoding('tests/test_noencoding.po')) + utf-8 + >>> print(detect_encoding('tests/test_utf8.po')) + UTF-8 + >>> print(detect_encoding('tests/test_utf8.mo', True)) + UTF-8 + >>> print(detect_encoding('tests/test_iso-8859-15.po')) + ISO_8859-15 + >>> print(detect_encoding('tests/test_iso-8859-15.mo', True)) + ISO_8859-15 """ + import re rx = re.compile(r'"?Content-Type:.+? charset=([\w_\-:\.]+)') - - def charset_exists(charset): - """Check whether ``charset`` is valid or not.""" - try: - codecs.lookup(charset) - except LookupError: - return False - return True - - if not os.path.exists(file): - match = rx.search(file) - if match: - enc = match.group(1).strip() - if charset_exists(enc): - return enc + if binary_mode: + mode = 'rb' else: - if binary_mode: - mode = 'rb' - else: - mode = 'r' - f = open(file, mode) - for l in f.readlines(): - match = rx.search(l) - if match: - f.close() - enc = match.group(1).strip() - if charset_exists(enc): - return enc - f.close() + mode = 'r' + f = open(fpath, mode) + for l in f.readlines(): + match = rx.search(l) + if match: + f.close() + return match.group(1).strip() + f.close() return default_encoding # }}} @@ -164,8 +216,12 @@ def detect_encoding(file, binary_mode=False): def escape(st): """ - Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in - the given string ``st`` and returns it. + Escape special chars and return the given string *st*. + + **Examples**: + + >>> escape('\\t and \\n and \\r and " and \\\\') + '\\\\t and \\\\n and \\\\r and \\\\" and \\\\\\\\' """ return st.replace('\\', r'\\')\ .replace('\t', r'\t')\ @@ -178,8 +234,18 @@ def escape(st): def unescape(st): """ - Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in - the given string ``st`` and returns it. + Unescape special chars and return the given string *st*. + + **Examples**: + + >>> unescape('\\\\t and \\\\n and \\\\r and \\\\" and \\\\\\\\') + '\\t and \\n and \\r and " and \\\\' + >>> unescape(r'\\n') + '\\n' + >>> unescape(r'\\\\n') + '\\\\n' + >>> unescape(r'\\\\n\\n') + '\\\\n\\n' """ def unescape_repl(m): m = m.group(1) @@ -199,36 +265,27 @@ def unescape(st): class _BaseFile(list): """ - Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile` - classes. This class should **not** be instanciated directly. + Common parent class for POFile and MOFile classes. + This class must **not** be instanciated directly. """ def __init__(self, *args, **kwargs): """ - Constructor, accepts the following keyword arguments: - - ``pofile`` - string, the path to the po or mo file, or its content as a string. - - ``wrapwidth`` - integer, the wrap width, only useful when the ``-w`` option was - passed to xgettext (optional, default: ``78``). - - ``encoding`` - string, the encoding to use, defaults to ``default_encoding`` - global variable (optional). + Constructor. - ``check_for_duplicates`` - whether to check for duplicate entries when adding entries to the - file, (optional, default: ``False``). + **Keyword arguments**: + - *fpath*: string, path to po or mo file + - *wrapwidth*: integer, the wrap width, only useful when -w option + was passed to xgettext to generate the po file that was used to + format the mo file, default to 78 (optional), + - *encoding*: string, the encoding to use, defaults to + "default_encoding" global variable (optional), + - *check_for_duplicates*: whether to check for duplicate entries + when adding entries to the file, default: False (optional). """ list.__init__(self) # the opened file handle - pofile = kwargs.get('pofile', None) - if pofile and os.path.exists(pofile): - self.fpath = pofile - else: - self.fpath = kwargs.get('fpath') + self.fpath = kwargs.get('fpath') # the width at which lines should be wrapped self.wrapwidth = kwargs.get('wrapwidth', 78) # the file encoding @@ -241,56 +298,66 @@ class _BaseFile(list): self.metadata = {} self.metadata_is_fuzzy = 0 - def __unicode__(self): + def __str__(self): """ - Returns the unicode representation of the file. + String representation of the file. """ ret = [] entries = [self.metadata_as_entry()] + \ [e for e in self if not e.obsolete] for entry in entries: - ret.append(entry.__unicode__(self.wrapwidth)) + ret.append(entry.__str__(self.wrapwidth)) for entry in self.obsolete_entries(): - ret.append(entry.__unicode__(self.wrapwidth)) - ret = '\n'.join(ret) - - if type(ret) != types.UnicodeType: - return unicode(ret, self.encoding) - return ret - - def __str__(self): - """ - Returns the string representation of the file. - """ - return unicode(self).encode(self.encoding) + ret.append(entry.__str__(self.wrapwidth)) + return '\n'.join(ret) def __contains__(self, entry): """ - Overriden ``list`` method to implement the membership test (in and - not in). - The method considers that an entry is in the file if it finds an entry - that has the same msgid (the test is **case sensitive**). - - Argument: - - ``entry`` - an instance of :class:`~polib._BaseEntry`. + Overriden method to implement the membership test (in and not in). + The method considers that an entry is in the file if it finds an + entry that has the same msgid (case sensitive). + + **Keyword argument**: + - *entry*: an instance of polib._BaseEntry + + **Tests**: + >>> po = POFile() + >>> e1 = POEntry(msgid='foobar', msgstr='spam') + >>> e2 = POEntry(msgid='barfoo', msgstr='spam') + >>> e3 = POEntry(msgid='foobar', msgstr='eggs') + >>> e4 = POEntry(msgid='spameggs', msgstr='eggs') + >>> po.append(e1) + >>> po.append(e2) + >>> e1 in po + True + >>> e2 not in po + False + >>> e3 in po + True + >>> e4 in po + False """ return self.find(entry.msgid, by='msgid') is not None - - def __eq__(self, other): - return unicode(self) == unicode(other) def append(self, entry): """ Overriden method to check for duplicates entries, if a user tries to - add an entry that is already in the file, the method will raise a - ``ValueError`` exception. - - Argument: - - ``entry`` - an instance of :class:`~polib._BaseEntry`. + add an entry that already exists, the method will raise a ValueError + exception. + + **Keyword argument**: + - *entry*: an instance of polib._BaseEntry + + **Tests**: + >>> e1 = POEntry(msgid='foobar', msgstr='spam') + >>> e2 = POEntry(msgid='foobar', msgstr='eggs') + >>> po = POFile(check_for_duplicates=True) + >>> po.append(e1) + >>> try: + ... po.append(e2) + ... except ValueError, e: + ... unicode(e) + u'Entry "foobar" already exists' """ if self.check_for_duplicates and entry in self: raise ValueError('Entry "%s" already exists' % entry.msgid) @@ -299,50 +366,70 @@ class _BaseFile(list): def insert(self, index, entry): """ Overriden method to check for duplicates entries, if a user tries to - add an entry that is already in the file, the method will raise a - ``ValueError`` exception. - - Arguments: - - ``index`` - index at which the entry should be inserted. - - ``entry`` - an instance of :class:`~polib._BaseEntry`. + insert an entry that already exists, the method will raise a ValueError + exception. + + **Keyword arguments**: + - *index*: index at which the entry should be inserted + - *entry*: an instance of polib._BaseEntry + + **Tests**: + >>> import polib + >>> polib.check_for_duplicates = True + >>> e1 = POEntry(msgid='foobar', msgstr='spam') + >>> e2 = POEntry(msgid='barfoo', msgstr='eggs') + >>> e3 = POEntry(msgid='foobar', msgstr='eggs') + >>> po = POFile(check_for_duplicates=True) + >>> po.insert(0, e1) + >>> po.insert(1, e2) + >>> try: + ... po.insert(0, e3) + ... except ValueError, e: + ... unicode(e) + u'Entry "foobar" already exists' """ if self.check_for_duplicates and entry in self: raise ValueError('Entry "%s" already exists' % entry.msgid) super(_BaseFile, self).insert(index, entry) + def __repr__(self): + """Return the official string representation of the object.""" + return '<%s instance at %x>' % (self.__class__.__name__, id(self)) + def metadata_as_entry(self): """ - Returns the file metadata as a :class:`~polib.POFile` instance. + Return the metadata as an entry: + + >>> import polib + >>> po = polib.pofile('tests/test_fuzzy_header.po') + >>> unicode(po) == unicode(open('tests/test_fuzzy_header.po').read()) + True """ e = POEntry(msgid='') mdata = self.ordered_metadata() if mdata: strs = [] + e._multiline_str['msgstr'] = '' for name, value in mdata: # Strip whitespace off each line in a multi-line entry strs.append('%s: %s' % (name, value)) e.msgstr = '\n'.join(strs) + '\n' + e._multiline_str['msgstr'] = '__POLIB__NL__'.join( + [s + '\n' for s in strs]) if self.metadata_is_fuzzy: e.flags.append('fuzzy') return e def save(self, fpath=None, repr_method='__str__'): """ - Saves the po file to ``fpath``. - If it is an existing file and no ``fpath`` is provided, then the - existing file is rewritten with the modified data. - - Keyword arguments: + Save the po file to file *fpath* if no file handle exists for + the object. If there's already an open file and no fpath is + provided, then the existing file is rewritten with the modified + data. - ``fpath`` - string, full or relative path to the file. - - ``repr_method`` - string, the method to use for output. + **Keyword arguments**: + - *fpath*: string, full or relative path to the file. + - *repr_method*: string, the method to use for output. """ if self.fpath is None and fpath is None: raise IOError('You must provide a file path to save() method') @@ -357,47 +444,38 @@ class _BaseFile(list): contents = contents.decode(self.encoding) fhandle.write(contents) fhandle.close() - # set the file path if not set - if self.fpath is None and fpath: - self.fpath = fpath - def find(self, st, by='msgid', include_obsolete_entries=False, - msgctxt=False): + def find(self, st, by='msgid'): """ - Find the entry which msgid (or property identified by the ``by`` - argument) matches the string ``st``. - - Keyword arguments: + Find entry which msgid (or property identified by the *by* + attribute) matches the string *st*. - ``st`` - string, the string to search for. + **Keyword arguments**: + - *st*: string, the string to search for + - *by*: string, the comparison attribute - ``by`` - string, the property to use for comparison (default: ``msgid``). + **Examples**: - ``include_obsolete_entries`` - boolean, whether to also search in entries that are obsolete. - - ``msgctxt`` - string, allows to specify a specific message context for the - search. + >>> po = pofile('tests/test_utf8.po') + >>> entry = po.find('Thursday') + >>> entry.msgstr + u'Jueves' + >>> entry = po.find('Some unexistant msgid') + >>> entry is None + True + >>> entry = po.find('Jueves', 'msgstr') + >>> entry.msgid + u'Thursday' """ - if include_obsolete_entries: - entries = self[:] - else: - entries = [e for e in self if not e.obsolete] - for e in entries: + for e in self: if getattr(e, by) == st: - if msgctxt and e.msgctxt != msgctxt: - continue return e return None def ordered_metadata(self): """ - Convenience method that returns an ordered version of the metadata - dictionnary. The return value is list of tuples (metadata name, - metadata_value). + Convenience method that return the metadata ordered. The return + value is list of tuples (metadata name, metadata_value). """ # copy the dict first metadata = self.metadata.copy() @@ -419,10 +497,9 @@ class _BaseFile(list): ordered_data.append((data, value)) except KeyError: pass - # the rest of the metadata will be alphabetically ordered since there - # are no specs for this AFAIK + # the rest of the metadata won't be ordered there are no specs for this keys = metadata.keys() - keys.sort() + list(keys).sort() for data in keys: value = metadata[data] ordered_data.append((data, value)) @@ -430,51 +507,45 @@ class _BaseFile(list): def to_binary(self): """ - Return the binary representation of the file. + Return the mofile binary representation. """ + import array + import struct + import types offsets = [] entries = self.translated_entries() # the keys are sorted in the .mo file def cmp(_self, other): - # msgfmt compares entries with msgctxt if it exists - self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid - other_msgid = other.msgctxt and other.msgctxt or other.msgid - if self_msgid > other_msgid: + if _self.msgid > other.msgid: return 1 - elif self_msgid < other_msgid: + elif _self.msgid < other.msgid: return -1 else: return 0 # add metadata entry entries.sort(cmp) mentry = self.metadata_as_entry() - #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip() + mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip() entries = [mentry] + entries entries_len = len(entries) ids, strs = '', '' for e in entries: # For each string, we need size and file offset. Each string is # NUL terminated; the NUL does not count into the size. - msgid = '' - if e.msgctxt: - # Contexts are stored by storing the concatenation of the - # context, a <EOT> byte, and the original string - msgid = self._encode(e.msgctxt + '\4') if e.msgid_plural: indexes = e.msgstr_plural.keys() indexes.sort() msgstr = [] for index in indexes: msgstr.append(e.msgstr_plural[index]) - msgid += self._encode(e.msgid + '\0' + e.msgid_plural) + msgid = self._encode(e.msgid + '\0' + e.msgid_plural) msgstr = self._encode('\0'.join(msgstr)) else: - msgid += self._encode(e.msgid) + msgid = self._encode(e.msgid) msgstr = self._encode(e.msgstr) offsets.append((len(ids), len(msgid), len(strs), len(msgstr))) ids += msgid + '\0' strs += msgstr + '\0' - # The header is 7 32-bit unsigned integers. keystart = 7*4+16*entries_len # and the values start after the keys @@ -487,31 +558,22 @@ class _BaseFile(list): koffsets += [l1, o1+keystart] voffsets += [l2, o2+valuestart] offsets = koffsets + voffsets - # check endianness for magic number - if struct.pack('@h', 1) == struct.pack('<h', 1): - magic_number = MOFile.LITTLE_ENDIAN - else: - magic_number = MOFile.BIG_ENDIAN - - output = struct.pack( - "Iiiiiii", - magic_number, # Magic number - 0, # Version - entries_len, # # of entries - 7*4, # start of key index - 7*4+entries_len*8, # start of value index - 0, keystart # size and offset of hash table - # Important: we don't use hash tables - ) - output += array.array("i", offsets).tostring() + output = struct.pack("IIIIIII", + 0x950412de, # Magic number + 0, # Version + entries_len, # # of entries + 7*4, # start of key index + 7*4+entries_len*8, # start of value index + 0, 0) # size and offset of hash table + output += array.array("I", offsets).tostring() output += ids output += strs return output def _encode(self, mixed): """ - Encodes the given ``mixed`` argument with the file encoding if and - only if it's an unicode string and returns the encoded string. + Encode the given argument with the file encoding if the type is unicode + and return the encoded string. """ if type(mixed) == types.UnicodeType: return mixed.encode(self.encoding) @@ -521,43 +583,88 @@ class _BaseFile(list): # class POFile {{{ class POFile(_BaseFile): - """ + ''' Po (or Pot) file reader/writer. - This class inherits the :class:`~polib._BaseFile` class and, by extension, - the python ``list`` type. - """ + POFile objects inherit the list objects methods. + + **Example**: + + >>> po = POFile() + >>> entry1 = POEntry( + ... msgid="Some english text", + ... msgstr="Un texte en anglais" + ... ) + >>> entry1.occurrences = [('testfile', 12),('another_file', 1)] + >>> entry1.comment = "Some useful comment" + >>> entry2 = POEntry( + ... msgid="Peace in some languages", + ... msgstr="Pace سلام שלום Hasîtî 和平" + ... ) + >>> entry2.occurrences = [('testfile', 15),('another_file', 5)] + >>> entry2.comment = "Another useful comment" + >>> entry3 = POEntry( + ... msgid='Some entry with quotes " \\"', + ... msgstr='Un message unicode avec des quotes " \\"' + ... ) + >>> entry3.comment = "Test string quoting" + >>> po.append(entry1) + >>> po.append(entry2) + >>> po.append(entry3) + >>> po.header = "Some Header" + >>> print(po) + # Some Header + msgid "" + msgstr "" + <BLANKLINE> + #. Some useful comment + #: testfile:12 another_file:1 + msgid "Some english text" + msgstr "Un texte en anglais" + <BLANKLINE> + #. Another useful comment + #: testfile:15 another_file:5 + msgid "Peace in some languages" + msgstr "Pace سلام שלום Hasîtî 和平" + <BLANKLINE> + #. Test string quoting + msgid "Some entry with quotes \\" \\"" + msgstr "Un message unicode avec des quotes \\" \\"" + <BLANKLINE> + ''' - def __unicode__(self): - """ - Returns the unicode representation of the po file. - """ + def __str__(self): + """Return the string representation of the po file""" ret, headers = '', self.header.split('\n') for header in headers: if header[:1] in [',', ':']: ret += '#%s\n' % header else: ret += '# %s\n' % header - - if type(ret) != types.UnicodeType: - ret = unicode(ret, self.encoding) - - return ret + _BaseFile.__unicode__(self) + return ret + _BaseFile.__str__(self) def save_as_mofile(self, fpath): """ - Saves the binary representation of the file to given ``fpath``. - - Keyword argument: + Save the binary representation of the file to *fpath*. - ``fpath`` - string, full or relative path to the mo file. + **Keyword arguments**: + - *fpath*: string, full or relative path to the file. """ _BaseFile.save(self, fpath, 'to_binary') def percent_translated(self): """ - Convenience method that returns the percentage of translated + Convenience method that return the percentage of translated messages. + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_pofile_helpers.po') + >>> po.percent_translated() + 50 + >>> po = POFile() + >>> po.percent_translated() + 100 """ total = len([e for e in self if not e.obsolete]) if total == 0: @@ -567,52 +674,91 @@ class POFile(_BaseFile): def translated_entries(self): """ - Convenience method that returns the list of translated entries. + Convenience method that return a list of translated entries. + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_pofile_helpers.po') + >>> len(po.translated_entries()) + 6 """ return [e for e in self if e.translated()] def untranslated_entries(self): """ - Convenience method that returns the list of untranslated entries. + Convenience method that return a list of untranslated entries. + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_pofile_helpers.po') + >>> len(po.untranslated_entries()) + 4 """ return [e for e in self if not e.translated() and not e.obsolete \ and not 'fuzzy' in e.flags] def fuzzy_entries(self): """ - Convenience method that returns the list of fuzzy entries. + Convenience method that return the list of 'fuzzy' entries. + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_pofile_helpers.po') + >>> len(po.fuzzy_entries()) + 2 """ return [e for e in self if 'fuzzy' in e.flags] def obsolete_entries(self): """ - Convenience method that returns the list of obsolete entries. + Convenience method that return the list of obsolete entries. + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_pofile_helpers.po') + >>> len(po.obsolete_entries()) + 4 """ return [e for e in self if e.obsolete] def merge(self, refpot): """ - Convenience method that merges the current pofile with the pot file + XXX this could not work if encodings are different, needs thinking + and general refactoring of how polib handles encoding... + + Convenience method that merge the current pofile with the pot file provided. It behaves exactly as the gettext msgmerge utility: - * comments of this file will be preserved, but extracted comments and - occurrences will be discarded; - * any translations or comments in the file will be discarded, however, - dot comments and file positions will be preserved; - * the fuzzy flags are preserved. + - comments of this file will be preserved, but extracted comments + and occurrences will be discarded + - any translations or comments in the file will be discarded, + however dot comments and file positions will be preserved - Keyword argument: + **Keyword argument**: + - *refpot*: object POFile, the reference catalog. - ``refpot`` - object POFile, the reference catalog. + **Example**: + + >>> import polib + >>> refpot = polib.pofile('tests/test_merge.pot') + >>> po = polib.pofile('tests/test_merge_before.po') + >>> po.merge(refpot) + >>> expected_po = polib.pofile('tests/test_merge_after.po') + >>> unicode(po) == unicode(expected_po) + True """ for entry in refpot: - e = self.find(entry.msgid, include_obsolete_entries=True) + e = self.find(entry.msgid) if e is None: e = POEntry() self.append(e) e.merge(entry) - # ok, now we must "obsolete" entries that are not in the refpot anymore + # ok, now we must "obsolete" entries that are not in the refpot + # anymore for entry in self: if refpot.find(entry.msgid) is None: entry.obsolete = True @@ -621,18 +767,48 @@ class POFile(_BaseFile): # class MOFile {{{ class MOFile(_BaseFile): - """ + ''' Mo file reader/writer. - This class inherits the :class:`~polib._BaseFile` class and, by - extension, the python ``list`` type. - """ - BIG_ENDIAN = 0xde120495 - LITTLE_ENDIAN = 0x950412de + MOFile objects inherit the list objects methods. + + **Example**: + + >>> mo = MOFile() + >>> entry1 = POEntry( + ... msgid="Some english text", + ... msgstr="Un texte en anglais" + ... ) + >>> entry2 = POEntry( + ... msgid="I need my dirty cheese", + ... msgstr="Je veux mon sale fromage" + ... ) + >>> entry3 = MOEntry( + ... msgid='Some entry with quotes " \\"', + ... msgstr='Un message unicode avec des quotes " \\"' + ... ) + >>> mo.append(entry1) + >>> mo.append(entry2) + >>> mo.append(entry3) + >>> print(mo) + msgid "" + msgstr "" + <BLANKLINE> + msgid "Some english text" + msgstr "Un texte en anglais" + <BLANKLINE> + msgid "I need my dirty cheese" + msgstr "Je veux mon sale fromage" + <BLANKLINE> + msgid "Some entry with quotes \\" \\"" + msgstr "Un message unicode avec des quotes \\" \\"" + <BLANKLINE> + ''' def __init__(self, *args, **kwargs): """ - Constructor, accepts all keywords arguments accepted by - :class:`~polib._BaseFile` class. + MOFile constructor. Mo files have two other properties: + - magic_number: the magic_number of the binary file, + - version: the version of the mo spec. """ _BaseFile.__init__(self, *args, **kwargs) self.magic_number = None @@ -640,23 +816,19 @@ class MOFile(_BaseFile): def save_as_pofile(self, fpath): """ - Saves the mofile as a pofile to ``fpath``. + Save the string representation of the file to *fpath*. - Keyword argument: - - ``fpath`` - string, full or relative path to the file. + **Keyword argument**: + - *fpath*: string, full or relative path to the file. """ _BaseFile.save(self, fpath) - def save(self, fpath=None): + def save(self, fpath): """ - Saves the mofile to ``fpath``. - - Keyword argument: + Save the binary representation of the file to *fpath*. - ``fpath`` - string, full or relative path to the file. + **Keyword argument**: + - *fpath*: string, full or relative path to the file. """ _BaseFile.save(self, fpath, 'to_binary') @@ -695,47 +867,29 @@ class MOFile(_BaseFile): class _BaseEntry(object): """ - Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes. - This class should **not** be instanciated directly. + Base class for POEntry or MOEntry objects. + This class must *not* be instanciated directly. """ def __init__(self, *args, **kwargs): - """ - Constructor, accepts the following keyword arguments: - - ``msgid`` - string, the entry msgid. - - ``msgstr`` - string, the entry msgstr. - - ``msgid_plural`` - string, the entry msgid_plural. - - ``msgstr_plural`` - list, the entry msgstr_plural lines. - - ``msgctxt`` - string, the entry context (msgctxt). - - ``obsolete`` - bool, whether the entry is "obsolete" or not. - - ``encoding`` - string, the encoding to use, defaults to ``default_encoding`` - global variable (optional). - """ + """Base Entry constructor.""" self.msgid = kwargs.get('msgid', '') self.msgstr = kwargs.get('msgstr', '') self.msgid_plural = kwargs.get('msgid_plural', '') self.msgstr_plural = kwargs.get('msgstr_plural', {}) - self.msgctxt = kwargs.get('msgctxt', None) self.obsolete = kwargs.get('obsolete', False) self.encoding = kwargs.get('encoding', default_encoding) + self.msgctxt = kwargs.get('msgctxt', None) + self._multiline_str = {} - def __unicode__(self, wrapwidth=78): + def __repr__(self): + """Return the official string representation of the object.""" + return '<%s instance at %x>' % (self.__class__.__name__, id(self)) + + def __str__(self, wrapwidth=78): """ - Returns the unicode representation of the entry. + Common string representation of the POEntry and MOEntry + objects. """ if self.obsolete: delflag = '#~ ' @@ -744,12 +898,12 @@ class _BaseEntry(object): ret = [] # write the msgctxt if any if self.msgctxt is not None: - ret += self._str_field("msgctxt", delflag, "", self.msgctxt, wrapwidth) + ret += self._str_field("msgctxt", delflag, "", self.msgctxt) # write the msgid - ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth) + ret += self._str_field("msgid", delflag, "", self.msgid) # write the msgid_plural if any if self.msgid_plural: - ret += self._str_field("msgid_plural", delflag, "", self.msgid_plural, wrapwidth) + ret += self._str_field("msgid_plural", delflag, "", self.msgid_plural) if self.msgstr_plural: # write the msgstr_plural if any msgstrs = self.msgstr_plural @@ -758,51 +912,23 @@ class _BaseEntry(object): for index in keys: msgstr = msgstrs[index] plural_index = '[%s]' % index - ret += self._str_field("msgstr", delflag, plural_index, msgstr, wrapwidth) + ret += self._str_field("msgstr", delflag, plural_index, msgstr) else: # otherwise write the msgstr - ret += self._str_field("msgstr", delflag, "", self.msgstr, wrapwidth) + ret += self._str_field("msgstr", delflag, "", self.msgstr) ret.append('') - ret = '\n'.join(ret) - - if type(ret) != types.UnicodeType: - return unicode(ret, self.encoding) - return ret - - def __str__(self): - """ - Returns the string representation of the entry. - """ - return unicode(self).encode(self.encoding) - - def __eq__(self, other): - return unicode(self) == unicode(other) + return '\n'.join(ret) - def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78): - lines = field.splitlines(True) - if len(lines) > 1: - lines = [''] + lines # start with initial empty line + def _str_field(self, fieldname, delflag, plural_index, field): + if (fieldname + plural_index) in self._multiline_str: + field = self._multiline_str[fieldname + plural_index] + lines = [''] + field.split('__POLIB__NL__') else: - escaped_field = escape(field) - specialchars_count = 0 - for c in ['\\', '\n', '\r', '\t', '"']: - specialchars_count += field.count(c) - # comparison must take into account fieldname length + one space - # + 2 quotes (eg. msgid "<string>") - flength = len(fieldname) + 3 - if plural_index: - flength += len(plural_index) - real_wrapwidth = wrapwidth - flength + specialchars_count - if wrapwidth > 0 and len(field) > real_wrapwidth: - # Wrap the line but take field name into account - lines = [''] + [unescape(item) for item in wrap( - escaped_field, - wrapwidth - 2, # 2 for quotes "" - drop_whitespace=False, - break_long_words=False - )] + lines = field.splitlines(True) + if len(lines) > 1: + lines = ['']+lines # start with initial empty line else: - lines = [field] + lines = [field] # needed for the empty string case if fieldname.startswith('previous_'): # quick and dirty trick to get the real field name fieldname = fieldname[9:] @@ -819,33 +945,50 @@ class _BaseEntry(object): class POEntry(_BaseEntry): """ Represents a po file entry. + + **Examples**: + + >>> entry = POEntry(msgid='Welcome', msgstr='Bienvenue') + >>> entry.occurrences = [('welcome.py', 12), ('anotherfile.py', 34)] + >>> print(entry) + #: welcome.py:12 anotherfile.py:34 + msgid "Welcome" + msgstr "Bienvenue" + <BLANKLINE> + >>> entry = POEntry() + >>> entry.occurrences = [('src/some-very-long-filename-that-should-not-be-wrapped-even-if-it-is-larger-than-the-wrap-limit.c', 32), ('src/eggs.c', 45)] + >>> entry.comment = 'A plural translation. This is a very very very long line please do not wrap, this is just for testing comment wrapping...' + >>> entry.tcomment = 'A plural translation. This is a very very very long line please do not wrap, this is just for testing comment wrapping...' + >>> entry.flags.append('c-format') + >>> entry.previous_msgctxt = '@somecontext' + >>> entry.previous_msgid = 'I had eggs but no spam !' + >>> entry.previous_msgid_plural = 'I had eggs and %d spam !' + >>> entry.msgctxt = '@somenewcontext' + >>> entry.msgid = 'I have spam but no egg !' + >>> entry.msgid_plural = 'I have spam and %d eggs !' + >>> entry.msgstr_plural[0] = "J'ai du jambon mais aucun oeuf !" + >>> entry.msgstr_plural[1] = "J'ai du jambon et %d oeufs !" + >>> print(entry) + #. A plural translation. This is a very very very long line please do not + #. wrap, this is just for testing comment wrapping... + # A plural translation. This is a very very very long line please do not wrap, + # this is just for testing comment wrapping... + #: src/some-very-long-filename-that-should-not-be-wrapped-even-if-it-is-larger-than-the-wrap-limit.c:32 + #: src/eggs.c:45 + #, c-format + #| msgctxt "@somecontext" + #| msgid "I had eggs but no spam !" + #| msgid_plural "I had eggs and %d spam !" + msgctxt "@somenewcontext" + msgid "I have spam but no egg !" + msgid_plural "I have spam and %d eggs !" + msgstr[0] "J'ai du jambon mais aucun oeuf !" + msgstr[1] "J'ai du jambon et %d oeufs !" + <BLANKLINE> """ def __init__(self, *args, **kwargs): - """ - Constructor, accepts the following keyword arguments: - - ``comment`` - string, the entry comment. - - ``tcomment`` - string, the entry translator comment. - - ``occurrences`` - list, the entry occurrences. - - ``flags`` - list, the entry flags. - - ``previous_msgctxt`` - string, the entry previous context. - - ``previous_msgid`` - string, the entry previous msgid. - - ``previous_msgid_plural`` - string, the entry previous msgid_plural. - """ + """POEntry constructor.""" _BaseEntry.__init__(self, *args, **kwargs) self.comment = kwargs.get('comment', '') self.tcomment = kwargs.get('tcomment', '') @@ -855,31 +998,33 @@ class POEntry(_BaseEntry): self.previous_msgid = kwargs.get('previous_msgid', None) self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None) - def __unicode__(self, wrapwidth=78): + def __str__(self, wrapwidth=78): """ - Returns the unicode representation of the entry. + Return the string representation of the entry. """ if self.obsolete: - return _BaseEntry.__unicode__(self, wrapwidth) - + return _BaseEntry.__str__(self) ret = [] - # comments first, if any (with text wrapping as xgettext does) - comments = [('comment', '#. '), ('tcomment', '# ')] - for c in comments: - val = getattr(self, c[0]) - if val: - for comment in val.split('\n'): - if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth: - ret += wrap( - comment, - wrapwidth, - initial_indent=c[1], - subsequent_indent=c[1], - break_long_words=False - ) - else: - ret.append('%s%s' % (c[1], comment)) - + # comment first, if any (with text wrapping as xgettext does) + if self.comment != '': + for comment in self.comment.split('\n'): + if wrapwidth > 0 and len(comment) > wrapwidth-3: + ret += textwrap.wrap(comment, wrapwidth, + initial_indent='#. ', + subsequent_indent='#. ', + break_long_words=False) + else: + ret.append('#. %s' % comment) + # translator comment, if any (with text wrapping as xgettext does) + if self.tcomment != '': + for tcomment in self.tcomment.split('\n'): + if wrapwidth > 0 and len(tcomment) > wrapwidth-2: + ret += textwrap.wrap(tcomment, wrapwidth, + initial_indent='# ', + subsequent_indent='# ', + break_long_words=False) + else: + ret.append('# %s' % tcomment) # occurrences (with text wrapping as xgettext does) if self.occurrences: filelist = [] @@ -889,43 +1034,79 @@ class POEntry(_BaseEntry): else: filelist.append(fpath) filestr = ' '.join(filelist) - if wrapwidth > 0 and len(filestr) + 3 > wrapwidth: - # textwrap split words that contain hyphen, this is not + if wrapwidth > 0 and len(filestr)+3 > wrapwidth: + # XXX textwrap split words that contain hyphen, this is not # what we want for filenames, so the dirty hack is to # temporally replace hyphens with a char that a file cannot # contain, like "*" - ret += [l.replace('*', '-') for l in wrap( - filestr.replace('-', '*'), - wrapwidth, - initial_indent='#: ', - subsequent_indent='#: ', - break_long_words=False - )] + lines = textwrap.wrap(filestr.replace('-', '*'), + wrapwidth, + initial_indent='#: ', + subsequent_indent='#: ', + break_long_words=False) + # end of the replace hack + for line in lines: + ret.append(line.replace('*', '-')) else: - ret.append('#: ' + filestr) - - # flags (TODO: wrapping ?) + ret.append('#: '+filestr) + # flags if self.flags: - ret.append('#, %s' % ', '.join(self.flags)) + flags = [] + for flag in self.flags: + flags.append(flag) + ret.append('#, %s' % ', '.join(flags)) # previous context and previous msgid/msgid_plural - fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural'] - for f in fields: - val = getattr(self, f) - if val: - ret += self._str_field(f, "#| ", "", val, wrapwidth) - - ret.append(_BaseEntry.__unicode__(self, wrapwidth)) - ret = '\n'.join(ret) - - if type(ret) != types.UnicodeType: - return unicode(ret, self.encoding) - return ret + if self.previous_msgctxt: + ret += self._str_field("previous_msgctxt", "#| ", "", + self.previous_msgctxt) + if self.previous_msgid: + ret += self._str_field("previous_msgid", "#| ", "", + self.previous_msgid) + if self.previous_msgid_plural: + ret += self._str_field("previous_msgid_plural", "#| ", "", + self.previous_msgid_plural) + + ret.append(_BaseEntry.__str__(self)) + return '\n'.join(ret) def __cmp__(self, other): - """ + ''' Called by comparison operations if rich comparison is not defined. - """ + + **Tests**: + >>> a = POEntry(msgid='a', occurrences=[('b.py', 1), ('b.py', 3)]) + >>> b = POEntry(msgid='b', occurrences=[('b.py', 1), ('b.py', 3)]) + >>> c1 = POEntry(msgid='c1', occurrences=[('a.py', 1), ('b.py', 1)]) + >>> c2 = POEntry(msgid='c2', occurrences=[('a.py', 1), ('a.py', 3)]) + >>> po = POFile() + >>> po.append(a) + >>> po.append(b) + >>> po.append(c1) + >>> po.append(c2) + >>> po.sort() + >>> print(po) + # + msgid "" + msgstr "" + <BLANKLINE> + #: a.py:1 a.py:3 + msgid "c2" + msgstr "" + <BLANKLINE> + #: a.py:1 b.py:1 + msgid "c1" + msgstr "" + <BLANKLINE> + #: b.py:1 b.py:3 + msgid "a" + msgstr "" + <BLANKLINE> + #: b.py:1 b.py:3 + msgid "b" + msgstr "" + <BLANKLINE> + ''' def compare_occurrences(a, b): """ Compare an entry occurrence with another one. @@ -972,8 +1153,7 @@ class POEntry(_BaseEntry): def translated(self): """ - Returns ``True`` if the entry has been translated or ``False`` - otherwise. + Return True if the entry has been translated or False. """ if self.obsolete or 'fuzzy' in self.flags: return False @@ -990,19 +1170,11 @@ class POEntry(_BaseEntry): """ Merge the current entry with the given pot entry. """ - self.msgid = other.msgid - self.msgctxt = other.msgctxt - self.occurrences = other.occurrences - self.comment = other.comment - fuzzy = 'fuzzy' in self.flags - self.flags = other.flags[:] # clone flags - if fuzzy: - self.flags.append('fuzzy') + self.msgid = other.msgid + self.occurrences = other.occurrences + self.comment = other.comment + self.flags = other.flags self.msgid_plural = other.msgid_plural - self.obsolete = other.obsolete - self.previous_msgctxt = other.previous_msgctxt - self.previous_msgid = other.previous_msgid - self.previous_msgid_plural = other.previous_msgid_plural if other.msgstr_plural: for pos in other.msgstr_plural: try: @@ -1017,8 +1189,23 @@ class POEntry(_BaseEntry): class MOEntry(_BaseEntry): """ Represents a mo file entry. + + **Examples**: + + >>> entry = MOEntry() + >>> entry.msgid = 'translate me !' + >>> entry.msgstr = 'traduisez moi !' + >>> print(entry) + msgid "translate me !" + msgstr "traduisez moi !" + <BLANKLINE> """ - pass + + def __str__(self, wrapwidth=78): + """ + Return the string representation of the entry. + """ + return _BaseEntry.__str__(self, wrapwidth) # }}} # class _POFileParser {{{ @@ -1029,37 +1216,28 @@ class _POFileParser(object): file format. """ - def __init__(self, pofile, *args, **kwargs): + def __init__(self, fpath, *args, **kwargs): """ Constructor. - Keyword arguments: - - ``pofile`` - string, path to the po file or its content - - ``encoding`` - string, the encoding to use, defaults to ``default_encoding`` - global variable (optional). - - ``check_for_duplicates`` - whether to check for duplicate entries when adding entries to the - file (optional, default: ``False``). + **Arguments**: + - *fpath*: string, path to the po file + - *encoding*: string, the encoding to use, defaults to + "default_encoding" global variable (optional), + - *check_for_duplicates*: whether to check for duplicate entries + when adding entries to the file, default: False (optional). """ enc = kwargs.get('encoding', default_encoding) - if os.path.exists(pofile): - try: - self.fhandle = codecs.open(pofile, 'rU', enc) - except LookupError: - enc = default_encoding - self.fhandle = codecs.open(pofile, 'rU', enc) - else: - self.fhandle = pofile.splitlines() - + check_dup = kwargs.get('check_for_duplicates', False) + try: + self.fhandle = codecs.open(fpath, 'rU', enc) + except LookupError: + enc = default_encoding + self.fhandle = codecs.open(fpath, 'rU', enc) self.instance = POFile( - pofile=pofile, + fpath=fpath, encoding=enc, - check_for_duplicates=kwargs.get('check_for_duplicates', False) + check_for_duplicates=check_dup ) self.transitions = {} self.current_entry = POEntry() @@ -1111,103 +1289,59 @@ class _POFileParser(object): Run the state machine, parse the file line by line and call process() with the current matched symbol. """ - i = 0 - - keywords = { - 'msgctxt': 'CT', - 'msgid': 'MI', - 'msgstr': 'MS', - 'msgid_plural': 'MP', - } - prev_keywords = { - 'msgid_plural': 'PP', - 'msgid': 'PM', - 'msgctxt': 'PC', - } - + i, lastlen = 1, 0 for line in self.fhandle: - i += 1 line = line.strip() if line == '': + i = i+1 continue - - tokens = line.split(None, 2) - nb_tokens = len(tokens) - - if tokens[0] == '#~' and nb_tokens > 1: - line = line[3:].strip() - tokens = tokens[1:] - nb_tokens -= 1 + if line[:3] == '#~ ': + line = line[3:] self.entry_obsolete = 1 else: self.entry_obsolete = 0 - - # Take care of keywords like - # msgid, msgid_plural, msgctxt & msgstr. - if tokens[0] in keywords and nb_tokens > 1: - line = line[len(tokens[0]):].lstrip() - self.current_token = line - self.process(keywords[tokens[0]], i) - continue - self.current_token = line - - if tokens[0] == '#:' and nb_tokens > 1: + if line[:2] == '#:': # we are on a occurrences line self.process('OC', i) - - elif line[:1] == '"': - # we are on a continuation line + elif line[:9] == 'msgctxt "': + # we are on a msgctxt + self.process('CT', i) + elif line[:7] == 'msgid "': + # we are on a msgid + self.process('MI', i) + elif line[:8] == 'msgstr "': + # we are on a msgstr + self.process('MS', i) + elif line[:1] == '"' or line[:4] == '#| "': + # we are on a continuation line or some metadata self.process('MC', i) - + elif line[:14] == 'msgid_plural "': + # we are on a msgid plural + self.process('MP', i) elif line[:7] == 'msgstr[': # we are on a msgstr plural self.process('MX', i) - - elif tokens[0] == '#,' and nb_tokens > 1: + elif line[:3] == '#, ': # we are on a flags line self.process('FL', i) - - elif tokens[0] == '#': - if line == '#': line += ' ' + elif line[:2] == '# ' or line == '#': + if line == '#': line = line + ' ' # we are on a translator comment line self.process('TC', i) - - elif tokens[0] == '#.' and nb_tokens > 1: + elif line[:2] == '#.': # we are on a generated comment line self.process('GC', i) - - elif tokens[0] == '#|': - if nb_tokens < 2: - self.process('??', i) - continue - - # Remove the marker and any whitespace right after that. - line = line[2:].lstrip() - self.current_token = line - - if tokens[1].startswith('"'): - # Continuation of previous metadata. - self.process('MC', i) - continue - - if nb_tokens == 2: - # Invalid continuation line. - self.process('??', i) - - # we are on a "previous translation" comment line, - if tokens[1] not in prev_keywords: - # Unknown keyword in previous translation comment. - self.process('??', i) - - # Remove the keyword and any whitespace - # between it and the starting quote. - line = line[len(tokens[1]):].lstrip() - self.current_token = line - self.process(prev_keywords[tokens[1]], i) - - else: - self.process('??', i) + elif line[:15] == '#| msgid_plural': + # we are on a previous msgid_plural + self.process('PP', i) + elif line[:8] == '#| msgid': + self.process('PM', i) + # we are on a previous msgid + elif line[:10] == '#| msgctxt': + # we are on a previous msgctxt + self.process('PC', i) + i = i+1 if self.current_entry: # since entries are added when another entry is found, we must add @@ -1229,24 +1363,17 @@ class _POFileParser(object): if key is not None: self.instance.metadata[key] += '\n'+ msg.strip() # close opened file - if isinstance(self.fhandle, file): - self.fhandle.close() + self.fhandle.close() return self.instance def add(self, symbol, states, next_state): """ Add a transition to the state machine. - Keywords arguments: - ``symbol`` - string, the matched token (two chars symbol). - - ``states`` - list, a list of states (two chars symbols). - - ``next_state`` - the next state the fsm will have after the action. + symbol -- string, the matched token (two chars symbol) + states -- list, a list of states (two chars symbols) + next_state -- the next state the fsm will have after the action """ for state in states: action = getattr(self, 'handle_%s' % next_state.lower()) @@ -1258,12 +1385,8 @@ class _POFileParser(object): symbol provided. Keywords arguments: - - ``symbol`` - string, the matched token (two chars symbol). - - ``linenum`` - integer, the current line number of the parsed file. + symbol -- string, the matched token (two chars symbol) + linenum -- integer, the current line number of the parsed file """ try: (action, state) = self.transitions[(symbol, self.current_state)] @@ -1333,7 +1456,7 @@ class _POFileParser(object): self.instance.append(self.current_entry) self.current_entry = POEntry() self.current_entry.previous_msgid_plural = \ - unescape(self.current_token[1:-1]) + unescape(self.current_token[17:-1]) return True def handle_pm(self): @@ -1342,7 +1465,7 @@ class _POFileParser(object): self.instance.append(self.current_entry) self.current_entry = POEntry() self.current_entry.previous_msgid = \ - unescape(self.current_token[1:-1]) + unescape(self.current_token[10:-1]) return True def handle_pc(self): @@ -1351,7 +1474,7 @@ class _POFileParser(object): self.instance.append(self.current_entry) self.current_entry = POEntry() self.current_entry.previous_msgctxt = \ - unescape(self.current_token[1:-1]) + unescape(self.current_token[12:-1]) return True def handle_ct(self): @@ -1359,7 +1482,7 @@ class _POFileParser(object): if self.current_state in ['MC', 'MS', 'MX']: self.instance.append(self.current_entry) self.current_entry = POEntry() - self.current_entry.msgctxt = unescape(self.current_token[1:-1]) + self.current_entry.msgctxt = unescape(self.current_token[9:-1]) return True def handle_mi(self): @@ -1368,17 +1491,17 @@ class _POFileParser(object): self.instance.append(self.current_entry) self.current_entry = POEntry() self.current_entry.obsolete = self.entry_obsolete - self.current_entry.msgid = unescape(self.current_token[1:-1]) + self.current_entry.msgid = unescape(self.current_token[7:-1]) return True def handle_mp(self): """Handle a msgid plural.""" - self.current_entry.msgid_plural = unescape(self.current_token[1:-1]) + self.current_entry.msgid_plural = unescape(self.current_token[14:-1]) return True def handle_ms(self): """Handle a msgstr.""" - self.current_entry.msgstr = unescape(self.current_token[1:-1]) + self.current_entry.msgstr = unescape(self.current_token[8:-1]) return True def handle_mx(self): @@ -1418,6 +1541,10 @@ class _POFileParser(object): typ = 'previous_msgctxt' token = token[3:] self.current_entry.previous_msgctxt += token + if typ not in self.current_entry._multiline_str: + self.current_entry._multiline_str[typ] = token + else: + self.current_entry._multiline_str[typ] += "__POLIB__NL__" + token # don't change the current state return False @@ -1428,41 +1555,43 @@ class _MOFileParser(object): """ A class to parse binary mo files. """ + BIG_ENDIAN = 0xde120495 + LITTLE_ENDIAN = 0x950412de - def __init__(self, mofile, *args, **kwargs): + def __init__(self, fpath, *args, **kwargs): """ Constructor. - Keyword arguments: - - ``mofile`` - string, path to the mo file or its content - - ``encoding`` - string, the encoding to use, defaults to ``default_encoding`` - global variable (optional). - - ``check_for_duplicates`` - whether to check for duplicate entries when adding entries to the - file (optional, default: ``False``). + **Arguments**: + - *fpath*: string, path to the po file + - *encoding*: string, the encoding to use, defaults to + "default_encoding" global variable (optional), + - *check_for_duplicates*: whether to check for duplicate entries + when adding entries to the file, default: False (optional). """ - self.fhandle = open(mofile, 'rb') + enc = kwargs.get('encoding', default_encoding) + check_dup = kwargs.get('check_for_duplicates', False) + self.fhandle = open(fpath, 'rb') self.instance = MOFile( - fpath=mofile, - encoding=kwargs.get('encoding', default_encoding), - check_for_duplicates=kwargs.get('check_for_duplicates', False) + fpath=fpath, + encoding=enc, + check_for_duplicates=check_dup ) + def parse_magicnumber(self): + """ + Parse the magic number and raise an exception if not valid. + """ + def parse(self): """ Build the instance with the file handle provided in the constructor. """ - # parse magic number magic_number = self._readbinary('<I', 4) - if magic_number == MOFile.LITTLE_ENDIAN: + if magic_number == self.LITTLE_ENDIAN: ii = '<II' - elif magic_number == MOFile.BIG_ENDIAN: + elif magic_number == self.BIG_ENDIAN: ii = '>II' else: raise IOError('Invalid mo file, magic number is incorrect !') @@ -1501,35 +1630,18 @@ class _MOFileParser(object): # test if we have a plural entry msgid_tokens = msgid.split('\0') if len(msgid_tokens) > 1: - entry = self._build_entry( + entry = MOEntry( msgid=msgid_tokens[0], msgid_plural=msgid_tokens[1], - msgstr_plural=dict((k,v) for k,v in enumerate(msgstr.split('\0'))) + msgstr_plural=dict((k,v) for k,v in \ + enumerate(msgstr.split('\0'))) ) else: - entry = self._build_entry(msgid=msgid, msgstr=msgstr) + entry = MOEntry(msgid=msgid, msgstr=msgstr) self.instance.append(entry) # close opened file self.fhandle.close() return self.instance - - def _build_entry(self, msgid, msgstr=None, msgid_plural=None, - msgstr_plural=None): - msgctxt_msgid = msgid.split('\x04') - if len(msgctxt_msgid) > 1: - kwargs = { - 'msgctxt': msgctxt_msgid[0], - 'msgid' : msgctxt_msgid[1], - } - else: - kwargs = {'msgid': msgid} - if msgstr: - kwargs['msgstr'] = msgstr - if msgid_plural: - kwargs['msgid_plural'] = msgid_plural - if msgstr_plural: - kwargs['msgstr_plural'] = msgstr_plural - return MOEntry(**kwargs) def _readbinary(self, fmt, numbytes): """ @@ -1543,97 +1655,26 @@ class _MOFileParser(object): return tup # }}} -# class TextWrapper {{{ +# __main__ {{{ -class TextWrapper(textwrap.TextWrapper): +if __name__ == '__main__': """ - Subclass of textwrap.TextWrapper that backport the - drop_whitespace option. + **Main function**:: + - to **test** the module just run: *python polib.py [-v]* + - to **profile** the module: *python polib.py -p <some_pofile.po>* """ - def __init__(self, *args, **kwargs): - drop_whitespace = kwargs.pop('drop_whitespace', True) - textwrap.TextWrapper.__init__(self, *args, **kwargs) - self.drop_whitespace = drop_whitespace - - def _wrap_chunks(self, chunks): - """_wrap_chunks(chunks : [string]) -> [string] - - Wrap a sequence of text chunks and return a list of lines of - length 'self.width' or less. (If 'break_long_words' is false, - some lines may be longer than this.) Chunks correspond roughly - to words and the whitespace between them: each chunk is - indivisible (modulo 'break_long_words'), but a line break can - come between any two chunks. Chunks should not have internal - whitespace; ie. a chunk is either all whitespace or a "word". - Whitespace chunks will be removed from the beginning and end of - lines, but apart from that whitespace is preserved. - """ - lines = [] - if self.width <= 0: - raise ValueError("invalid width %r (must be > 0)" % self.width) - - # Arrange in reverse order so items can be efficiently popped - # from a stack of chucks. - chunks.reverse() - - while chunks: - - # Start the list of chunks that will make up the current line. - # cur_len is just the length of all the chunks in cur_line. - cur_line = [] - cur_len = 0 - - # Figure out which static string will prefix this line. - if lines: - indent = self.subsequent_indent + import sys + if len(sys.argv) > 2 and sys.argv[1] == '-p': + def test(f): + if f.endswith('po'): + p = pofile(f) else: - indent = self.initial_indent - - # Maximum width for this line. - width = self.width - len(indent) - - # First chunk on line is whitespace -- drop it, unless this - # is the very beginning of the text (ie. no lines started yet). - if self.drop_whitespace and chunks[-1].strip() == '' and lines: - del chunks[-1] - - while chunks: - l = len(chunks[-1]) - - # Can at least squeeze this chunk onto the current line. - if cur_len + l <= width: - cur_line.append(chunks.pop()) - cur_len += l - - # Nope, this line is full. - else: - break - - # The current line is full, and the next chunk is too big to - # fit on *any* line (not just this one). - if chunks and len(chunks[-1]) > width: - self._handle_long_word(chunks, cur_line, cur_len, width) - - # If the last chunk on this line is all whitespace, drop it. - if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': - del cur_line[-1] - - # Convert current line back to a string and store it in list - # of all lines (return value). - if cur_line: - lines.append(indent + ''.join(cur_line)) - - return lines + p = mofile(f) + s = unicode(p) + import profile + profile.run('test("'+sys.argv[2]+'")') + else: + import doctest + doctest.testmod() # }}} -# function wrap() {{{ - -def wrap(text, width=70, **kwargs): - """ - Wrap a single paragraph of text, returning a list of wrapped lines. - """ - if sys.version_info < (2, 6): - return TextWrapper(width=width, **kwargs).wrap(text) - return textwrap.wrap(text, width=width, **kwargs) - -#}}} |