#!/usr/bin/python -u # -*- encoding: utf-8 -*- # Copyright (c) 2004, 2005, 2006 Danilo Ĺ egan . # # This file is part of xml2po. # # xml2po is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # xml2po is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with xml2po; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # xml2po -- translate XML documents VERSION = "1.0.5" # Versioning system (I use this for a long time, so lets explain it to # those Linux-versioning-scheme addicts): # 1.0.* are unstable, development versions # 1.1 will be first stable release (release 1), and 1.1.* bugfix releases # 2.0.* will be unstable-feature-development stage (milestone 1) # 2.1.* unstable development betas (milestone 2) # 2.2 second stable release (release 2), and 2.2.* bugfix releases # ... # import sys import libxml2 import gettext import os import re class NoneTranslations: def gettext(self, message): return None def lgettext(self, message): return None def ngettext(self, msgid1, msgid2, n): return None def lngettext(self, msgid1, msgid2, n): return None def ugettext(self, message): return None def ungettext(self, msgid1, msgid2, n): return None class MessageOutput: def __init__(self, with_translations = 0): self.messages = [] self.comments = {} self.linenos = {} self.nowrap = {} if with_translations: self.translations = [] self.do_translations = with_translations self.output_msgstr = 0 # this is msgid mode for outputMessage; 1 is for msgstr mode def translationsFollow(self): """Indicate that what follows are translations.""" self.output_msgstr = 1 def setFilename(self, filename): self.filename = filename def outputMessage(self, text, lineno = 0, comment = None, spacepreserve = 0, tag = None): """Adds a string to the list of messages.""" if (text.strip() != ''): t = escapePoString(normalizeString(text, not spacepreserve)) if self.output_msgstr: self.translations.append(t) return if self.do_translations or (not t in self.messages): self.messages.append(t) if spacepreserve: self.nowrap[t] = 1 if t in self.linenos.keys(): self.linenos[t].append((self.filename, tag, lineno)) else: self.linenos[t] = [ (self.filename, tag, lineno) ] if (not self.do_translations) and comment and not t in self.comments: self.comments[t] = comment else: if t in self.linenos.keys(): self.linenos[t].append((self.filename, tag, lineno)) else: self.linenos[t] = [ (self.filename, tag, lineno) ] if comment and not t in self.comments: self.comments[t] = comment def outputHeader(self, out): import time out.write("""msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\\n" "POT-Creation-Date: %s\\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" "Last-Translator: FULL NAME \\n" "Language-Team: LANGUAGE \\n" "MIME-Version: 1.0\\n" "Content-Type: text/plain; charset=UTF-8\\n" "Content-Transfer-Encoding: 8bit\\n" """ % (time.strftime("%Y-%m-%d %H:%M%z"))) def outputAll(self, out): self.outputHeader(out) for k in self.messages: if k in self.comments: out.write("#. %s\n" % (self.comments[k].replace("\n","\n#. "))) references = "" for reference in self.linenos[k]: references += "%s:%d(%s) " % (reference[0], reference[2], reference[1]) out.write("#: %s\n" % (references)) if k in self.nowrap and self.nowrap[k]: out.write("#, no-wrap\n") out.write("msgid \"%s\"\n" % (k)) translation = "" if self.do_translations: if len(self.translations)>0: translation = self.translations.pop(0) if translation == k: translation = "" out.write("msgstr \"%s\"\n\n" % (translation)) def normalizeNode(node): #print >>sys.stderr, "<%s> (%s) [%s]" % (node.name, node.type, node.serialize('utf-8')) if not node: return elif isSpacePreserveNode(node): return elif node.isText(): if node.isBlankNode(): if expand_entities or ( not (node.prev and not node.prev.isBlankNode() and node.next and not node.next.isBlankNode()) ): #print >>sys.stderr, "BLANK" node.setContent('') else: node.setContent(re.sub('\s+',' ', node.content)) elif node.children and node.type == 'element': child = node.children while child: normalizeNode(child) child = child.next def normalizeString(text, ignorewhitespace = 1): """Normalizes string to be used as key for gettext lookup. Removes all unnecessary whitespace.""" if not ignorewhitespace: return text try: # Lets add document DTD so entities are resolved dtd = doc.intSubset() tmp = dtd.serialize('utf-8') tmp = tmp + '%s' % text except: tmp = '%s' % text try: ctxt = libxml2.createDocParserCtxt(tmp) if expand_entities: ctxt.replaceEntities(1) ctxt.parseDocument() tree = ctxt.doc() newnode = tree.getRootElement() except: print >> sys.stderr, """Error while normalizing string as XML:\n"%s"\n""" % (text) return text normalizeNode(newnode) result = '' child = newnode.children while child: result += child.serialize('utf-8') child = child.next result = re.sub('^ ','', result) result = re.sub(' $','', result) return result def stringForEntity(node): """Replaces entities in the node.""" text = node.serialize('utf-8') try: # Lets add document DTD so entities are resolved dtd = node.doc.intSubset() tmp = dtd.serialize('utf-8') + '%s' % text next = 1 except: tmp = '%s' % text next = 0 ctxt = libxml2.createDocParserCtxt(tmp) if expand_entities: ctxt.replaceEntities(1) ctxt.parseDocument() tree = ctxt.doc() if next: newnode = tree.children.next else: newnode = tree.children result = '' child = newnode.children while child: result += child.serialize('utf-8') child = child.next return result def escapePoString(text): return text.replace('\\','\\\\').replace('"', "\\\"").replace("\n","\\n").replace("\t","\\t") def unEscapePoString(text): return text.replace('\\"', '"').replace('\\\\','\\') def getTranslation(text, spacepreserve = 0): """Returns a translation via gettext for specified snippet. text should be a string to look for, spacepreserve set to 1 when spaces should be preserved. """ #print >>sys.stderr,"getTranslation('%s')" % (text.encode('utf-8')) text = normalizeString(text, not spacepreserve) if (text.strip() == ''): return text global gt if gt: res = gt.ugettext(text.decode('utf-8')) return res return text def myAttributeSerialize(node): result = '' if node.children: child = node.children while child: if child.type=='text': result += doc.encodeEntitiesReentrant(child.content) elif child.type=='entity_ref': if not expand_entities: result += '&' + child.name + ';' else: result += child.content.decode('utf-8') else: result += myAttributeSerialize(child) child = child.next else: result = node.serialize('utf-8') return result def startTagForNode(node): if not node: return 0 result = node.name params = '' if node.properties: for p in node.properties: if p.type == 'attribute': try: nsprop = p.ns().name + ":" + p.name except: nsprop = p.name params += " %s=\"%s\"" % (nsprop, myAttributeSerialize(p)) return result+params def endTagForNode(node): if not node: return 0 result = node.name return result def isFinalNode(node): if automatic: auto = autoNodeIsFinal(node) # Check if any of the parents is also autoNodeIsFinal, # and if it is, don't consider this node a final one parent = node.parent while parent and auto: auto = not autoNodeIsFinal(parent) parent = parent.parent return auto #node.type =='text' or not node.children or if node.type == 'element' and node.name in ultimate_tags: return 1 elif node.children: final_children = 1 child = node.children while child and final_children: if not child.isBlankNode() and child.type != 'comment' and not isFinalNode(child): final_children = 0 child = child.next if final_children: return 1 return 0 def ignoreNode(node): if automatic: if node.type in ('dtd', 'comment'): return 1 else: return 0 else: if isFinalNode(node): return 0 if node.name in ignored_tags or node.type in ('dtd', 'comment'): return 1 return 0 def isSpacePreserveNode(node): pres = node.getSpacePreserve() if pres == 1: return 1 else: if CurrentXmlMode and (node.name in CurrentXmlMode.getSpacePreserveTags()): return 1 else: return 0 def getCommentForNode(node): """Walk through previous siblings until a comment is found, or other element. Only whitespace is allowed between comment and current node.""" prev = node.prev while prev and prev.type == 'text' and prev.content.strip() == '': prev = prev.prev if prev and prev.type == 'comment': return prev.content.strip() else: return None def replaceAttributeContentsWithText(node,text): node.setContent(text) def replaceNodeContentsWithText(node,text): """Replaces all subnodes of a node with contents of text treated as XML.""" if node.children: starttag = startTagForNode(node) endtag = endTagForNode(node) # Lets add document DTD so entities are resolved tmp = '' try: dtd = doc.intSubset() tmp = tmp + dtd.serialize('utf-8') except libxml2.treeError: pass content = '<%s>%s' % (starttag, text, endtag) tmp = tmp + content.encode('utf-8') newnode = None try: ctxt = libxml2.createDocParserCtxt(tmp) ctxt.replaceEntities(0) ctxt.parseDocument() newnode = ctxt.doc() except: pass if not newnode: print >> sys.stderr, """Error while parsing translation as XML:\n"%s"\n""" % (text.encode('utf-8')) return newelem = newnode.getRootElement() if newelem and newelem.children: free = node.children while free: next = free.next free.unlinkNode() free = next if node: copy = newelem.copyNodeList() next = node.next node.replaceNode(newelem.copyNodeList()) node.next = next else: # In practice, this happens with tags such as " " (only whitespace in between) pass else: node.setContent(text) def autoNodeIsFinal(node): """Returns 1 if node is text node, contains non-whitespace text nodes or entities.""" if hasattr(node, '__autofinal__'): return node.__autofinal__ if node.name in ignored_tags: node.__autofinal__ = 0 return 0 if node.isText() and node.content.strip()!='': node.__autofinal__ = 1 return 1 final = 0 child = node.children while child: if child.type in ['text'] and child.content.strip()!='': final = 1 break child = child.next node.__autofinal__ = final return final def worthOutputting(node, noauto = 0): """Returns 1 if node is "worth outputting", otherwise 0. Node is "worth outputting", if none of the parents isFinalNode, and it contains non-blank text and entities. """ if noauto and hasattr(node, '__worth__'): return node.__worth__ elif not noauto and hasattr(node, '__autoworth__'): return node.__autoworth__ worth = 1 parent = node.parent final = isFinalNode(node) and node.name not in ignored_tags while not final and parent: if isFinalNode(parent): final = 1 # reset if we've got to one final tag if final and (parent.name not in ignored_tags) and worthOutputting(parent): worth = 0 break parent = parent.parent if not worth: node.__worth__ = 0 return 0 if noauto: node.__worth__ = worth return worth else: node.__autoworth__ = autoNodeIsFinal(node) return node.__autoworth__ def processAttribute(node, attr): if not node or not attr or not worthOutputting(node=node, noauto=1): return outtxt = attr.content if mode=='merge': translation = getTranslation(outtxt, 0) replaceAttributeContentsWithText(attr, translation.encode('utf-8')) else: msg.outputMessage(outtxt, node.lineNo(), "", 0, node.name + ":" + attr.name) def processElementTag(node, replacements, restart = 0): """Process node with node.type == 'element'.""" if node.type == 'element': # Translate attributes if needed if node.properties and len(treated_attributes): for p in node.properties: if p.name in treated_attributes: processAttribute(node, p) outtxt = '' if restart: myrepl = [] else: myrepl = replacements submsgs = [] child = node.children while child: if (isFinalNode(child)) or (child.type == 'element' and worthOutputting(child)): myrepl.append(processElementTag(child, myrepl, 1)) outtxt += '' % (len(myrepl)) else: if child.type == 'element': (starttag, content, endtag, translation) = processElementTag(child, myrepl, 0) outtxt += '<%s>%s' % (starttag, content, endtag) else: outtxt += doSerialize(child) child = child.next if mode == 'merge': translation = getTranslation(outtxt, isSpacePreserveNode(node)) else: translation = outtxt.decode('utf-8') starttag = startTagForNode(node) endtag = endTagForNode(node) worth = worthOutputting(node) if not translation: translation = outtxt.decode('utf-8') if worth and mark_untranslated: node.setLang('C') if restart or worth: i = 0 while i < len(myrepl): replacement = '<%s>%s' % (myrepl[i][0], myrepl[i][3], myrepl[i][2]) i += 1 translation = translation.replace('' % (i), replacement) if worth: if mode == 'merge': replaceNodeContentsWithText(node, translation) else: msg.outputMessage(outtxt, node.lineNo(), getCommentForNode(node), isSpacePreserveNode(node), tag = node.name) return (starttag, outtxt, endtag, translation) else: raise Exception("You must pass node with node.type=='element'.") def isExternalGeneralParsedEntity(node): if (node and node.type=='entity_ref'): try: # it would be nice if debugDumpNode could use StringIO, but it apparently cannot tmp = file(".xml2po-entitychecking","w+") node.debugDumpNode(tmp,0) tmp.seek(0) tmpstr = tmp.read() tmp.close() os.remove(".xml2po-entitychecking") except: # We fail silently, and replace all entities if we cannot # write .xml2po-entitychecking # !!! This is not very nice thing to do, but I don't know if # raising an exception is any better return 0 if tmpstr.find('EXTERNAL_GENERAL_PARSED_ENTITY') != -1: return 1 else: return 0 else: return 0 def doSerialize(node): """Serializes a node and its children, emitting PO messages along the way. node is the node to serialize, first indicates whether surrounding tags should be emitted as well. """ if ignoreNode(node): return '' elif not node.children: return node.serialize("utf-8") elif node.type == 'entity_ref': if isExternalGeneralParsedEntity(node): return node.serialize('utf-8') else: return stringForEntity(node) #content #content #serialize("utf-8") elif node.type == 'entity_decl': return node.serialize('utf-8') #'<%s>%s' % (startTagForNode(node), node.content, node.name) elif node.type == 'text': return node.serialize('utf-8') elif node.type == 'element': repl = [] (starttag, content, endtag, translation) = processElementTag(node, repl, 1) return '<%s>%s' % (starttag, content, endtag) else: child = node.children outtxt = '' while child: outtxt += doSerialize(child) child = child.next return outtxt def read_finaltags(filelist): if CurrentXmlMode: return CurrentXmlMode.getFinalTags() else: defaults = ['para', 'title', 'releaseinfo', 'revnumber', 'date', 'itemizedlist', 'orderedlist', 'variablelist', 'varlistentry', 'term' ] return defaults def read_ignoredtags(filelist): if CurrentXmlMode: return CurrentXmlMode.getIgnoredTags() else: defaults = ['itemizedlist', 'orderedlist', 'variablelist', 'varlistentry' ] return defaults def read_treatedattributes(filelist): if CurrentXmlMode: return CurrentXmlMode.getTreatedAttributes() else: return [] def tryToUpdate(allargs, lang): # Remove "-u" and "--update-translation" print >>sys.stderr, "OVDI!" command = allargs[0] args = allargs[1:] opts, args = getopt.getopt(args, 'avhm:ket:o:p:u:', ['automatic-tags','version', 'help', 'keep-entities', 'extract-all-entities', 'merge', 'translation=', 'output=', 'po-file=', 'update-translation=' ]) for opt, arg in opts: if opt in ('-a', '--automatic-tags'): command += " -a" elif opt in ('-k', '--keep-entities'): command += " -k" elif opt in ('-e', '--extract-all-entities'): command += " -e" elif opt in ('-m', '--mode'): command += " -m %s" % arg elif opt in ('-o', '--output'): sys.stderr.write("Error: Option '-o' is not yet supported when updating translations directly.\n") sys.exit(8) elif opt in ('-v', '--version'): print VERSION sys.exit(0) elif opt in ('-h', '--help'): sys.stderr.write("Error: If you want help, please use `%s --help' without '-u' option.\n" % (allargs[0])) sys.exit(9) elif opt in ('-u', '--update-translation'): pass else: sys.stderr.write("Error: Option `%s' is not supported with option `-u'.\n" % (opt)) sys.exit(9) while args: command += " " + args.pop() file = lang sys.stderr.write("Merging translations for %s: " % (lang)) result = os.system("%s | msgmerge -o .tmp.%s.po %s -" % (command, lang, file)) if result: sys.exit(10) else: result = os.system("mv .tmp.%s.po %s" % (lang, file)) if result: sys.stderr.write("Error: cannot rename file.\n") sys.exit(11) else: os.system("msgfmt -cv -o %s %s" % (NULL_STRING, file)) sys.exit(0) def load_mode(modename): #import imp #found = imp.find_module(modename, submodes_path) #module = imp.load_module(modename, found[0], found[1], found[2]) try: sys.path.append(submodes_path) module = __import__(modename) modeModule = '%sXmlMode' % modename return getattr(module, modeModule) except: return None def xml_error_handler(arg, ctxt): pass libxml2.registerErrorHandler(xml_error_handler, None) # Main program start if __name__ != '__main__': raise NotImplementedError # Parameters submodes_path = "/home/danilo/cvs/gnom/gnome-doc-utils/xml2po/modes" default_mode = 'docbook' filename = '' origxml = '' mofile = '' gt = None ultimate = [ ] ignored = [ ] filenames = [ ] translationlanguage = '' mode = 'pot' # 'pot' or 'merge' automatic = 0 expand_entities = 1 mark_untranslated = 0 expand_all_entities = 0 output = '-' # this means to stdout NULL_STRING = '/dev/null' if not os.path.exists('/dev/null'): NULL_STRING = 'NUL' import getopt, fileinput def usage (with_help = False): print >> sys.stderr, "Usage: %s [OPTIONS] [XMLFILE]..." % (sys.argv[0]) if (with_help): print >> sys.stderr, """ OPTIONS may be some of: -a --automatic-tags Automatically decides if tags are to be considered "final" or not -k --keep-entities Don't expand entities -e --expand-all-entities Expand ALL entities (including SYSTEM ones) -m --mode=TYPE Treat tags as type TYPE (default: docbook) -o --output=FILE Print resulting text (XML or POT) to FILE -p --po-file=FILE Specify PO file containing translation, and merge Overwrites temporary file .xml2po.mo. -r --reuse=FILE Specify translated XML file with the same structure -t --translation=FILE Specify MO file containing translation, and merge -u --update-translation=LANG.po Updates a PO file using msgmerge program -l --language=LANG Set language of the translation to LANG --mark-untranslated Set 'xml:lang="C"' on untranslated tags -v --version Output version of the xml2po program -h --help Output this message EXAMPLES: To create a POTemplate book.pot from input files chapter1.xml and chapter2.xml, run the following: %s -o book.pot chapter1.xml chapter2.xml After translating book.pot into de.po, merge the translations back, using -p option for each XML file: %s -p de.po chapter1.xml > chapter1.de.xml %s -p de.po chapter2.xml > chapter2.de.xml """ % (sys.argv[0], sys.argv[0], sys.argv[0]) sys.exit(0) if len(sys.argv) < 2: usage() args = sys.argv[1:] try: opts, args = getopt.getopt(args, 'avhkem:t:o:p:u:r:l:', ['automatic-tags','version', 'help', 'keep-entities', 'expand-all-entities', 'mode=', 'translation=', 'output=', 'po-file=', 'update-translation=', 'reuse=', 'language=', 'mark-untranslated' ]) except getopt.GetoptError: usage(True) for opt, arg in opts: if opt in ('-m', '--mode'): default_mode = arg if opt in ('-a', '--automatic-tags'): automatic = 1 elif opt in ('-k', '--keep-entities'): expand_entities = 0 elif opt in ('--mark-untranslated',): mark_untranslated = 1 elif opt in ('-e', '--expand-all-entities'): expand_all_entities = 1 elif opt in ('-l', '--language'): translationlanguage = arg elif opt in ('-t', '--translation'): mofile = arg mode = 'merge' if translationlanguage == '': translationlanguage = os.path.split(os.path.splitext(mofile)[0])[1] elif opt in ('-r', '--reuse'): origxml = arg elif opt in ('-u', '--update-translation'): tryToUpdate(sys.argv, arg) elif opt in ('-p', '--po-file'): mofile = ".xml2po.mo" pofile = arg if translationlanguage == '': translationlanguage = os.path.split(os.path.splitext(pofile)[0])[1] os.system("msgfmt -o %s %s >%s" % (mofile, pofile, NULL_STRING)) and sys.exit(7) mode = 'merge' elif opt in ('-o', '--output'): output = arg elif opt in ('-v', '--version'): print VERSION sys.exit(0) elif opt in ('-h', '--help'): usage(True) # Treat remaining arguments as XML files while args: filenames.append(args.pop()) if len(filenames) > 1 and mode=='merge': print >> sys.stderr, "Error: You can merge translations with only one XML file at a time." sys.exit(2) try: CurrentXmlMode = load_mode(default_mode)() except: CurrentXmlMode = None print >> sys.stderr, "Warning: cannot load module '%s', using automatic detection (-a)." % (default_mode) automatic = 1 if mode=='merge' and mofile=='': print >> sys.stderr, "Error: You must specify MO file when merging translations." sys.exit(3) if mofile: try: mfile = open(mofile, "rb") gt = gettext.GNUTranslations(mfile) gt.add_fallback(NoneTranslations()) except: print >> sys.stderr, "Can't open MO file '%s'." % (mofile) ultimate_tags = read_finaltags(ultimate) ignored_tags = read_ignoredtags(ignored) treated_attributes = read_treatedattributes(ignored) # I'm not particularly happy about making any of these global, # but I don't want to bother too much with it right now semitrans = {} PlaceHolder = 0 if origxml == '': msg = MessageOutput() else: filenames.append(origxml) msg = MessageOutput(1) for filename in filenames: try: if filename == origxml: msg.translationsFollow() ctxt = libxml2.createFileParserCtxt(filename) ctxt.lineNumbers(1) if expand_all_entities: ctxt.replaceEntities(1) ctxt.parseDocument() doc = ctxt.doc() if doc.name != filename: print >> sys.stderr, "Error: I tried to open '%s' but got '%s' -- how did that happen?" % (filename, doc.name) sys.exit(4) except: print >> sys.stderr, "Error: cannot open file '%s'." % (filename) sys.exit(1) msg.setFilename(filename) if CurrentXmlMode and origxml=='': CurrentXmlMode.preProcessXml(doc,msg) doSerialize(doc) if output == '-': out = sys.stdout else: try: out = file(output, 'w') except: print >> sys.stderr, "Error: cannot open file %s for writing." % (output) sys.exit(5) if mode != 'merge': if CurrentXmlMode: tcmsg = CurrentXmlMode.getStringForTranslators() tccom = CurrentXmlMode.getCommentForTranslators() if tcmsg: msg.outputMessage(tcmsg, 0, tccom) msg.outputAll(out) else: if CurrentXmlMode: tcmsg = CurrentXmlMode.getStringForTranslators() if tcmsg: outtxt = getTranslation(tcmsg) else: outtxt = '' CurrentXmlMode.postProcessXmlTranslation(doc, translationlanguage, outtxt) out.write(doc.serialize('utf-8', 1))