diff options
Diffstat (limited to 'sphinx/search/__init__.py')
-rw-r--r-- | sphinx/search/__init__.py | 87 |
1 files changed, 67 insertions, 20 deletions
diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py index d57efd9e..58b611a3 100644 --- a/sphinx/search/__init__.py +++ b/sphinx/search/__init__.py @@ -8,10 +8,10 @@ :copyright: Copyright 2007-2014 by the Sphinx team, see AUTHORS. :license: BSD, see LICENSE for details. """ -from __future__ import with_statement import re -import cPickle as pickle +from six import iteritems, itervalues, text_type, string_types +from six.moves import cPickle as pickle from docutils.nodes import raw, comment, title, Text, NodeVisitor, SkipNode from sphinx.util import jsdump, rpartition @@ -40,6 +40,7 @@ class SearchLanguage(object): type, before searching index. Default implementation does nothing. """ lang = None + language_name = None stopwords = set() js_stemmer_code = """ /** @@ -89,16 +90,49 @@ var Stemmer = function() { Return true if the target word should be registered in the search index. This method is called after stemming. """ - return not (((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or - (ord(word[0]) < 256 and (len(word) < 3 or word in self.stopwords or - word.isdigit()))) + return ( + len(word) == 0 or not ( + ((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or + (ord(word[0]) < 256 and ( + len(word) < 3 or word in self.stopwords or word.isdigit() + )))) -from sphinx.search import en, ja +# SearchEnglish imported after SearchLanguage is defined due to circular import +from sphinx.search.en import SearchEnglish + +def parse_stop_word(source): + """ + parse snowball style word list like this: + + * http://snowball.tartarus.org/algorithms/finnish/stop.txt + """ + result = set() + for line in source.splitlines(): + line = line.split('|')[0] # remove comment + result.update(line.split()) + return result + + +# maps language name to module.class or directly a class languages = { - 'en': en.SearchEnglish, - 'ja': ja.SearchJapanese, + 'da': 'sphinx.search.da.SearchDanish', + 'de': 'sphinx.search.de.SearchGerman', + 'en': SearchEnglish, + 'es': 'sphinx.search.es.SearchSpanish', + 'fi': 'sphinx.search.fi.SearchFinnish', + 'fr': 'sphinx.search.fr.SearchFrench', + 'hu': 'sphinx.search.hu.SearchHungarian', + 'it': 'sphinx.search.it.SearchItalian', + 'ja': 'sphinx.search.ja.SearchJapanese', + 'nl': 'sphinx.search.nl.SearchDutch', + 'no': 'sphinx.search.no.SearchNorwegian', + 'pt': 'sphinx.search.pt.SearchPortuguese', + 'ro': 'sphinx.search.ro.SearchRomanian', + 'ru': 'sphinx.search.ru.SearchRussian', + 'sv': 'sphinx.search.sv.SearchSwedish', + 'tr': 'sphinx.search.tr.SearchTurkish', } @@ -186,7 +220,17 @@ class IndexBuilder(object): # objtype index -> (domain, type, objname (localized)) self._objnames = {} # add language-specific SearchLanguage instance - self.lang = languages[lang](options) + lang_class = languages.get(lang) + if lang_class is None: + self.lang = SearchEnglish(options) + elif isinstance(lang_class, str): + module, classname = lang_class.rsplit('.', 1) + lang_class = getattr(__import__(module, None, None, [classname]), + classname) + self.lang = lang_class(options) + else: + # it's directly a class (e.g. added by app.add_search_language) + self.lang = lang_class(options) if scoring: with open(scoring, 'rb') as fp: @@ -196,7 +240,7 @@ class IndexBuilder(object): def load(self, stream, format): """Reconstruct from frozen data.""" - if isinstance(format, basestring): + if isinstance(format, string_types): format = self.formats[format] frozen = format.load(stream) # if an old index is present, we treat it as not existing. @@ -208,7 +252,7 @@ class IndexBuilder(object): def load_terms(mapping): rv = {} - for k, v in mapping.iteritems(): + for k, v in iteritems(mapping): if isinstance(v, int): rv[k] = set([index2fn[v]]) else: @@ -221,7 +265,7 @@ class IndexBuilder(object): def dump(self, stream, format): """Dump the frozen index to a stream.""" - if isinstance(format, basestring): + if isinstance(format, string_types): format = self.formats[format] format.dump(self.freeze(), stream) @@ -229,7 +273,7 @@ class IndexBuilder(object): rv = {} otypes = self._objtypes onames = self._objnames - for domainname, domain in self.env.domains.iteritems(): + for domainname, domain in iteritems(self.env.domains): for fullname, dispname, type, docname, anchor, prio in \ domain.get_objects(): # XXX use dispname? @@ -248,7 +292,7 @@ class IndexBuilder(object): if otype: # use unicode() to fire translation proxies onames[typeindex] = (domainname, type, - unicode(domain.get_type_name(otype))) + text_type(domain.get_type_name(otype))) else: onames[typeindex] = (domainname, type, type) if anchor == fullname: @@ -263,7 +307,7 @@ class IndexBuilder(object): def get_terms(self, fn2index): rvs = {}, {} for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)): - for k, v in mapping.iteritems(): + for k, v in iteritems(mapping): if len(v) == 1: fn, = v if fn in fn2index: @@ -274,19 +318,22 @@ class IndexBuilder(object): def freeze(self): """Create a usable data structure for serializing.""" - filenames = self._titles.keys() - titles = self._titles.values() + filenames = list(self._titles.keys()) + titles = list(self._titles.values()) fn2index = dict((f, i) for (i, f) in enumerate(filenames)) terms, title_terms = self.get_terms(fn2index) objects = self.get_objects(fn2index) # populates _objtypes objtypes = dict((v, k[0] + ':' + k[1]) - for (k, v) in self._objtypes.iteritems()) + for (k, v) in iteritems(self._objtypes)) objnames = self._objnames return dict(filenames=filenames, titles=titles, terms=terms, objects=objects, objtypes=objtypes, objnames=objnames, titleterms=title_terms, envversion=self.env.version) + def label(self): + return "%s (code: %s)" % (self.lang.language_name, self.lang.lang) + def prune(self, filenames): """Remove data for all filenames not in the list.""" new_titles = {} @@ -294,9 +341,9 @@ class IndexBuilder(object): if filename in self._titles: new_titles[filename] = self._titles[filename] self._titles = new_titles - for wordnames in self._mapping.itervalues(): + for wordnames in itervalues(self._mapping): wordnames.intersection_update(filenames) - for wordnames in self._title_mapping.itervalues(): + for wordnames in itervalues(self._title_mapping): wordnames.intersection_update(filenames) def feed(self, filename, title, doctree): |