summaryrefslogtreecommitdiff
path: root/sphinx/search/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to 'sphinx/search/__init__.py')
-rw-r--r--sphinx/search/__init__.py87
1 files changed, 67 insertions, 20 deletions
diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py
index d57efd9e..58b611a3 100644
--- a/sphinx/search/__init__.py
+++ b/sphinx/search/__init__.py
@@ -8,10 +8,10 @@
:copyright: Copyright 2007-2014 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
-from __future__ import with_statement
import re
-import cPickle as pickle
+from six import iteritems, itervalues, text_type, string_types
+from six.moves import cPickle as pickle
from docutils.nodes import raw, comment, title, Text, NodeVisitor, SkipNode
from sphinx.util import jsdump, rpartition
@@ -40,6 +40,7 @@ class SearchLanguage(object):
type, before searching index. Default implementation does nothing.
"""
lang = None
+ language_name = None
stopwords = set()
js_stemmer_code = """
/**
@@ -89,16 +90,49 @@ var Stemmer = function() {
Return true if the target word should be registered in the search index.
This method is called after stemming.
"""
- return not (((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or
- (ord(word[0]) < 256 and (len(word) < 3 or word in self.stopwords or
- word.isdigit())))
+ return (
+ len(word) == 0 or not (
+ ((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or
+ (ord(word[0]) < 256 and (
+ len(word) < 3 or word in self.stopwords or word.isdigit()
+ ))))
-from sphinx.search import en, ja
+# SearchEnglish imported after SearchLanguage is defined due to circular import
+from sphinx.search.en import SearchEnglish
+
+def parse_stop_word(source):
+ """
+ parse snowball style word list like this:
+
+ * http://snowball.tartarus.org/algorithms/finnish/stop.txt
+ """
+ result = set()
+ for line in source.splitlines():
+ line = line.split('|')[0] # remove comment
+ result.update(line.split())
+ return result
+
+
+# maps language name to module.class or directly a class
languages = {
- 'en': en.SearchEnglish,
- 'ja': ja.SearchJapanese,
+ 'da': 'sphinx.search.da.SearchDanish',
+ 'de': 'sphinx.search.de.SearchGerman',
+ 'en': SearchEnglish,
+ 'es': 'sphinx.search.es.SearchSpanish',
+ 'fi': 'sphinx.search.fi.SearchFinnish',
+ 'fr': 'sphinx.search.fr.SearchFrench',
+ 'hu': 'sphinx.search.hu.SearchHungarian',
+ 'it': 'sphinx.search.it.SearchItalian',
+ 'ja': 'sphinx.search.ja.SearchJapanese',
+ 'nl': 'sphinx.search.nl.SearchDutch',
+ 'no': 'sphinx.search.no.SearchNorwegian',
+ 'pt': 'sphinx.search.pt.SearchPortuguese',
+ 'ro': 'sphinx.search.ro.SearchRomanian',
+ 'ru': 'sphinx.search.ru.SearchRussian',
+ 'sv': 'sphinx.search.sv.SearchSwedish',
+ 'tr': 'sphinx.search.tr.SearchTurkish',
}
@@ -186,7 +220,17 @@ class IndexBuilder(object):
# objtype index -> (domain, type, objname (localized))
self._objnames = {}
# add language-specific SearchLanguage instance
- self.lang = languages[lang](options)
+ lang_class = languages.get(lang)
+ if lang_class is None:
+ self.lang = SearchEnglish(options)
+ elif isinstance(lang_class, str):
+ module, classname = lang_class.rsplit('.', 1)
+ lang_class = getattr(__import__(module, None, None, [classname]),
+ classname)
+ self.lang = lang_class(options)
+ else:
+ # it's directly a class (e.g. added by app.add_search_language)
+ self.lang = lang_class(options)
if scoring:
with open(scoring, 'rb') as fp:
@@ -196,7 +240,7 @@ class IndexBuilder(object):
def load(self, stream, format):
"""Reconstruct from frozen data."""
- if isinstance(format, basestring):
+ if isinstance(format, string_types):
format = self.formats[format]
frozen = format.load(stream)
# if an old index is present, we treat it as not existing.
@@ -208,7 +252,7 @@ class IndexBuilder(object):
def load_terms(mapping):
rv = {}
- for k, v in mapping.iteritems():
+ for k, v in iteritems(mapping):
if isinstance(v, int):
rv[k] = set([index2fn[v]])
else:
@@ -221,7 +265,7 @@ class IndexBuilder(object):
def dump(self, stream, format):
"""Dump the frozen index to a stream."""
- if isinstance(format, basestring):
+ if isinstance(format, string_types):
format = self.formats[format]
format.dump(self.freeze(), stream)
@@ -229,7 +273,7 @@ class IndexBuilder(object):
rv = {}
otypes = self._objtypes
onames = self._objnames
- for domainname, domain in self.env.domains.iteritems():
+ for domainname, domain in iteritems(self.env.domains):
for fullname, dispname, type, docname, anchor, prio in \
domain.get_objects():
# XXX use dispname?
@@ -248,7 +292,7 @@ class IndexBuilder(object):
if otype:
# use unicode() to fire translation proxies
onames[typeindex] = (domainname, type,
- unicode(domain.get_type_name(otype)))
+ text_type(domain.get_type_name(otype)))
else:
onames[typeindex] = (domainname, type, type)
if anchor == fullname:
@@ -263,7 +307,7 @@ class IndexBuilder(object):
def get_terms(self, fn2index):
rvs = {}, {}
for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
- for k, v in mapping.iteritems():
+ for k, v in iteritems(mapping):
if len(v) == 1:
fn, = v
if fn in fn2index:
@@ -274,19 +318,22 @@ class IndexBuilder(object):
def freeze(self):
"""Create a usable data structure for serializing."""
- filenames = self._titles.keys()
- titles = self._titles.values()
+ filenames = list(self._titles.keys())
+ titles = list(self._titles.values())
fn2index = dict((f, i) for (i, f) in enumerate(filenames))
terms, title_terms = self.get_terms(fn2index)
objects = self.get_objects(fn2index) # populates _objtypes
objtypes = dict((v, k[0] + ':' + k[1])
- for (k, v) in self._objtypes.iteritems())
+ for (k, v) in iteritems(self._objtypes))
objnames = self._objnames
return dict(filenames=filenames, titles=titles, terms=terms,
objects=objects, objtypes=objtypes, objnames=objnames,
titleterms=title_terms, envversion=self.env.version)
+ def label(self):
+ return "%s (code: %s)" % (self.lang.language_name, self.lang.lang)
+
def prune(self, filenames):
"""Remove data for all filenames not in the list."""
new_titles = {}
@@ -294,9 +341,9 @@ class IndexBuilder(object):
if filename in self._titles:
new_titles[filename] = self._titles[filename]
self._titles = new_titles
- for wordnames in self._mapping.itervalues():
+ for wordnames in itervalues(self._mapping):
wordnames.intersection_update(filenames)
- for wordnames in self._title_mapping.itervalues():
+ for wordnames in itervalues(self._title_mapping):
wordnames.intersection_update(filenames)
def feed(self, filename, title, doctree):