diff options
Diffstat (limited to 'sphinx/search/__init__.py')
-rw-r--r-- | sphinx/search/__init__.py | 88 |
1 files changed, 59 insertions, 29 deletions
diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py index 6e9610a4..764c9208 100644 --- a/sphinx/search/__init__.py +++ b/sphinx/search/__init__.py @@ -5,13 +5,15 @@ Create a full-text search index for offline search. - :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. + :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS. :license: BSD, see LICENSE for details. """ +from __future__ import with_statement import re +import itertools import cPickle as pickle -from docutils.nodes import comment, Text, NodeVisitor, SkipNode +from docutils.nodes import comment, title, Text, NodeVisitor, SkipNode from sphinx.util import jsdump, rpartition @@ -92,6 +94,7 @@ var Stemmer = function() { (ord(word[0]) < 256 and (len(word) < 3 or word in self.stopwords or word.isdigit()))) + from sphinx.search import en, ja languages = { @@ -137,13 +140,16 @@ class WordCollector(NodeVisitor): def __init__(self, document, lang): NodeVisitor.__init__(self, document) self.found_words = [] + self.found_title_words = [] self.lang = lang def dispatch_visit(self, node): if node.__class__ is comment: raise SkipNode - if node.__class__ is Text: + elif node.__class__ is Text: self.found_words.extend(self.lang.split(node.astext())) + elif node.__class__ is title: + self.found_title_words.extend(self.lang.split(node.astext())) class IndexBuilder(object): @@ -156,12 +162,14 @@ class IndexBuilder(object): 'pickle': pickle } - def __init__(self, env, lang, options): + def __init__(self, env, lang, options, scoring): self.env = env # filename -> title self._titles = {} # stemmed word -> set(filenames) self._mapping = {} + # stemmed words in titles -> set(filenames) + self._title_mapping = {} # objtype -> index self._objtypes = {} # objtype index -> (domain, type, objname (localized)) @@ -169,6 +177,12 @@ class IndexBuilder(object): # add language-specific SearchLanguage instance self.lang = languages[lang](options) + if scoring: + with open(scoring, 'rb') as fp: + self.js_scorer_code = fp.read().decode('utf-8') + else: + self.js_scorer_code = u'' + def load(self, stream, format): """Reconstruct from frozen data.""" if isinstance(format, basestring): @@ -179,12 +193,18 @@ class IndexBuilder(object): raise ValueError('old format') index2fn = frozen['filenames'] self._titles = dict(zip(index2fn, frozen['titles'])) - self._mapping = {} - for k, v in frozen['terms'].iteritems(): - if isinstance(v, int): - self._mapping[k] = set([index2fn[v]]) - else: - self._mapping[k] = set(index2fn[i] for i in v) + + def load_terms(mapping): + rv = {} + for k, v in mapping.iteritems(): + if isinstance(v, int): + rv[k] = set([index2fn[v]]) + else: + rv[k] = set(index2fn[i] for i in v) + return rv + + self._mapping = load_terms(frozen['terms']) + self._title_mapping = load_terms(frozen['titleterms']) # no need to load keywords/objtypes def dump(self, stream, format): @@ -229,28 +249,31 @@ class IndexBuilder(object): return rv def get_terms(self, fn2index): - rv = {} - for k, v in self._mapping.iteritems(): - if len(v) == 1: - fn, = v - if fn in fn2index: - rv[k] = fn2index[fn] - else: - rv[k] = [fn2index[fn] for fn in v if fn in fn2index] - return rv + rvs = {}, {} + for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)): + for k, v in mapping.iteritems(): + if len(v) == 1: + fn, = v + if fn in fn2index: + rv[k] = fn2index[fn] + else: + rv[k] = [fn2index[fn] for fn in v if fn in fn2index] + return rvs def freeze(self): """Create a usable data structure for serializing.""" filenames = self._titles.keys() titles = self._titles.values() fn2index = dict((f, i) for (i, f) in enumerate(filenames)) - terms = self.get_terms(fn2index) + terms, title_terms = self.get_terms(fn2index) + objects = self.get_objects(fn2index) # populates _objtypes objtypes = dict((v, k[0] + ':' + k[1]) for (k, v) in self._objtypes.iteritems()) objnames = self._objnames return dict(filenames=filenames, titles=titles, terms=terms, - objects=objects, objtypes=objtypes, objnames=objnames) + objects=objects, objtypes=objtypes, objnames=objnames, + titleterms=title_terms) def prune(self, filenames): """Remove data for all filenames not in the list.""" @@ -261,6 +284,8 @@ class IndexBuilder(object): self._titles = new_titles for wordnames in self._mapping.itervalues(): wordnames.intersection_update(filenames) + for wordnames in self._title_mapping.itervalues(): + wordnames.intersection_update(filenames) def feed(self, filename, title, doctree): """Feed a doctree to the index.""" @@ -269,19 +294,24 @@ class IndexBuilder(object): visitor = WordCollector(doctree, self.lang) doctree.walk(visitor) - def add_term(word, stem=self.lang.stem): - word = stem(word) - if self.lang.word_filter(word): - self._mapping.setdefault(word, set()).add(filename) + stem = self.lang.stem + _filter = self.lang.word_filter - for word in self.lang.split(title): - add_term(word) + for word in itertools.chain(visitor.found_title_words, + self.lang.split(title)): + word = stem(word) + if _filter(word): + self._title_mapping.setdefault(word, set()).add(filename) for word in visitor.found_words: - add_term(word) + word = stem(word) + if word not in self._title_mapping and _filter(word): + self._mapping.setdefault(word, set()).add(filename) def context_for_searchtool(self): return dict( search_language_stemming_code = self.lang.js_stemmer_code, - search_language_stop_words = jsdump.dumps(self.lang.stopwords), + search_language_stop_words = + jsdump.dumps(sorted(self.lang.stopwords)), + search_scorer_tool = self.js_scorer_code, ) |