diff options
author | Brian Beck <exogen@gmail.com> | 2006-08-21 19:02:19 +0000 |
---|---|---|
committer | Brian Beck <exogen@gmail.com> | 2006-08-21 19:02:19 +0000 |
commit | 91790e27cd00b88b0891cb815a36d21813246c5d (patch) | |
tree | a0ae13c5dba42547f03ccb86cd88409eec4e07d7 | |
parent | 682aed446b16c1dd6c12d727849a715033878b34 (diff) | |
download | django-91790e27cd00b88b0891cb815a36d21813246c5d.tar.gz |
[search-api] Initial commit, Lucene working, Xapian and Hype almost working, needs polish.
git-svn-id: http://code.djangoproject.com/svn/django/branches/search-api@3636 bcc190cf-cafb-0310-a4f2-bffc1f526a37
-rw-r--r-- | django/contrib/search/__init__.py | 0 | ||||
-rw-r--r-- | django/contrib/search/backends.py | 19 | ||||
-rw-r--r-- | django/contrib/search/base.py | 214 | ||||
-rw-r--r-- | django/contrib/search/default.py | 9 | ||||
-rw-r--r-- | django/contrib/search/hype.py | 35 | ||||
-rw-r--r-- | django/contrib/search/lucene.py | 162 | ||||
-rw-r--r-- | django/contrib/search/models.py | 27 | ||||
-rw-r--r-- | django/contrib/search/query.py | 36 | ||||
-rw-r--r-- | django/contrib/search/views.py | 1 | ||||
-rw-r--r-- | django/contrib/search/xapian.py | 64 |
10 files changed, 567 insertions, 0 deletions
diff --git a/django/contrib/search/__init__.py b/django/contrib/search/__init__.py new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/django/contrib/search/__init__.py diff --git a/django/contrib/search/backends.py b/django/contrib/search/backends.py new file mode 100644 index 0000000000..d377fd2789 --- /dev/null +++ b/django/contrib/search/backends.py @@ -0,0 +1,19 @@ +from default import DefaultIndexer + +try: + from xapian import XapianIndexer +except ImportError: + print "Xapian backend will not be available due to an ImportError. " \ + "Do you have Xapian and Xapwrap installed?" + +try: + from lucene import LuceneIndexer +except ImportError: + print "Lucene backend will not be available due to an ImportError. " \ + "Do you have Lucene and PyLucene installed?" + +try: + from hype import HypeIndexer +except ImportError: + print "Hyper Estraier backend will not be available due to an importError. " \ + "Do you have Hyper Estraier and Hype installed?" diff --git a/django/contrib/search/base.py b/django/contrib/search/base.py new file mode 100644 index 0000000000..e63137a103 --- /dev/null +++ b/django/contrib/search/base.py @@ -0,0 +1,214 @@ +from django.db import models +from django.core.exceptions import ObjectDoesNotExist +import sys + +# For Python 2.3 +if not hasattr(__builtins__, 'set'): + from sets import Set as set + +# FIXME: Methods that accept a field parameter claim to accept Field instances +# or strings giving the object path. However, since there is no Field +# attribute giving the Model it is bound to, these methods only work for +# strings at the moment. This doesn't really affect the ease of use of the +# library, as strings are actually easier to use. + +def str_to_field(string, namespace=None): + """Gets the column attribute from the model as indicated + by `string`, following ForeignKey attributes, etc. + + Example: 'Person.first_name' -> Person._meta.get_field('first_name') + + `namespace` is the dict-like object in which the object path will be + searched. If None, the caller's global namespace will be used, thanks + to the sys._getframe hack. This is important so that, for example, + if `string` is 'models.Person.first_name', the caller's models module + is used instead of the django.db.models module imported here. + """ + # FIXME: This whole function is either silly or clever... + objPath = string.split('.') + model = None + + if namespace is None: + # FIXME: This uses the sys._getframe hack to get the caller's namespace. + obj = sys._getframe(1).f_globals + else: + obj = namespace + getter = obj.__getitem__ + + while objPath: + objName = objPath.pop(0) + + # This might be better in a try/except block, but the respective + # exceptions for the getters (KeyError, AttributeError, + # FieldDoesNotExist) are already pretty descriptive... + obj = getter(objName) + + if isinstance(obj, models.base.ModelBase): + model = obj + getter = model._meta.get_field + elif isinstance(obj, models.fields.related.ForeignKey): + model = obj.rel.to + getter = model._meta.get_field + + # TODO: The rest of these could be more type-smart... + elif hasattr(obj, '__getitem__'): + getter = obj.__getitem__ + elif hasattr(obj, '__getattribute__'): + getter = obj.__getattribute__ + else: + getter = obj.__getattr__ + + if isinstance(obj, models.base.ModelBase): + model = obj + obj = obj._meta.pk + + if not isinstance(obj, models.Field): + raise ValueError("%r is not a Field object! (%r -> %r)" % \ + (objName, string, obj)) + # FIXME: I don't think there is a way to get back to a field's Model + # from the Field object. This makes sense from a hierarchical viewpoint, + # but sure makes things like this harder. Hopefully setting this attribute + # won't mess anything up... + obj._model = model + return obj + + +class Indexer(object): + def __init__(self, path, model, fields=None, attributes=None, namespace=None, **kwargs): + """Initialize an Indexer whose index data is stored at `path`. + `model` is the Model (or string name of the model) whose instances will + be used as documents. Note that fields from other models can still be + used in the index, but this model will be the one returned from search + results. + `fields` may be optionally initialized as an iterable of unnamed Fields. + `attributes` may be optionally initialized as a mapping of field names + to Fields. + `namespace` is the dict-like object in which fields passed as object + paths will be searched. If None, the caller's global namespace will be + used, thanks to the sys._getframe hack. + + Example: If `fields` is ['models.Person.first_name'], it is important + that namespace['models'] refers to the intended module and NOT the + django.db.models module imported here. + """ + if fields is None: + fields = [] + if attributes is None: + attributes = kwargs + else: + # `attributes` should take precedence to `kwargs`. + kwargs.update(attributes) + attributes = kwargs + + if namespace is None: + # FIXME: This uses the sys._getframe hack to get the caller's namespace. + namespace = sys._getframe(1).f_globals + + self._prepare_path(path) + + self.path = path + self.model = model + self.text_fields = set([]) + self.attr_fields = {} + + for field in fields: + self.add_field(field, namespace=namespace) + + for name, field in attributes.iteritems(): + self.add_field(field, name, namespace=namespace) + + pk = self.model._meta.pk + pk._model = self.model + if pk not in self.text_fields and pk not in set(self.attr_fields.values()): + self.add_field(pk, 'pk', namespace=namespace) + + def add_field(self, field, name=None, namespace=None): + """Add the given field to the Indexer, where `field` is either + an object path string or a Field instance. If `name` is None, + the field will be added to self.text_fields, otherwise it will be + added to self.attr_fields with the given name. + `namespace` has the same meaning as in __init__. + """ + # FIXME: This uses the sys._getframe hack to get the caller's namespace. + if namespace is None: + namespace = sys._getframe(1).f_globals + + # FIXME: Detect duplicates, or user-knows-best? + if isinstance(field, basestring): + field = str_to_field(field, namespace) + + if name: + self.attr_fields[name] = field + else: + self.text_fields.add(field) + + def remove_field(self, field=None, name=None, find_name=True, namespace=None): + """Remove the given field from the Indexer, where `field` is either + an object path string or a Field instance. If `name` is given, + the field with that name is removed. If both `field` and `name` + are given, both are removed if they refer to different fields. + If `find_name` is True, the named fields in self.attr_fields are + searched for `field`, otherwise only self.text_fields is searched. + `namespace` has the same meaning as in __init__. + """ + # FIXME: This uses the sys._getframe hack to get the caller's namespace. + if namespace is None: + namespace = sys._getframe(1).f_globals + + if name: + if name in self.attr_fields: + del self.attr_fields[name] + return + + if field: + if isinstance(field, basestring): + field = str_to_field(field, namespace) + + self.text_fields.discard(field) + + if find_name: + for name, f in self.attr_fields.items(): + # TODO: Make sure identity is correct here + if f is field: + del self.attr_fields[name] + + def search(self, query_string, sortBy=None): + """Query the index for `query_string` and return a HitResults instance. + `order_by` can have the same values as Model.objects.order_by, with + 'SCORE' being the default. + """ + raise NotImplementedError + + def index(self, document): + raise NotImplementedError + + def update(self, force=False): + raise NotImplementedError + + def _prepare_path(self, path): + pass + +def test_indexer(): + # Note: I'm not very good at writing tests. + + class Person(models.Model): + first_name = models.CharField(maxlength=30) + last_name = models.CharField(maxlength=30) + description = models.TextField() + + i = Indexer('', Person, ['Person.description'], {'first': 'Person.first_name'}, + last='Person.last_name', namespace=locals()) + + assert Person._meta.get_field('description') in i.text_fields + assert set([Person._meta.get_field('first_name'), + Person._meta.get_field('last_name')]) == \ + set(i.attr_fields.values()) + assert 'first' in i.attr_fields and 'last' in i.attr_fields + + i.remove_field('Person.description', namespace=locals()) + assert not i.text_fields + + i.remove_field(name='last') + assert 'last' not in i.attr_fields + print "Test succeeded." + return i diff --git a/django/contrib/search/default.py b/django/contrib/search/default.py new file mode 100644 index 0000000000..705f5a8f15 --- /dev/null +++ b/django/contrib/search/default.py @@ -0,0 +1,9 @@ +from base import Indexer + +# This is the future home of a pure-Python text indexer. + +# Alec Thomas has created a built-in indexer for his library here: +# http://swapoff.org/wiki/pyndexter + +class DefaultIndexer(Indexer): + pass
\ No newline at end of file diff --git a/django/contrib/search/hype.py b/django/contrib/search/hype.py new file mode 100644 index 0000000000..3ec20a98c5 --- /dev/null +++ b/django/contrib/search/hype.py @@ -0,0 +1,35 @@ +from base import Indexer +from query import ResultSet, Hit + +import hype + +# TODO: This is very incomplete. + +class HypeIndexer(Indexer): + def __init__(self, *args, **kwargs): + super(Indexer, self).__init__(*args, **kwargs) + self.db = hype.Database(self.path, hype.ESTDBWRITER | hype.ESTDBCREAT) + + def index(self, row): + document = hype.Document() + document['@pk'] = row._get_pk_val() + document.add_text() + + def search(self, query_string, sortBy=None): + searcher = self.db.search(query_string) + return HypeResultSet(searcher) + + def close(self): + self.db.close() + + +class HypeResultSet(ResultSet): + def __len__(self): + return len(self._hits) + + def __iter__(self): + for hit in self._hits: + yield HypeHit(hit, self._indexer) + +class HypeHit(Hit): + pass
\ No newline at end of file diff --git a/django/contrib/search/lucene.py b/django/contrib/search/lucene.py new file mode 100644 index 0000000000..76f2ad4402 --- /dev/null +++ b/django/contrib/search/lucene.py @@ -0,0 +1,162 @@ +from base import Indexer +from query import ResultSet, Hit +from itertools import imap +import os, sys + +import PyLucene + +# WARNING!* +# PyLucene wants you to use PyLucene.PythonThread for threading. +# Look at samples/ThreadIndexFiles.py bundled with PyLucene. +# * I'm not sure how important this is. + +# TODO: Make Lucene aware of field types. + +# Here's how to use me: +# +# class Person(models.Model): +# first_name = models.CharField(maxlength=30) +# last_name = models.CharField(maxlength=30) +# biography = models.TextField() +# +# indexer = LuceneIndexer('/tmp/lucene-index', Person, [biography], +# {'first': 'Person.first_name', +# 'last': 'Person.last_name'}) +# indexer.update() # Note, calling this multiple times without clearing old +# # entries will cause duplicates in the index. +# indexer.search("brian -last:beck") + +class LuceneIndexer(Indexer): + def __init__(self, *args, **kwargs): + # FIXME: This uses the sys._getframe hack to get the caller's namespace. + namespace = sys._getframe(1).f_globals + kwargs['namespace'] = namespace + super(LuceneIndexer, self).__init__(*args, **kwargs) + self.writer_closed = True + + def _prepare_path(self, path): + # Lucene wants an abstraction of the directory. + # Should look into storage in a Model-compatible database in the future... + self._store = PyLucene.FSDirectory.getDirectory(path, True) + + def update(self, documents=None): + close = False + if self.writer_closed: + close = True + self.open_writer() + + if documents is None: + update_queue = self.model.objects.all() + else: + update_queue = documents + + for document in update_queue: + self.delete(document) + self.index(document) + + if close: + self.close_writer() + + def clear(self): + close = False + if self.writer_closed: + close = True + self.open_writer() + for i in xrange(self._writer.docCount()): + self._writer.deleteDocument(i) + if close: + self.close_writer() + + def delete(self, row): + reader = PyLucene.IndexReader.open(self.path) + reader.deleteDocuments(PyLucene.Term('pk', str(row._get_pk_val()))) + reader.close() + + def open_writer(self): + self.writer_closed = False + self._writer = PyLucene.IndexWriter(self._store, PyLucene.StandardAnalyzer(), True) + self._writer.setMaxFieldLength(1048576) # Max number of tokens stored per field? + + def close_writer(self): + self._writer.optimize() + self._writer.close() + self.writer_closed = True + + def index(self, row): + close = False + if self.writer_closed: + close = True + self.open_writer() + + document = PyLucene.Document() + + for name, field in self.attr_fields.iteritems(): + # FIXME: Assumes no Foreign Keys! Lame! + value = getattr(row, field.name) + document.add(PyLucene.Field(name, str(value), + PyLucene.Field.Store.YES, + PyLucene.Field.Index.TOKENIZED)) + # Lucene only seems to support one 'default' field. + # However, we might want multiple fields to be searched + # by default. Hopefully just joining their contents with + # newlines solves this. + contents = '\n'.join([str(getattr(row, field.name)) for field in \ + self.text_fields]) + # FIXME: Hardcoded 'contents' field. + document.add(PyLucene.Field('contents', contents, + PyLucene.Field.Store.YES, + PyLucene.Field.Index.TOKENIZED)) + self._writer.addDocument(document) + if close: + self.close_writer() + + def search(self, query_string, default_field='contents', order_by='RELEVANCE'): + searcher = PyLucene.IndexSearcher(self._store) + analyzer = PyLucene.StandardAnalyzer() + query = PyLucene.QueryParser(default_field, analyzer).parse(query_string) + + if order_by == 'SCORE': + sort_field = PyLucene.SortField.FIELD_SCORE + sort = PyLucene.Sort(sort_field) + elif order_by == 'INDEX': + sort = PyLucene.Sort.INDEXORDER + elif order_by == 'RELEVANCE': + sort = PyLucene.Sort.RELEVANCE + else: + reverse = order_by.startswith('-') + while order_by[0] in '+-': + order_by = order_by[1:] + sort_field = PyLucene.SortField(order_by, reverse) + sort = PyLucene.Sort(sort_field) + hits = searcher.search(query, sort) + return LuceneResultSet(hits, self) + + +class LuceneResultSet(ResultSet): + def __init__(self, hits, indexer): + self._hits = hits + self._indexer = indexer + + def __len__(self): + return self._hits.length() + + def __iter__(self): + for hit in self._hits: + yield LuceneHit(hit, self._indexer) + + def __getitem__(self, item): + return LuceneHit(self._hits.__getitem__(item)) + + +class LuceneHit(Hit): + def get_pk(self): + # FIXME: Hardcoded 'pk' field. + return self.data.get('pk') + + def __getitem__(self, item): + return self.data.__getitem__(item) + + def get_score(self): + return self.data.getScore() + + score = property(get_score) diff --git a/django/contrib/search/models.py b/django/contrib/search/models.py new file mode 100644 index 0000000000..756bf01bad --- /dev/null +++ b/django/contrib/search/models.py @@ -0,0 +1,27 @@ +from django.db import models + +# Note: These aren't used yet, but they probably will be in the future. +# This is because the only thing that really needs to be remembered +# (the path to the index) is going to go in SETTINGS anyway. +# But persistent info such as outdated rows, search statistics, etc. +# could still be useful. + +class Index(models.Model): + model_name = models.CharField(maxlength=255) + +class IndexedField(models.Model): + object_path = models.CharField(maxlength=255) + model = models.ForeignKey('Index') + +class QueryLog(models.Model): + """This is not a full log, but merely counts queries.""" + query = models.CharField(maxlength=255, unique=True) + query_count = models.IntegerField(default=1) + last_date = DateTimeField() + last_source = models.CharField("Some identifier for who sent the query", maxlength=255) + +class Person(models.Model): + """This is for testing.""" + first_name = models.CharField(maxlength=30) + last_name = models.CharField(maxlength=30) + description = models.TextField()
\ No newline at end of file diff --git a/django/contrib/search/query.py b/django/contrib/search/query.py new file mode 100644 index 0000000000..3b90c43c70 --- /dev/null +++ b/django/contrib/search/query.py @@ -0,0 +1,36 @@ +class QueryParser(object): + # TODO: Make a common query language for all the backends. + pass + + +class ResultSet(object): + def __iter__(self): + raise NotImplementedError + + def __len__(self): + raise NotImplementedError + + def __getitem__(self): + raise NotImplementedError + + +class Hit(object): + def __init__(self, data, indexer): + self.indexer = indexer + self.model = indexer.model + self.data = data + + def get_instance(self): + name = self.model._meta.pk.name + pk = self.model._meta.pk.to_python(self.get_pk()) + return self.model.objects.get(**{name: pk}) + + instance = property(get_instance) + + def get_pk(self): + raise NotImplementedError + + def __repr__(self): + return "<%s: %s %s, Score: %s>" % (self.__class__.__name__, + self.model._meta, + self.get_pk(), self.score)
\ No newline at end of file diff --git a/django/contrib/search/views.py b/django/contrib/search/views.py new file mode 100644 index 0000000000..60f00ef0ef --- /dev/null +++ b/django/contrib/search/views.py @@ -0,0 +1 @@ +# Create your views here. diff --git a/django/contrib/search/xapian.py b/django/contrib/search/xapian.py new file mode 100644 index 0000000000..a1dbdbb8e9 --- /dev/null +++ b/django/contrib/search/xapian.py @@ -0,0 +1,64 @@ +from django.db import models +from datetime import datetime +import xapwrap.index +import xapwrap.document +from itertools import imap + +from base import Indexer, ResultSet + +# TODO: This is incomplete. + +class XapianIndexer(Indexer): + def update(self, documents=None): + idx = xapwrap.index.Index(self.path, True) + + if documents is None: + update_queue = self.model.objects.all() + else: + update_queue = documents + + for row in documents: + keys = [] + for name, field in self.attr_fields.iteritems(): + keys.append(xapwrap.document.SortKey(name, getattr(self.model, field.name))) + + d = xapwrap.document.Document(textFields=fields, sortFields=keys, uid=row._get_pk_val()) + idx.index(d) + idx.close() + + def search(self, query, order_by='RELEVANCE'): + idx = Index(self.path) + if order_by == 'RELEVANCE': + results = idx.search(query, sortByRelevence=True) + else: + ascending = True + if isinstance(order_by, basestring) and order_by.startswith('-'): + ascending = False + while order_by[0] in '+-': + order_by = order_by[1:] + results = idx.search(query, order_by, sortAscending=ascending) + return XapianResultSet(results) + + +class XapianResultSet(ResultSet): + def __init__(self, hits, indexer): + self._hits = hits + self._indexer = indexer + + def __len__(self): + return len(self._hits) + + def __iter__(self): + for hit in self._hits): + yield XapianHit(hit, self._indexer) + + +class XapianHit(object): + def get_pk(self): + return self.data['pk'] + + def get_score(self): + return self.data['score'] + + score = property(get_score) + |