[search-api] Initial commit, Lucene working, Xapian and Hype almost working, needs polish.

git-svn-id: http://code.djangoproject.com/svn/django/branches/search-api@3636 bcc190cf-cafb-0310-a4f2-bffc1f526a37
author: Brian Beck <exogen@gmail.com> 2006-08-21 19:02:19 +0000
committer: Brian Beck <exogen@gmail.com> 2006-08-21 19:02:19 +0000
commit: 91790e27cd00b88b0891cb815a36d21813246c5d (patch)
tree: a0ae13c5dba42547f03ccb86cd88409eec4e07d7
parent: 682aed446b16c1dd6c12d727849a715033878b34 (diff)
download: django-91790e27cd00b88b0891cb815a36d21813246c5d.tar.gz
10 files changed, 567 insertions, 0 deletions
diff --git a/django/contrib/search/__init__.py b/django/contrib/search/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/django/contrib/search/__init__.py
diff --git a/django/contrib/search/backends.py b/django/contrib/search/backends.py
new file mode 100644
index 0000000000..d377fd2789
--- /dev/null
+++ b/django/contrib/search/backends.py
@@ -0,0 +1,19 @@
+from default import DefaultIndexer
+
+try:
+    from xapian import XapianIndexer
+except ImportError:
+    print "Xapian backend will not be available due to an ImportError. " \
+          "Do you have Xapian and Xapwrap installed?"
+
+try:
+    from lucene import LuceneIndexer
+except ImportError:
+    print "Lucene backend will not be available due to an ImportError. " \
+          "Do you have Lucene and PyLucene installed?"
+
+try:
+    from hype import HypeIndexer
+except ImportError:
+    print "Hyper Estraier backend will not be available due to an importError. " \
+          "Do you have Hyper Estraier and Hype installed?"
diff --git a/django/contrib/search/base.py b/django/contrib/search/base.py
new file mode 100644
index 0000000000..e63137a103
--- /dev/null
+++ b/django/contrib/search/base.py
@@ -0,0 +1,214 @@
+from django.db import models
+from django.core.exceptions import ObjectDoesNotExist
+import sys
+
+# For Python 2.3
+if not hasattr(__builtins__, 'set'):
+    from sets import Set as set
+
+# FIXME: Methods that accept a field parameter claim to accept Field instances
+# or strings giving the object path. However, since there is no Field
+# attribute giving the Model it is bound to, these methods only work for
+# strings at the moment. This doesn't really affect the ease of use of the
+# library, as strings are actually easier to use.
+
+def str_to_field(string, namespace=None):
+    """Gets the column attribute from the model as indicated
+    by `string`, following ForeignKey attributes, etc.
+
+    Example: 'Person.first_name' -> Person._meta.get_field('first_name')
+
+    `namespace` is the dict-like object in which the object path will be
+    searched. If None, the caller's global namespace will be used, thanks
+    to the sys._getframe hack. This is important so that, for example,
+    if `string` is 'models.Person.first_name', the caller's models module
+    is used instead of the django.db.models module imported here.
+    """
+    # FIXME: This whole function is either silly or clever...
+    objPath = string.split('.')
+    model = None
+
+    if namespace is None:
+        # FIXME: This uses the sys._getframe hack to get the caller's namespace.
+        obj = sys._getframe(1).f_globals
+    else:
+        obj = namespace
+    getter = obj.__getitem__
+
+    while objPath:
+        objName = objPath.pop(0)
+
+        # This might be better in a try/except block, but the respective
+        # exceptions for the getters (KeyError, AttributeError,
+        # FieldDoesNotExist) are already pretty descriptive...
+        obj = getter(objName)
+
+        if isinstance(obj, models.base.ModelBase):
+            model = obj
+            getter = model._meta.get_field
+        elif isinstance(obj, models.fields.related.ForeignKey):
+            model = obj.rel.to
+            getter = model._meta.get_field
+
+        # TODO: The rest of these could be more type-smart...
+        elif hasattr(obj, '__getitem__'):
+            getter = obj.__getitem__
+        elif hasattr(obj, '__getattribute__'):
+            getter = obj.__getattribute__
+        else:
+            getter = obj.__getattr__
+
+    if isinstance(obj, models.base.ModelBase):
+        model = obj
+        obj = obj._meta.pk
+
+    if not isinstance(obj, models.Field):
+        raise ValueError("%r is not a Field object! (%r -> %r)" % \
+                         (objName, string, obj))
+    # FIXME: I don't think there is a way to get back to a field's Model
+    # from the Field object. This makes sense from a hierarchical viewpoint,
+    # but sure makes things like this harder. Hopefully setting this attribute
+    # won't mess anything up...
+    obj._model = model
+    return obj
+
+
+class Indexer(object):
+    def __init__(self, path, model, fields=None, attributes=None, namespace=None, **kwargs):
+        """Initialize an Indexer whose index data is stored at `path`.
+        `model` is the Model (or string name of the model) whose instances will
+        be used as documents. Note that fields from other models can still be
+        used in the index, but this model will be the one returned from search
+        results.
+        `fields` may be optionally initialized as an iterable of unnamed Fields.
+        `attributes` may be optionally initialized as a mapping of field names
+        to Fields.
+        `namespace` is the dict-like object in which fields passed as object
+        paths will be searched. If None, the caller's global namespace will be
+        used, thanks to the sys._getframe hack.
+
+        Example: If `fields` is ['models.Person.first_name'], it is important
+        that namespace['models'] refers to the intended module and NOT the
+        django.db.models module imported here.
+        """
+        if fields is None:
+            fields = []
+        if attributes is None:
+            attributes = kwargs
+        else:
+            # `attributes` should take precedence to `kwargs`.
+            kwargs.update(attributes)
+            attributes = kwargs
+
+        if namespace is None:
+            # FIXME: This uses the sys._getframe hack to get the caller's namespace.
+            namespace = sys._getframe(1).f_globals
+
+        self._prepare_path(path)
+
+        self.path = path
+        self.model = model
+        self.text_fields = set([])
+        self.attr_fields = {}
+
+        for field in fields:
+            self.add_field(field, namespace=namespace)
+
+        for name, field in attributes.iteritems():
+            self.add_field(field, name, namespace=namespace)
+
+        pk = self.model._meta.pk
+        pk._model = self.model
+        if pk not in self.text_fields and pk not in set(self.attr_fields.values()):
+            self.add_field(pk, 'pk', namespace=namespace)
+
+    def add_field(self, field, name=None, namespace=None):
+        """Add the given field to the Indexer, where `field` is either
+        an object path string or a Field instance. If `name` is None,
+        the field will be added to self.text_fields, otherwise it will be
+        added to self.attr_fields with the given name.
+        `namespace` has the same meaning as in __init__.
+        """
+        # FIXME: This uses the sys._getframe hack to get the caller's namespace.
+        if namespace is None:
+            namespace = sys._getframe(1).f_globals
+
+        # FIXME: Detect duplicates, or user-knows-best?
+        if isinstance(field, basestring):
+            field = str_to_field(field,  namespace)
+
+        if name:
+            self.attr_fields[name] = field
+        else:
+            self.text_fields.add(field)
+
+    def remove_field(self, field=None, name=None, find_name=True, namespace=None):
+        """Remove the given field from the Indexer, where `field` is either
+        an object path string or a Field instance. If `name` is given,
+        the field with that name is removed. If both `field` and `name`
+        are given, both are removed if they refer to different fields.
+        If `find_name` is True, the named fields in self.attr_fields are
+        searched for `field`, otherwise only self.text_fields is searched.
+        `namespace` has the same meaning as in __init__.
+        """
+        # FIXME: This uses the sys._getframe hack to get the caller's namespace.
+        if namespace is None:
+            namespace = sys._getframe(1).f_globals
+
+        if name:
+            if name in self.attr_fields:
+                del self.attr_fields[name]
+                return
+
+        if field:
+            if isinstance(field, basestring):
+                field = str_to_field(field, namespace)
+
+            self.text_fields.discard(field)
+
+            if find_name:
+                for name, f in self.attr_fields.items():
+                    # TODO: Make sure identity is correct here
+                    if f is field:
+                        del self.attr_fields[name]
+
+    def search(self, query_string, sortBy=None):
+        """Query the index for `query_string` and return a HitResults instance.
+        `order_by` can have the same values as Model.objects.order_by, with
+        'SCORE' being the default.
+        """
+        raise NotImplementedError
+
+    def index(self, document):
+        raise NotImplementedError
+
+    def update(self, force=False):
+        raise NotImplementedError
+
+    def _prepare_path(self, path):
+        pass
+
+def test_indexer():
+    # Note: I'm not very good at writing tests.
+
+    class Person(models.Model):
+        first_name = models.CharField(maxlength=30)
+        last_name = models.CharField(maxlength=30)
+        description = models.TextField()
+
+    i = Indexer('', Person, ['Person.description'], {'first': 'Person.first_name'},
+                last='Person.last_name', namespace=locals())
+
+    assert Person._meta.get_field('description') in i.text_fields
+    assert set([Person._meta.get_field('first_name'),
+                Person._meta.get_field('last_name')]) == \
+           set(i.attr_fields.values())
+    assert 'first' in i.attr_fields and 'last' in i.attr_fields
+
+    i.remove_field('Person.description', namespace=locals())
+    assert not i.text_fields
+
+    i.remove_field(name='last')
+    assert 'last' not in i.attr_fields
+    print "Test succeeded."
+    return i
diff --git a/django/contrib/search/default.py b/django/contrib/search/default.py
new file mode 100644
index 0000000000..705f5a8f15
--- /dev/null
+++ b/django/contrib/search/default.py
@@ -0,0 +1,9 @@
+from base import Indexer
+
+# This is the future home of a pure-Python text indexer.
+
+# Alec Thomas has created a built-in indexer for his library here:
+#   http://swapoff.org/wiki/pyndexter
+
+class DefaultIndexer(Indexer):
+    pass
+\ No newline at end of file
diff --git a/django/contrib/search/hype.py b/django/contrib/search/hype.py
new file mode 100644
index 0000000000..3ec20a98c5
--- /dev/null
+++ b/django/contrib/search/hype.py
@@ -0,0 +1,35 @@
+from base import Indexer
+from query import ResultSet, Hit
+
+import hype
+
+# TODO: This is very incomplete.
+
+class HypeIndexer(Indexer):
+    def __init__(self, *args, **kwargs):
+        super(Indexer, self).__init__(*args, **kwargs)
+        self.db = hype.Database(self.path, hype.ESTDBWRITER | hype.ESTDBCREAT)
+
+    def index(self, row):
+        document = hype.Document()
+        document['@pk'] = row._get_pk_val()
+        document.add_text()
+
+    def search(self, query_string, sortBy=None):
+        searcher = self.db.search(query_string)
+        return HypeResultSet(searcher)
+
+    def close(self):
+        self.db.close()
+
+
+class HypeResultSet(ResultSet):
+    def __len__(self):
+        return len(self._hits)
+
+    def __iter__(self):
+        for hit in self._hits:
+            yield HypeHit(hit, self._indexer)
+
+class HypeHit(Hit):
+    pass
+\ No newline at end of file
diff --git a/django/contrib/search/lucene.py b/django/contrib/search/lucene.py
new file mode 100644
index 0000000000..76f2ad4402
--- /dev/null
+++ b/django/contrib/search/lucene.py
@@ -0,0 +1,162 @@
+from base import Indexer
+from query import ResultSet, Hit
+from itertools import imap
+import os, sys
+
+import PyLucene
+
+# WARNING!*
+# PyLucene wants you to use PyLucene.PythonThread for threading.
+# Look at samples/ThreadIndexFiles.py bundled with PyLucene.
+# * I'm not sure how important this is.
+
+# TODO: Make Lucene aware of field types.
+
+# Here's how to use me:
+#
+# class Person(models.Model):
+#     first_name = models.CharField(maxlength=30)
+#     last_name = models.CharField(maxlength=30)
+#     biography = models.TextField()
+#
+# indexer = LuceneIndexer('/tmp/lucene-index', Person, [biography],
+#                         {'first': 'Person.first_name',
+#                          'last': 'Person.last_name'})
+# indexer.update() # Note, calling this multiple times without clearing old
+#                  # entries will cause duplicates in the index.
+# indexer.search("brian -last:beck")
+
+class LuceneIndexer(Indexer):
+    def __init__(self, *args, **kwargs):
+        # FIXME: This uses the sys._getframe hack to get the caller's namespace.
+        namespace = sys._getframe(1).f_globals
+        kwargs['namespace'] = namespace
+        super(LuceneIndexer, self).__init__(*args, **kwargs)
+        self.writer_closed = True
+
+    def _prepare_path(self, path):
+        # Lucene wants an abstraction of the directory.
+        # Should look into storage in a Model-compatible database in the future...
+        self._store = PyLucene.FSDirectory.getDirectory(path, True)
+
+    def update(self, documents=None):
+        close = False
+        if self.writer_closed:
+            close = True
+            self.open_writer()
+
+        if documents is None:
+            update_queue = self.model.objects.all()
+        else:
+            update_queue = documents
+
+        for document in update_queue:
+            self.delete(document)
+            self.index(document)
+
+        if close:
+            self.close_writer()
+
+    def clear(self):
+        close = False
+        if self.writer_closed:
+            close = True
+            self.open_writer()
+        for i in xrange(self._writer.docCount()):
+            self._writer.deleteDocument(i)
+        if close:
+            self.close_writer()
+
+    def delete(self, row):
+        reader = PyLucene.IndexReader.open(self.path)
+        reader.deleteDocuments(PyLucene.Term('pk', str(row._get_pk_val())))
+        reader.close()
+
+    def open_writer(self):
+        self.writer_closed = False
+        self._writer = PyLucene.IndexWriter(self._store, PyLucene.StandardAnalyzer(), True)
+        self._writer.setMaxFieldLength(1048576) # Max number of tokens stored per field?
+
+    def close_writer(self):
+        self._writer.optimize()
+        self._writer.close()
+        self.writer_closed = True
+
+    def index(self, row):
+        close = False
+        if self.writer_closed:
+            close = True
+            self.open_writer()
+
+        document = PyLucene.Document()
+
+        for name, field in self.attr_fields.iteritems():
+            # FIXME: Assumes no Foreign Keys! Lame!
+            value = getattr(row, field.name)
+            document.add(PyLucene.Field(name, str(value),
+                                        PyLucene.Field.Store.YES,
+                                        PyLucene.Field.Index.TOKENIZED))
+        # Lucene only seems to support one 'default' field.
+        # However, we might want multiple fields to be searched
+        # by default. Hopefully just joining their contents with
+        # newlines solves this.
+        contents = '\n'.join([str(getattr(row, field.name)) for field in \
+                              self.text_fields])
+        # FIXME: Hardcoded 'contents' field.
+        document.add(PyLucene.Field('contents', contents,
+                                    PyLucene.Field.Store.YES,
+                                    PyLucene.Field.Index.TOKENIZED))
+        self._writer.addDocument(document)
+        if close:
+            self.close_writer()
+
+    def search(self, query_string, default_field='contents', order_by='RELEVANCE'):
+        searcher = PyLucene.IndexSearcher(self._store)
+        analyzer = PyLucene.StandardAnalyzer()
+        query = PyLucene.QueryParser(default_field, analyzer).parse(query_string)
+
+        if order_by == 'SCORE':
+            sort_field = PyLucene.SortField.FIELD_SCORE
+            sort = PyLucene.Sort(sort_field)
+        elif order_by == 'INDEX':
+            sort = PyLucene.Sort.INDEXORDER
+        elif order_by == 'RELEVANCE':
+            sort = PyLucene.Sort.RELEVANCE
+        else:
+            reverse = order_by.startswith('-')
+            while order_by[0] in '+-':
+                order_by = order_by[1:]
+            sort_field = PyLucene.SortField(order_by, reverse)
+            sort = PyLucene.Sort(sort_field)
+        hits = searcher.search(query, sort)
+        return LuceneResultSet(hits, self)
+
+
+class LuceneResultSet(ResultSet):
+    def __init__(self, hits, indexer):
+        self._hits = hits
+        self._indexer = indexer
+
+    def __len__(self):
+        return self._hits.length()
+
+    def __iter__(self):
+        for hit in self._hits:
+            yield LuceneHit(hit, self._indexer)
+
+    def __getitem__(self, item):
+        return LuceneHit(self._hits.__getitem__(item))
+
+
+class LuceneHit(Hit):
+    def get_pk(self):
+        # FIXME: Hardcoded 'pk' field.
+        return self.data.get('pk')
+
+    def __getitem__(self, item):
+        return self.data.__getitem__(item)
+
+    def get_score(self):
+        return self.data.getScore()
+
+    score = property(get_score)
diff --git a/django/contrib/search/models.py b/django/contrib/search/models.py
new file mode 100644
index 0000000000..756bf01bad
--- /dev/null
+++ b/django/contrib/search/models.py
@@ -0,0 +1,27 @@
+from django.db import models
+
+# Note: These aren't used yet, but they probably will be in the future.
+# This is because the only thing that really needs to be remembered
+# (the path to the index) is going to go in SETTINGS anyway.
+# But persistent info such as outdated rows, search statistics, etc.
+# could still be useful.
+
+class Index(models.Model):
+    model_name = models.CharField(maxlength=255)
+
+class IndexedField(models.Model):
+    object_path = models.CharField(maxlength=255)
+    model = models.ForeignKey('Index')
+
+class QueryLog(models.Model):
+    """This is not a full log, but merely counts queries."""
+    query = models.CharField(maxlength=255, unique=True)
+    query_count = models.IntegerField(default=1)
+    last_date = DateTimeField()
+    last_source = models.CharField("Some identifier for who sent the query", maxlength=255)
+
+class Person(models.Model):
+    """This is for testing."""
+    first_name = models.CharField(maxlength=30)
+    last_name = models.CharField(maxlength=30)
+    description = models.TextField()
+\ No newline at end of file
diff --git a/django/contrib/search/query.py b/django/contrib/search/query.py
new file mode 100644
index 0000000000..3b90c43c70
--- /dev/null
+++ b/django/contrib/search/query.py
@@ -0,0 +1,36 @@
+class QueryParser(object):
+    # TODO: Make a common query language for all the backends.
+    pass
+
+
+class ResultSet(object):
+    def __iter__(self):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def __getitem__(self):
+        raise NotImplementedError
+
+
+class Hit(object):
+    def __init__(self, data, indexer):
+        self.indexer = indexer
+        self.model = indexer.model
+        self.data = data
+
+    def get_instance(self):
+        name = self.model._meta.pk.name
+        pk = self.model._meta.pk.to_python(self.get_pk())
+        return self.model.objects.get(**{name: pk})
+
+    instance = property(get_instance)
+
+    def get_pk(self):
+        raise NotImplementedError
+
+    def __repr__(self):
+        return "<%s: %s %s, Score: %s>" % (self.__class__.__name__,
+                                           self.model._meta,
+                                           self.get_pk(), self.score)
+\ No newline at end of file
diff --git a/django/contrib/search/views.py b/django/contrib/search/views.py
new file mode 100644
index 0000000000..60f00ef0ef
--- /dev/null
+++ b/django/contrib/search/views.py
@@ -0,0 +1 @@
+# Create your views here.
diff --git a/django/contrib/search/xapian.py b/django/contrib/search/xapian.py
new file mode 100644
index 0000000000..a1dbdbb8e9
--- /dev/null
+++ b/django/contrib/search/xapian.py
@@ -0,0 +1,64 @@
+from django.db import models
+from datetime import datetime
+import xapwrap.index
+import xapwrap.document
+from itertools import imap
+
+from base import Indexer, ResultSet
+
+# TODO: This is incomplete.
+
+class XapianIndexer(Indexer):
+    def update(self, documents=None):
+        idx = xapwrap.index.Index(self.path, True)
+
+        if documents is None:
+            update_queue = self.model.objects.all()
+        else:
+            update_queue = documents
+
+        for row in documents:
+            keys = []
+            for name, field in self.attr_fields.iteritems():
+                keys.append(xapwrap.document.SortKey(name, getattr(self.model, field.name)))
+
+            d = xapwrap.document.Document(textFields=fields, sortFields=keys, uid=row._get_pk_val())
+            idx.index(d)
+        idx.close()
+
+    def search(self, query, order_by='RELEVANCE'):
+        idx = Index(self.path)
+        if order_by == 'RELEVANCE':
+            results = idx.search(query, sortByRelevence=True)
+        else:
+            ascending = True
+            if isinstance(order_by, basestring) and order_by.startswith('-'):
+                ascending = False
+            while order_by[0] in '+-':
+                order_by = order_by[1:]
+            results = idx.search(query, order_by, sortAscending=ascending)
+        return XapianResultSet(results)
+
+
+class XapianResultSet(ResultSet):
+    def __init__(self, hits, indexer):
+        self._hits = hits
+        self._indexer = indexer
+
+    def __len__(self):
+        return len(self._hits)
+
+    def __iter__(self):
+        for hit in self._hits):
+            yield XapianHit(hit, self._indexer)
+
+
+class XapianHit(object):
+    def get_pk(self):
+        return self.data['pk']
+
+    def get_score(self):
+        return self.data['score']
+
+    score = property(get_score)
+
author	Brian Beck <exogen@gmail.com>	2006-08-21 19:02:19 +0000
committer	Brian Beck <exogen@gmail.com>	2006-08-21 19:02:19 +0000
commit	91790e27cd00b88b0891cb815a36d21813246c5d (patch)
tree	a0ae13c5dba42547f03ccb86cd88409eec4e07d7
parent	682aed446b16c1dd6c12d727849a715033878b34 (diff)
download	django-91790e27cd00b88b0891cb815a36d21813246c5d.tar.gz