summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Beck <exogen@gmail.com>2006-08-21 19:02:19 +0000
committerBrian Beck <exogen@gmail.com>2006-08-21 19:02:19 +0000
commit91790e27cd00b88b0891cb815a36d21813246c5d (patch)
treea0ae13c5dba42547f03ccb86cd88409eec4e07d7
parent682aed446b16c1dd6c12d727849a715033878b34 (diff)
downloaddjango-91790e27cd00b88b0891cb815a36d21813246c5d.tar.gz
[search-api] Initial commit, Lucene working, Xapian and Hype almost working, needs polish.
git-svn-id: http://code.djangoproject.com/svn/django/branches/search-api@3636 bcc190cf-cafb-0310-a4f2-bffc1f526a37
-rw-r--r--django/contrib/search/__init__.py0
-rw-r--r--django/contrib/search/backends.py19
-rw-r--r--django/contrib/search/base.py214
-rw-r--r--django/contrib/search/default.py9
-rw-r--r--django/contrib/search/hype.py35
-rw-r--r--django/contrib/search/lucene.py162
-rw-r--r--django/contrib/search/models.py27
-rw-r--r--django/contrib/search/query.py36
-rw-r--r--django/contrib/search/views.py1
-rw-r--r--django/contrib/search/xapian.py64
10 files changed, 567 insertions, 0 deletions
diff --git a/django/contrib/search/__init__.py b/django/contrib/search/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/django/contrib/search/__init__.py
diff --git a/django/contrib/search/backends.py b/django/contrib/search/backends.py
new file mode 100644
index 0000000000..d377fd2789
--- /dev/null
+++ b/django/contrib/search/backends.py
@@ -0,0 +1,19 @@
+from default import DefaultIndexer
+
+try:
+ from xapian import XapianIndexer
+except ImportError:
+ print "Xapian backend will not be available due to an ImportError. " \
+ "Do you have Xapian and Xapwrap installed?"
+
+try:
+ from lucene import LuceneIndexer
+except ImportError:
+ print "Lucene backend will not be available due to an ImportError. " \
+ "Do you have Lucene and PyLucene installed?"
+
+try:
+ from hype import HypeIndexer
+except ImportError:
+ print "Hyper Estraier backend will not be available due to an importError. " \
+ "Do you have Hyper Estraier and Hype installed?"
diff --git a/django/contrib/search/base.py b/django/contrib/search/base.py
new file mode 100644
index 0000000000..e63137a103
--- /dev/null
+++ b/django/contrib/search/base.py
@@ -0,0 +1,214 @@
+from django.db import models
+from django.core.exceptions import ObjectDoesNotExist
+import sys
+
+# For Python 2.3
+if not hasattr(__builtins__, 'set'):
+ from sets import Set as set
+
+# FIXME: Methods that accept a field parameter claim to accept Field instances
+# or strings giving the object path. However, since there is no Field
+# attribute giving the Model it is bound to, these methods only work for
+# strings at the moment. This doesn't really affect the ease of use of the
+# library, as strings are actually easier to use.
+
+def str_to_field(string, namespace=None):
+ """Gets the column attribute from the model as indicated
+ by `string`, following ForeignKey attributes, etc.
+
+ Example: 'Person.first_name' -> Person._meta.get_field('first_name')
+
+ `namespace` is the dict-like object in which the object path will be
+ searched. If None, the caller's global namespace will be used, thanks
+ to the sys._getframe hack. This is important so that, for example,
+ if `string` is 'models.Person.first_name', the caller's models module
+ is used instead of the django.db.models module imported here.
+ """
+ # FIXME: This whole function is either silly or clever...
+ objPath = string.split('.')
+ model = None
+
+ if namespace is None:
+ # FIXME: This uses the sys._getframe hack to get the caller's namespace.
+ obj = sys._getframe(1).f_globals
+ else:
+ obj = namespace
+ getter = obj.__getitem__
+
+ while objPath:
+ objName = objPath.pop(0)
+
+ # This might be better in a try/except block, but the respective
+ # exceptions for the getters (KeyError, AttributeError,
+ # FieldDoesNotExist) are already pretty descriptive...
+ obj = getter(objName)
+
+ if isinstance(obj, models.base.ModelBase):
+ model = obj
+ getter = model._meta.get_field
+ elif isinstance(obj, models.fields.related.ForeignKey):
+ model = obj.rel.to
+ getter = model._meta.get_field
+
+ # TODO: The rest of these could be more type-smart...
+ elif hasattr(obj, '__getitem__'):
+ getter = obj.__getitem__
+ elif hasattr(obj, '__getattribute__'):
+ getter = obj.__getattribute__
+ else:
+ getter = obj.__getattr__
+
+ if isinstance(obj, models.base.ModelBase):
+ model = obj
+ obj = obj._meta.pk
+
+ if not isinstance(obj, models.Field):
+ raise ValueError("%r is not a Field object! (%r -> %r)" % \
+ (objName, string, obj))
+ # FIXME: I don't think there is a way to get back to a field's Model
+ # from the Field object. This makes sense from a hierarchical viewpoint,
+ # but sure makes things like this harder. Hopefully setting this attribute
+ # won't mess anything up...
+ obj._model = model
+ return obj
+
+
+class Indexer(object):
+ def __init__(self, path, model, fields=None, attributes=None, namespace=None, **kwargs):
+ """Initialize an Indexer whose index data is stored at `path`.
+ `model` is the Model (or string name of the model) whose instances will
+ be used as documents. Note that fields from other models can still be
+ used in the index, but this model will be the one returned from search
+ results.
+ `fields` may be optionally initialized as an iterable of unnamed Fields.
+ `attributes` may be optionally initialized as a mapping of field names
+ to Fields.
+ `namespace` is the dict-like object in which fields passed as object
+ paths will be searched. If None, the caller's global namespace will be
+ used, thanks to the sys._getframe hack.
+
+ Example: If `fields` is ['models.Person.first_name'], it is important
+ that namespace['models'] refers to the intended module and NOT the
+ django.db.models module imported here.
+ """
+ if fields is None:
+ fields = []
+ if attributes is None:
+ attributes = kwargs
+ else:
+ # `attributes` should take precedence to `kwargs`.
+ kwargs.update(attributes)
+ attributes = kwargs
+
+ if namespace is None:
+ # FIXME: This uses the sys._getframe hack to get the caller's namespace.
+ namespace = sys._getframe(1).f_globals
+
+ self._prepare_path(path)
+
+ self.path = path
+ self.model = model
+ self.text_fields = set([])
+ self.attr_fields = {}
+
+ for field in fields:
+ self.add_field(field, namespace=namespace)
+
+ for name, field in attributes.iteritems():
+ self.add_field(field, name, namespace=namespace)
+
+ pk = self.model._meta.pk
+ pk._model = self.model
+ if pk not in self.text_fields and pk not in set(self.attr_fields.values()):
+ self.add_field(pk, 'pk', namespace=namespace)
+
+ def add_field(self, field, name=None, namespace=None):
+ """Add the given field to the Indexer, where `field` is either
+ an object path string or a Field instance. If `name` is None,
+ the field will be added to self.text_fields, otherwise it will be
+ added to self.attr_fields with the given name.
+ `namespace` has the same meaning as in __init__.
+ """
+ # FIXME: This uses the sys._getframe hack to get the caller's namespace.
+ if namespace is None:
+ namespace = sys._getframe(1).f_globals
+
+ # FIXME: Detect duplicates, or user-knows-best?
+ if isinstance(field, basestring):
+ field = str_to_field(field, namespace)
+
+ if name:
+ self.attr_fields[name] = field
+ else:
+ self.text_fields.add(field)
+
+ def remove_field(self, field=None, name=None, find_name=True, namespace=None):
+ """Remove the given field from the Indexer, where `field` is either
+ an object path string or a Field instance. If `name` is given,
+ the field with that name is removed. If both `field` and `name`
+ are given, both are removed if they refer to different fields.
+ If `find_name` is True, the named fields in self.attr_fields are
+ searched for `field`, otherwise only self.text_fields is searched.
+ `namespace` has the same meaning as in __init__.
+ """
+ # FIXME: This uses the sys._getframe hack to get the caller's namespace.
+ if namespace is None:
+ namespace = sys._getframe(1).f_globals
+
+ if name:
+ if name in self.attr_fields:
+ del self.attr_fields[name]
+ return
+
+ if field:
+ if isinstance(field, basestring):
+ field = str_to_field(field, namespace)
+
+ self.text_fields.discard(field)
+
+ if find_name:
+ for name, f in self.attr_fields.items():
+ # TODO: Make sure identity is correct here
+ if f is field:
+ del self.attr_fields[name]
+
+ def search(self, query_string, sortBy=None):
+ """Query the index for `query_string` and return a HitResults instance.
+ `order_by` can have the same values as Model.objects.order_by, with
+ 'SCORE' being the default.
+ """
+ raise NotImplementedError
+
+ def index(self, document):
+ raise NotImplementedError
+
+ def update(self, force=False):
+ raise NotImplementedError
+
+ def _prepare_path(self, path):
+ pass
+
+def test_indexer():
+ # Note: I'm not very good at writing tests.
+
+ class Person(models.Model):
+ first_name = models.CharField(maxlength=30)
+ last_name = models.CharField(maxlength=30)
+ description = models.TextField()
+
+ i = Indexer('', Person, ['Person.description'], {'first': 'Person.first_name'},
+ last='Person.last_name', namespace=locals())
+
+ assert Person._meta.get_field('description') in i.text_fields
+ assert set([Person._meta.get_field('first_name'),
+ Person._meta.get_field('last_name')]) == \
+ set(i.attr_fields.values())
+ assert 'first' in i.attr_fields and 'last' in i.attr_fields
+
+ i.remove_field('Person.description', namespace=locals())
+ assert not i.text_fields
+
+ i.remove_field(name='last')
+ assert 'last' not in i.attr_fields
+ print "Test succeeded."
+ return i
diff --git a/django/contrib/search/default.py b/django/contrib/search/default.py
new file mode 100644
index 0000000000..705f5a8f15
--- /dev/null
+++ b/django/contrib/search/default.py
@@ -0,0 +1,9 @@
+from base import Indexer
+
+# This is the future home of a pure-Python text indexer.
+
+# Alec Thomas has created a built-in indexer for his library here:
+# http://swapoff.org/wiki/pyndexter
+
+class DefaultIndexer(Indexer):
+ pass \ No newline at end of file
diff --git a/django/contrib/search/hype.py b/django/contrib/search/hype.py
new file mode 100644
index 0000000000..3ec20a98c5
--- /dev/null
+++ b/django/contrib/search/hype.py
@@ -0,0 +1,35 @@
+from base import Indexer
+from query import ResultSet, Hit
+
+import hype
+
+# TODO: This is very incomplete.
+
+class HypeIndexer(Indexer):
+ def __init__(self, *args, **kwargs):
+ super(Indexer, self).__init__(*args, **kwargs)
+ self.db = hype.Database(self.path, hype.ESTDBWRITER | hype.ESTDBCREAT)
+
+ def index(self, row):
+ document = hype.Document()
+ document['@pk'] = row._get_pk_val()
+ document.add_text()
+
+ def search(self, query_string, sortBy=None):
+ searcher = self.db.search(query_string)
+ return HypeResultSet(searcher)
+
+ def close(self):
+ self.db.close()
+
+
+class HypeResultSet(ResultSet):
+ def __len__(self):
+ return len(self._hits)
+
+ def __iter__(self):
+ for hit in self._hits:
+ yield HypeHit(hit, self._indexer)
+
+class HypeHit(Hit):
+ pass \ No newline at end of file
diff --git a/django/contrib/search/lucene.py b/django/contrib/search/lucene.py
new file mode 100644
index 0000000000..76f2ad4402
--- /dev/null
+++ b/django/contrib/search/lucene.py
@@ -0,0 +1,162 @@
+from base import Indexer
+from query import ResultSet, Hit
+from itertools import imap
+import os, sys
+
+import PyLucene
+
+# WARNING!*
+# PyLucene wants you to use PyLucene.PythonThread for threading.
+# Look at samples/ThreadIndexFiles.py bundled with PyLucene.
+# * I'm not sure how important this is.
+
+# TODO: Make Lucene aware of field types.
+
+# Here's how to use me:
+#
+# class Person(models.Model):
+# first_name = models.CharField(maxlength=30)
+# last_name = models.CharField(maxlength=30)
+# biography = models.TextField()
+#
+# indexer = LuceneIndexer('/tmp/lucene-index', Person, [biography],
+# {'first': 'Person.first_name',
+# 'last': 'Person.last_name'})
+# indexer.update() # Note, calling this multiple times without clearing old
+# # entries will cause duplicates in the index.
+# indexer.search("brian -last:beck")
+
+class LuceneIndexer(Indexer):
+ def __init__(self, *args, **kwargs):
+ # FIXME: This uses the sys._getframe hack to get the caller's namespace.
+ namespace = sys._getframe(1).f_globals
+ kwargs['namespace'] = namespace
+ super(LuceneIndexer, self).__init__(*args, **kwargs)
+ self.writer_closed = True
+
+ def _prepare_path(self, path):
+ # Lucene wants an abstraction of the directory.
+ # Should look into storage in a Model-compatible database in the future...
+ self._store = PyLucene.FSDirectory.getDirectory(path, True)
+
+ def update(self, documents=None):
+ close = False
+ if self.writer_closed:
+ close = True
+ self.open_writer()
+
+ if documents is None:
+ update_queue = self.model.objects.all()
+ else:
+ update_queue = documents
+
+ for document in update_queue:
+ self.delete(document)
+ self.index(document)
+
+ if close:
+ self.close_writer()
+
+ def clear(self):
+ close = False
+ if self.writer_closed:
+ close = True
+ self.open_writer()
+ for i in xrange(self._writer.docCount()):
+ self._writer.deleteDocument(i)
+ if close:
+ self.close_writer()
+
+ def delete(self, row):
+ reader = PyLucene.IndexReader.open(self.path)
+ reader.deleteDocuments(PyLucene.Term('pk', str(row._get_pk_val())))
+ reader.close()
+
+ def open_writer(self):
+ self.writer_closed = False
+ self._writer = PyLucene.IndexWriter(self._store, PyLucene.StandardAnalyzer(), True)
+ self._writer.setMaxFieldLength(1048576) # Max number of tokens stored per field?
+
+ def close_writer(self):
+ self._writer.optimize()
+ self._writer.close()
+ self.writer_closed = True
+
+ def index(self, row):
+ close = False
+ if self.writer_closed:
+ close = True
+ self.open_writer()
+
+ document = PyLucene.Document()
+
+ for name, field in self.attr_fields.iteritems():
+ # FIXME: Assumes no Foreign Keys! Lame!
+ value = getattr(row, field.name)
+ document.add(PyLucene.Field(name, str(value),
+ PyLucene.Field.Store.YES,
+ PyLucene.Field.Index.TOKENIZED))
+ # Lucene only seems to support one 'default' field.
+ # However, we might want multiple fields to be searched
+ # by default. Hopefully just joining their contents with
+ # newlines solves this.
+ contents = '\n'.join([str(getattr(row, field.name)) for field in \
+ self.text_fields])
+ # FIXME: Hardcoded 'contents' field.
+ document.add(PyLucene.Field('contents', contents,
+ PyLucene.Field.Store.YES,
+ PyLucene.Field.Index.TOKENIZED))
+ self._writer.addDocument(document)
+ if close:
+ self.close_writer()
+
+ def search(self, query_string, default_field='contents', order_by='RELEVANCE'):
+ searcher = PyLucene.IndexSearcher(self._store)
+ analyzer = PyLucene.StandardAnalyzer()
+ query = PyLucene.QueryParser(default_field, analyzer).parse(query_string)
+
+ if order_by == 'SCORE':
+ sort_field = PyLucene.SortField.FIELD_SCORE
+ sort = PyLucene.Sort(sort_field)
+ elif order_by == 'INDEX':
+ sort = PyLucene.Sort.INDEXORDER
+ elif order_by == 'RELEVANCE':
+ sort = PyLucene.Sort.RELEVANCE
+ else:
+ reverse = order_by.startswith('-')
+ while order_by[0] in '+-':
+ order_by = order_by[1:]
+ sort_field = PyLucene.SortField(order_by, reverse)
+ sort = PyLucene.Sort(sort_field)
+ hits = searcher.search(query, sort)
+ return LuceneResultSet(hits, self)
+
+
+class LuceneResultSet(ResultSet):
+ def __init__(self, hits, indexer):
+ self._hits = hits
+ self._indexer = indexer
+
+ def __len__(self):
+ return self._hits.length()
+
+ def __iter__(self):
+ for hit in self._hits:
+ yield LuceneHit(hit, self._indexer)
+
+ def __getitem__(self, item):
+ return LuceneHit(self._hits.__getitem__(item))
+
+
+class LuceneHit(Hit):
+ def get_pk(self):
+ # FIXME: Hardcoded 'pk' field.
+ return self.data.get('pk')
+
+ def __getitem__(self, item):
+ return self.data.__getitem__(item)
+
+ def get_score(self):
+ return self.data.getScore()
+
+ score = property(get_score)
diff --git a/django/contrib/search/models.py b/django/contrib/search/models.py
new file mode 100644
index 0000000000..756bf01bad
--- /dev/null
+++ b/django/contrib/search/models.py
@@ -0,0 +1,27 @@
+from django.db import models
+
+# Note: These aren't used yet, but they probably will be in the future.
+# This is because the only thing that really needs to be remembered
+# (the path to the index) is going to go in SETTINGS anyway.
+# But persistent info such as outdated rows, search statistics, etc.
+# could still be useful.
+
+class Index(models.Model):
+ model_name = models.CharField(maxlength=255)
+
+class IndexedField(models.Model):
+ object_path = models.CharField(maxlength=255)
+ model = models.ForeignKey('Index')
+
+class QueryLog(models.Model):
+ """This is not a full log, but merely counts queries."""
+ query = models.CharField(maxlength=255, unique=True)
+ query_count = models.IntegerField(default=1)
+ last_date = DateTimeField()
+ last_source = models.CharField("Some identifier for who sent the query", maxlength=255)
+
+class Person(models.Model):
+ """This is for testing."""
+ first_name = models.CharField(maxlength=30)
+ last_name = models.CharField(maxlength=30)
+ description = models.TextField() \ No newline at end of file
diff --git a/django/contrib/search/query.py b/django/contrib/search/query.py
new file mode 100644
index 0000000000..3b90c43c70
--- /dev/null
+++ b/django/contrib/search/query.py
@@ -0,0 +1,36 @@
+class QueryParser(object):
+ # TODO: Make a common query language for all the backends.
+ pass
+
+
+class ResultSet(object):
+ def __iter__(self):
+ raise NotImplementedError
+
+ def __len__(self):
+ raise NotImplementedError
+
+ def __getitem__(self):
+ raise NotImplementedError
+
+
+class Hit(object):
+ def __init__(self, data, indexer):
+ self.indexer = indexer
+ self.model = indexer.model
+ self.data = data
+
+ def get_instance(self):
+ name = self.model._meta.pk.name
+ pk = self.model._meta.pk.to_python(self.get_pk())
+ return self.model.objects.get(**{name: pk})
+
+ instance = property(get_instance)
+
+ def get_pk(self):
+ raise NotImplementedError
+
+ def __repr__(self):
+ return "<%s: %s %s, Score: %s>" % (self.__class__.__name__,
+ self.model._meta,
+ self.get_pk(), self.score) \ No newline at end of file
diff --git a/django/contrib/search/views.py b/django/contrib/search/views.py
new file mode 100644
index 0000000000..60f00ef0ef
--- /dev/null
+++ b/django/contrib/search/views.py
@@ -0,0 +1 @@
+# Create your views here.
diff --git a/django/contrib/search/xapian.py b/django/contrib/search/xapian.py
new file mode 100644
index 0000000000..a1dbdbb8e9
--- /dev/null
+++ b/django/contrib/search/xapian.py
@@ -0,0 +1,64 @@
+from django.db import models
+from datetime import datetime
+import xapwrap.index
+import xapwrap.document
+from itertools import imap
+
+from base import Indexer, ResultSet
+
+# TODO: This is incomplete.
+
+class XapianIndexer(Indexer):
+ def update(self, documents=None):
+ idx = xapwrap.index.Index(self.path, True)
+
+ if documents is None:
+ update_queue = self.model.objects.all()
+ else:
+ update_queue = documents
+
+ for row in documents:
+ keys = []
+ for name, field in self.attr_fields.iteritems():
+ keys.append(xapwrap.document.SortKey(name, getattr(self.model, field.name)))
+
+ d = xapwrap.document.Document(textFields=fields, sortFields=keys, uid=row._get_pk_val())
+ idx.index(d)
+ idx.close()
+
+ def search(self, query, order_by='RELEVANCE'):
+ idx = Index(self.path)
+ if order_by == 'RELEVANCE':
+ results = idx.search(query, sortByRelevence=True)
+ else:
+ ascending = True
+ if isinstance(order_by, basestring) and order_by.startswith('-'):
+ ascending = False
+ while order_by[0] in '+-':
+ order_by = order_by[1:]
+ results = idx.search(query, order_by, sortAscending=ascending)
+ return XapianResultSet(results)
+
+
+class XapianResultSet(ResultSet):
+ def __init__(self, hits, indexer):
+ self._hits = hits
+ self._indexer = indexer
+
+ def __len__(self):
+ return len(self._hits)
+
+ def __iter__(self):
+ for hit in self._hits):
+ yield XapianHit(hit, self._indexer)
+
+
+class XapianHit(object):
+ def get_pk(self):
+ return self.data['pk']
+
+ def get_score(self):
+ return self.data['score']
+
+ score = property(get_score)
+