diff options
Diffstat (limited to 'src/assistant/3rdparty/clucene/src/CLucene/document')
6 files changed, 1095 insertions, 0 deletions
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/document/DateField.cpp b/src/assistant/3rdparty/clucene/src/CLucene/document/DateField.cpp new file mode 100644 index 000000000..ff72b12bb --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/document/DateField.cpp @@ -0,0 +1,60 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" + +#include "DateField.h" +#include "CLucene/util/Misc.h" +CL_NS_USE(util) +CL_NS_DEF(document) + +DateField::~DateField(){ +} + +TCHAR* DateField::timeToString(const int64_t time) { + TCHAR* buf = _CL_NEWARRAY(TCHAR,DATEFIELD_DATE_LEN + 1); + timeToString(time,buf); + return buf; +} +void DateField::timeToString(const int64_t time, TCHAR* buf) { + CND_PRECONDITION (buf, "buf == NULL"); + *buf = '\0'; + if (time < 0) + _CLTHROWA (CL_ERR_IllegalArgument,"time too early"); //todo: make richer error + + if (time > DATEFIELD_DATE_MAX) + _CLTHROWA (CL_ERR_IllegalArgument, "time too late (past DATEFIELD_DATE_MAX"); //todo: make richer error + + _i64tot(time, buf, 36); + int32_t bufLen = _tcslen(buf); + + CND_PRECONDITION (bufLen <= DATEFIELD_DATE_LEN, "timeToString length is greater than 9"); + + /* Supply leading zeroes if necessary. */ + if (bufLen < DATEFIELD_DATE_LEN) { + const int32_t nMissingZeroes = DATEFIELD_DATE_LEN - bufLen; + /* Move buffer contents forward to make room for leading zeroes. */ + for (int32_t i = DATEFIELD_DATE_LEN - 1; i >= nMissingZeroes; i--) + buf[i] = buf[i - nMissingZeroes]; + + /* Insert leading zeroes. */ + {// MSVC6 scoping fix + for (int32_t i = 0; i < nMissingZeroes; i++) + buf[i] = '0'; + } + + buf[DATEFIELD_DATE_LEN] = 0; + } + + CND_PRECONDITION (_tcslen(buf) == DATEFIELD_DATE_LEN, "timeToString return is not equal to DATEFIELD_DATE_LEN"); +} + +int64_t DateField::stringToTime(const TCHAR* time) { + TCHAR* end; + return _tcstoi64(time, &end, 36); +} + +CL_NS_END diff --git a/src/assistant/3rdparty/clucene/src/CLucene/document/DateField.h b/src/assistant/3rdparty/clucene/src/CLucene/document/DateField.h new file mode 100644 index 000000000..712fe9b62 --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/document/DateField.h @@ -0,0 +1,64 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_document_DateField_ +#define _lucene_document_DateField_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +CL_NS_DEF(document) + +//here are some constants used throughout clucene +//make date strings long enough to last a millenium +#define DATEFIELD_DATE_MAX _ILONGLONG(31536000000000) //1000L*365*24*60*60*1000 + +#define DATEFIELD_DATE_LEN 9 ////Long.toString(DATEFIELD_DATE_MAX, Character.MAX_RADIX).length() + +/** +* Provides support for converting dates to strings and vice-versa. +* The strings are structured so that lexicographic sorting orders by date, +* which makes them suitable for use as field values and search terms. +* +* <P>Note that this class saves dates with millisecond granularity, +* which is bad for {@link RangeQuery} and {@link PrefixQuery}, as those +* queries are expanded to a BooleanQuery with a potentially large number +* of terms when searching. Thus you might want to use +* {@link DateTools} instead. +* +* <P> +* Note: dates before 1970 cannot be used, and therefore cannot be +* indexed when using this class. See {@link DateTools} for an +* alternative without such a limitation. +* +* @deprecated If you build a new index, use {@link DateTools} instead. This class is included for use with existing +* indices and will be removed in a future release. +*/ +class DateField :LUCENE_BASE { +public: + ~DateField(); + + /** + * Converts a millisecond time to a string suitable for indexing. + * @throws RuntimeException if the time specified in the + * method argument is negative, that is, before 1970 + */ + static TCHAR* timeToString(const int64_t time); + + /** + * Converts a millisecond time to a string suitable for indexing. + * @throws CL_ERR_IllegalArgument if the time specified in the + * method argument is negative, that is, before 1970 + * @param str must be a character array DATEFIELD_DATE_LEN+1 or longer + */ + static void timeToString(const int64_t time, TCHAR* str); + + /** Converts a string-encoded date into a millisecond time. */ + static int64_t stringToTime(const TCHAR* s); +}; +CL_NS_END +#endif diff --git a/src/assistant/3rdparty/clucene/src/CLucene/document/Document.cpp b/src/assistant/3rdparty/clucene/src/CLucene/document/Document.cpp new file mode 100644 index 000000000..a0ce03942 --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/document/Document.cpp @@ -0,0 +1,237 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "Document.h" +#include "Field.h" +#include "CLucene/util/StringBuffer.h" + +CL_NS_USE(util) +CL_NS_DEF(document) + + DocumentFieldEnumeration::DocumentFieldList::DocumentFieldList(Field* f, DocumentFieldList* n ) { + //Func - Constructor + //Pre - f != NULL + // n may be NULL + //Post - Instance has been created + CND_PRECONDITION(f != NULL, "f is NULL"); + + field = f; + next = n; + } + DocumentFieldEnumeration::DocumentFieldList::~DocumentFieldList(){ + //Func - Destructor + //Pre - true + //Post - Instance has been destroyed + + // Instead of recursively deleting the field list we do + // it iteratively to avoid stack overflows when + // dealing with several thousands of fields. + + if (!field) { + return; // nothing to do; deleted by different invocation of dtor + } + + DocumentFieldList* cur = next; + while (cur != NULL) + { + DocumentFieldList* temp = cur->next; + cur->next = NULL; + + _CLDELETE(cur); + cur = temp; + } + _CLDELETE(field); + } + + + DocumentFieldEnumeration::DocumentFieldEnumeration(const DocumentFieldList* fl){ + //Func - Constructor + //Pre - fl may be NULL + //Post - Instance has been created + + fields = fl; + } + + DocumentFieldEnumeration::~DocumentFieldEnumeration(){ + //Func - Destructor + //Pre - true + //Post - Instance has been destroyed + } + + bool DocumentFieldEnumeration::hasMoreElements() const { + return fields == NULL ? false : true; + } + + Field* DocumentFieldEnumeration::nextElement() { + //Func - Return the next element in the enumeration + //Pre - true + //Post - The next element is returned or NULL + + + Field* result = NULL; + //Check if fields is still valid + if (fields){ + result = fields->field; + fields = fields->next; + } + return result; + } + + /** Constructs a new document with no fields. */ + Document::Document(){ + //Func - Constructor + //Pre - true + //Post - Instance has been created + boost = 1.0f; + fieldList = NULL; + } + + Document::~Document(){ + //Func - Destructor + //Pre - true + //Post - Instance has been destroyed + boost = 1.0f; + _CLDELETE(fieldList); + } + + void Document::clear(){ + _CLDELETE(fieldList); + } + + void Document::add(Field& field) { + fieldList = _CLNEW DocumentFieldEnumeration::DocumentFieldList(&field, fieldList); + } + + void Document::setBoost(qreal boost) { + this->boost = boost; + } + + qreal Document::getBoost() const { + return boost; + } + + + Field* Document::getField(const TCHAR* name) const{ + CND_PRECONDITION(name != NULL, "name is NULL"); + + for (DocumentFieldEnumeration::DocumentFieldList* list = fieldList; list != NULL; list = list->next) + //cannot use interning here, because name is probably not interned + if ( _tcscmp(list->field->name(), name) == 0 ){ + return list->field; + } + + return NULL; + } + + const TCHAR* Document::get(const TCHAR* field) const { + CND_PRECONDITION(field != NULL, "field is NULL"); + Field *f = getField(field); + if (f!=NULL) + return f->stringValue(); //this returns null it is a binary(reader) + else + return NULL; + } + + DocumentFieldEnumeration* Document::fields() const { + return _CLNEW DocumentFieldEnumeration(fieldList); + } + + + TCHAR* Document::toString() const { + StringBuffer ret(_T("Document<")); + for (DocumentFieldEnumeration::DocumentFieldList* list = fieldList; list != NULL; list = list->next) { + TCHAR* tmp = list->field->toString(); + ret.append( tmp ); + if (list->next != NULL) + ret.append(_T(" ")); + _CLDELETE_ARRAY( tmp ); + } + ret.append(_T(">")); + return ret.toString(); + } + + + + void Document::removeField(const TCHAR* name) { + CND_PRECONDITION(name != NULL, "name is NULL"); + + DocumentFieldEnumeration::DocumentFieldList* previous = NULL; + DocumentFieldEnumeration::DocumentFieldList* current = fieldList; + while (current != NULL) { + //cannot use interning here, because name is probably not interned + if ( _tcscmp(current->field->name(),name) == 0 ){ + if (previous){ + previous->next = current->next; + }else + fieldList = current->next; + current->next=NULL; //ensure fieldlist destructor doesnt delete it + _CLDELETE(current); + return; + } + previous = current; + current = current->next; + } + } + + void Document::removeFields(const TCHAR* name) { + CND_PRECONDITION(name != NULL, "name is NULL"); + + DocumentFieldEnumeration::DocumentFieldList* previous = NULL; + DocumentFieldEnumeration::DocumentFieldList* current = fieldList; + while (current != NULL) { + //cannot use interning here, because name is probably not interned + if ( _tcscmp(current->field->name(),name) == 0 ){ + if (previous){ + previous->next = current->next; + }else + fieldList = current->next; + + current->next=NULL; //ensure fieldlist destructor doesnt delete it + _CLDELETE(current); + + if ( previous ) + current = previous->next; + else + current = fieldList; + }else{ + previous = current; + current = current->next; + } + } + } + + TCHAR** Document::getValues(const TCHAR* name) { + DocumentFieldEnumeration* it = fields(); + int32_t count = 0; + while ( it->hasMoreElements() ){ + Field* f = it->nextElement(); + //cannot use interning here, because name is probably not interned + if ( _tcscmp(f->name(),name) == 0 && f->stringValue() != NULL ) + count++; + } + _CLDELETE(it); + it = fields(); + + //todo: there must be a better way of doing this, we are doing two iterations of the fields + TCHAR** ret = NULL; + if ( count > 0 ){ + //start again + ret = _CL_NEWARRAY(TCHAR*,count+1); + int32_t i=0; + while ( it->hasMoreElements() ){ + Field* fld=it->nextElement(); + if ( _tcscmp(fld->name(),name)== 0 && fld->stringValue() != NULL ){ + ret[i] = stringDuplicate(fld->stringValue()); + i++; + } + } + ret[count]=NULL; + } + _CLDELETE(it); + return ret; + } +CL_NS_END diff --git a/src/assistant/3rdparty/clucene/src/CLucene/document/Document.h b/src/assistant/3rdparty/clucene/src/CLucene/document/Document.h new file mode 100644 index 000000000..ba7a283f7 --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/document/Document.h @@ -0,0 +1,158 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_document_Document_ +#define _lucene_document_Document_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "Field.h" + +///todo: jlucene has change from using DocumentFieldList/Enumeration +///to using a java List... do we want to do this too? +CL_NS_DEF(document) + +class Document; //predefine +class DocumentFieldEnumeration :LUCENE_BASE{ + class DocumentFieldList :LUCENE_BASE{ + public: + DocumentFieldList(Field* f, DocumentFieldList* n); + ~DocumentFieldList(); + Field* field; + DocumentFieldList* next; + }; + friend class Document; +private: + const DocumentFieldList* fields; +public: + DocumentFieldEnumeration(const DocumentFieldList* fl); + ~DocumentFieldEnumeration(); + bool hasMoreElements() const; + Field* nextElement(); +}; + +/** Documents are the unit of indexing and search. +* +* A Document is a set of fields. Each field has a name and a textual value. +* A field may be {@link Field#isStored() stored} with the document, in which +* case it is returned with search hits on the document. Thus each document +* should typically contain one or more stored fields which uniquely identify +* it. +* +* <p>Note that fields which are <i>not</i> {@link Field#isStored() stored} are +* <i>not</i> available in documents retrieved from the index, e.g. with {@link +* Hits#doc(int32_t, Document*)}, {@link Searcher#doc(int32_t, Document*)} or {@link +* IndexReader#document(int32_t, Document*)}. +*/ +class Document:LUCENE_BASE { +private: + DocumentFieldEnumeration::DocumentFieldList* fieldList; + qreal boost; +public: + Document(); + ~Document(); + + /** + * <p>Adds a field to a document. Several fields may be added with + * the same name. In this case, if the fields are indexed, their text is + * treated as though appended for the purposes of search.</p> + * <p> Note that add like the removeField(s) methods only makes sense + * prior to adding a document to an index. These methods cannot + * be used to change the content of an existing index! In order to achieve this, + * a document has to be deleted from an index and a new changed version of that + * document has to be added.</p> + * + */ + void add(Field& field); + /** Returns a field with the given name if any exist in this document, or + * null. If multiple fields exists with this name, this method returns the + * first value added. + * Note: name is case sensitive + */ + Field* getField(const TCHAR* name) const; + + /** Returns the string value of the field with the given name if any exist in + * this document, or null. If multiple fields exist with this name, this + * method returns the first value added. If only binary fields with this name + * exist, returns null. + * Note: name is case sensitive + */ + const TCHAR* get(const TCHAR* field) const; + + /** Returns an Enumeration of all the fields in a document. */ + DocumentFieldEnumeration* fields() const; + /** Prints the fields of a document for human consumption. */ + TCHAR* toString() const; + + /** Sets a boost factor for hits on any field of this document. This value + * will be multiplied into the score of all hits on this document. + * + * <p>Values are multiplied into the value of {@link Field#getBoost()} of + * each field in this document. Thus, this method in effect sets a default + * boost for the fields of this document. + * + * @see Field#setBoost(qreal) + */ + void setBoost(qreal boost); + + /** Returns the boost factor for hits on any field of this document. + * + * <p>The default value is 1.0. + * + * <p>Note: This value is not stored directly with the document in the index. + * Documents returned from {@link IndexReader#document(int32_t, Document*)} and + * {@link Hits#doc(int32_t, Document*)} may thus not have the same value present as when + * this document was indexed. + * + * @see #setBoost(qreal) + */ + qreal getBoost() const; + + + /** + * <p>Removes field with the specified name from the document. + * If multiple fields exist with this name, this method removes the first field that has been added. + * If there is no field with the specified name, the document remains unchanged.</p> + * <p> Note that the removeField(s) methods like the add method only make sense + * prior to adding a document to an index. These methods cannot + * be used to change the content of an existing index! In order to achieve this, + * a document has to be deleted from an index and a new changed version of that + * document has to be added.</p> + * Note: name is case sensitive + */ + void removeField(const TCHAR* name); + + /** + * <p>Removes all fields with the given name from the document. + * If there is no field with the specified name, the document remains unchanged.</p> + * <p> Note that the removeField(s) methods like the add method only make sense + * prior to adding a document to an index. These methods cannot + * be used to change the content of an existing index! In order to achieve this, + * a document has to be deleted from an index and a new changed version of that + * document has to be added.</p> + * Note: name is case sensitive + */ + void removeFields(const TCHAR* name); + + /** + * Returns an array of values of the field specified as the method parameter. + * This method can return <code>null</code>. + * Note: name is case sensitive + * + * @param name the name of the field + * @return a <code>String[]</code> of field values + */ + TCHAR** getValues(const TCHAR* name); + + /** + * Empties out the document so that it can be reused + */ + void clear(); +}; +CL_NS_END +#endif diff --git a/src/assistant/3rdparty/clucene/src/CLucene/document/Field.cpp b/src/assistant/3rdparty/clucene/src/CLucene/document/Field.cpp new file mode 100644 index 000000000..8cd88a36b --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/document/Field.cpp @@ -0,0 +1,315 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "CLucene/util/Reader.h" +#include "Field.h" +#include "CLucene/util/Misc.h" +#include "CLucene/util/StringIntern.h" +#include "CLucene/util/StringBuffer.h" + +CL_NS_USE(util) +CL_NS_DEF(document) + +Field::Field(const TCHAR* Name, const TCHAR* String, bool store, bool index, bool token, const bool storeTermVector) +{ +//Func - Constructor +//Pre - Name != NULL and contains the name of the field +// String != NULL and contains the value of the field +// store indicates if the field must be stored +// index indicates if the field must be indexed +// token indicates if the field must be tokenized +//Post - The instance has been created + + CND_PRECONDITION(Name != NULL, "Name is NULL"); + CND_PRECONDITION(String != NULL,"String is NULL"); + CND_PRECONDITION(!(!index && storeTermVector),"cannot store a term vector for fields that are not indexed."); + + _name = CLStringIntern::intern( Name CL_FILELINE); + _stringValue = stringDuplicate( String ); + _readerValue = NULL; + _streamValue = NULL; + boost=1.0f; + omitNorms = false; + + int cfg = 0; + if ( store ) + cfg |= STORE_YES; + if ( index && token ) + cfg |= INDEX_TOKENIZED; + else if ( index && !token ) + cfg |= INDEX_UNTOKENIZED; + + if ( storeTermVector ) + _CLTHROWA(CL_ERR_IllegalArgument,"Stored term vector is deprecated with using this constructor"); + + setConfig(cfg); +} + +Field::Field(const TCHAR* Name, Reader* reader, bool store, bool index, bool token, const bool storeTermVector) +{ +//Func - Constructor +//Pre - Name != NULL and contains the name of the field +// reader != NULL and contains a Reader +// store indicates if the field must be stored +// index indicates if the field must be indexed +// token indicates if the field must be tokenized +//Post - The instance has been created + + CND_PRECONDITION(Name != NULL, "Name is NULL"); + CND_PRECONDITION(reader != NULL, "reader is NULL"); + + _name = CLStringIntern::intern( Name CL_FILELINE); + _stringValue = NULL; + _readerValue = reader; + _streamValue = NULL; + boost=1.0f; + omitNorms = false; + + int cfg = 0; + if ( store ) + cfg |= STORE_YES; + if ( index && token ) + cfg |= INDEX_TOKENIZED; + else if ( index && !token ) + cfg |= INDEX_UNTOKENIZED; + + if ( storeTermVector ) + _CLTHROWA(CL_ERR_IllegalArgument,"Stored term vector is deprecated with using this constructor"); + + setConfig(cfg); +} + +Field::Field(const TCHAR* Name, Reader* reader, int config) +{ + CND_PRECONDITION(Name != NULL, "Name is NULL"); + CND_PRECONDITION(reader != NULL, "reader is NULL"); + + _name = CLStringIntern::intern( Name CL_FILELINE); + _stringValue = NULL; + _readerValue = reader; + _streamValue = NULL; + boost=1.0f; + omitNorms = false; + + setConfig(config); +} + + +Field::Field(const TCHAR* Name, const TCHAR* Value, int config) +{ + CND_PRECONDITION(Name != NULL, "Name is NULL"); + CND_PRECONDITION(Value != NULL, "value is NULL"); + + _name = CLStringIntern::intern( Name CL_FILELINE); + _stringValue = stringDuplicate( Value ); + _readerValue = NULL; + _streamValue = NULL; + boost=1.0f; + omitNorms = false; + + setConfig(config); +} + +Field::Field(const TCHAR* Name, jstreams::StreamBase<char>* Value, int config) +{ + CND_PRECONDITION(Name != NULL, "Name is NULL"); + CND_PRECONDITION(Value != NULL, "value is NULL"); + + _name = CLStringIntern::intern( Name CL_FILELINE); + _stringValue = NULL; + _readerValue = NULL; + _streamValue = Value; + boost=1.0f; + omitNorms = false; + + setConfig(config); +} + +Field::~Field(){ +//Func - Destructor +//Pre - true +//Post - Instance has been destroyed + + CLStringIntern::unintern(_name); + _CLDELETE_CARRAY(_stringValue); + _CLDELETE(_readerValue); + _CLVDELETE( _streamValue ); +} + + +/*===============FIELDS=======================*/ +const TCHAR* Field::name() { return _name; } ///<returns reference +TCHAR* Field::stringValue() { return _stringValue; } ///<returns reference +Reader* Field::readerValue() { return _readerValue; } ///<returns reference +jstreams::StreamBase<char>* Field::streamValue() { return _streamValue; } ///<returns reference + +bool Field::isStored() { return (config & STORE_YES) != 0; } +bool Field::isIndexed() { return (config & INDEX_TOKENIZED)!=0 || (config & INDEX_UNTOKENIZED)!=0; } +bool Field::isTokenized() { return (config & INDEX_TOKENIZED) != 0; } +bool Field::isCompressed() { return (config & STORE_COMPRESS) != 0; } +bool Field::isBinary() { return _streamValue!=NULL; } + +bool Field::isTermVectorStored() { return (config & TERMVECTOR_YES) != 0; } +bool Field::isStoreOffsetWithTermVector() { return (config & TERMVECTOR_YES) != 0 && (config & TERMVECTOR_WITH_OFFSETS) != 0; } +bool Field::isStorePositionWithTermVector() { return (config & TERMVECTOR_YES) != 0 && (config & TERMVECTOR_WITH_POSITIONS) != 0; } + +bool Field::getOmitNorms() { return omitNorms; } +void Field::setOmitNorms(bool omitNorms) { this->omitNorms=omitNorms; } + +void Field::setBoost(qreal boost) { this->boost = boost; } +qreal Field::getBoost() { return boost; } + +void Field::setConfig(int x){ + int newConfig=0; + + //set storage settings + if ( (x & STORE_YES) || (x & STORE_COMPRESS) ){ + newConfig |= STORE_YES; + if ( x & STORE_COMPRESS ) + newConfig |= STORE_COMPRESS; + }else + newConfig |= STORE_NO; + + if ( (x & INDEX_NO)==0 ){ + bool index=false; + + if ( x & INDEX_NONORMS ){ + newConfig |= INDEX_NONORMS; + index = true; + } + + if ( x & INDEX_TOKENIZED && x & INDEX_UNTOKENIZED ) + _CLTHROWA(CL_ERR_IllegalArgument,"it doesn't make sense to have an untokenised and tokenised field"); + if ( x & INDEX_TOKENIZED ){ + newConfig |= INDEX_TOKENIZED; + index = true; + } + if ( x & INDEX_UNTOKENIZED ){ + newConfig |= INDEX_UNTOKENIZED; + index = true; + } + if ( !index ) + newConfig |= INDEX_NO; + }else + newConfig |= INDEX_NO; + + if ( newConfig & INDEX_NO && newConfig & STORE_NO ) + _CLTHROWA(CL_ERR_IllegalArgument,"it doesn't make sense to have a field that is neither indexed nor stored"); + + //set termvector settings + if ( (x & TERMVECTOR_NO) == 0 ){ + bool termVector=false; + if ( x & TERMVECTOR_YES ){ + termVector=true; + } + if ( x & TERMVECTOR_WITH_OFFSETS ){ + newConfig |= TERMVECTOR_WITH_OFFSETS; + termVector=true; + } + if ( x & TERMVECTOR_WITH_POSITIONS ){ + newConfig |= TERMVECTOR_WITH_POSITIONS; + termVector=true; + } + if ( termVector ){ + if ( newConfig & INDEX_NO ) + _CLTHROWA(CL_ERR_IllegalArgument,"cannot store a term vector for fields that are not indexed."); + + newConfig |= TERMVECTOR_YES; + }else + newConfig |= TERMVECTOR_NO; + }else + newConfig |= TERMVECTOR_NO; + + config = newConfig; +} + +TCHAR* Field::toString() { + CL_NS(util)::StringBuffer result; + if (isStored()) { + result.append( _T("stored") ); + } + if (isIndexed()) { + if (result.length() > 0) + result.append( _T(",") ); + result.append( _T("indexed") ); + } + if (isTokenized()) { + if (result.length() > 0) + result.append( _T(",") ); + result.append( _T("tokenized") ); + } + if (isTermVectorStored()) { + if (result.length() > 0) + result.append( _T(",") ); + result.append( _T("termVector") ); + } + if (isStoreOffsetWithTermVector()) { + if (result.length() > 0) + result.appendChar( ',' ); + result.append( _T("termVectorOffsets") ); + } + if (isStorePositionWithTermVector()) { + if (result.length() > 0) + result.appendChar( ',' ); + result.append( _T("termVectorPosition") ); + } + if (isBinary()) { + if (result.length() > 0) + result.appendChar( ',' ); + result.append( _T("binary") ); + } + if (getOmitNorms()) { + result.append( _T(",omitNorms") ); + } + result.appendChar('<'); + result.append(name()); + result.appendChar(':'); + + if (_stringValue != NULL) + result.append(_stringValue); + else if ( _readerValue != NULL ) + result.append( _T("Reader") ); + else if ( _streamValue != NULL ) + result.append( _T("Stream") ); + else + result.append( _T("NULL") ); + + result.appendChar('>'); + return result.toString(); +} + + +Field* Field::Keyword(const TCHAR* Name, const TCHAR* Value) { + return _CLNEW Field(Name,Value,Field::STORE_YES | Field::INDEX_UNTOKENIZED); +} + +Field* Field::UnIndexed(const TCHAR* Name, const TCHAR* Value) { + return _CLNEW Field(Name,Value,Field::STORE_YES | Field::INDEX_NO); +} + +Field* Field::Text(const TCHAR* Name, const TCHAR* Value, const bool storeTermVector) { + if ( storeTermVector ) + return _CLNEW Field(Name,Value,Field::STORE_YES | Field::INDEX_TOKENIZED | Field::TERMVECTOR_YES); + else + return _CLNEW Field(Name,Value,Field::STORE_YES | Field::INDEX_TOKENIZED); +} + +Field* Field::UnStored(const TCHAR* Name, const TCHAR* Value, const bool storeTermVector) { + if ( storeTermVector ) + return _CLNEW Field(Name,Value,Field::STORE_NO | Field::INDEX_TOKENIZED | Field::TERMVECTOR_YES); + else + return _CLNEW Field(Name,Value,Field::STORE_NO | Field::INDEX_TOKENIZED); +} + +Field* Field::Text(const TCHAR* Name, Reader* Value, const bool storeTermVector) { + if ( storeTermVector ) + return _CLNEW Field(Name,Value,Field::INDEX_TOKENIZED | Field::TERMVECTOR_YES); + else + return _CLNEW Field(Name,Value,Field::INDEX_TOKENIZED); +} + +CL_NS_END diff --git a/src/assistant/3rdparty/clucene/src/CLucene/document/Field.h b/src/assistant/3rdparty/clucene/src/CLucene/document/Field.h new file mode 100644 index 000000000..771a1382b --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/document/Field.h @@ -0,0 +1,261 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_document_Field_ +#define _lucene_document_Field_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/util/Reader.h" +#include "CLucene/util/streambase.h" + +CL_NS_DEF(document) +/** +A field is a section of a Document. Each field has two parts, a name and a +value. Values may be free text, provided as a String or as a Reader, or they +may be atomic keywords, which are not further processed. Such keywords may +be used to represent dates, urls, etc. Fields are optionally stored in the +index, so that they may be returned with hits on the document. + +PORTING: CLucene doesn't directly support compressed fields. However, it is easy +to reproduce this functionality by using the GZip streams in the contrib package. +Also note that binary fields are not read immediately in CLucene, a substream +is pointed directly to the field's data, in affect creating a lazy load ability. +This means that large fields are best saved in binary format (even if they are +text), so that they can be loaded lazily. +*/ +class Field :LUCENE_BASE{ +private: + const TCHAR* _name; + TCHAR* _stringValue; + CL_NS(util)::Reader* _readerValue; + jstreams::StreamBase<char>* _streamValue; + + int config; + qreal boost; + bool omitNorms; +public: + enum Store{ + /** Store the original field value in the index. This is useful for short texts + * like a document's title which should be displayed with the results. The + * value is stored in its original form, i.e. no analyzer is used before it is + * stored. + */ + STORE_YES=1, + /** Do not store the field value in the index. */ + STORE_NO=2, + + /** Store the original field value in the index in a compressed form. This is + * useful for long documents and for binary valued fields. + * NOTE: CLucene does not directly support compressed fields, to store a + * compressed field. + * //TODO: need better documentation on how to add a compressed field + * //because actually we still need to write a GZipOutputStream... + */ + STORE_COMPRESS=4 + }; + + enum Index{ + /** Do not index the field value. This field can thus not be searched, + * but one can still access its contents provided it is + * {@link Field::Store stored}. */ + INDEX_NO=16, + /** Index the field's value so it can be searched. An Analyzer will be used + * to tokenize and possibly further normalize the text before its + * terms will be stored in the index. This is useful for common text. + */ + INDEX_TOKENIZED=32, + /** Index the field's value without using an Analyzer, so it can be searched. + * As no analyzer is used the value will be stored as a single term. This is + * useful for unique Ids like product numbers. + */ + INDEX_UNTOKENIZED=64, + /** Index the field's value without an Analyzer, and disable + * the storing of norms. No norms means that index-time boosting + * and field length normalization will be disabled. The benefit is + * less memory usage as norms take up one byte per indexed field + * for every document in the index. + */ + INDEX_NONORMS=128 + }; + + enum TermVector{ + /** Do not store term vectors. */ + TERMVECTOR_NO=256, + /** Store the term vectors of each document. A term vector is a list + * of the document's terms and their number of occurences in that document. */ + TERMVECTOR_YES=512, + /** + * Store the term vector + token position information + * + * @see #YES + */ + TERMVECTOR_WITH_POSITIONS=1024, + /** + * Store the term vector + Token offset information + * + * @see #YES + */ + TERMVECTOR_WITH_OFFSETS=2048 + }; + + _CL_DEPRECATED( another overload ) Field(const TCHAR* name, const TCHAR* value, bool store, bool index, bool token, const bool storeTermVector=false); + _CL_DEPRECATED( another overload ) Field(const TCHAR* name, CL_NS(util)::Reader* reader, bool store, bool index, bool token, const bool storeTermVector=false); + + Field(const TCHAR* name, const TCHAR* value, int configs); + Field(const TCHAR* name, CL_NS(util)::Reader* reader, int configs); + Field(const TCHAR* name, jstreams::StreamBase<char>* stream, int configs); + ~Field(); + + /** Constructs a String-valued Field that is not tokenized, but is indexed + * and stored. Useful for non-text fields, e.g. date or url. + * @deprecated Use new Field(name,value,Field::STORE_YES | Field::INDEX_UNTOKENIZED) + */ + _CL_DEPRECATED( new Field(*) ) static Field* Keyword(const TCHAR* name, const TCHAR* value); + + /** Constructs a String-valued Field that is not tokenized nor indexed, + * but is stored in the index, for return with hits. + * @deprecated Use new Field(name,value,Field::STORE_YES | Field::INDEX_NO) + */ + _CL_DEPRECATED( new Field(*) ) static Field* UnIndexed(const TCHAR* name, const TCHAR* value); + + /** Constructs a String-valued Field that is tokenized and indexed, + * and is stored in the index, for return with hits. Useful for short text + * fields, like "title" or "subject". + * @deprecated Use new Field(name,value,Field::STORE_YES | Field::INDEX_TOKENIZED) + */ + _CL_DEPRECATED( new Field(*) ) static Field* Text(const TCHAR* name, const TCHAR* value, const bool storeTermVector=false); + + /** Constructs a String-valued Field that is tokenized and indexed, + * but that is not stored in the index. + * @deprecated Use new Field(name,value,Field::STORE_NO | Field::INDEX_TOKENIZED) + */ + _CL_DEPRECATED( new Field(*) ) static Field* UnStored(const TCHAR* name, const TCHAR* value, const bool storeTermVector=false); + + /** Constructs a Reader-valued Field that is tokenized and indexed, but is + * *not* stored in the index verbatim. Useful for longer text fields, like + * "body". + * @deprecated Use new Field(name,value, Field::INDEX_TOKENIZED) + */ + _CL_DEPRECATED( new Field(*) ) static Field* Text(const TCHAR* name, CL_NS(util)::Reader* value, const bool storeTermVector=false); + + /** The name of the field (e.g., "date", "subject", "title", "body", etc.) + * as an interned string. */ + const TCHAR* name(); ///<returns reference + + /** The value of the field as a String, or null. If null, the Reader value + * or binary value is used. Exactly one of stringValue(), readerValue() and + * streamValue() must be set. */ + TCHAR* stringValue(); ///<returns reference + + /** The value of the field as a reader, or null. If null, the String value + * or stream value is used. Exactly one of stringValue(), readerValue() and + * streamValue() must be set. */ + CL_NS(util)::Reader* readerValue(); + + /** The value of the field as a String, or null. If null, the String value + * or Reader value is used. Exactly one of stringValue(), readerValue() and + * streamValue() must be set. */ + jstreams::StreamBase<char>* streamValue(); + + // True iff the value of the field is to be stored in the index for return + // with search hits. It is an error for this to be true if a field is + // Reader-valued. + bool isStored(); + + // True iff the value of the field is to be indexed, so that it may be + // searched on. + bool isIndexed(); + + // True iff the value of the field should be tokenized as text prior to + // indexing. Un-tokenized fields are indexed as a single word and may not be + // Reader-valued. + bool isTokenized(); + + /** True if the value of the field is stored and compressed within the index + * NOTE: CLucene does not actually support compressed fields, Instead, a reader + * will be returned with a pointer to a SubIndexInputStream. A GZipInputStream + * and a UTF8 reader must be used to actually read the content. This flag + * will only be set if the index was created by another lucene implementation. + */ + bool isCompressed(); + + //Set configs using XOR. This resets all the settings + //For example, to use term vectors with positions and offsets do: + //object->setConfig(TERMVECTOR_WITH_POSITIONS | TERMVECTOR_WITH_OFFSETS); + void setConfig(int termVector); + + /** True iff the term or terms used to index this field are stored as a term + * vector, available from {@link IndexReader#getTermFreqVector(int32_t,TCHAR*)}. + * These methods do not provide access to the original content of the field, + * only to terms used to index it. If the original content must be + * preserved, use the <code>stored</code> attribute instead. + * + * @see IndexReader#getTermFreqVector(int32_t, String) + */ + bool isTermVectorStored(); + + /** + * True iff terms are stored as term vector together with their offsets + * (start and end positon in source text). + */ + bool isStoreOffsetWithTermVector(); + + /** + * True iff terms are stored as term vector together with their token positions. + */ + bool isStorePositionWithTermVector(); + + /** Returns the boost factor for hits for this field. + * + * <p>The default value is 1.0. + * + * <p>Note: this value is not stored directly with the document in the index. + * Documents returned from {@link IndexReader#document(int)} and + * {@link Hits#doc(int)} may thus not have the same value present as when + * this field was indexed. + * + * @see #setBoost(float) + */ + qreal getBoost(); + + /** Sets the boost factor hits on this field. This value will be + * multiplied into the score of all hits on this field of this document. + * + * <p>The boost is multiplied by {@link Document#getBoost()} of the document + * containing this field. If a document has multiple fields with the same + * name, all such values are multiplied together. This product is then + * multipled by the value {@link Similarity#lengthNorm(String,int)}, and + * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the + * index. One should attempt to ensure that this product does not overflow + * the range of that encoding. + * + * @see Document#setBoost(float) + * @see Similarity#lengthNorm(String, int) + * @see Similarity#encodeNorm(float) + */ + void setBoost(qreal value); + + /** True iff the value of the filed is stored as binary */ + bool isBinary(); + + /** True if norms are omitted for this indexed field */ + bool getOmitNorms(); + + /** Expert: + * + * If set, omit normalization factors associated with this indexed field. + * This effectively disables indexing boosts and length normalization for this field. + */ + void setOmitNorms(bool omitNorms); + + // Prints a Field for human consumption. + TCHAR* toString(); +}; +CL_NS_END +#endif |