diff options
Diffstat (limited to 'src/assistant/3rdparty/clucene/src/CLucene/analysis/standard')
7 files changed, 752 insertions, 0 deletions
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.cpp b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.cpp new file mode 100644 index 000000000..e0994c41a --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.cpp @@ -0,0 +1,46 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "StandardAnalyzer.h" + +#include "CLucene/util/VoidMap.h" +#include "CLucene/util/Reader.h" +#include "CLucene/analysis/AnalysisHeader.h" +#include "CLucene/analysis/Analyzers.h" +#include "StandardFilter.h" +#include "StandardTokenizer.h" + +CL_NS_USE(util) +CL_NS_USE(analysis) + +CL_NS_DEF2(analysis,standard) + + StandardAnalyzer::StandardAnalyzer(): + stopSet(false) + { + StopFilter::fillStopTable( &stopSet,CL_NS(analysis)::StopAnalyzer::ENGLISH_STOP_WORDS); + } + + StandardAnalyzer::StandardAnalyzer( const TCHAR** stopWords): + stopSet(false) + { + StopFilter::fillStopTable( &stopSet,stopWords ); + } + + StandardAnalyzer::~StandardAnalyzer(){ + } + + + TokenStream* StandardAnalyzer::tokenStream(const TCHAR* fieldName, Reader* reader) + { + TokenStream* ret = _CLNEW StandardTokenizer(reader); + ret = _CLNEW StandardFilter(ret,true); + ret = _CLNEW LowerCaseFilter(ret,true); + ret = _CLNEW StopFilter(ret,true, &stopSet); + return ret; + } +CL_NS_END2 diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.h b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.h new file mode 100644 index 000000000..9cce041df --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.h @@ -0,0 +1,47 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_analysis_standard_StandardAnalyzer +#define _lucene_analysis_standard_StandardAnalyzer + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/util/VoidMap.h" +#include "CLucene/util/Reader.h" +#include "CLucene/analysis/AnalysisHeader.h" +#include "CLucene/analysis/Analyzers.h" +#include "StandardFilter.h" +#include "StandardTokenizer.h" + + +CL_NS_DEF2(analysis,standard) + + /** Represents a standard analyzer. */ + class StandardAnalyzer : public Analyzer + { + private: + CL_NS(util)::CLSetList<const TCHAR*> stopSet; + public: + /** Builds an analyzer.*/ + StandardAnalyzer(); + + /** Builds an analyzer with the given stop words. */ + StandardAnalyzer( const TCHAR** stopWords); + + ~StandardAnalyzer(); + + + /** + * Constructs a StandardTokenizer filtered by a + * StandardFilter, a LowerCaseFilter and a StopFilter. + */ + TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) + ; + }; +CL_NS_END2 +#endif diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.cpp b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.cpp new file mode 100644 index 000000000..9869d2592 --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.cpp @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "StandardFilter.h" + +#include "../AnalysisHeader.h" +#include "../Analyzers.h" +#include "StandardTokenizerConstants.h" +#include "CLucene/util/StringBuffer.h" + +CL_NS_USE(analysis) +CL_NS_USE(util) +CL_NS_DEF2(analysis,standard) + + StandardFilter::StandardFilter(TokenStream* in, bool deleteTokenStream): + TokenFilter(in, deleteTokenStream) + { + } + + StandardFilter::~StandardFilter(){ + } + + bool StandardFilter::next(Token* t) { + if (!input->next(t)) + return false; + + TCHAR* text = t->_termText; + const int32_t textLength = t->termTextLength(); + const TCHAR* type = t->type(); + + if ( type == tokenImage[APOSTROPHE] && //we can compare the type directy since the type should always come from the tokenImage + ( textLength >= 2 && _tcsicmp(text+textLength-2, _T("'s"))==0 ) ) + { + // remove 's + text[textLength-2]=0; + t->resetTermTextLen(); + + return true; + + } else if ( type == tokenImage[ACRONYM] ) { // remove dots + int32_t j = 0; + for ( int32_t i=0;i<textLength;i++ ){ + if ( text[i] != '.' ) + text[j++]=text[i]; + } + text[j]=0; + return true; + + } else { + return true; + } + } + +CL_NS_END2 diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.h b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.h new file mode 100644 index 000000000..59657fdb3 --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.h @@ -0,0 +1,37 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_analysis_standard_StandardFilter +#define _lucene_analysis_standard_StandardFilter + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "../AnalysisHeader.h" +#include "../Analyzers.h" +#include "StandardTokenizerConstants.h" +#include "CLucene/util/StringBuffer.h" + +CL_NS_DEF2(analysis,standard) + + /** Normalizes tokens extracted with {@link StandardTokenizer}. */ + class StandardFilter: public TokenFilter{ + public: + // Construct filtering <i>in</i>. + StandardFilter(TokenStream* in, bool deleteTokenStream); + + ~StandardFilter(); + + + /** Returns the next token in the stream, or NULL at EOS. + * <p>Removes <tt>'s</tt> from the end of words. + * <p>Removes dots from acronyms. + */ + bool next(Token* token); + }; +CL_NS_END2 +#endif diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp new file mode 100644 index 000000000..60f9a449c --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp @@ -0,0 +1,446 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "StandardTokenizer.h" + +CL_NS_USE(analysis) +CL_NS_USE(util) +CL_NS_DEF2(analysis,standard) + + const static TCHAR* tokenImageArray[] = { + _T("<EOF>"), + _T("<UNKNOWN>"), + _T("<ALPHANUM>"), + _T("<APOSTROPHE>"), + _T("<ACRONYM>"), + _T("<COMPANY>"), + _T("<EMAIL>"), + _T("<HOST>"), + _T("<NUM>"), + _T("<CJK>") + }; + const TCHAR** tokenImage = tokenImageArray; + + /* A bunch of shortcut macros, many of which make assumptions about variable + ** names. These macros enhance readability, not just convenience! */ + #define EOS (ch==-1 || rd->Eos()) + #define SPACE (_istspace((TCHAR)ch) != 0) + #define ALPHA (_istalpha((TCHAR)ch) != 0) + #define ALNUM (_istalnum(ch) != 0) + #define DIGIT (_istdigit(ch) != 0) + #define UNDERSCORE (ch == '_') + + #define _CJK ( (ch>=0x3040 && ch<=0x318f) || \ + (ch>=0x3300 && ch<=0x337f) || \ + (ch>=0x3400 && ch<=0x3d2d) || \ + (ch>=0x4e00 && ch<=0x9fff) || \ + (ch>=0xf900 && ch<=0xfaff) || \ + (ch>=0xac00 && ch<=0xd7af) ) //korean + + + #define DASH (ch == '-') + #define NEGATIVE_SIGN_ DASH + //#define POSITIVE_SIGN_ (ch == '+') + //#define SIGN (NEGATIVE_SIGN_ || POSITIVE_SIGN_) + + #define DOT (ch == '.') + #define DECIMAL DOT + + + //freebsd seems to have a problem with defines over multiple lines, so this has to be one long line + #define _CONSUME_AS_LONG_AS(conditionFails) while (true) { ch = readChar(); if (ch==-1 || (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN)) { break; } str.appendChar(ch);} + + #define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA) + + #define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT) + + /* otherMatches is a condition (possibly compound) under which a character + ** that's not an ALNUM or UNDERSCORE can be considered not to break the + ** span. Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */ + #define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE) + + /* + ** Consume CJK characters + */ + #define CONSUME_CJK _CONSUME_AS_LONG_AS(_CJK) + + + /* It is considered that "nothing of value" has been read if: + ** a) The "read head" hasn't moved since specialCharPos was established. + ** or + ** b) The "read head" has moved by one character, but that character was + ** either whitespace or not among the characters found in the body of + ** a token (deliberately doesn't include the likes of '@'/'&'). */ + #define CONSUMED_NOTHING_OF_VALUE (rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) ))) + + #define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1]) + #define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c) + /* To discard the last character in a StringBuffer, we decrement the buffer's + ** length indicator and move the terminator back by one character. */ + #define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0') + + //#define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) { TCHAR* sbBuf = sb.getBuffer(); for (int32_t i = sb.len-1; i >= 0; i--) { TCHAR c = sbBuf[i]; if (charMatchesCondition) { sbBuf[--sb.len] = '\0'; } else {break;}}} + + /* Does StringBuffer sb contain any of the characters in string ofThese? */ + #define CONTAINS_ANY(sb, ofThese) (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len)) + + + StandardTokenizer::StandardTokenizer(Reader* reader): + rd(_CLNEW FastCharStream(reader)), + /* rdPos is zero-based. It starts at -1, and will advance to the first + ** position when readChar() is first called. */ + rdPos(-1), + tokenStart(-1) + { + } + + StandardTokenizer::~StandardTokenizer() { + _CLDELETE(rd); + } + + int StandardTokenizer::readChar() { + /* Increment by 1 because we're speaking in terms of characters, not + ** necessarily bytes: */ + rdPos++; + return rd->GetNext(); + } + + void StandardTokenizer::unReadChar() { + rd->UnGet(); + rdPos--; + } + + inline bool StandardTokenizer::setToken(Token* t, StringBuffer* sb, TokenTypes tokenCode) { + t->setStartOffset(tokenStart); + t->setEndOffset(tokenStart+sb->length()); + t->setType(tokenImage[tokenCode]); + sb->getBuffer(); //null terminates the buffer + t->resetTermTextLen(); + return true; + } + + bool StandardTokenizer::next(Token* t) { + int ch=0; + while (!EOS) { + ch = readChar(); + + if ( ch == 0 || ch == -1 ){ + continue; + } else if (SPACE) { + continue; + } else if (ALPHA || UNDERSCORE) { + tokenStart = rdPos; + return ReadAlphaNum(ch,t); + } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) { + tokenStart = rdPos; + /* ReadNumber returns NULL if it fails to extract a valid number; in + ** that case, we just continue. */ + if (ReadNumber(NULL, ch,t)) + return true; + } else if ( _CJK ){ + if ( ReadCJK(ch,t) ) + return true; + } + } + return false; + } + + bool StandardTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) { + /* previousNumber is only non-NULL if this function already read a complete + ** number in a previous recursion, yet has been asked to read additional + ** numeric segments. For example, in the HOST "192.168.1.3", "192.168" is + ** a complete number, but this function will recurse to read the "1.3", + ** generating a single HOST token "192.168.1.3". */ + t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word + StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText + TokenTypes tokenType; + bool decExhausted; + if (previousNumber != NULL) { + str.prepend(previousNumber); + tokenType = CL_NS2(analysis,standard)::HOST; + decExhausted = false; + } else { + tokenType = CL_NS2(analysis,standard)::NUM; + decExhausted = (prev == '.'); + } + if ( str.len >= LUCENE_MAX_WORD_LEN ){ + //if a number is too long, i would say there is no point + //storing it, because its going to be the wrong number anyway? + //what do people think? + return false; + } + str.appendChar(prev); + + const bool signExhausted = (prev == '-'); + int ch = prev; + + CONSUME_DIGITS; + + if (str.len < 2 /* CONSUME_DIGITS didn't find any digits. */ + && ( + (signExhausted && !DECIMAL) + || (decExhausted /* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */) + ) + ) + { + /* We have either: + ** a) a negative sign that's not followed by either digit(s) or a decimal + ** b) a decimal that's not followed by digit(s) + ** so this is not a valid number. */ + if (!EOS) { + /* Unread the character that stopped CONSUME_DIGITS: */ + unReadChar(); + } + return false; + } + + /* We just read a group of digits. Is it followed by a decimal symbol, + ** implying that there might be another group of digits available? */ + if (!EOS) { + if (DECIMAL) { + if ( str.len >= LUCENE_MAX_WORD_LEN ) + return false; //read above for rationale + str.appendChar(ch); + } else { + unReadChar(); + goto SUCCESSFULLY_EXTRACTED_NUMBER; + } + + CONSUME_DIGITS; + if (!DIGIT && !DECIMAL) { + unReadChar(); + } else if (!EOS && DECIMAL && _istdigit(rd->Peek())) { + /* We just read the fractional digit group, but it's also followed by + ** a decimal symbol and at least one more digit, so this must be a + ** HOST rather than a real number. */ + return ReadNumber(str.getBuffer(), '.',t); + } + } + + SUCCESSFULLY_EXTRACTED_NUMBER: + TCHAR rightmost = RIGHTMOST(str); + /* Don't including a trailing decimal point. */ + if (rightmost == '.') { + SHAVE_RIGHTMOST(str); + unReadChar(); + rightmost = RIGHTMOST(str); + } + /* If all we have left is a negative sign, it's not a valid number. */ + if (rightmost == '-') { + CND_PRECONDITION (str.len == 1, "Number is invalid"); + return false; + } + + return setToken(t,&str,tokenType); + } + + bool StandardTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) { + t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word + StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText + if ( str.len < LUCENE_MAX_WORD_LEN ){ + str.appendChar(prev); + int ch = prev; + + CONSUME_WORD; + if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character? + switch(ch) { /* What follows the first alphanum segment? */ + case '.': + str.appendChar('.'); + return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t); + case '\'': + str.appendChar('\''); + return ReadApostrophe(&str,t); + case '@': + str.appendChar('@'); + return ReadAt(&str,t); + case '&': + str.appendChar('&'); + return ReadCompany(&str,t); + /* default: fall through to end of this function. */ + } + } + } + return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM); + } + + bool StandardTokenizer::ReadCJK(const TCHAR prev, Token* t) { + t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word + StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText + if ( str.len < LUCENE_MAX_WORD_LEN ){ + str.appendChar(prev); + int ch = prev; + + CONSUME_CJK; + } + return setToken(t,&str,CL_NS2(analysis,standard)::CJK); + } + + + bool StandardTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, Token* t) { + const int32_t specialCharPos = rdPos; + StringBuffer& str=*_str; + + /* A segment of a "dotted" is not allowed to begin with another dot or a dash. + ** Even though hosts, e-mail addresses, etc., could have a dotted-segment + ** that begins with a dot or a dash, it's far more common in source text + ** for a pattern like "abc.--def" to be intended as two tokens. */ + int ch = rd->Peek(); + if (!(DOT || DASH)) { + bool prevWasDot; + bool prevWasDash; + if (str.len == 0) { + prevWasDot = false; + prevWasDash = false; + } else { + prevWasDot = RIGHTMOST(str) == '.'; + prevWasDash = RIGHTMOST(str) == '-'; + } + while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { + ch = readChar(); + const bool dot = ch == '.'; + const bool dash = ch == '-'; + + if (!(ALNUM || UNDERSCORE || dot || dash)) { + break; + } + /* Multiple dots or dashes in succession end the token. + ** Consider the following inputs: + ** "Visit windowsupdate.microsoft.com--update today!" + ** "In the U.S.A.--yes, even there!" */ + if ((dot || dash) && (prevWasDot || prevWasDash)) { + /* We're not going to append the character we just read, in any case. + ** As to the character before it (which is currently RIGHTMOST(str)): + ** Unless RIGHTMOST(str) is a dot, in which we need to save it so the + ** acronym-versus-host detection can work, we want to get rid of it. */ + if (!prevWasDot) { + SHAVE_RIGHTMOST(str); + } + break; + } + + str.appendChar(ch); + + prevWasDot = dot; + prevWasDash = dash; + } + } + + /* There's a potential StringBuffer.append call in the code above, which + ** could cause str to reallocate its internal buffer. We must wait to + ** obtain the optimization-oriented strBuf pointer until after the initial + ** potentially realloc-triggering operations on str. + ** Because there can be other such ops much later in this function, strBuf + ** is guarded within a block to prevent its use during or after the calls + ** that would potentially invalidate it. */ + { /* Begin block-guard of strBuf */ + TCHAR* strBuf = str.getBuffer(); + + bool rightmostIsDot = RIGHTMOST_IS(str, '.'); + if (CONSUMED_NOTHING_OF_VALUE) { + /* No more alphanums available for this token; shave trailing dot, if any. */ + if (rightmostIsDot) { + SHAVE_RIGHTMOST(str); + } + /* If there are no dots remaining, this is a generic ALPHANUM. */ + if (_tcschr(strBuf, '.') == NULL) { + forcedType = CL_NS2(analysis,standard)::ALPHANUM; + } + + /* Check the token to see if it's an acronym. An acronym must have a + ** letter in every even slot and a dot in every odd slot, including the + ** last slot (for example, "U.S.A."). */ + } else if (rightmostIsDot) { + bool isAcronym = true; + const int32_t upperCheckLimit = str.len - 1; /* -1 b/c we already checked the last slot. */ + + for (int32_t i = 0; i < upperCheckLimit; i++) { + const bool even = (i % 2 == 0); + ch = strBuf[i]; + if ( (even && !ALPHA) || (!even && !DOT) ) { + isAcronym = false; + break; + } + } + if (isAcronym) { + forcedType = CL_NS2(analysis,standard)::ACRONYM; + } else { + /* If it's not an acronym, we don't want the trailing dot. */ + SHAVE_RIGHTMOST(str); + /* If there are no dots remaining, this is a generic ALPHANUM. */ + if (_tcschr(strBuf, '.') == NULL) { + forcedType = CL_NS2(analysis,standard)::ALPHANUM; + } + } + } + } /* End block-guard of strBuf */ + + if (!EOS) { + if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) { + str.appendChar('@'); + return ReadAt(&str,t); + } else { + unReadChar(); + } + } + + return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN + ? forcedType : CL_NS2(analysis,standard)::HOST); + } + + bool StandardTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) { + StringBuffer& str=*_str; + + TokenTypes tokenType = CL_NS2(analysis,standard)::APOSTROPHE; + const int32_t specialCharPos = rdPos; + int ch=0; + + CONSUME_ALPHAS; + if (RIGHTMOST_IS(str, '\'') || CONSUMED_NOTHING_OF_VALUE) { + /* After the apostrophe, no more alphanums were available within this + ** token; shave trailing apostrophe and revert to generic ALPHANUM. */ + SHAVE_RIGHTMOST(str); + tokenType = CL_NS2(analysis,standard)::ALPHANUM; + } + if (!EOS) { + unReadChar(); + } + + return setToken(t,&str,tokenType); + } + + bool StandardTokenizer::ReadAt(StringBuffer* str, Token* t) { + ReadDotted(str, CL_NS2(analysis,standard)::EMAIL,t); + /* JLucene grammar indicates dots/digits not allowed in company name: */ + if (!CONTAINS_ANY((*str), ".0123456789")) { + setToken(t,str,CL_NS2(analysis,standard)::COMPANY); + } + return true; + } + + bool StandardTokenizer::ReadCompany(StringBuffer* _str, Token* t) { + StringBuffer& str = *_str; + const int32_t specialCharPos = rdPos; + int ch=0; + + CONSUME_WORD; + if (CONSUMED_NOTHING_OF_VALUE) { + /* After the ampersand, no more alphanums were available within this + ** token; shave trailing ampersand and revert to ALPHANUM. */ + CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed"); + SHAVE_RIGHTMOST(str); + + + return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM); + } + if (!EOS) { + unReadChar(); + } + + return setToken(t,&str,CL_NS2(analysis,standard)::COMPANY); + } + +CL_NS_END2 diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h new file mode 100644 index 000000000..d4195be81 --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h @@ -0,0 +1,88 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_analysis_standard_StandardTokenizer +#define _lucene_analysis_standard_StandardTokenizer + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "../AnalysisHeader.h" +#include "../Analyzers.h" +#include "StandardTokenizerConstants.h" +#include "CLucene/util/StringBuffer.h" +#include "CLucene/util/FastCharStream.h" +#include "CLucene/util/Reader.h" + + +CL_NS_DEF2(analysis,standard) + +/** A grammar-based tokenizer constructed with JavaCC. + * + * <p> This should be a good tokenizer for most European-language documents: + * + * <ul> + * <li>Splits words at punctuation characters, removing punctuation. However, a + * dot that's not followed by whitespace is considered part of a token. + * <li>Splits words at hyphens, unless there's a number in the token, in which case + * the whole token is interpreted as a product number and is not split. + * <li>Recognizes email addresses and internet hostnames as one token. + * </ul> + * + * <p>Many applications have specific tokenizer needs. If this tokenizer does + * not suit your application, please consider copying this source code + * directory to your project and maintaining your own grammar-based tokenizer. + */ + class StandardTokenizer: public Tokenizer { + private: + int32_t rdPos; + int32_t tokenStart; + + // Advance by one character, incrementing rdPos and returning the character. + int readChar(); + // Retreat by one character, decrementing rdPos. + void unReadChar(); + + // createToken centralizes token creation for auditing purposes. + //Token* createToken(CL_NS(util)::StringBuffer* sb, TokenTypes tokenCode); + inline bool setToken(Token* t, CL_NS(util)::StringBuffer* sb, TokenTypes tokenCode); + + bool ReadDotted(CL_NS(util)::StringBuffer* str, TokenTypes forcedType,Token* t); + + public: + CL_NS(util)::FastCharStream* rd; + + // Constructs a tokenizer for this Reader. + StandardTokenizer(CL_NS(util)::Reader* reader); + + ~StandardTokenizer(); + + /** Returns the next token in the stream, or false at end-of-stream. + * The returned token's type is set to an element of + * StandardTokenizerConstants::tokenImage. */ + bool next(Token* token); + + // Reads for number like "1"/"1234.567", or IP address like "192.168.1.2". + bool ReadNumber(const TCHAR* previousNumber, const TCHAR prev, Token* t); + + bool ReadAlphaNum(const TCHAR prev, Token* t); + + // Reads for apostrophe-containing word. + bool ReadApostrophe(CL_NS(util)::StringBuffer* str, Token* t); + + // Reads for something@... it may be a COMPANY name or a EMAIL address + bool ReadAt(CL_NS(util)::StringBuffer* str, Token* t); + + // Reads for COMPANY name like AT&T. + bool ReadCompany(CL_NS(util)::StringBuffer* str, Token* t); + + // Reads CJK characters + bool ReadCJK(const TCHAR prev, Token* t); + }; + +CL_NS_END2 +#endif diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizerConstants.h b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizerConstants.h new file mode 100644 index 000000000..3c95af45a --- /dev/null +++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizerConstants.h @@ -0,0 +1,30 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_analysis_standard_StandardTokenizerConstants +#define _lucene_analysis_standard_StandardTokenizerConstants + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +CL_NS_DEF2(analysis,standard) + enum TokenTypes { + _EOF, + UNKNOWN, + ALPHANUM, + APOSTROPHE, + ACRONYM, + COMPANY, + EMAIL, + HOST, + NUM, + CJK + }; + extern const TCHAR** tokenImage; + + CL_NS_END2 +#endif |