7 files changed, 752 insertions, 0 deletions
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.cpp b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.cpp
new file mode 100644
index 000000000..e0994c41a
--- /dev/null
+++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.cpp
@@ -0,0 +1,46 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/StdHeader.h"
+#include "StandardAnalyzer.h"
+
+#include "CLucene/util/VoidMap.h"
+#include "CLucene/util/Reader.h"
+#include "CLucene/analysis/AnalysisHeader.h"
+#include "CLucene/analysis/Analyzers.h"
+#include "StandardFilter.h"
+#include "StandardTokenizer.h"
+
+CL_NS_USE(util)
+CL_NS_USE(analysis)
+
+CL_NS_DEF2(analysis,standard)
+
+	StandardAnalyzer::StandardAnalyzer():
+		stopSet(false)
+	{
+      StopFilter::fillStopTable( &stopSet,CL_NS(analysis)::StopAnalyzer::ENGLISH_STOP_WORDS);
+	}
+
+	StandardAnalyzer::StandardAnalyzer( const TCHAR** stopWords):
+		stopSet(false)
+	{
+		StopFilter::fillStopTable( &stopSet,stopWords );
+	}
+
+	StandardAnalyzer::~StandardAnalyzer(){
+	}
+
+
+	TokenStream* StandardAnalyzer::tokenStream(const TCHAR* fieldName, Reader* reader) 
+	{
+		TokenStream* ret = _CLNEW StandardTokenizer(reader);
+		ret = _CLNEW StandardFilter(ret,true);
+		ret = _CLNEW LowerCaseFilter(ret,true);
+		ret = _CLNEW StopFilter(ret,true, &stopSet);
+		return ret;
+	}
+CL_NS_END2
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.h b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.h
new file mode 100644
index 000000000..9cce041df
--- /dev/null
+++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardAnalyzer.h
@@ -0,0 +1,47 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_standard_StandardAnalyzer
+#define _lucene_analysis_standard_StandardAnalyzer
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+#include "CLucene/util/VoidMap.h"
+#include "CLucene/util/Reader.h"
+#include "CLucene/analysis/AnalysisHeader.h"
+#include "CLucene/analysis/Analyzers.h"
+#include "StandardFilter.h"
+#include "StandardTokenizer.h"
+
+
+CL_NS_DEF2(analysis,standard)
+
+	/** Represents a standard analyzer. */
+	class StandardAnalyzer : public Analyzer 
+	{
+	private:
+		CL_NS(util)::CLSetList<const TCHAR*> stopSet;
+	public:
+		/** Builds an analyzer.*/
+		StandardAnalyzer();
+
+		/** Builds an analyzer with the given stop words. */
+		StandardAnalyzer( const TCHAR** stopWords);
+
+		~StandardAnalyzer();
+
+
+		/**
+		* Constructs a StandardTokenizer filtered by a 
+		* StandardFilter, a LowerCaseFilter and a StopFilter.
+		*/
+		TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) 
+		;
+	};
+CL_NS_END2
+#endif
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.cpp b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.cpp
new file mode 100644
index 000000000..9869d2592
--- /dev/null
+++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.cpp
@@ -0,0 +1,58 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/StdHeader.h"
+#include "StandardFilter.h"
+
+#include "../AnalysisHeader.h"
+#include "../Analyzers.h"
+#include "StandardTokenizerConstants.h"
+#include "CLucene/util/StringBuffer.h"
+
+CL_NS_USE(analysis)
+CL_NS_USE(util)
+CL_NS_DEF2(analysis,standard)
+
+  StandardFilter::StandardFilter(TokenStream* in, bool deleteTokenStream):
+    TokenFilter(in, deleteTokenStream)
+  {
+  }
+
+  StandardFilter::~StandardFilter(){
+  }
+
+  bool StandardFilter::next(Token* t) {
+    if (!input->next(t))
+      return false;
+
+    TCHAR* text = t->_termText;
+    const int32_t textLength = t->termTextLength();
+    const TCHAR* type = t->type();
+
+    if ( type == tokenImage[APOSTROPHE] && //we can compare the type directy since the type should always come from the tokenImage
+		( textLength >= 2 && _tcsicmp(text+textLength-2, _T("'s"))==0  ) )
+    {
+      // remove 's
+      text[textLength-2]=0; 
+	  t->resetTermTextLen();
+
+      return true;
+
+    } else if ( type == tokenImage[ACRONYM] ) {		  // remove dots
+		int32_t j = 0;
+		for ( int32_t i=0;i<textLength;i++ ){
+			if ( text[i] != '.' )
+				text[j++]=text[i];
+		}
+		text[j]=0;
+      return true;
+
+    } else {
+      return true;
+    }
+  }
+
+CL_NS_END2
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.h b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.h
new file mode 100644
index 000000000..59657fdb3
--- /dev/null
+++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardFilter.h
@@ -0,0 +1,37 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_standard_StandardFilter
+#define _lucene_analysis_standard_StandardFilter
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+#include "../AnalysisHeader.h"
+#include "../Analyzers.h"
+#include "StandardTokenizerConstants.h"
+#include "CLucene/util/StringBuffer.h"
+
+CL_NS_DEF2(analysis,standard)
+
+	/** Normalizes tokens extracted with {@link StandardTokenizer}. */
+	class StandardFilter: public TokenFilter{
+	public:
+		// Construct filtering <i>in</i>. 
+		StandardFilter(TokenStream* in, bool deleteTokenStream);
+
+		~StandardFilter();
+
+		
+	  /** Returns the next token in the stream, or NULL at EOS.
+	  * <p>Removes <tt>'s</tt> from the end of words.
+	  * <p>Removes dots from acronyms.
+	  */
+		bool next(Token* token);
+	};
+CL_NS_END2
+#endif
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp
new file mode 100644
index 000000000..60f9a449c
--- /dev/null
+++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp
@@ -0,0 +1,446 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/StdHeader.h"
+#include "StandardTokenizer.h"
+
+CL_NS_USE(analysis)
+CL_NS_USE(util)
+CL_NS_DEF2(analysis,standard)
+
+  const static TCHAR* tokenImageArray[] = {
+    _T("<EOF>"),
+    _T("<UNKNOWN>"),
+    _T("<ALPHANUM>"),
+    _T("<APOSTROPHE>"),
+    _T("<ACRONYM>"),
+    _T("<COMPANY>"),
+    _T("<EMAIL>"),
+    _T("<HOST>"),
+    _T("<NUM>"),
+    _T("<CJK>")
+  };
+  const TCHAR** tokenImage = tokenImageArray;
+
+  /* A bunch of shortcut macros, many of which make assumptions about variable
+  ** names.  These macros enhance readability, not just convenience! */
+  #define EOS           (ch==-1 || rd->Eos())
+  #define SPACE         (_istspace((TCHAR)ch) != 0)
+  #define ALPHA         (_istalpha((TCHAR)ch) != 0)
+  #define ALNUM         (_istalnum(ch) != 0)
+  #define DIGIT         (_istdigit(ch) != 0)
+  #define UNDERSCORE    (ch == '_')
+  
+  #define _CJK			(  (ch>=0x3040 && ch<=0x318f) || \
+  						   (ch>=0x3300 && ch<=0x337f) || \
+  						   (ch>=0x3400 && ch<=0x3d2d) || \
+  						   (ch>=0x4e00 && ch<=0x9fff) || \
+  						   (ch>=0xf900 && ch<=0xfaff) || \
+  						   (ch>=0xac00 && ch<=0xd7af) ) //korean
+
+  
+  #define DASH          (ch == '-')
+  #define NEGATIVE_SIGN_ DASH
+  //#define POSITIVE_SIGN_ (ch == '+')
+  //#define SIGN          (NEGATIVE_SIGN_ || POSITIVE_SIGN_)
+
+  #define DOT             (ch == '.')
+  #define DECIMAL         DOT
+
+
+  //freebsd seems to have a problem with defines over multiple lines, so this has to be one long line
+  #define _CONSUME_AS_LONG_AS(conditionFails) while (true) { ch = readChar(); if (ch==-1 || (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN)) { break; } str.appendChar(ch);}
+
+  #define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA)
+
+  #define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT)
+
+  /* otherMatches is a condition (possibly compound) under which a character
+  ** that's not an ALNUM or UNDERSCORE can be considered not to break the
+  ** span.  Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */
+  #define CONSUME_WORD                  _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE)
+  
+  /*
+  ** Consume CJK characters
+  */
+  #define CONSUME_CJK                   _CONSUME_AS_LONG_AS(_CJK)
+
+
+  /* It is considered that "nothing of value" has been read if:
+  ** a) The "read head" hasn't moved since specialCharPos was established.
+  ** or
+  ** b) The "read head" has moved by one character, but that character was
+  **    either whitespace or not among the characters found in the body of
+  **    a token (deliberately doesn't include the likes of '@'/'&'). */
+  #define CONSUMED_NOTHING_OF_VALUE (rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) )))
+
+  #define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1])
+  #define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c)
+  /* To discard the last character in a StringBuffer, we decrement the buffer's
+  ** length indicator and move the terminator back by one character. */
+  #define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0')
+
+  //#define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) { TCHAR* sbBuf = sb.getBuffer(); for (int32_t i = sb.len-1; i >= 0; i--) { TCHAR c = sbBuf[i]; if (charMatchesCondition) { sbBuf[--sb.len] = '\0'; } else {break;}}}
+
+  /* Does StringBuffer sb contain any of the characters in string ofThese? */
+  #define CONTAINS_ANY(sb, ofThese) (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len))
+
+
+  StandardTokenizer::StandardTokenizer(Reader* reader):
+    rd(_CLNEW FastCharStream(reader)),
+    /* rdPos is zero-based.  It starts at -1, and will advance to the first
+    ** position when readChar() is first called. */
+    rdPos(-1),
+    tokenStart(-1)
+  {
+  }
+
+  StandardTokenizer::~StandardTokenizer() {
+    _CLDELETE(rd);
+  }
+
+  int StandardTokenizer::readChar() {
+    /* Increment by 1 because we're speaking in terms of characters, not
+    ** necessarily bytes: */
+    rdPos++;
+    return rd->GetNext();
+  }
+
+  void StandardTokenizer::unReadChar() {
+    rd->UnGet();
+    rdPos--;
+  }
+
+  inline bool StandardTokenizer::setToken(Token* t, StringBuffer* sb, TokenTypes tokenCode) {
+    t->setStartOffset(tokenStart);
+	t->setEndOffset(tokenStart+sb->length());
+	t->setType(tokenImage[tokenCode]);
+	sb->getBuffer(); //null terminates the buffer
+	t->resetTermTextLen();
+	return true;
+  }
+
+  bool StandardTokenizer::next(Token* t) {
+    int ch=0;
+	while (!EOS) {
+      ch = readChar();
+
+	  if ( ch == 0 || ch == -1 ){
+		continue;
+	  } else if (SPACE) {
+        continue;
+      } else if (ALPHA || UNDERSCORE) {
+        tokenStart = rdPos;
+        return ReadAlphaNum(ch,t);
+      } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
+        tokenStart = rdPos;
+        /* ReadNumber returns NULL if it fails to extract a valid number; in
+        ** that case, we just continue. */
+        if (ReadNumber(NULL, ch,t))
+          return true;
+	  } else if ( _CJK ){
+      	if ( ReadCJK(ch,t) )
+      		return true;
+      }
+    }
+    return false;
+  }
+
+  bool StandardTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) {
+    /* previousNumber is only non-NULL if this function already read a complete
+    ** number in a previous recursion, yet has been asked to read additional
+    ** numeric segments.  For example, in the HOST "192.168.1.3", "192.168" is
+    ** a complete number, but this function will recurse to read the "1.3",
+    ** generating a single HOST token "192.168.1.3". */
+    t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
+    StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
+    TokenTypes tokenType;
+    bool decExhausted;
+    if (previousNumber != NULL) {
+      str.prepend(previousNumber);
+      tokenType = CL_NS2(analysis,standard)::HOST;
+      decExhausted = false;
+    } else {
+      tokenType = CL_NS2(analysis,standard)::NUM;
+      decExhausted = (prev == '.');
+    }
+	if (  str.len >= LUCENE_MAX_WORD_LEN ){
+		//if a number is too long, i would say there is no point
+		//storing it, because its going to be the wrong number anyway?
+		//what do people think?
+		return false; 
+	}
+    str.appendChar(prev);
+
+    const bool signExhausted = (prev == '-');
+    int ch = prev;
+
+    CONSUME_DIGITS;
+
+    if (str.len < 2 /* CONSUME_DIGITS didn't find any digits. */
+        && (
+                (signExhausted && !DECIMAL)
+             || (decExhausted /* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */)
+           )
+       )
+    {
+      /* We have either:
+      **   a) a negative sign that's not followed by either digit(s) or a decimal
+      **   b) a decimal that's not followed by digit(s)
+      ** so this is not a valid number. */
+      if (!EOS) {
+        /* Unread the character that stopped CONSUME_DIGITS: */
+        unReadChar();
+      }
+      return false;
+    }
+
+    /* We just read a group of digits.  Is it followed by a decimal symbol,
+    ** implying that there might be another group of digits available? */
+    if (!EOS) {
+      if (DECIMAL) {
+		if (  str.len >= LUCENE_MAX_WORD_LEN )
+			return false; //read above for rationale
+        str.appendChar(ch);
+      } else {
+        unReadChar();
+        goto SUCCESSFULLY_EXTRACTED_NUMBER;
+      }
+
+      CONSUME_DIGITS;
+      if (!DIGIT && !DECIMAL) {
+        unReadChar();
+      } else if (!EOS && DECIMAL && _istdigit(rd->Peek())) {
+        /* We just read the fractional digit group, but it's also followed by
+        ** a decimal symbol and at least one more digit, so this must be a
+        ** HOST rather than a real number. */
+        return ReadNumber(str.getBuffer(), '.',t);
+      }
+    }
+
+    SUCCESSFULLY_EXTRACTED_NUMBER:
+    TCHAR rightmost = RIGHTMOST(str);
+    /* Don't including a trailing decimal point. */
+    if (rightmost == '.') {
+      SHAVE_RIGHTMOST(str);
+      unReadChar();
+      rightmost = RIGHTMOST(str);
+    }
+    /* If all we have left is a negative sign, it's not a valid number. */
+    if (rightmost == '-') {
+      CND_PRECONDITION (str.len == 1, "Number is invalid");
+      return false;
+    }
+
+	return setToken(t,&str,tokenType);
+  }
+
+  bool StandardTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) {
+    t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
+    StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
+	if (  str.len < LUCENE_MAX_WORD_LEN ){
+		str.appendChar(prev);
+		int ch = prev;
+
+		CONSUME_WORD;
+		if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
+			switch(ch) { /* What follows the first alphanum segment? */
+				case '.':
+					str.appendChar('.');
+					return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
+				case '\'':
+					str.appendChar('\'');
+					return ReadApostrophe(&str,t);
+				case '@':
+					str.appendChar('@');
+					return ReadAt(&str,t);
+				case '&':
+					str.appendChar('&');
+					return ReadCompany(&str,t);
+				/* default: fall through to end of this function. */
+			}
+		}
+	}
+	return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
+  }
+  
+  bool StandardTokenizer::ReadCJK(const TCHAR prev, Token* t) {
+    t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
+    StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
+	if ( str.len < LUCENE_MAX_WORD_LEN ){
+		str.appendChar(prev);
+		int ch = prev;
+
+		CONSUME_CJK;
+	}
+	return setToken(t,&str,CL_NS2(analysis,standard)::CJK);
+  }
+  
+
+  bool StandardTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, Token* t) {
+    const int32_t specialCharPos = rdPos;
+	StringBuffer& str=*_str;
+	
+    /* A segment of a "dotted" is not allowed to begin with another dot or a dash.
+    ** Even though hosts, e-mail addresses, etc., could have a dotted-segment
+    ** that begins with a dot or a dash, it's far more common in source text
+    ** for a pattern like "abc.--def" to be intended as two tokens. */
+    int ch = rd->Peek();
+    if (!(DOT || DASH)) {
+      bool prevWasDot;
+      bool prevWasDash;
+      if (str.len == 0) {
+        prevWasDot = false;
+        prevWasDash = false;
+      } else {
+        prevWasDot = RIGHTMOST(str) == '.';
+        prevWasDash = RIGHTMOST(str) == '-';
+      }
+      while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
+        ch = readChar();
+        const bool dot = ch == '.';
+        const bool dash = ch == '-';
+
+        if (!(ALNUM || UNDERSCORE || dot || dash)) {
+          break;
+        }
+        /* Multiple dots or dashes in succession end the token.
+        ** Consider the following inputs:
+        **   "Visit windowsupdate.microsoft.com--update today!"
+        **   "In the U.S.A.--yes, even there!"                 */
+        if ((dot || dash) && (prevWasDot || prevWasDash)) {
+          /* We're not going to append the character we just read, in any case.
+          ** As to the character before it (which is currently RIGHTMOST(str)):
+          ** Unless RIGHTMOST(str) is a dot, in which we need to save it so the
+          ** acronym-versus-host detection can work, we want to get rid of it. */
+          if (!prevWasDot) {
+            SHAVE_RIGHTMOST(str);
+          }
+          break;
+        }
+
+        str.appendChar(ch);
+
+        prevWasDot = dot;
+        prevWasDash = dash;
+      }
+    }
+
+    /* There's a potential StringBuffer.append call in the code above, which
+    ** could cause str to reallocate its internal buffer.  We must wait to
+    ** obtain the optimization-oriented strBuf pointer until after the initial
+    ** potentially realloc-triggering operations on str.
+    ** Because there can be other such ops much later in this function, strBuf
+    ** is guarded within a block to prevent its use during or after the calls
+    ** that would potentially invalidate it. */
+    { /* Begin block-guard of strBuf */
+    TCHAR* strBuf = str.getBuffer();
+
+    bool rightmostIsDot = RIGHTMOST_IS(str, '.');
+    if (CONSUMED_NOTHING_OF_VALUE) {
+      /* No more alphanums available for this token; shave trailing dot, if any. */
+      if (rightmostIsDot) {
+        SHAVE_RIGHTMOST(str);
+      }
+      /* If there are no dots remaining, this is a generic ALPHANUM. */
+      if (_tcschr(strBuf, '.') == NULL) {
+        forcedType = CL_NS2(analysis,standard)::ALPHANUM;
+      }
+
+    /* Check the token to see if it's an acronym.  An acronym must have a
+    ** letter in every even slot and a dot in every odd slot, including the
+    ** last slot (for example, "U.S.A."). */
+    } else if (rightmostIsDot) {
+      bool isAcronym = true;
+      const int32_t upperCheckLimit = str.len - 1; /* -1 b/c we already checked the last slot. */
+
+      for (int32_t i = 0; i < upperCheckLimit; i++) {
+        const bool even = (i % 2 == 0);
+        ch = strBuf[i];
+        if ( (even && !ALPHA) || (!even && !DOT) ) {
+          isAcronym = false;
+          break;
+        }
+      }
+      if (isAcronym) {
+        forcedType = CL_NS2(analysis,standard)::ACRONYM;
+      } else {
+        /* If it's not an acronym, we don't want the trailing dot. */
+        SHAVE_RIGHTMOST(str);
+        /* If there are no dots remaining, this is a generic ALPHANUM. */
+        if (_tcschr(strBuf, '.') == NULL) {
+          forcedType = CL_NS2(analysis,standard)::ALPHANUM;
+        }
+      }
+    }
+    } /* End block-guard of strBuf */
+
+    if (!EOS) {
+      if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
+        str.appendChar('@');
+        return ReadAt(&str,t);
+      } else {
+        unReadChar();
+      }
+    }
+
+	return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN
+			? forcedType : CL_NS2(analysis,standard)::HOST);
+  }
+
+  bool StandardTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
+    StringBuffer& str=*_str;
+
+    TokenTypes tokenType = CL_NS2(analysis,standard)::APOSTROPHE;
+    const int32_t specialCharPos = rdPos;
+    int ch=0;
+
+    CONSUME_ALPHAS;
+    if (RIGHTMOST_IS(str, '\'') || CONSUMED_NOTHING_OF_VALUE) {
+      /* After the apostrophe, no more alphanums were available within this
+      ** token; shave trailing apostrophe and revert to generic ALPHANUM. */
+      SHAVE_RIGHTMOST(str);
+      tokenType = CL_NS2(analysis,standard)::ALPHANUM;
+    }
+    if (!EOS) {
+      unReadChar();
+    }
+
+	return setToken(t,&str,tokenType);
+  }
+
+  bool StandardTokenizer::ReadAt(StringBuffer* str, Token* t) {
+    ReadDotted(str, CL_NS2(analysis,standard)::EMAIL,t);
+    /* JLucene grammar indicates dots/digits not allowed in company name: */
+    if (!CONTAINS_ANY((*str), ".0123456789")) {
+		setToken(t,str,CL_NS2(analysis,standard)::COMPANY);
+    }
+    return true;
+  }
+
+  bool StandardTokenizer::ReadCompany(StringBuffer* _str, Token* t) {
+    StringBuffer& str = *_str;
+    const int32_t specialCharPos = rdPos;
+    int ch=0;
+
+    CONSUME_WORD;
+    if (CONSUMED_NOTHING_OF_VALUE) {
+      /* After the ampersand, no more alphanums were available within this
+      ** token; shave trailing ampersand and revert to ALPHANUM. */
+      CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed");
+      SHAVE_RIGHTMOST(str);
+
+
+	  return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
+    }
+    if (!EOS) {
+      unReadChar();
+    }
+
+	return setToken(t,&str,CL_NS2(analysis,standard)::COMPANY);
+  }
+
+CL_NS_END2
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h
new file mode 100644
index 000000000..d4195be81
--- /dev/null
+++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h
@@ -0,0 +1,88 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_standard_StandardTokenizer
+#define _lucene_analysis_standard_StandardTokenizer
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+#include "../AnalysisHeader.h"
+#include "../Analyzers.h"
+#include "StandardTokenizerConstants.h"
+#include "CLucene/util/StringBuffer.h"
+#include "CLucene/util/FastCharStream.h"
+#include "CLucene/util/Reader.h"
+
+
+CL_NS_DEF2(analysis,standard)
+
+/** A grammar-based tokenizer constructed with JavaCC.
+ *
+ * <p> This should be a good tokenizer for most European-language documents:
+ *
+ * <ul>
+ *   <li>Splits words at punctuation characters, removing punctuation. However, a 
+ *     dot that's not followed by whitespace is considered part of a token.
+ *   <li>Splits words at hyphens, unless there's a number in the token, in which case
+ *     the whole token is interpreted as a product number and is not split.
+ *   <li>Recognizes email addresses and internet hostnames as one token.
+ * </ul>
+ *
+ * <p>Many applications have specific tokenizer needs.  If this tokenizer does
+ * not suit your application, please consider copying this source code
+ * directory to your project and maintaining your own grammar-based tokenizer.
+ */
+  class StandardTokenizer: public Tokenizer {
+  private:
+    int32_t rdPos;
+    int32_t tokenStart;
+
+    // Advance by one character, incrementing rdPos and returning the character.
+    int readChar();
+    // Retreat by one character, decrementing rdPos.
+    void unReadChar();
+
+    // createToken centralizes token creation for auditing purposes.
+	//Token* createToken(CL_NS(util)::StringBuffer* sb, TokenTypes tokenCode);
+    inline bool setToken(Token* t, CL_NS(util)::StringBuffer* sb, TokenTypes tokenCode);
+
+    bool ReadDotted(CL_NS(util)::StringBuffer* str, TokenTypes forcedType,Token* t);
+
+  public:
+	CL_NS(util)::FastCharStream* rd;
+
+    // Constructs a tokenizer for this Reader.
+    StandardTokenizer(CL_NS(util)::Reader* reader);
+
+    ~StandardTokenizer();
+
+    /** Returns the next token in the stream, or false at end-of-stream.
+    * The returned token's type is set to an element of
+    * StandardTokenizerConstants::tokenImage. */
+    bool next(Token* token);
+
+    // Reads for number like "1"/"1234.567", or IP address like "192.168.1.2".
+    bool ReadNumber(const TCHAR* previousNumber, const TCHAR prev, Token* t);
+
+    bool ReadAlphaNum(const TCHAR prev, Token* t);
+
+    // Reads for apostrophe-containing word.
+    bool ReadApostrophe(CL_NS(util)::StringBuffer* str, Token* t);
+
+    // Reads for something@... it may be a COMPANY name or a EMAIL address
+    bool ReadAt(CL_NS(util)::StringBuffer* str, Token* t);
+
+    // Reads for COMPANY name like AT&T.
+    bool ReadCompany(CL_NS(util)::StringBuffer* str, Token* t);
+    
+    // Reads CJK characters
+    bool ReadCJK(const TCHAR prev, Token* t);
+  };
+
+CL_NS_END2
+#endif
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizerConstants.h b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizerConstants.h
new file mode 100644
index 000000000..3c95af45a
--- /dev/null
+++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizerConstants.h
@@ -0,0 +1,30 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_standard_StandardTokenizerConstants
+#define _lucene_analysis_standard_StandardTokenizerConstants
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+CL_NS_DEF2(analysis,standard)
+  enum TokenTypes {
+    _EOF,
+    UNKNOWN,
+    ALPHANUM,
+    APOSTROPHE,
+    ACRONYM,
+    COMPANY,
+    EMAIL,
+    HOST,
+    NUM,
+    CJK
+  };
+  extern const TCHAR** tokenImage;
+
+  CL_NS_END2
+#endif