A TokenStream enumerates the sequence of tokens, either from Fields of a Document or from query text.

this is an abstract class; concrete subclasses are:

  • Tokenizer, a TokenStream whose input is a Reader; and
  • TokenFilter, a TokenStream whose input is another TokenStream.
A new TokenStream API has been introduced with Lucene 2.9. this API has moved from being Token-based to Attribute-based. While Token still exists in 2.9 as a convenience class, the preferred way to store the information of a Token is to use AttributeImpls.

TokenStream now extends AttributeSource, which provides access to all of the token Attributes for the TokenStream. Note that only one instance per AttributeImpl is created and reused for every token. this approach reduces object creation and allows local caching of references to the AttributeImpls. See #IncrementToken() for further details.

The workflow of the new TokenStream API is as follows:

  1. Instantiation of TokenStream/TokenFilters which add/get attributes to/from the AttributeSource.
  2. The consumer calls TokenStream#reset().
  3. The consumer retrieves attributes from the stream and stores local references to all attributes it wants to access.
  4. The consumer calls #IncrementToken() until it returns false consuming the attributes after each call.
  5. The consumer calls #end() so that any end-of-stream operations can be performed.
  6. The consumer calls #close() to release any resource when finished using the TokenStream.
To make sure that filters and consumers know which attributes are available, the attributes must be added during instantiation. Filters and consumers are not required to check for availability of attributes in #IncrementToken().

You can find some example code for the new API in the analysis package level Javadoc.

Sometimes it is desirable to capture a current state of a TokenStream, e.g., for buffering purposes (see CachingTokenFilter, TeeSinkTokenFilter). For this usecase AttributeSource#captureState and AttributeSource#restoreState can be used.

The {@code TokenStream}-API in Lucene is based on the decorator pattern. Therefore all non-abstract subclasses must be final or have at least a final implementation of #incrementToken! this is checked when Java assertions are enabled.

Наследование: Lucene.Net.Util.AttributeSource, IDisposable
Пример #1
0
 public SimplePayloadFilter(TokenStream input)
     : base(input)
 {
     Pos = 0;
     PayloadAttr = input.AddAttribute<IPayloadAttribute>();
     TermAttr = input.AddAttribute<ICharTermAttribute>();
 }
Пример #2
0
 /// <summary> Build a filter that removes words that are too long or too
 /// short from the text.
 /// </summary>
 public LengthFilter(TokenStream in_Renamed, int min, int max)
     : base(in_Renamed)
 {
     this.min = min;
     this.max = max;
     termAtt = AddAttribute<ITermAttribute>();
 }
Пример #3
0
 /// <summary> Build a filter that removes words that are too long or too
 /// short from the text.
 /// </summary>
 public LengthFilter(TokenStream in_Renamed, int min, int max)
     : base(in_Renamed)
 {
     this.min = min;
     this.max = max;
     termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
 }
Пример #4
0
        private string GetTokenizerText(string text)
        {
            StringBuilder result = new StringBuilder();

            MyAnalyzer ma = new MyAnalyzer(AnalyzerFactory.stopWords);

            Lucene.Net.Analysis.TokenStream ts = ma.TokenStream("", new System.IO.StringReader(text));

            Lucene.Net.Analysis.Token token;
            while ((token = ts.Next()) != null)
            {
                int    len  = token.TermLength();
                char[] buff = token.TermBuffer();
                if (len == 1)
                {
                    if (buff[0] != MyFilter.Separator)
                    {
                        result.Append(buff, 0, 1);
                    }
                }
                else
                {
                    result.Append(buff, 0, len);
                }
            }
            ts.Close();

            return(result.ToString());
        }
Пример #5
0
        public string GetTokenView(TokenStream tokenStream, out int numberOfTokens)
        {
            var sb = new StringBuilder();
            numberOfTokens = 0;

            var termAttr = tokenStream.GetAttribute<ITermAttribute>();
            var startOffset = tokenStream.GetAttribute<Lucene.Net.Analysis.Tokenattributes.IOffsetAttribute>();
            while (tokenStream.IncrementToken())
            {

                sb.Append(termAttr.Term + "   Start: " + startOffset.StartOffset.ToString().PadLeft(5) + "  End: " + startOffset.EndOffset.ToString().PadLeft(5) + "\r\n");

                //var view = "[" + termAttr.Term + "]   ";
                //sb.Append(view);
                numberOfTokens++;
            }

            return sb.ToString();

            //StringBuilder sb = new StringBuilder();

            //Token token = tokenStream.Next();

            //numberOfTokens = 0;

            //while (token != null)
            //{
            //    numberOfTokens++;
            //    sb.Append(token.TermText() + "   Start: " + token.StartOffset().ToString().PadLeft(5) + "  End: " + token.EndOffset().ToString().PadLeft(5) + "\r\n");
            //    token = tokenStream.Next();
            //}

            //return sb.ToString();
        }
Пример #6
0
        public static Token NextToken(TokenStream input, Token reusableToken)
        {
            if (input == null) 
                return null;
            if (!input.IncrementToken()) 
                return null;

            ITermAttribute termAtt = input.GetAttribute<ITermAttribute>();
            IOffsetAttribute offsetAtt = input.GetAttribute<IOffsetAttribute>();
            ITypeAttribute typeAtt = input.GetAttribute<ITypeAttribute>();

            if (reusableToken == null)
            {
                reusableToken = new Token();
            }
            reusableToken.Clear();

            if (termAtt != null)
                reusableToken.SetTermBuffer(termAtt.TermBuffer(), 0, termAtt.TermLength());

            if (offsetAtt != null)
            {
                reusableToken.StartOffset = offsetAtt.StartOffset;
                reusableToken.EndOffset = offsetAtt.EndOffset;
            }

            if (typeAtt != null)
                reusableToken.Type = typeAtt.Type;

            return reusableToken;
        }
 public MockHoleInjectingTokenFilter(Random random, TokenStream @in)
     : base(@in)
 {
     RandomSeed = random.Next();
     PosIncAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLenAtt = AddAttribute<IPositionLengthAttribute>();
 }
 public MockVariableLengthPayloadFilter(Random random, TokenStream @in)
     : base(@in)
 {
     this.Random = random;
     this.Payload = new BytesRef(Bytes);
     this.PayloadAtt = AddAttribute<IPayloadAttribute>();
 }
Пример #9
0
 public CamelCaseFilter(TokenStream stream)
     : base(stream)
 {
     _termAttribute = AddAttribute<ITermAttribute>();
     _offsetAttribute = AddAttribute<IOffsetAttribute>();
     _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>();
 }
Пример #10
0
        public void v()
        {
            //Analyzer analyzer = new CJKAnalyzer();
            //TokenStream tokenStream = analyzer.TokenStream("", new StringReader("我爱你中国China中华人名共和国"));
            //Lucene.Net.Analysis.Token token = null;
            //while ((token = tokenStream.Next()) != null)
            //{
            //    Response.Write(token.TermText() + "<br/>");
            //}

            Lucene.Net.Analysis.Standard.StandardAnalyzer a = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
            string s = "我日中华人民共和国";

            System.IO.StringReader          reader = new System.IO.StringReader(s);
            Lucene.Net.Analysis.TokenStream ts     = a.TokenStream(s, reader);
            bool hasnext = ts.IncrementToken();

            Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;
            while (hasnext)
            {
                ita = ts.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
                Console.WriteLine(ita.Term);
                hasnext = ts.IncrementToken();
            }
            ts.CloneAttributes();
            reader.Close();
            a.Close();
            Console.ReadKey();
        }
Пример #11
0
        private int GetTokenizerLength(string text)
        {
            int result = 0;

            MyAnalyzer ma = new MyAnalyzer(AnalyzerFactory.stopWords);

            Lucene.Net.Analysis.TokenStream ts = ma.TokenStream("", new System.IO.StringReader(text));

            Lucene.Net.Analysis.Token token;
            while ((token = ts.Next()) != null)
            {
                int len = token.TermLength();
                if (len == 1)
                {
                    char[] buff = token.TermBuffer();
                    if (buff[0] != MyFilter.Separator)
                    {
                        result++;
                    }
                }
                else
                {
                    result += len;
                }
            }
            ts.Close();

            return(result);
        }
Пример #12
0
        public FrenchStemFilter(TokenStream _in)
            : base(_in)
        {

            stemmer = new FrenchStemmer();
            termAtt = AddAttribute<ITermAttribute>();
        }
Пример #13
0
        public void TestMethod1()
        {
            Lucene.Net.Analysis.Standard.StandardAnalyzer a = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
            string s = "我日中华人民共和国";

            System.IO.StringReader          reader = new System.IO.StringReader(s);
            Lucene.Net.Analysis.TokenStream ts     = a.TokenStream(s, reader);
            bool hasnext = ts.IncrementToken();

            Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;

            while (hasnext)
            {
                ita = ts.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
                Console.WriteLine(ita.Term);
                hasnext = ts.IncrementToken();
            }

            Console.WriteLine("over");


            ts.CloneAttributes();
            reader.Close();
            a.Close();
        }
Пример #14
0
 /// <summary>
 /// Create a new MockTokenFilter.
 /// </summary>
 /// <param name="input"> TokenStream to filter </param>
 /// <param name="filter"> DFA representing the terms that should be removed. </param>
 public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter)
     : base(input)
 {
     this.Filter = filter;
     TermAtt = AddAttribute<ICharTermAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
Пример #15
0
        public AddSuffixFilter(TokenStream input, Dictionary<string, char[]> _suffixByTokenType)
            : base(input)
        {
			termAtt = AddAttribute <ITermAttribute>();
			typeAtt = AddAttribute <ITypeAttribute>();
            this.suffixByTokenType = _suffixByTokenType;
        }
Пример #16
0
		public NoiseEmailHostFilter (TokenStream input, bool tokenize_email_hostname, LinkCallback link_call_back)
			: base (input)
		{
			this.token_stream = input;
			this.tokenize_email_hostname = tokenize_email_hostname;
			this.link_call_back = link_call_back;
		}
Пример #17
0
 public SingleCharTokenizer(TokenStream input): base(input)
 {
     _input = input;
     _termAttribute = (TermAttribute)AddAttribute(typeof(TermAttribute));
     _offsetAttribute = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
     _positionIncrementAttribute = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
 }
Пример #18
0
        /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
        /// <param name="r">a source of text to be tokenized
        /// </param>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="fieldName">Used by analyzer for any special per-field analysis
        /// </param>
        protected void AddTermFrequencies(System.IO.TextReader r, System.Collections.IDictionary termFreqMap, System.String fieldName)
        {
            TokenStream ts         = analyzer.TokenStream(fieldName, r);
            var         termAtt    = ts.AddAttribute <ITermAttribute>();
            int         tokenCount = 0;

            while (ts.IncrementToken())
            {
                // for every token
                System.String word = termAtt.Term;
                tokenCount++;
                if (tokenCount > maxNumTokensParsed)
                {
                    break;
                }
                if (IsNoiseWord(word))
                {
                    continue;
                }

                // increment frequency
                Int cnt = (Int)termFreqMap[word];
                if (cnt == null)
                {
                    termFreqMap[word] = new Int();
                }
                else
                {
                    cnt.x++;
                }
            }
        }
        /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
        /// <param name="r">a source of text to be tokenized
        /// </param>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="fieldName">Used by analyzer for any special per-field analysis
        /// </param>
        private void AddTermFrequencies(StreamReader r, IDictionary termFreqMap, String fieldName)
        {
            Lucene.Net.Analysis.TokenStream ts = analyzer.TokenStream(fieldName, r);
            Lucene.Net.Analysis.Token       token;
            int tokenCount = 0;

            while ((token = ts.Next()) != null)
            {
                // for every token
                String word = token.TermText();
                tokenCount++;
                if (tokenCount > maxNumTokensParsed)
                {
                    break;
                }
                if (IsNoiseWord(word))
                {
                    continue;
                }

                // increment frequency
                Int cnt = (Int)termFreqMap[word];
                if (cnt == null)
                {
                    termFreqMap[word] = new Int();
                }
                else
                {
                    cnt.x++;
                }
            }
        }
        public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int? finalOffset)
        {
            Assert.IsNotNull(output);
            ICheckClearAttributesAttribute checkClearAtt = ts.AddAttribute<ICheckClearAttributesAttribute>();

            Assert.IsTrue(ts.HasAttribute<ITermAttribute>(), "has no TermAttribute");
            ITermAttribute termAtt = ts.GetAttribute<ITermAttribute>();

            IOffsetAttribute offsetAtt = null;
            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute<IOffsetAttribute>(), "has no OffsetAttribute");
                offsetAtt = ts.GetAttribute<IOffsetAttribute>();
            }
    
            ITypeAttribute typeAtt = null;
            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute<ITypeAttribute>(), "has no TypeAttribute");
                typeAtt = ts.GetAttribute<ITypeAttribute>();
            }
            
            IPositionIncrementAttribute posIncrAtt = null;
            if (posIncrements != null)
            {
                Assert.IsTrue(ts.HasAttribute<IPositionIncrementAttribute>(), "has no PositionIncrementAttribute");
                posIncrAtt = ts.GetAttribute<IPositionIncrementAttribute>();
            }

            ts.Reset();
            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null) offsetAtt.SetOffset(14584724, 24683243);
                if (typeAtt != null) typeAtt.Type = "bogusType";
                if (posIncrAtt != null) posIncrAtt.PositionIncrement = 45987657;

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term, "term " + i);
                if (startOffsets != null)
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset, "startOffset " + i);
                if (endOffsets != null)
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset, "endOffset " + i);
                if (types != null)
                    Assert.AreEqual(types[i], typeAtt.Type, "type " + i);
                if (posIncrements != null)
                    Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i);
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
                Assert.AreEqual(finalOffset, offsetAtt.EndOffset, "finalOffset ");
            ts.Close();
        }
Пример #21
0
 public SynonymFilter(TokenStream input, ISynonymEngine engine)
     : base(input)
 {
     synonymStack = new Stack<string>();
     this.engine = engine;
     this.termAttr = AddAttribute(typeof(TermAttribute)) as TermAttribute;
     this.posIncrAttr = AddAttribute(typeof(PositionIncrementAttribute)) as PositionIncrementAttribute;
 }
 public CutLeterDigitFilter(TokenStream input)
     : base(input)
 {
     reusableToken = new Token();
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
Пример #23
0
 private static void DisplayTokens(TokenStream stream)
 {
     TermAttribute term = (TermAttribute) stream.AddAttribute(typeof(TermAttribute));
     while (stream.IncrementToken())
     {
         Console.WriteLine("[{0}]  ", term.Term());
     }
 }
Пример #24
0
 /// <summary>
 /// 
 /// </summary>
 /// <remarks></remarks>
 /// <seealso cref=""/>
 /// <param name="input"></param>
 /// <param name="synonymEngine"></param>
 /// <return></return>
 public SynonymFilter(TokenStream input, ISynonymEngine synonymEngine)
     : base(input)
 {
     synonymStack = new Stack<String>();
     this.engine = synonymEngine;
     this.termAtt = AddAttribute<ITermAttribute>();
     this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
            public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
            {
                TokenStream ts = a.TokenStream(fieldName, reader);

                return(new StopFilter(enablePositionIncrements, ts, new CharArraySet(new List <string> {
                    "stop"
                }, true)));
            }
 public SectionTokenStream(TokenStream tokenStream, int sectionId)
     : base(tokenStream)
 {
     // NOTE: Calling the AddAttribute<T> method failed, so 
     // switched to using AddAttributeImpl.
     _payloadAtt = new PayloadAttribute();
     AddAttributeImpl(_payloadAtt);
     _payload = EncodeIntPayload(sectionId);
 }
 public PayloadFilter(TokenStream input, System.String fieldName) : base(input)
 {
     this.fieldName = fieldName;
     pos            = 0;
     i           = 0;
     posIncrAttr = input.AddAttribute <IPositionIncrementAttribute>();
     payloadAttr = input.AddAttribute <IPayloadAttribute>();
     termAttr    = input.AddAttribute <ITermAttribute>();
 }
    public SynonymFilter(TokenStream in_Renamed, ISynonymEngine engine)
        : base(in_Renamed)
    {
        synonymStack = new Stack<string>();
            this.engine = engine;

            termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
            posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
    }
Пример #29
0
 /// <summary>
 /// Sole constructor. </summary>
 public SuggestStopFilter(TokenStream input, CharArraySet stopWords)
     : base(input)
 {
     this.stopWords = stopWords;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.keywordAtt = AddAttribute<IKeywordAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
 }
Пример #30
0
 public MockPayloadFilter(TokenStream input, string fieldName)
     : base(input)
 {
     this.FieldName = fieldName;
     Pos = 0;
     i = 0;
     PosIncrAttr = input.AddAttribute<IPositionIncrementAttribute>();
     PayloadAttr = input.AddAttribute<IPayloadAttribute>();
     TermAttr = input.AddAttribute<ICharTermAttribute>();
 }
        public ExpandAcronymsFilter(TokenStream input, IAcronymExpansionProvider acronymExpansionProvider)
            : base(input)
        {
            _acronymExpansionProvider = acronymExpansionProvider;

            _termAttribute = AddAttribute<ITermAttribute>();
            _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>();
            _tokenSet = new Queue<string>();
            _recognizedTokens = new HashSet<string>();
        }
Пример #32
0
 public static void DisplayTokens(TokenStream stream)
 {
     // error in Lucene.Net? should work, look in source code why not
     // source: Lucene in Action, page ??
     var term = stream.AddAttribute<TermAttribute>();
     while (stream.IncrementToken()) {
     #if LuceneV303
         Trace.WriteLine("[" + term.Term + "] ");
     #endif
     }
 }
 public MockFixedLengthPayloadFilter(Random random, TokenStream @in, int length)
     : base(@in)
 {
     if (length < 0)
     {
         throw new System.ArgumentException("length must be >= 0");
     }
     this.Random = random;
     this.Bytes = new byte[length];
     this.Payload = new BytesRef(Bytes);
     this.PayloadAtt = AddAttribute<IPayloadAttribute>();
 }
Пример #34
0
        public ExpanderFilter(TokenStream input, [NotNull] Func<String, IEnumerable<Expansion>> expander, Boolean emitSource = true)
            : base(input)
        {
            if (expander == null)
                throw new ArgumentNullException("expander");

            _expander = expander;
            _emitSource = emitSource;
            _termAttr = AddAttribute<ITermAttribute>();
            _posAttr = AddAttribute<IPositionIncrementAttribute>();
            _typeAttr = AddAttribute<ITypeAttribute>();
        }
Пример #35
0
        public SynonymFilter (TokenStream input, SynonymEngine engine) : base(input) {
            if (engine == null)
                throw new ArgumentNullException("synonymEngine");
            synonymStack = new Stack<string>();
            this.engine = engine;

            this.termAtt = (TermAttribute)AddAttribute<ITermAttribute>();
            this.posIncrAtt = (PositionIncrementAttribute)AddAttribute<IPositionIncrementAttribute>();

            //this.termAtt = this.AddAttribute<string>();
            //this.posIncrAtt = this.AddAttribute<string>();
        }
 public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) : base(suffix)
 {
     _suffix =
         new InjectablePrefixAwareTokenFilter(
             new InjectablePrefixAwareTokenFilter(prefix, input)
                 {
                     UpdateAction = UpdateInputToken
                 },
             suffix)
             {
                 UpdateAction = UpdateSuffixToken
             };
 }
Пример #37
0
		internal virtual void  VerifyPayload(TokenStream ts)
		{
            IPayloadAttribute payloadAtt = ts.GetAttribute<IPayloadAttribute>();
			for (byte b = 1; ; b++)
			{
				bool hasNext = ts.IncrementToken();
				if (!hasNext)
					break;
				// System.out.println("id="+System.identityHashCode(nextToken) + " " + t);
				// System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]);
				Assert.AreEqual(b, payloadAtt.Payload.ToByteArray()[0]);
			}
		}
Пример #38
0
		internal virtual void  VerifyPayload(TokenStream ts)
		{
			Token t = new Token();
			for (byte b = 1; ; b++)
			{
				t.Clear();
				t = ts.Next(t);
				if (t == null)
					break;
				// System.out.println("id="+System.identityHashCode(t) + " " + t);
				// System.out.println("payload=" + (int)t.getPayload().toByteArray()[0]);
				Assert.AreEqual(b, t.GetPayload().ToByteArray()[0]);
			}
		}
Пример #39
0
        /// <summary>
        /// 分词测试
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public string Token(string keyword)
        {
            string ret = "";

            System.IO.StringReader          reader = new System.IO.StringReader(keyword);
            Lucene.Net.Analysis.TokenStream ts     = analyzer.TokenStream(keyword, reader);
            Lucene.Net.Analysis.Token       token  = ts.Next();
            while (token != null)
            {
                ret  += " " + token.TermText();
                token = ts.Next();
            }
            ts.CloneAttributes();
            reader.Close();
            analyzer.Close();
            return(ret);
        }
Пример #40
0
        /// <summary>
        /// 分词测试
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public string Token(string keyword)
        {
            string ret = "";

            System.IO.StringReader          reader = new System.IO.StringReader(keyword);
            Lucene.Net.Analysis.TokenStream ts     = analyzer.TokenStream(keyword, reader);
            bool hasNext = ts.IncrementToken();

            Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;
            while (hasNext)
            {
                ita     = ts.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
                ret    += ita.Term + "|";
                hasNext = ts.IncrementToken();
            }
            ts.CloneAttributes();
            reader.Close();
            analyzer.Close();
            return(ret);
        }
Пример #41
0
        /// <summary>
        /// 分词方法
        /// </summary>
        /// <param name="words">待分词内容</param>
        /// <param name="analyzer"></param>
        /// <returns></returns>
        private string cutWords(string words, Analyzer analyzer)
        {
            string resultStr = "";

            System.IO.StringReader          reader = new System.IO.StringReader(words);
            Lucene.Net.Analysis.TokenStream ts     = analyzer.TokenStream(words, reader);
            bool hasNext = ts.IncrementToken();

            Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;
            while (hasNext)
            {
                ita        = ts.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
                resultStr += ita.Term + "|";
                hasNext    = ts.IncrementToken();
            }
            ts.CloneAttributes();
            reader.Close();
            analyzer.Close();
            return(resultStr);
        }
Пример #42
0
        public static List <string> SplitWords(string content)
        {
            List <string> strList = new List <string>();

            using (Analyzer analyzer = new PanGuAnalyzer())//指定使用盘古 PanGuAnalyzer 分词算法
            {
                using (System.IO.StringReader reader = new System.IO.StringReader(content))
                {
                    Lucene.Net.Analysis.TokenStream ts = analyzer.TokenStream(content, reader);

                    while (ts.IncrementToken())
                    {
                        var ita = ts.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
                        strList.Add(ita.Term);
                    }
                    ts.CloneAttributes();
                }
            }

            return(strList);
        }
Пример #43
0
 public LowerCaseFilter(TokenStream in_Renamed) : base(in_Renamed)
 {
 }
Пример #44
0
 public StopFilter(TokenStream input, ICollection <string> stopWords, bool ignoreCase) : this(ENABLE_POSITION_INCREMENTS_DEFAULT, input, stopWords, ignoreCase)
 {
 }
Пример #45
0
 public StopFilter(TokenStream in_Renamed, ICollection <string> stopWords) : this(ENABLE_POSITION_INCREMENTS_DEFAULT, in_Renamed, stopWords, false)
 {
 }
Пример #46
0
 public MockGraphTokenFilter(Random random, TokenStream input)
     : base(input)
 {
     Seed    = random.Next();
     TermAtt = AddAttribute <ICharTermAttribute>();
 }
Пример #47
0
            internal readonly IPositionIncrementAttribute PosIncAtt; // = addAttribute(typeof(PositionIncrementAttribute));

            public RemoveATokens(TokenStream @in)
                : base(@in)
            {
                TermAtt   = AddAttribute <ICharTermAttribute>();
                PosIncAtt = AddAttribute <IPositionIncrementAttribute>();
            }
Пример #48
0
 /// <summary>
 /// Creates a new <see cref="TokenStreamComponents"/> instance.
 /// </summary>
 /// <param name="source">
 ///          the analyzer's tokenizer </param>
 /// <param name="result">
 ///          the analyzer's resulting token stream </param>
 public TokenStreamComponents(Tokenizer source, TokenStream result)
 {
     this.m_source = source;
     this.m_sink   = result;
 }
Пример #49
0
 public StopFilter(bool enablePositionIncrements, TokenStream input, System.String[] stopWords) : this(enablePositionIncrements, input, stopWords, false)
 {
 }
Пример #50
0
 public StopFilter(TokenStream in_Renamed, System.String[] stopWords, bool ignoreCase) : this(ENABLE_POSITION_INCREMENTS_DEFAULT, in_Renamed, stopWords, ignoreCase)
 {
 }
Пример #51
0
 public CachingTokenFilter(TokenStream input) : base(input)
 {
 }
 public MockRandomLookaheadTokenFilter(Random random, TokenStream @in)
     : base(@in)
 {
     this.Seed   = random.Next();
     this.random = new Random((int)Seed);
 }
Пример #53
0
 public NeverPeeksLookaheadTokenFilter(TokenStream input)
     : base(input)
 {
 }
Пример #54
0
 /// <summary> Build a filter that removes words that are too long or too
 /// short from the text.
 /// </summary>
 public LengthFilter(TokenStream in_Renamed, int min, int max) : base(in_Renamed)
 {
     this.min = min;
     this.max = max;
     termAtt  = (TermAttribute)AddAttribute(typeof(TermAttribute));
 }
Пример #55
0
 /// <summary>
 /// Creates a new <see cref="TokenStreamComponents"/> instance.
 /// </summary>
 /// <param name="source">
 ///          the analyzer's tokenizer </param>
 public TokenStreamComponents(Tokenizer source)
 {
     this.m_source = source;
     this.m_sink   = source;
 }
Пример #56
0
 public StopFilter(TokenStream input, System.String[] stopWords) : this(ENABLE_POSITION_INCREMENTS_DEFAULT, input, stopWords, false)
 {
 }
Пример #57
0
 /// <summary>Construct a token stream filtering the given input. </summary>
 protected internal TokenFilter(TokenStream input)
 {
     this.input = input;
 }
Пример #58
0
 /// <summary> Constructs a filter which removes words from the input
 /// TokenStream that are named in the Set.
 ///
 /// </summary>
 /// <param name="enablePositionIncrements">true if token positions should record the removed stop words
 /// </param>
 /// <param name="in">Input stream
 /// </param>
 /// <param name="stopWords">The set of Stop Words.
 /// </param>
 /// <seealso cref="MakeStopSet(java.lang.String[])">
 /// </seealso>
 public StopFilter(bool enablePositionIncrements, TokenStream in_Renamed, ICollection <string> stopWords) : this(enablePositionIncrements, in_Renamed, stopWords, false)
 {
 }
Пример #59
0
 public StopFilter(bool enablePositionIncrements, TokenStream in_Renamed, string[] stopWords, bool ignoreCase) : base(in_Renamed)
 {
     this.stopWords = (CharArraySet)MakeStopSet(stopWords, ignoreCase);
     this.enablePositionIncrements = enablePositionIncrements;
     Init();
 }
Пример #60
0
 public TeeTokenFilter(TokenStream input, SinkTokenizer sink) : base(input)
 {
     this.sink = sink;
 }