A Tokenizer is a TokenStream whose input is a Reader.

this is an abstract class; subclasses must override #IncrementToken()

NOTE: Subclasses overriding #IncrementToken() must call AttributeSource#ClearAttributes() before setting attributes.

Inheritance: TokenStream
Beispiel #1
0
		public void IncrementsOffsetCorrectlyWithAnotherReader2()
		{
			const string input = @"test1 <a href=""foo"">testlink</a> test2 test3";

			CharFilter filter = new HTMLStripCharFilter(CharReader.Get(new StringReader(input)));
			Tokenizer t = new Tokenizer(filter);

			string token = string.Empty;
			List<Token> results = new List<Token>();

			t.NextToken(out token);
			Assert.Equal(0, filter.CorrectOffset(t.Offset));
			Assert.Equal(5, t.LengthInSource);

			t.NextToken(out token);
			Assert.Equal(20, filter.CorrectOffset(t.Offset));
			Assert.Equal(8, t.LengthInSource);

			t.NextToken(out token);
			Assert.Equal(33, filter.CorrectOffset(t.Offset));
			Assert.Equal(5, t.LengthInSource);

			t.NextToken(out token);
			Assert.Equal(39, filter.CorrectOffset(t.Offset));
			Assert.Equal(5, t.LengthInSource);
		}
Beispiel #2
0
		public void IncrementsOffsetCorrectlyWithAnotherReader()
		{
			int[] expectedOffsets = { 0, 5, 10, 15 };
			int curPos = 0;

			string token = string.Empty;
			Tokenizer t =
				new Tokenizer(
					new HTMLStripCharFilter(CharReader.Get(new System.IO.StringReader(@"test<a href=""foo"">test</a>test test"))));

			while (true)
			{
				Tokenizer.TokenType token_type = t.NextToken(out token);
				if (token_type == 0)
					break;

				Assert.Equal(expectedOffsets[curPos++], t.Offset);
				Assert.Equal(4, t.LengthInSource);
			}
		}
Beispiel #3
0
 /// <summary>
 /// Creates a new <seealso cref="TokenStreamComponents"/> instance.
 /// </summary>
 /// <param name="source">
 ///          the analyzer's tokenizer </param>
 public TokenStreamComponents(Tokenizer source)
 {
     this.Source = source;
     this.Sink = source;
 }
Beispiel #4
0
 /// <summary>
 /// Creates a new <seealso cref="TokenStreamComponents"/> instance.
 /// </summary>
 /// <param name="source">
 ///          the analyzer's tokenizer </param>
 /// <param name="result">
 ///          the analyzer's resulting token stream </param>
 public TokenStreamComponents(Tokenizer source, TokenStream result)
 {
     this.Source = source;
     this.Sink = result;
 }
        public override bool IncrementToken()
        {

            if (currentRealToken == null)
            {
                bool next = realStream.IncrementToken();
                if (!next)
                {
                    return false;
                }
                //Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset());
                ClearAttributes();
                termAtt.SetTermBuffer(realTermAtt.Term);
                offsetAtt.SetOffset(realOffsetAtt.StartOffset, realOffsetAtt.EndOffset);
                posIncrAtt.PositionIncrement = realPosIncrAtt.PositionIncrement;

                String expansions = synonyms[realTermAtt.Term];
                if (expansions == null)
                {
                    return true;
                }
                st = new Tokenizer(expansions, ",");
                if (st.HasMoreTokens())
                {
                    currentRealToken = new Token(realOffsetAtt.StartOffset, realOffsetAtt.EndOffset);
                    currentRealToken.SetTermBuffer(realTermAtt.Term);
                }

                return true;
            }
            else
            {
                String tok = st.NextToken();
                ClearAttributes();
                termAtt.SetTermBuffer(tok);
                offsetAtt.SetOffset(currentRealToken.StartOffset, currentRealToken.EndOffset);
                posIncrAtt.PositionIncrement = 0;
                if (!st.HasMoreTokens())
                {
                    currentRealToken = null;
                    st = null;
                }
                return true;
            }
        }
 public TokenFilterAnonymousInnerClassHelper(AnalyzerAnonymousInnerClassHelper2 outerInstance, Tokenizer tokenizer)
     : base(tokenizer)
 {
     this.OuterInstance = outerInstance;
     first = true;
     termAtt = AddAttribute<ICharTermAttribute>();
     payloadAtt = AddAttribute<IPayloadAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
Beispiel #7
0
 /// <summary>
 /// Creates a new <see cref="TokenStreamComponents"/> instance.
 /// </summary>
 /// <param name="source">
 ///          the analyzer's tokenizer </param>
 public TokenStreamComponents(Tokenizer source)
 {
     this.m_source = source;
     this.m_sink   = source;
 }
Beispiel #8
0
 /// <summary>
 /// Creates a new <see cref="TokenStreamComponents"/> instance.
 /// </summary>
 /// <param name="source">
 ///          the analyzer's tokenizer </param>
 /// <param name="result">
 ///          the analyzer's resulting token stream </param>
 public TokenStreamComponents(Tokenizer source, TokenStream result)
 {
     this.m_source = source;
     this.m_sink   = result;
 }