예제 #1
0
        public virtual bool IsNewFragment(Token token)
        {
            char kar1 = this.text[token.StartOffset() - 2];
            char kar2 = this.text[token.StartOffset() - 3];
            char kar3 = this.text[token.StartOffset() - 4];

            bool isNewFrag= ((token.EndOffset()>=(fragmentSize*(currentNumFrags - 1) + (fragmentSize/2))&&
                    (isCriticalChar(kar1) || isCriticalChar(kar2) || isCriticalChar(kar3)))
                    || (token.EndOffset()>=(fragmentSize*currentNumFrags)));
            if(isNewFrag)
            {
                currentNumFrags++;
            }
            return isNewFrag;
        }
        public override Token Next()
        {
            Token t = input.Next();

            if (t == null)
            {
                return(null);
            }
            // Return a token with filtered characters.
            return(new Token(RemoveAccents(t.TermText()), t.StartOffset(), t.EndOffset(), t.Type()));
        }
예제 #3
0
		/// <summary>
		/// 
		/// </summary>
		/// <returns>Returns the next token in the stream, or null at EOS</returns>
		public override Token Next() 
		{
			if ((token = input.Next()) == null)
			{
				return null;
			}
			else
			{
				String s = stemmer.Stem(token.TermText());
				if (!s.Equals(token.TermText()))
				{
					return new Token(s, token.StartOffset(), token.EndOffset(),
						token.Type());
				}
				return token;
			}
		}
예제 #4
0
 public override bool IncrementToken()
 {
     if (Upto < Tokens.Length)
     {
         Token token = Tokens[Upto++];
         // TODO: can we just capture/restoreState so
         // we get all attrs...?
         ClearAttributes();
         TermAtt.SetEmpty();
         TermAtt.Append(token.ToString());
         PosIncrAtt.PositionIncrement = token.PositionIncrement;
         PosLengthAtt.PositionLength  = token.PositionLength;
         OffsetAtt.SetOffset(token.StartOffset(), token.EndOffset());
         PayloadAtt.Payload = token.Payload;
         return(true);
     }
     else
     {
         return(false);
     }
 }
예제 #5
0
        public virtual void TestCtor()
        {
            Token t = new Token();

            char[] content = "hello".ToCharArray();
            t.CopyBuffer(content, 0, content.Length);
            Assert.AreNotSame(t.Buffer(), content);
            Assert.AreEqual(0, t.StartOffset());
            Assert.AreEqual(0, t.EndOffset());
            Assert.AreEqual("hello", t.ToString());
            Assert.AreEqual("word", t.Type);
            Assert.AreEqual(0, t.Flags);

            t = new Token(6, 22);
            t.CopyBuffer(content, 0, content.Length);
            Assert.AreEqual("hello", t.ToString());
            Assert.AreEqual("hello", t.ToString());
            Assert.AreEqual(6, t.StartOffset());
            Assert.AreEqual(22, t.EndOffset());
            Assert.AreEqual("word", t.Type);
            Assert.AreEqual(0, t.Flags);

            t = new Token(6, 22, 7);
            t.CopyBuffer(content, 0, content.Length);
            Assert.AreEqual("hello", t.ToString());
            Assert.AreEqual("hello", t.ToString());
            Assert.AreEqual(6, t.StartOffset());
            Assert.AreEqual(22, t.EndOffset());
            Assert.AreEqual("word", t.Type);
            Assert.AreEqual(7, t.Flags);

            t = new Token(6, 22, "junk");
            t.CopyBuffer(content, 0, content.Length);
            Assert.AreEqual("hello", t.ToString());
            Assert.AreEqual("hello", t.ToString());
            Assert.AreEqual(6, t.StartOffset());
            Assert.AreEqual(22, t.EndOffset());
            Assert.AreEqual("junk", t.Type);
            Assert.AreEqual(0, t.Flags);
        }
예제 #6
0
            public override bool IncrementToken()
            {
                if (Tokens == null)
                {
                    FillTokens();
                }
                //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
                if (Upto == Tokens.Count)
                {
                    //System.out.println("  END @ " + tokens.size());
                    return(false);
                }
                Token t = Tokens[Upto++];

                //System.out.println("  return token=" + t);
                ClearAttributes();
                TermAtt.Append(t.ToString());
                OffsetAtt.SetOffset(t.StartOffset(), t.EndOffset());
                PosIncrAtt.PositionIncrement = t.PositionIncrement;
                PosLengthAtt.PositionLength  = t.PositionLength;
                return(true);
            }
예제 #7
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p>Removes <tt>'s</tt> from the end of words.
        /// <p>Removes dots from acronyms.
        /// </summary>
        public override Lucene.Net.Analysis.Token Next()
        {
            Lucene.Net.Analysis.Token t = input.Next();

            if (t == null)
            {
                return(null);
            }

            System.String text = t.TermText();
            System.String type = t.Type();

            if (type == APOSTROPHE_TYPE && (text.EndsWith("'s") || text.EndsWith("'S")))
            {
                return(new Lucene.Net.Analysis.Token(text.Substring(0, (text.Length - 2) - (0)), t.StartOffset(), t.EndOffset(), type));
            }
            else if (type == ACRONYM_TYPE)
            {
                // remove dots
                System.Text.StringBuilder trimmed = new System.Text.StringBuilder();
                for (int i = 0; i < text.Length; i++)
                {
                    char c = text[i];
                    if (c != '.')
                    {
                        trimmed.Append(c);
                    }
                }
                return(new Lucene.Net.Analysis.Token(trimmed.ToString(), t.StartOffset(), t.EndOffset(), type));
            }
            else
            {
                return(t);
            }
        }
 public virtual Token updateSuffixToken(Token suffixToken, Token lastInputToken)
 {
     suffixToken.SetOffset(lastInputToken.EndOffset() + suffixToken.StartOffset(), lastInputToken.EndOffset() + suffixToken.EndOffset());
     return suffixToken;
 }
 public virtual Token updateInputToken(Token inputToken, Token lastPrefixToken)
 {
     inputToken.SetOffset(lastPrefixToken.EndOffset() + inputToken.StartOffset(), lastPrefixToken.EndOffset() + inputToken.EndOffset());
     return inputToken;
 }
예제 #10
0
 public int EndOffset()
 {
     return(delegate_Renamed.EndOffset());
 }
예제 #11
0
        private bool ProcessToken(ref Lucene.Net.Analysis.Token token)
        {
            string type = token.Type();

            if (type == tokentype_number)
            {
                // nobody will remember more than 20 digits
                return(token.TermText().Length <= 20);
            }
            else if (type == tokentype_alphanum)
            {
                string text  = token.TermText();
                int    begin = 0;
                bool   found = false;
                // Check if number, in that case strip 0's from beginning
                foreach (char c in text)
                {
                    if (!Char.IsDigit(c))
                    {
                        begin = 0;
                        break;
                    }
                    else if (!found)
                    {
                        if (c == '0')
                        {
                            begin++;
                        }
                        else
                        {
                            found = true;
                        }
                    }
                }

                if (begin == 0)
                {
                    return(!IsNoise(text));
                }
                token = new Lucene.Net.Analysis.Token(
                    text.Remove(0, begin),
                    begin,
                    token.EndOffset(),
                    type);
                return(true);
            }
            else if (type == tokentype_email)
            {
                if (tokenize_email_hostname)
                {
                    ProcessEmailToken(token);
                }
                return(true);
            }
            else if (type == tokentype_host)
            {
                if (tokenize_email_hostname)
                {
                    ProcessURLToken(token);
                }
                return(true);
            }
            else
            {
                // FIXME: Noise should be only tested on token type alphanum
                return(!IsNoise(token.TermText()));
            }
        }
 /// <summary>
 /// The default implementation adds last prefix token end offset to the suffix token start and end offsets.
 /// </summary>
 /// <param name="suffixToken">a token from the suffix stream</param>
 /// <param name="lastPrefixToken">the last token from the prefix stream</param>
 /// <returns>consumer token</returns>
 public virtual Token UpdateSuffixToken(Token suffixToken, Token lastPrefixToken)
 {
     suffixToken.SetStartOffset(lastPrefixToken.EndOffset() + suffixToken.StartOffset());
     suffixToken.SetEndOffset(lastPrefixToken.EndOffset() + suffixToken.EndOffset());
     return suffixToken;
 }
 private void SetCurrentToken(Token token)
 {
     if (token == null) return;
     ClearAttributes();
     _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength());
     _posIncrAtt.SetPositionIncrement(token.GetPositionIncrement());
     _flagsAtt.SetFlags(token.GetFlags());
     _offsetAtt.SetOffset(token.StartOffset(), token.EndOffset());
     _typeAtt.SetType(token.Type());
     _payloadAtt.SetPayload(token.GetPayload());
 }
 public Token UpdateSuffixToken(Token suffixToken, Token lastInputToken)
 {
     suffixToken.SetStartOffset(lastInputToken.EndOffset() + suffixToken.StartOffset());
     suffixToken.SetEndOffset(lastInputToken.EndOffset() + suffixToken.EndOffset());
     return suffixToken;
 }
 public Token UpdateInputToken(Token inputToken, Token lastPrefixToken)
 {
     inputToken.SetStartOffset(lastPrefixToken.EndOffset() + inputToken.StartOffset());
     inputToken.SetEndOffset(lastPrefixToken.EndOffset() + inputToken.EndOffset());
     return inputToken;
 }
 protected override string GetTokenView(Token token)
 {
     return token.TermText() + "   Start: " + token.StartOffset().ToString().PadLeft(5) + "  End: " + token.EndOffset().ToString().PadLeft(5) + "\r\n";
 }