public virtual bool IsNewFragment(Token token) { char kar1 = this.text[token.StartOffset() - 2]; char kar2 = this.text[token.StartOffset() - 3]; char kar3 = this.text[token.StartOffset() - 4]; bool isNewFrag= ((token.EndOffset()>=(fragmentSize*(currentNumFrags - 1) + (fragmentSize/2))&& (isCriticalChar(kar1) || isCriticalChar(kar2) || isCriticalChar(kar3))) || (token.EndOffset()>=(fragmentSize*currentNumFrags))); if(isNewFrag) { currentNumFrags++; } return isNewFrag; }
public override Token Next() { Token t = input.Next(); if (t == null) { return(null); } // Return a token with filtered characters. return(new Token(RemoveAccents(t.TermText()), t.StartOffset(), t.EndOffset(), t.Type())); }
/// <summary> /// /// </summary> /// <returns>Returns the next token in the stream, or null at EOS</returns> public override Token Next() { if ((token = input.Next()) == null) { return null; } else { String s = stemmer.Stem(token.TermText()); if (!s.Equals(token.TermText())) { return new Token(s, token.StartOffset(), token.EndOffset(), token.Type()); } return token; } }
public override bool IncrementToken() { if (Upto < Tokens.Length) { Token token = Tokens[Upto++]; // TODO: can we just capture/restoreState so // we get all attrs...? ClearAttributes(); TermAtt.SetEmpty(); TermAtt.Append(token.ToString()); PosIncrAtt.PositionIncrement = token.PositionIncrement; PosLengthAtt.PositionLength = token.PositionLength; OffsetAtt.SetOffset(token.StartOffset(), token.EndOffset()); PayloadAtt.Payload = token.Payload; return(true); } else { return(false); } }
public virtual void TestCtor() { Token t = new Token(); char[] content = "hello".ToCharArray(); t.CopyBuffer(content, 0, content.Length); Assert.AreNotSame(t.Buffer(), content); Assert.AreEqual(0, t.StartOffset()); Assert.AreEqual(0, t.EndOffset()); Assert.AreEqual("hello", t.ToString()); Assert.AreEqual("word", t.Type); Assert.AreEqual(0, t.Flags); t = new Token(6, 22); t.CopyBuffer(content, 0, content.Length); Assert.AreEqual("hello", t.ToString()); Assert.AreEqual("hello", t.ToString()); Assert.AreEqual(6, t.StartOffset()); Assert.AreEqual(22, t.EndOffset()); Assert.AreEqual("word", t.Type); Assert.AreEqual(0, t.Flags); t = new Token(6, 22, 7); t.CopyBuffer(content, 0, content.Length); Assert.AreEqual("hello", t.ToString()); Assert.AreEqual("hello", t.ToString()); Assert.AreEqual(6, t.StartOffset()); Assert.AreEqual(22, t.EndOffset()); Assert.AreEqual("word", t.Type); Assert.AreEqual(7, t.Flags); t = new Token(6, 22, "junk"); t.CopyBuffer(content, 0, content.Length); Assert.AreEqual("hello", t.ToString()); Assert.AreEqual("hello", t.ToString()); Assert.AreEqual(6, t.StartOffset()); Assert.AreEqual(22, t.EndOffset()); Assert.AreEqual("junk", t.Type); Assert.AreEqual(0, t.Flags); }
public override bool IncrementToken() { if (Tokens == null) { FillTokens(); } //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size()); if (Upto == Tokens.Count) { //System.out.println(" END @ " + tokens.size()); return(false); } Token t = Tokens[Upto++]; //System.out.println(" return token=" + t); ClearAttributes(); TermAtt.Append(t.ToString()); OffsetAtt.SetOffset(t.StartOffset(), t.EndOffset()); PosIncrAtt.PositionIncrement = t.PositionIncrement; PosLengthAtt.PositionLength = t.PositionLength; return(true); }
/// <summary>Returns the next token in the stream, or null at EOS. /// <p>Removes <tt>'s</tt> from the end of words. /// <p>Removes dots from acronyms. /// </summary> public override Lucene.Net.Analysis.Token Next() { Lucene.Net.Analysis.Token t = input.Next(); if (t == null) { return(null); } System.String text = t.TermText(); System.String type = t.Type(); if (type == APOSTROPHE_TYPE && (text.EndsWith("'s") || text.EndsWith("'S"))) { return(new Lucene.Net.Analysis.Token(text.Substring(0, (text.Length - 2) - (0)), t.StartOffset(), t.EndOffset(), type)); } else if (type == ACRONYM_TYPE) { // remove dots System.Text.StringBuilder trimmed = new System.Text.StringBuilder(); for (int i = 0; i < text.Length; i++) { char c = text[i]; if (c != '.') { trimmed.Append(c); } } return(new Lucene.Net.Analysis.Token(trimmed.ToString(), t.StartOffset(), t.EndOffset(), type)); } else { return(t); } }
public virtual Token updateSuffixToken(Token suffixToken, Token lastInputToken) { suffixToken.SetOffset(lastInputToken.EndOffset() + suffixToken.StartOffset(), lastInputToken.EndOffset() + suffixToken.EndOffset()); return suffixToken; }
public virtual Token updateInputToken(Token inputToken, Token lastPrefixToken) { inputToken.SetOffset(lastPrefixToken.EndOffset() + inputToken.StartOffset(), lastPrefixToken.EndOffset() + inputToken.EndOffset()); return inputToken; }
public int EndOffset() { return(delegate_Renamed.EndOffset()); }
private bool ProcessToken(ref Lucene.Net.Analysis.Token token) { string type = token.Type(); if (type == tokentype_number) { // nobody will remember more than 20 digits return(token.TermText().Length <= 20); } else if (type == tokentype_alphanum) { string text = token.TermText(); int begin = 0; bool found = false; // Check if number, in that case strip 0's from beginning foreach (char c in text) { if (!Char.IsDigit(c)) { begin = 0; break; } else if (!found) { if (c == '0') { begin++; } else { found = true; } } } if (begin == 0) { return(!IsNoise(text)); } token = new Lucene.Net.Analysis.Token( text.Remove(0, begin), begin, token.EndOffset(), type); return(true); } else if (type == tokentype_email) { if (tokenize_email_hostname) { ProcessEmailToken(token); } return(true); } else if (type == tokentype_host) { if (tokenize_email_hostname) { ProcessURLToken(token); } return(true); } else { // FIXME: Noise should be only tested on token type alphanum return(!IsNoise(token.TermText())); } }
/// <summary> /// The default implementation adds last prefix token end offset to the suffix token start and end offsets. /// </summary> /// <param name="suffixToken">a token from the suffix stream</param> /// <param name="lastPrefixToken">the last token from the prefix stream</param> /// <returns>consumer token</returns> public virtual Token UpdateSuffixToken(Token suffixToken, Token lastPrefixToken) { suffixToken.SetStartOffset(lastPrefixToken.EndOffset() + suffixToken.StartOffset()); suffixToken.SetEndOffset(lastPrefixToken.EndOffset() + suffixToken.EndOffset()); return suffixToken; }
private void SetCurrentToken(Token token) { if (token == null) return; ClearAttributes(); _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength()); _posIncrAtt.SetPositionIncrement(token.GetPositionIncrement()); _flagsAtt.SetFlags(token.GetFlags()); _offsetAtt.SetOffset(token.StartOffset(), token.EndOffset()); _typeAtt.SetType(token.Type()); _payloadAtt.SetPayload(token.GetPayload()); }
public Token UpdateSuffixToken(Token suffixToken, Token lastInputToken) { suffixToken.SetStartOffset(lastInputToken.EndOffset() + suffixToken.StartOffset()); suffixToken.SetEndOffset(lastInputToken.EndOffset() + suffixToken.EndOffset()); return suffixToken; }
public Token UpdateInputToken(Token inputToken, Token lastPrefixToken) { inputToken.SetStartOffset(lastPrefixToken.EndOffset() + inputToken.StartOffset()); inputToken.SetEndOffset(lastPrefixToken.EndOffset() + inputToken.EndOffset()); return inputToken; }
protected override string GetTokenView(Token token) { return token.TermText() + " Start: " + token.StartOffset().ToString().PadLeft(5) + " End: " + token.EndOffset().ToString().PadLeft(5) + "\r\n"; }