public override Lucene.Net.Analysis.Token Next() { if (parts != null) { if (++parts_index < parts.Length) { string part = parts [parts_index]; Lucene.Net.Analysis.Token part_token; // FIXME: Searching for google.com will not match www.google.com. // If we decide to allow google-style "abcd.1234" which means // "abcd 1234" as a consequtive phrase, then adjusting // the startOffset and endOffset would enable matching // google.com to www.google.com int start_offset = (parts_index == 0 && token_type == tokentype_email ? 0 : last_end_offset + 1); // assuming only one separator int end_offset = start_offset + part.Length; part_token = new Lucene.Net.Analysis.Token(part, start_offset, end_offset, token_type); part_token.SetPositionIncrement(0); last_end_offset = (parts_index == 0 && token_type == tokentype_email ? -1 : end_offset); // assuming only one separator return(part_token); } else { // clear the array parts = null; parts_index = -1; last_end_offset = -1; token_type = null; } } Token token; while ((token = token_stream.Next()) != null) { //Console.WriteLine ("Found token: [{0}]", token.TermText ()); if (ProcessToken(ref token)) { return(token); } } return(null); }
public override Token Next() { Token token = input.Next(); if (token == null) return null; string result = null; try { result = stemmer.Stem(token.TermText()); } catch (Exception e) { throw new System.SystemException(e.Message, e); } Token newToken = new Token(result, token.StartOffset(), token.EndOffset(), token.Type()); newToken.SetPositionIncrement(token.GetPositionIncrement()); return newToken; }
/// <summary> Returns the next input Token whose termText() is not a stop word.</summary> public override Token Next(Token result) { // return the first non-stop word found int skippedPositions = 0; while ((result = input.Next(result)) != null) { if (!stopWords.Contains(result.TermBuffer(), 0, result.termLength)) { if (enablePositionIncrements) { result.SetPositionIncrement(result.GetPositionIncrement() + skippedPositions); } return(result); } skippedPositions += result.GetPositionIncrement(); } // reached EOS -- return null return(null); }
/// <summary> Returns the next input Token whose term() is not a stop word.</summary> public override Token Next(/* in */ Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); // return the first non-stop word found int skippedPositions = 0; for (Token nextToken = input.Next(reusableToken); nextToken != null; nextToken = input.Next(reusableToken)) { if (!stopWords.Contains(nextToken.TermBuffer(), 0, nextToken.TermLength())) { if (enablePositionIncrements) { nextToken.SetPositionIncrement(nextToken.GetPositionIncrement() + skippedPositions); } return(nextToken); } skippedPositions += nextToken.GetPositionIncrement(); } // reached EOS -- return null return(null); }
/// <summary> /// Final touch of a shingle token before it is passed on to the consumer from method {@link #next(org.apache.lucene.analysis.Token)}. /// /// Calculates and sets type, flags, position increment, start/end offsets and weight. /// </summary> /// <param name="token">Shingle Token</param> /// <param name="shingle">Tokens used to produce the shingle token.</param> /// <param name="currentPermutationStartOffset">Start offset in parameter currentPermutationTokens</param> /// <param name="currentPermutationRows">index to Matrix.Column.Row from the position of tokens in parameter currentPermutationTokens</param> /// <param name="currentPermuationTokens">tokens of the current permutation of rows in the matrix. </param> public void UpdateToken(Token token, List<Token> shingle, int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens) { token.SetType(typeof(ShingleMatrixFilter).Name); token.SetFlags(0); token.SetPositionIncrement(1); token.SetStartOffset((shingle[0]).StartOffset()); token.SetEndOffset(shingle[shingle.Count - 1].EndOffset()); _settingsCodec.SetWeight( token, CalculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens) ); }
private Token GetNextInputToken(Token token) { if (!_input.IncrementToken()) return null; token.SetTermBuffer(_inTermAtt.TermBuffer(), 0, _inTermAtt.TermLength()); token.SetPositionIncrement(_inPosIncrAtt.GetPositionIncrement()); token.SetFlags(_inFlagsAtt.GetFlags()); token.SetOffset(_inOffsetAtt.StartOffset(), _inOffsetAtt.EndOffset()); token.SetType(_inTypeAtt.Type()); token.SetPayload(_inPayloadAtt.GetPayload()); return token; }
public override Lucene.Net.Analysis.Token Next () { if (parts != null) { if (++parts_index < parts.Length) { string part = parts [parts_index]; Lucene.Net.Analysis.Token part_token; // FIXME: Searching for google.com will not match www.google.com. // If we decide to allow google-style "abcd.1234" which means // "abcd 1234" as a consequtive phrase, then adjusting // the startOffset and endOffset would enable matching // google.com to www.google.com int start_offset = (parts_index == 0 && token_type == tokentype_email ? 0 : last_end_offset + 1); // assuming only one separator int end_offset = start_offset + part.Length; part_token = new Lucene.Net.Analysis.Token (part, start_offset, end_offset, token_type); part_token.SetPositionIncrement (0); last_end_offset = (parts_index == 0 && token_type == tokentype_email ? -1 : end_offset); // assuming only one separator return part_token; } else { // clear the array parts = null; parts_index = -1; last_end_offset = -1; token_type = null; } } Token token; while ( (token = token_stream.Next ()) != null) { //Console.WriteLine ("Found token: [{0}]", token.TermText ()); if (ProcessToken (ref token)) return token; } return null; }
public virtual void TestFilterTokens() { Token tok = new Token("accents", 2, 7, "wrd"); tok.SetPositionIncrement(3); SnowballFilter filter = new SnowballFilter(new AnonymousClassTokenStream(tok, this), "English"); Token newtok = filter.Next(); System.Diagnostics.Trace.Assert("accent" == newtok.TermText()); //// assertEquals("accent", newtok.TermText()); System.Diagnostics.Trace.Assert(2 == newtok.StartOffset()); //// assertEquals(2, newtok.StartOffset()); System.Diagnostics.Trace.Assert(7 == newtok.EndOffset()); //// assertEquals(7, newtok.EndOffset()); System.Diagnostics.Trace.Assert("wrd" == newtok.Type()); //// assertEquals("wrd", newtok.Type()); System.Diagnostics.Trace.Assert(3 == newtok.GetPositionIncrement()); //// assertEquals(3, newtok.GetPositionIncrement()); }
public void SetPositionIncrement(int positionIncrement) { delegate_Renamed.SetPositionIncrement(positionIncrement); }
private static Token TokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, TokenPositioner positioner) { var token = new Token(startOffset, endOffset); token.SetTermBuffer(text); token.SetPositionIncrement(posIncr); ShingleMatrixFilter.DefaultSettingsCodec.SetWeight(token, weight); ShingleMatrixFilter.DefaultSettingsCodec.SetTokenPositioner(token, positioner); return token; }
private static Token TokenFactory(String text, int posIncr, int startOffset, int endOffset) { var token = new Token(startOffset, endOffset); token.SetTermBuffer(text); token.SetPositionIncrement(posIncr); return token; }
public virtual void TestFilterTokens() { Token tok = new Token("accents", 2, 7, "wrd"); tok.SetPositionIncrement(3); SnowballFilter filter = new SnowballFilter(new AnonymousClassTokenStream(tok, this), "English"); Token newtok = filter.Next(); Assert.AreEqual("accent", newtok.TermText()); Assert.AreEqual(2, newtok.StartOffset()); Assert.AreEqual(7, newtok.EndOffset()); Assert.AreEqual("wrd", newtok.Type()); Assert.AreEqual(3, newtok.GetPositionIncrement()); }
private Token GetNextSuffixInputToken(Token token) { if (!Suffix.IncrementToken()) return null; token.SetTermBuffer(_termAtt.TermBuffer(), 0, _termAtt.TermLength()); token.SetPositionIncrement(_posIncrAtt.GetPositionIncrement()); token.SetFlags(_flagsAtt.GetFlags()); token.SetOffset(_offsetAtt.StartOffset(), _offsetAtt.EndOffset()); token.SetType(_typeAtt.Type()); token.SetPayload(_payloadAtt.GetPayload()); return token; }
/// <summary> Returns the next input Token whose termText() is not a stop word.</summary> public override Token Next(Token result) { // return the first non-stop word found int skippedPositions = 0; while ((result = input.Next(result)) != null) { if (!stopWords.Contains(result.TermBuffer(), 0, result.termLength)) { if (enablePositionIncrements) { result.SetPositionIncrement(result.GetPositionIncrement() + skippedPositions); } return result; } skippedPositions += result.GetPositionIncrement(); } // reached EOS -- return null return null; }
public override Token Next(Token result) { if (buffered != null) { Token t = buffered; buffered = null; return t; } Token t2 = input.Next(result); if (t2 == null) return null; if (System.Char.IsDigit(t2.TermBuffer()[0])) { t2.SetPositionIncrement(t2.TermBuffer()[0] - '0'); } if (first) { // set payload on first position only t2.SetPayload(new Payload(new byte[]{100})); first = false; } // index a "synonym" for every token buffered = (Token) t2.Clone(); buffered.SetPayload(null); buffered.SetPositionIncrement(0); buffered.SetTermBuffer(new char[]{'b'}, 0, 1); return t2; }
public static Token t(String text, int startOffset, int endOffset, int positionIncrement) { Token token = new Token(text, startOffset, endOffset); token.SetPositionIncrement(positionIncrement); return token; }