public string GetTokenView(TokenStream tokenStream, out int numberOfTokens) { var sb = new StringBuilder(); numberOfTokens = 0; var termAttr = tokenStream.GetAttribute<ITermAttribute>(); var startOffset = tokenStream.GetAttribute<Lucene.Net.Analysis.Tokenattributes.IOffsetAttribute>(); while (tokenStream.IncrementToken()) { sb.Append(termAttr.Term + " Start: " + startOffset.StartOffset.ToString().PadLeft(5) + " End: " + startOffset.EndOffset.ToString().PadLeft(5) + "\r\n"); //var view = "[" + termAttr.Term + "] "; //sb.Append(view); numberOfTokens++; } return sb.ToString(); //StringBuilder sb = new StringBuilder(); //Token token = tokenStream.Next(); //numberOfTokens = 0; //while (token != null) //{ // numberOfTokens++; // sb.Append(token.TermText() + " Start: " + token.StartOffset().ToString().PadLeft(5) + " End: " + token.EndOffset().ToString().PadLeft(5) + "\r\n"); // token = tokenStream.Next(); //} //return sb.ToString(); }
public static Token NextToken(TokenStream input, Token reusableToken) { if (input == null) return null; if (!input.IncrementToken()) return null; ITermAttribute termAtt = input.GetAttribute<ITermAttribute>(); IOffsetAttribute offsetAtt = input.GetAttribute<IOffsetAttribute>(); ITypeAttribute typeAtt = input.GetAttribute<ITypeAttribute>(); if (reusableToken == null) { reusableToken = new Token(); } reusableToken.Clear(); if (termAtt != null) reusableToken.SetTermBuffer(termAtt.TermBuffer(), 0, termAtt.TermLength()); if (offsetAtt != null) { reusableToken.StartOffset = offsetAtt.StartOffset; reusableToken.EndOffset = offsetAtt.EndOffset; } if (typeAtt != null) reusableToken.Type = typeAtt.Type; return reusableToken; }
public void v() { //Analyzer analyzer = new CJKAnalyzer(); //TokenStream tokenStream = analyzer.TokenStream("", new StringReader("我爱你中国China中华人名共和国")); //Lucene.Net.Analysis.Token token = null; //while ((token = tokenStream.Next()) != null) //{ // Response.Write(token.TermText() + "<br/>"); //} Lucene.Net.Analysis.Standard.StandardAnalyzer a = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); string s = "我日中华人民共和国"; System.IO.StringReader reader = new System.IO.StringReader(s); Lucene.Net.Analysis.TokenStream ts = a.TokenStream(s, reader); bool hasnext = ts.IncrementToken(); Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita; while (hasnext) { ita = ts.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>(); Console.WriteLine(ita.Term); hasnext = ts.IncrementToken(); } ts.CloneAttributes(); reader.Close(); a.Close(); Console.ReadKey(); }
public void TestMethod1() { Lucene.Net.Analysis.Standard.StandardAnalyzer a = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); string s = "我日中华人民共和国"; System.IO.StringReader reader = new System.IO.StringReader(s); Lucene.Net.Analysis.TokenStream ts = a.TokenStream(s, reader); bool hasnext = ts.IncrementToken(); Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita; while (hasnext) { ita = ts.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>(); Console.WriteLine(ita.Term); hasnext = ts.IncrementToken(); } Console.WriteLine("over"); ts.CloneAttributes(); reader.Close(); a.Close(); }
internal virtual void VerifyPayload(TokenStream ts) { IPayloadAttribute payloadAtt = ts.GetAttribute<IPayloadAttribute>(); for (byte b = 1; ; b++) { bool hasNext = ts.IncrementToken(); if (!hasNext) break; // System.out.println("id="+System.identityHashCode(nextToken) + " " + t); // System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]); Assert.AreEqual(b, payloadAtt.Payload.ToByteArray()[0]); } }
/// <summary> /// 分词测试 /// </summary> /// <param name="keyword"></param> /// <returns></returns> public string Token(string keyword) { string ret = ""; System.IO.StringReader reader = new System.IO.StringReader(keyword); Lucene.Net.Analysis.TokenStream ts = analyzer.TokenStream(keyword, reader); bool hasNext = ts.IncrementToken(); Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita; while (hasNext) { ita = ts.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>(); ret += ita.Term + "|"; hasNext = ts.IncrementToken(); } ts.CloneAttributes(); reader.Close(); analyzer.Close(); return(ret); }
/// <summary> /// 分词方法 /// </summary> /// <param name="words">待分词内容</param> /// <param name="analyzer"></param> /// <returns></returns> private string cutWords(string words, Analyzer analyzer) { string resultStr = ""; System.IO.StringReader reader = new System.IO.StringReader(words); Lucene.Net.Analysis.TokenStream ts = analyzer.TokenStream(words, reader); bool hasNext = ts.IncrementToken(); Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita; while (hasNext) { ita = ts.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>(); resultStr += ita.Term + "|"; hasNext = ts.IncrementToken(); } ts.CloneAttributes(); reader.Close(); analyzer.Close(); return(resultStr); }
public static List <string> SplitWords(string content) { List <string> strList = new List <string>(); using (Analyzer analyzer = new PanGuAnalyzer())//指定使用盘古 PanGuAnalyzer 分词算法 { using (System.IO.StringReader reader = new System.IO.StringReader(content)) { Lucene.Net.Analysis.TokenStream ts = analyzer.TokenStream(content, reader); while (ts.IncrementToken()) { var ita = ts.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>(); strList.Add(ita.Term); } ts.CloneAttributes(); } } return(strList); }
public virtual void TestStopList() { var stopWordsSet = Support.Compatibility.SetFactory.CreateHashSet <string>(); stopWordsSet.Add("good"); stopWordsSet.Add("test"); stopWordsSet.Add("analyzer"); StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_24, stopWordsSet); System.IO.StringReader reader = new System.IO.StringReader("This is a good test of the english stop analyzer"); TokenStream stream = newStop.TokenStream("test", reader); Assert.IsNotNull(stream); ITermAttribute termAtt = stream.GetAttribute <ITermAttribute>(); IPositionIncrementAttribute posIncrAtt = stream.AddAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { System.String text = termAtt.Term; Assert.IsFalse(stopWordsSet.Contains(text)); Assert.AreEqual(1, posIncrAtt.PositionIncrement); // in 2.4 stop tokenizer does not apply increments. } }
public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int? finalOffset) { Assert.IsNotNull(output); ICheckClearAttributesAttribute checkClearAtt = ts.AddAttribute<ICheckClearAttributesAttribute>(); Assert.IsTrue(ts.HasAttribute<ITermAttribute>(), "has no TermAttribute"); ITermAttribute termAtt = ts.GetAttribute<ITermAttribute>(); IOffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { Assert.IsTrue(ts.HasAttribute<IOffsetAttribute>(), "has no OffsetAttribute"); offsetAtt = ts.GetAttribute<IOffsetAttribute>(); } ITypeAttribute typeAtt = null; if (types != null) { Assert.IsTrue(ts.HasAttribute<ITypeAttribute>(), "has no TypeAttribute"); typeAtt = ts.GetAttribute<ITypeAttribute>(); } IPositionIncrementAttribute posIncrAtt = null; if (posIncrements != null) { Assert.IsTrue(ts.HasAttribute<IPositionIncrementAttribute>(), "has no PositionIncrementAttribute"); posIncrAtt = ts.GetAttribute<IPositionIncrementAttribute>(); } ts.Reset(); for (int i = 0; i < output.Length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.ClearAttributes(); termAtt.SetTermBuffer("bogusTerm"); if (offsetAtt != null) offsetAtt.SetOffset(14584724, 24683243); if (typeAtt != null) typeAtt.Type = "bogusType"; if (posIncrAtt != null) posIncrAtt.PositionIncrement = 45987657; checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist"); Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain"); Assert.AreEqual(output[i], termAtt.Term, "term " + i); if (startOffsets != null) Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset, "startOffset " + i); if (endOffsets != null) Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset, "endOffset " + i); if (types != null) Assert.AreEqual(types[i], typeAtt.Type, "type " + i); if (posIncrements != null) Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i); } Assert.IsFalse(ts.IncrementToken(), "end of stream"); ts.End(); if (finalOffset.HasValue) Assert.AreEqual(finalOffset, offsetAtt.EndOffset, "finalOffset "); ts.Close(); }
public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset) { Assert.IsNotNull(output); CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute)ts.AddAttribute(typeof(CheckClearAttributesAttribute)); Assert.IsTrue(ts.HasAttribute(typeof(TermAttribute)), "has no TermAttribute"); TermAttribute termAtt = (TermAttribute)ts.GetAttribute(typeof(TermAttribute)); OffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { Assert.IsTrue(ts.HasAttribute(typeof(OffsetAttribute)), "has no OffsetAttribute"); offsetAtt = (OffsetAttribute)ts.GetAttribute(typeof(OffsetAttribute)); } TypeAttribute typeAtt = null; if (types != null) { Assert.IsTrue(ts.HasAttribute(typeof(TypeAttribute)), "has no TypeAttribute"); typeAtt = (TypeAttribute)ts.GetAttribute(typeof(TypeAttribute)); } PositionIncrementAttribute posIncrAtt = null; if (posIncrements != null) { Assert.IsTrue(ts.HasAttribute(typeof(PositionIncrementAttribute)), "has no PositionIncrementAttribute"); posIncrAtt = (PositionIncrementAttribute)ts.GetAttribute(typeof(PositionIncrementAttribute)); } ts.Reset(); for (int i = 0; i < output.Length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.ClearAttributes(); termAtt.SetTermBuffer("bogusTerm"); if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.SetType("bogusType"); } if (posIncrAtt != null) { posIncrAtt.SetPositionIncrement(45987657); } checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist"); Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain"); Assert.AreEqual(output[i], termAtt.Term(), "term " + i); if (startOffsets != null) { Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i); } if (endOffsets != null) { Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i); } if (types != null) { Assert.AreEqual(types[i], typeAtt.Type(), "type " + i); } if (posIncrements != null) { Assert.AreEqual(posIncrements[i], posIncrAtt.GetPositionIncrement(), "posIncrement " + i); } } Assert.IsFalse(ts.IncrementToken(), "end of stream"); ts.End(); if (finalOffset.HasValue) { Assert.AreEqual(finalOffset, offsetAtt.EndOffset(), "finalOffset "); } ts.Close(); }
private void CheckTokens(TokenStream stream) { int count = 0; ICharTermAttribute termAtt = stream.GetAttribute<ICharTermAttribute>(); while (stream.IncrementToken()) { Assert.IsTrue(count < Tokens.Length); Assert.AreEqual(Tokens[count], termAtt.ToString()); count++; } Assert.AreEqual(Tokens.Length, count); }
// offsetsAreCorrect also validates: // - graph offsets are correct (all tokens leaving from // pos X have the same startOffset; all tokens // arriving to pos Y have the same endOffset) // - offsets only move forwards (startOffset >= // lastStartOffset) public static void AssertTokenStreamContents(TokenStream ts, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, int? finalOffset, int? finalPosInc, bool[] keywordAtts, bool offsetsAreCorrect) { // LUCENENET: Bug fix: NUnit throws an exception when something fails. // This causes Dispose() to be skipped and it pollutes other tests indicating false negatives. // Added this try-finally block to fix this. try { Assert.IsNotNull(output); var checkClearAtt = ts.AddAttribute<ICheckClearAttributesAttribute>(); ICharTermAttribute termAtt = null; if (output.Length > 0) { Assert.IsTrue(ts.HasAttribute<ICharTermAttribute>(), "has no CharTermAttribute"); termAtt = ts.GetAttribute<ICharTermAttribute>(); } IOffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { Assert.IsTrue(ts.HasAttribute<IOffsetAttribute>(), "has no OffsetAttribute"); offsetAtt = ts.GetAttribute<IOffsetAttribute>(); } ITypeAttribute typeAtt = null; if (types != null) { Assert.IsTrue(ts.HasAttribute<ITypeAttribute>(), "has no TypeAttribute"); typeAtt = ts.GetAttribute<ITypeAttribute>(); } IPositionIncrementAttribute posIncrAtt = null; if (posIncrements != null || finalPosInc != null) { Assert.IsTrue(ts.HasAttribute<IPositionIncrementAttribute>(), "has no PositionIncrementAttribute"); posIncrAtt = ts.GetAttribute<IPositionIncrementAttribute>(); } IPositionLengthAttribute posLengthAtt = null; if (posLengths != null) { Assert.IsTrue(ts.HasAttribute<IPositionLengthAttribute>(), "has no PositionLengthAttribute"); posLengthAtt = ts.GetAttribute<IPositionLengthAttribute>(); } IKeywordAttribute keywordAtt = null; if (keywordAtts != null) { Assert.IsTrue(ts.HasAttribute<IKeywordAttribute>(), "has no KeywordAttribute"); keywordAtt = ts.GetAttribute<IKeywordAttribute>(); } // Maps position to the start/end offset: IDictionary<int?, int?> posToStartOffset = new Dictionary<int?, int?>(); IDictionary<int?, int?> posToEndOffset = new Dictionary<int?, int?>(); ts.Reset(); int pos = -1; int lastStartOffset = 0; for (int i = 0; i < output.Length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.ClearAttributes(); termAtt.SetEmpty().Append("bogusTerm"); if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.Type = "bogusType"; } if (posIncrAtt != null) { posIncrAtt.PositionIncrement = 45987657; } if (posLengthAtt != null) { posLengthAtt.PositionLength = 45987653; } if (keywordAtt != null) { keywordAtt.Keyword = (i & 1) == 0; } bool reset = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist"); Assert.IsTrue(reset, "ClearAttributes() was not called correctly in TokenStream chain"); Assert.AreEqual(output[i], termAtt.ToString(), "term " + i + ", output[i] = " + output[i] + ", termAtt = " + termAtt.ToString()); if (startOffsets != null) { Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i); } if (endOffsets != null) { Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i); } if (types != null) { Assert.AreEqual(types[i], typeAtt.Type, "type " + i); } if (posIncrements != null) { Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i); } if (posLengths != null) { Assert.AreEqual(posLengths[i], posLengthAtt.PositionLength, "posLength " + i); } if (keywordAtts != null) { Assert.AreEqual(keywordAtts[i], keywordAtt.Keyword, "keywordAtt " + i); } // we can enforce some basic things about a few attributes even if the caller doesn't check: if (offsetAtt != null) { int startOffset = offsetAtt.StartOffset(); int endOffset = offsetAtt.EndOffset(); if (finalOffset != null) { Assert.IsTrue(startOffset <= (int)finalOffset, "startOffset must be <= finalOffset"); Assert.IsTrue(endOffset <= (int)finalOffset, "endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + (int)finalOffset); } if (offsetsAreCorrect) { Assert.IsTrue(offsetAtt.StartOffset() >= lastStartOffset, "offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset); lastStartOffset = offsetAtt.StartOffset(); } if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) { // Validate offset consistency in the graph, ie // all tokens leaving from a certain pos have the // same startOffset, and all tokens arriving to a // certain pos have the same endOffset: int posInc = posIncrAtt.PositionIncrement; pos += posInc; int posLength = posLengthAtt.PositionLength; if (!posToStartOffset.ContainsKey(pos)) { // First time we've seen a token leaving from this position: posToStartOffset[pos] = startOffset; //System.out.println(" + s " + pos + " -> " + startOffset); } else { // We've seen a token leaving from this position // before; verify the startOffset is the same: //System.out.println(" + vs " + pos + " -> " + startOffset); Assert.AreEqual((int)posToStartOffset[pos], startOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt); } int endPos = pos + posLength; if (!posToEndOffset.ContainsKey(endPos)) { // First time we've seen a token arriving to this position: posToEndOffset[endPos] = endOffset; //System.out.println(" + e " + endPos + " -> " + endOffset); } else { // We've seen a token arriving to this position // before; verify the endOffset is the same: //System.out.println(" + ve " + endPos + " -> " + endOffset); Assert.AreEqual((int)posToEndOffset[endPos], endOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt); } } } if (posIncrAtt != null) { if (i == 0) { Assert.IsTrue(posIncrAtt.PositionIncrement >= 1, "first posIncrement must be >= 1"); } else { Assert.IsTrue(posIncrAtt.PositionIncrement >= 0, "posIncrement must be >= 0"); } } if (posLengthAtt != null) { Assert.IsTrue(posLengthAtt.PositionLength >= 1, "posLength must be >= 1"); } } if (ts.IncrementToken()) { Assert.Fail("TokenStream has more tokens than expected (expected count=" + output.Length + "); extra token=" + termAtt); } // repeat our extra safety checks for End() ts.ClearAttributes(); if (termAtt != null) { termAtt.SetEmpty().Append("bogusTerm"); } if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.Type = "bogusType"; } if (posIncrAtt != null) { posIncrAtt.PositionIncrement = 45987657; } if (posLengthAtt != null) { posLengthAtt.PositionLength = 45987653; } var reset_ = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before ts.End(); Assert.IsTrue(checkClearAtt.AndResetClearCalled, "super.End()/ClearAttributes() was not called correctly in End()"); if (finalOffset != null) { Assert.AreEqual((int)finalOffset, offsetAtt.EndOffset(), "finalOffset"); } if (offsetAtt != null) { Assert.IsTrue(offsetAtt.EndOffset() >= 0, "finalOffset must be >= 0"); } if (finalPosInc != null) { Assert.AreEqual((int)finalPosInc, posIncrAtt.PositionIncrement, "finalPosInc"); } ts.Dispose(); } catch (Exception) { //ts.Reset(); ts.ClearAttributes(); ts.End(); ts.Dispose(); throw; } }
// offsetsAreCorrect also validates: // - graph offsets are correct (all tokens leaving from // pos X have the same startOffset; all tokens // arriving to pos Y have the same endOffset) // - offsets only move forwards (startOffset >= // lastStartOffset) public static void AssertTokenStreamContents(TokenStream ts, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, int?finalOffset, int?finalPosInc, bool[] keywordAtts, bool offsetsAreCorrect) { Assert.IsNotNull(output); var checkClearAtt = ts.AddAttribute <ICheckClearAttributesAttribute>(); ICharTermAttribute termAtt = null; if (output.Length > 0) { Assert.IsTrue(ts.HasAttribute <ICharTermAttribute>(), "has no CharTermAttribute"); termAtt = ts.GetAttribute <ICharTermAttribute>(); } IOffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { Assert.IsTrue(ts.HasAttribute <IOffsetAttribute>(), "has no OffsetAttribute"); offsetAtt = ts.GetAttribute <IOffsetAttribute>(); } ITypeAttribute typeAtt = null; if (types != null) { Assert.IsTrue(ts.HasAttribute <ITypeAttribute>(), "has no TypeAttribute"); typeAtt = ts.GetAttribute <ITypeAttribute>(); } IPositionIncrementAttribute posIncrAtt = null; if (posIncrements != null || finalPosInc != null) { Assert.IsTrue(ts.HasAttribute <IPositionIncrementAttribute>(), "has no PositionIncrementAttribute"); posIncrAtt = ts.GetAttribute <IPositionIncrementAttribute>(); } IPositionLengthAttribute posLengthAtt = null; if (posLengths != null) { Assert.IsTrue(ts.HasAttribute <IPositionLengthAttribute>(), "has no PositionLengthAttribute"); posLengthAtt = ts.GetAttribute <IPositionLengthAttribute>(); } IKeywordAttribute keywordAtt = null; if (keywordAtts != null) { Assert.IsTrue(ts.HasAttribute <IKeywordAttribute>(), "has no KeywordAttribute"); keywordAtt = ts.GetAttribute <IKeywordAttribute>(); } // Maps position to the start/end offset: IDictionary <int?, int?> posToStartOffset = new Dictionary <int?, int?>(); IDictionary <int?, int?> posToEndOffset = new Dictionary <int?, int?>(); ts.Reset(); int pos = -1; int lastStartOffset = 0; for (int i = 0; i < output.Length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.ClearAttributes(); termAtt.SetEmpty().Append("bogusTerm"); if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.Type = "bogusType"; } if (posIncrAtt != null) { posIncrAtt.PositionIncrement = 45987657; } if (posLengthAtt != null) { posLengthAtt.PositionLength = 45987653; } if (keywordAtt != null) { keywordAtt.Keyword = (i & 1) == 0; } bool reset = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist"); Assert.IsTrue(reset, "ClearAttributes() was not called correctly in TokenStream chain"); Assert.AreEqual(output[i], termAtt.ToString(), "term " + i + ", output[i] = " + output[i] + ", termAtt = " + termAtt.ToString()); if (startOffsets != null) { Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i); } if (endOffsets != null) { Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i); } if (types != null) { Assert.AreEqual(types[i], typeAtt.Type, "type " + i); } if (posIncrements != null) { Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i); } if (posLengths != null) { Assert.AreEqual(posLengths[i], posLengthAtt.PositionLength, "posLength " + i); } if (keywordAtts != null) { Assert.AreEqual(keywordAtts[i], keywordAtt.Keyword, "keywordAtt " + i); } // we can enforce some basic things about a few attributes even if the caller doesn't check: if (offsetAtt != null) { int startOffset = offsetAtt.StartOffset(); int endOffset = offsetAtt.EndOffset(); if (finalOffset != null) { Assert.IsTrue(startOffset <= (int)finalOffset, "startOffset must be <= finalOffset"); Assert.IsTrue(endOffset <= (int)finalOffset, "endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + (int)finalOffset); } if (offsetsAreCorrect) { Assert.IsTrue(offsetAtt.StartOffset() >= lastStartOffset, "offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset); lastStartOffset = offsetAtt.StartOffset(); } if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) { // Validate offset consistency in the graph, ie // all tokens leaving from a certain pos have the // same startOffset, and all tokens arriving to a // certain pos have the same endOffset: int posInc = posIncrAtt.PositionIncrement; pos += posInc; int posLength = posLengthAtt.PositionLength; if (!posToStartOffset.ContainsKey(pos)) { // First time we've seen a token leaving from this position: posToStartOffset[pos] = startOffset; //System.out.println(" + s " + pos + " -> " + startOffset); } else { // We've seen a token leaving from this position // before; verify the startOffset is the same: //System.out.println(" + vs " + pos + " -> " + startOffset); Assert.AreEqual((int)posToStartOffset[pos], startOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt); } int endPos = pos + posLength; if (!posToEndOffset.ContainsKey(endPos)) { // First time we've seen a token arriving to this position: posToEndOffset[endPos] = endOffset; //System.out.println(" + e " + endPos + " -> " + endOffset); } else { // We've seen a token arriving to this position // before; verify the endOffset is the same: //System.out.println(" + ve " + endPos + " -> " + endOffset); Assert.AreEqual((int)posToEndOffset[endPos], endOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt); } } } if (posIncrAtt != null) { if (i == 0) { Assert.IsTrue(posIncrAtt.PositionIncrement >= 1, "first posIncrement must be >= 1"); } else { Assert.IsTrue(posIncrAtt.PositionIncrement >= 0, "posIncrement must be >= 0"); } } if (posLengthAtt != null) { Assert.IsTrue(posLengthAtt.PositionLength >= 1, "posLength must be >= 1"); } } if (ts.IncrementToken()) { Assert.Fail("TokenStream has more tokens than expected (expected count=" + output.Length + "); extra token=" + termAtt); } // repeat our extra safety checks for End() ts.ClearAttributes(); if (termAtt != null) { termAtt.SetEmpty().Append("bogusTerm"); } if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.Type = "bogusType"; } if (posIncrAtt != null) { posIncrAtt.PositionIncrement = 45987657; } if (posLengthAtt != null) { posLengthAtt.PositionLength = 45987653; } var reset_ = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before ts.End(); Assert.IsTrue(checkClearAtt.AndResetClearCalled, "super.End()/ClearAttributes() was not called correctly in End()"); if (finalOffset != null) { Assert.AreEqual((int)finalOffset, offsetAtt.EndOffset(), "finalOffset"); } if (offsetAtt != null) { Assert.IsTrue(offsetAtt.EndOffset() >= 0, "finalOffset must be >= 0"); } if (finalPosInc != null) { Assert.AreEqual((int)finalPosInc, posIncrAtt.PositionIncrement, "finalPosInc"); } ts.Dispose(); }
public string GetView(TokenStream tokenStream, out int numberOfTokens) { var sb = new StringBuilder(); var termDictionary = new Dictionary<string, int>(); var termAttr = tokenStream.GetAttribute<ITermAttribute>(); while (tokenStream.IncrementToken()) { if (termDictionary.Keys.Contains(termAttr.Term)) termDictionary[termAttr.Term] = termDictionary[termAttr.Term] + 1; else termDictionary.Add(termAttr.Term, 1); } foreach (var item in termDictionary.OrderBy(x => x.Key)) { sb.Append(item.Key + " [" + item.Value + "] "); } numberOfTokens = termDictionary.Count; return sb.ToString(); }
private void checkTokens(TokenStream stream) { int count = 0; TermAttribute termAtt = (TermAttribute) stream.GetAttribute(typeof(TermAttribute)); Assert.IsNotNull(termAtt); while (stream.IncrementToken()) { Assert.IsTrue(count < tokens.Length); Assert.AreEqual(tokens[count], termAtt.Term()); count++; } Assert.AreEqual(tokens.Length, count); }
/// <summary> Not an explicit test, just useful to print out some info on performance /// /// </summary> /// <throws> Exception </throws> public virtual void Performance() { int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 }; int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 }; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString())))); TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100)); teeStream.ConsumeAllTokens(); TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), 100); ITermAttribute tfTok = stream.AddAttribute <ITermAttribute>(); ITermAttribute sinkTok = sink.AddAttribute <ITermAttribute>(); for (int i = 0; stream.IncrementToken(); i++) { Assert.IsTrue(sink.IncrementToken()); Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))); IPositionIncrementAttribute posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), modCounts[j]); posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } } long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString())))); sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j])); IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute <IPositionIncrementAttribute>(); while (teeStream.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } //System.out.println("Modulo--------"); posIncrAtt = sink.GetAttribute <IPositionIncrementAttribute>(); while (sink.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } } finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }
void AssertTermEquals(String expected, TokenStream stream, byte[] expectPay) { ITermAttribute termAtt = stream.GetAttribute<ITermAttribute>(); IPayloadAttribute payloadAtt = stream.GetAttribute<IPayloadAttribute>(); Assert.True(stream.IncrementToken()); Assert.AreEqual(expected, termAtt.Term); Payload payload = payloadAtt.Payload; if (payload != null) { Assert.True(payload.Length == expectPay.Length, payload.Length + " does not equal: " + expectPay.Length); for (int i = 0; i < expectPay.Length; i++) { Assert.True(expectPay[i] == payload.ByteAt(i), expectPay[i] + " does not equal: " + payload.ByteAt(i)); } } else { Assert.True(expectPay == null, "expectPay is not null and it should be"); } }