public virtual void TestCaching() { Directory dir = new RAMDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); Document doc = new Document(); TokenStream stream = new TokenStreamAnonymousInnerClassHelper(this); stream = new CachingTokenFilter(stream); doc.Add(new TextField("preanalyzed", stream)); // 1) we consume all tokens twice before we add the doc to the index CheckTokens(stream); stream.Reset(); CheckTokens(stream); // 2) now add the document to the index and verify if all tokens are indexed // don't reset the stream here, the DocumentWriter should do that implicitly writer.AddDocument(doc); IndexReader reader = writer.GetReader(); DocsAndPositionsEnum termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term1")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, termPositions.Freq); Assert.AreEqual(0, termPositions.NextPosition()); termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term2")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(2, termPositions.Freq); Assert.AreEqual(1, termPositions.NextPosition()); Assert.AreEqual(3, termPositions.NextPosition()); termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term3")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, termPositions.Freq); Assert.AreEqual(2, termPositions.NextPosition()); reader.Dispose(); writer.Dispose(); // 3) reset stream and consume tokens again stream.Reset(); CheckTokens(stream); dir.Dispose(); }
public virtual void TestMultipleSources() { TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString()))); TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter); TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter); TokenStream source1 = new CachingTokenFilter(tee1); tee1.AddAttribute <ICheckClearAttributesAttribute>(); dogDetector.AddAttribute <ICheckClearAttributesAttribute>(); theDetector.AddAttribute <ICheckClearAttributesAttribute>(); TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString()))); tee2.AddSinkTokenStream(dogDetector); tee2.AddSinkTokenStream(theDetector); TokenStream source2 = tee2; AssertTokenStreamContents(source1, tokens1); AssertTokenStreamContents(source2, tokens2); AssertTokenStreamContents(theDetector, new String[] { "The", "the", "The", "the" }); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); String[] lowerCaseTokens = new String[tokens1.Length]; for (int i = 0; i < tokens1.Length; i++) { lowerCaseTokens[i] = tokens1[i].ToLower(); } }
public virtual void TestCaching() { Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); TokenStream stream = new AnonymousClassTokenStream(this); stream = new CachingTokenFilter(stream); doc.Add(new Field("preanalyzed", stream, TermVector.NO)); // 1) we consume all tokens twice before we add the doc to the index checkTokens(stream); stream.Reset(); checkTokens(stream); // 2) now add the document to the index and verify if all tokens are indexed // don't reset the stream here, the DocumentWriter should do that implicitly writer.AddDocument(doc); writer.Close(); IndexReader reader = IndexReader.Open(dir); TermPositions termPositions = reader.TermPositions(new Term("preanalyzed", "term1")); Assert.IsTrue(termPositions.Next()); Assert.AreEqual(1, termPositions.Freq()); Assert.AreEqual(0, termPositions.NextPosition()); termPositions.Seek(new Term("preanalyzed", "term2")); Assert.IsTrue(termPositions.Next()); Assert.AreEqual(2, termPositions.Freq()); Assert.AreEqual(1, termPositions.NextPosition()); Assert.AreEqual(3, termPositions.NextPosition()); termPositions.Seek(new Term("preanalyzed", "term3")); Assert.IsTrue(termPositions.Next()); Assert.AreEqual(1, termPositions.Freq()); Assert.AreEqual(2, termPositions.NextPosition()); reader.Close(); // 3) reset stream and consume tokens again stream.Reset(); checkTokens(stream); }
public virtual void TestCaching() { Directory dir = new RAMDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); Document doc = new Document(); TokenStream stream = new TokenStreamAnonymousInnerClassHelper(this); stream = new CachingTokenFilter(stream); doc.Add(new TextField("preanalyzed", stream)); // 1) we consume all tokens twice before we add the doc to the index CheckTokens(stream); stream.Reset(); CheckTokens(stream); // 2) now add the document to the index and verify if all tokens are indexed // don't reset the stream here, the DocumentWriter should do that implicitly writer.AddDocument(doc); IndexReader reader = writer.Reader; DocsAndPositionsEnum termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term1")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, termPositions.Freq()); Assert.AreEqual(0, termPositions.NextPosition()); termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term2")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(2, termPositions.Freq()); Assert.AreEqual(1, termPositions.NextPosition()); Assert.AreEqual(3, termPositions.NextPosition()); termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term3")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, termPositions.Freq()); Assert.AreEqual(2, termPositions.NextPosition()); reader.Dispose(); writer.Dispose(); // 3) reset stream and consume tokens again stream.Reset(); CheckTokens(stream); dir.Dispose(); }
public virtual void TestMultipleSources() { SinkTokenizer theDetector = new AnonymousClassSinkTokenizer1(this, null); SinkTokenizer dogDetector = new AnonymousClassSinkTokenizer2(this, null); TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), theDetector), dogDetector)); TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString())), theDetector), dogDetector); int i = 0; Token reusableToken = new Token(); for (Token nextToken = source1.Next(reusableToken); nextToken != null; nextToken = source1.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); Assert.IsTrue(theDetector.GetTokens().Count == 2, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 2); Assert.IsTrue(dogDetector.GetTokens().Count == 1, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 1); i = 0; for (Token nextToken = source2.Next(reusableToken); nextToken != null; nextToken = source2.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens2[i]) == true, nextToken.Term() + " is not equal to " + tokens2[i]); i++; } Assert.IsTrue(i == tokens2.Length, i + " does not equal: " + tokens2.Length); Assert.IsTrue(theDetector.GetTokens().Count == 4, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 4); Assert.IsTrue(dogDetector.GetTokens().Count == 2, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 2); i = 0; for (Token nextToken = theDetector.Next(reusableToken); nextToken != null; nextToken = theDetector.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().ToUpper().Equals("The".ToUpper()) == true, nextToken.Term() + " is not equal to " + "The"); i++; } Assert.IsTrue(i == theDetector.GetTokens().Count, i + " does not equal: " + theDetector.GetTokens().Count); i = 0; for (Token nextToken = dogDetector.Next(reusableToken); nextToken != null; nextToken = dogDetector.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().ToUpper().Equals("Dogs".ToUpper()) == true, nextToken.Term() + " is not equal to " + "Dogs"); i++; } Assert.IsTrue(i == dogDetector.GetTokens().Count, i + " does not equal: " + dogDetector.GetTokens().Count); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; for (Token nextToken = lowerCasing.Next(reusableToken); nextToken != null; nextToken = lowerCasing.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i].ToLower()) == true, nextToken.Term() + " is not equal to " + tokens1[i].ToLower()); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); }
// This is a simplified query builder which works for single Terms and single Phrases // Returns null, TermQuery, or PhraseQuery public static Lucene.Net.Search.Query GetFieldQuery(Analyzer analyzer, string field, string queryText) { TokenStream stream = analyzer.TokenStream(field, new StringReader(queryText)); TokenFilter filter = new CachingTokenFilter(stream); filter.Reset(); // This attribute way of getting token properties isn't very good, but it's the non-obsolete one. var attr1 = filter.GetAttribute<ITermAttribute>(); Func<string> getText = () => attr1 != null ? attr1.Term : null; Func<int> getPositionIncrement; if (filter.HasAttribute<IPositionIncrementAttribute>()) { var attr = filter.GetAttribute<IPositionIncrementAttribute>(); getPositionIncrement = () => attr.PositionIncrement; } else { getPositionIncrement = () => 1; } // 0 tokens if (!filter.IncrementToken()) { return new BooleanQuery(); } // 1 token? string token1 = getText(); int position = 0; if (!filter.IncrementToken()) { return new TermQuery(new Term(field, token1)); } // many tokens - handle first token PhraseQuery ret = new PhraseQuery(); ret.Add(new Term(field, token1)); do { // handle rest of tokens string tokenNext = getText(); position += getPositionIncrement(); ret.Add(new Term(field, tokenNext), position); } while (filter.IncrementToken()); return ret; }
protected override IQueryNode PostProcessNode(IQueryNode node) { if (node is ITextableQueryNode && !(node is WildcardQueryNode) && !(node is FuzzyQueryNode) && !(node is RegexpQueryNode) && !(node.Parent is IRangeQueryNode)) { FieldQueryNode fieldNode = ((FieldQueryNode)node); string text = fieldNode.GetTextAsString(); string field = fieldNode.GetFieldAsString(); CachingTokenFilter buffer = null; IPositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; bool severalTokensAtSamePosition = false; TokenStream source = null; try { source = this.analyzer.TokenStream(field, text); source.Reset(); buffer = new CachingTokenFilter(source); if (buffer.HasAttribute<IPositionIncrementAttribute>()) { posIncrAtt = buffer.GetAttribute<IPositionIncrementAttribute>(); } try { while (buffer.IncrementToken()) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt .PositionIncrement : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } } } #pragma warning disable 168 catch (IOException e) #pragma warning restore 168 { // ignore } } catch (IOException e) { throw new Exception(e.Message, e); } finally { IOUtils.CloseWhileHandlingException(source); } // rewind the buffer stream buffer.Reset(); if (!buffer.HasAttribute<ICharTermAttribute>()) { return new NoTokenFoundQueryNode(); } ICharTermAttribute termAtt = buffer.GetAttribute<ICharTermAttribute>(); if (numTokens == 0) { return new NoTokenFoundQueryNode(); } else if (numTokens == 1) { string term = null; try { bool hasNext; hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); term = termAtt.ToString(); } #pragma warning disable 168 catch (IOException e) #pragma warning restore 168 { // safe to ignore, because we know the number of tokens } fieldNode.Text = term.ToCharSequence(); return fieldNode; } else if (severalTokensAtSamePosition || !(node is QuotedFieldQueryNode)) { if (positionCount == 1 || !(node is QuotedFieldQueryNode)) { // no phrase query: if (positionCount == 1) { // simple case: only one position, with synonyms List<IQueryNode> children = new List<IQueryNode>(); for (int i = 0; i < numTokens; i++) { string term = null; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); term = termAtt.ToString(); } #pragma warning disable 168 catch (IOException e) #pragma warning restore 168 { // safe to ignore, because we know the number of tokens } children.Add(new FieldQueryNode(field, term, -1, -1)); } return new GroupQueryNode( new StandardBooleanQueryNode(children, positionCount == 1)); } else { // multiple positions IQueryNode q = new StandardBooleanQueryNode(new List<IQueryNode>(), false); IQueryNode currentQuery = null; for (int i = 0; i < numTokens; i++) { string term = null; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); term = termAtt.ToString(); } #pragma warning disable 168 catch (IOException e) #pragma warning restore 168 { // safe to ignore, because we know the number of tokens } if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0) { if (!(currentQuery is BooleanQueryNode)) { IQueryNode t = currentQuery; currentQuery = new StandardBooleanQueryNode(new List<IQueryNode>(), true); ((BooleanQueryNode)currentQuery).Add(t); } ((BooleanQueryNode)currentQuery).Add(new FieldQueryNode(field, term, -1, -1)); } else { if (currentQuery != null) { if (this.defaultOperator == Operator.OR) { q.Add(currentQuery); } else { q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ)); } } currentQuery = new FieldQueryNode(field, term, -1, -1); } } if (this.defaultOperator == Operator.OR) { q.Add(currentQuery); } else { q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ)); } if (q is BooleanQueryNode) { q = new GroupQueryNode(q); } return q; } } else { // phrase query: MultiPhraseQueryNode mpq = new MultiPhraseQueryNode(); List<FieldQueryNode> multiTerms = new List<FieldQueryNode>(); int position = -1; int i = 0; int termGroupCount = 0; for (; i < numTokens; i++) { string term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); term = termAtt.ToString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } #pragma warning disable 168 catch (IOException e) #pragma warning restore 168 { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.Count > 0) { foreach (FieldQueryNode termNode in multiTerms) { if (this.positionIncrementsEnabled) { termNode.PositionIncrement = position; } else { termNode.PositionIncrement = termGroupCount; } mpq.Add(termNode); } // Only increment once for each "group" of // terms that were in the same position: termGroupCount++; multiTerms.Clear(); } position += positionIncrement; multiTerms.Add(new FieldQueryNode(field, term, -1, -1)); } foreach (FieldQueryNode termNode in multiTerms) { if (this.positionIncrementsEnabled) { termNode.PositionIncrement = position; } else { termNode.PositionIncrement = termGroupCount; } mpq.Add(termNode); } return mpq; } } else { TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode(); int position = -1; for (int i = 0; i < numTokens; i++) { string term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); term = termAtt.ToString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } #pragma warning disable 168 catch (IOException e) #pragma warning restore 168 { // safe to ignore, because we know the number of tokens } FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1); if (this.positionIncrementsEnabled) { position += positionIncrement; newFieldNode.PositionIncrement = position; } else { newFieldNode.PositionIncrement = i; } pq.Add(newFieldNode); } return pq; } } return node; }
/// <exception cref="ParseException">throw in overridden method to disallow /// </exception> protected internal virtual Query GetFieldQuery(String field, String queryText) { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count TokenStream source; try { source = analyzer.ReusableTokenStream(field, new StringReader(queryText)); source.Reset(); } catch (IOException) { source = analyzer.TokenStream(field, new StringReader(queryText)); } CachingTokenFilter buffer = new CachingTokenFilter(source); ITermAttribute termAtt = null; IPositionIncrementAttribute posIncrAtt = null; int numTokens = 0; bool success = false; try { buffer.Reset(); success = true; } catch (IOException) { // success==false if we hit an exception } if (success) { if (buffer.HasAttribute<ITermAttribute>()) { termAtt = buffer.GetAttribute<ITermAttribute>(); } if (buffer.HasAttribute<IPositionIncrementAttribute>()) { posIncrAtt = buffer.GetAttribute<IPositionIncrementAttribute>(); } } int positionCount = 0; bool severalTokensAtSamePosition = false; bool hasMoreTokens = false; if (termAtt != null) { try { hasMoreTokens = buffer.IncrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.PositionIncrement : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.IncrementToken(); } } catch (IOException) { // ignore } } try { // rewind the buffer stream buffer.Reset(); // close original stream - all tokens buffered source.Close(); } catch (IOException) { // ignore } if (numTokens == 0) return null; else if (numTokens == 1) { String term = null; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext); term = termAtt.Term; } catch (IOException) { // safe to ignore, because we know the number of tokens } return NewTermQuery(new Term(field, term)); } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = NewBooleanQuery(true); for (int i = 0; i < numTokens; i++) { String term = null; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext); term = termAtt.Term; } catch (IOException) { // safe to ignore, because we know the number of tokens } Query currentQuery = NewTermQuery( new Term(field, term)); q.Add(currentQuery, Occur.SHOULD); } return q; } else { // phrase query: MultiPhraseQuery mpq = NewMultiPhraseQuery(); mpq.Slop = phraseSlop; List<Term> multiTerms = new List<Term>(); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); term = termAtt.Term; if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (IOException) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.Count > 0) { if (enablePositionIncrements) { mpq.Add(multiTerms.ToArray(), position); } else { mpq.Add(multiTerms.ToArray()); } multiTerms.Clear(); } position += positionIncrement; multiTerms.Add(new Term(field, term)); } if (enablePositionIncrements) { mpq.Add(multiTerms.ToArray(), position); } else { mpq.Add(multiTerms.ToArray()); } return mpq; } } else { PhraseQuery pq = NewPhraseQuery(); pq.Slop = phraseSlop; int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); term = termAtt.Term; if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (IOException) { // safe to ignore, because we know the number of tokens } if (enablePositionIncrements) { position += positionIncrement; pq.Add(new Term(field, term), position); } else { pq.Add(new Term(field, term)); } } return pq; } } }
public void TestTokenStream() { ShingleMatrixFilter.DefaultSettingsCodec = null; //new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec(); // test a plain old token stream with synonyms tranlated to rows. var tokens = new LinkedList<Token>(); tokens.AddLast(TokenFactory("hello", 1, 0, 4)); tokens.AddLast(TokenFactory("greetings", 0, 0, 4)); tokens.AddLast(TokenFactory("world", 1, 5, 10)); tokens.AddLast(TokenFactory("earth", 0, 5, 10)); tokens.AddLast(TokenFactory("tellus", 0, 5, 10)); TokenStream tls = new TokenListStream(tokens); // bi-grams TokenStream ts = new ShingleMatrixFilter(tls, 2, 2, '_', false, new TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); AssertNext(ts, "hello_world"); AssertNext(ts, "greetings_world"); AssertNext(ts, "hello_earth"); AssertNext(ts, "greetings_earth"); AssertNext(ts, "hello_tellus"); AssertNext(ts, "greetings_tellus"); Assert.IsFalse(ts.IncrementToken()); // bi-grams with no spacer character, start offset, end offset tls.Reset(); ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); AssertNext(ts, "helloworld", 0, 10); AssertNext(ts, "greetingsworld", 0, 10); AssertNext(ts, "helloearth", 0, 10); AssertNext(ts, "greetingsearth", 0, 10); AssertNext(ts, "hellotellus", 0, 10); AssertNext(ts, "greetingstellus", 0, 10); Assert.IsFalse(ts.IncrementToken()); // add ^_prefix_and_suffix_$ // // using 3d codec as it supports weights ShingleMatrixFilter.DefaultSettingsCodec = new SimpleThreeDimensionalTokenSettingsCodec(); tokens = new LinkedList<Token>(); tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("greetings", 0, 1f, 0, 4, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("earth", 0, 1f, 5, 10, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("tellus", 0, 1f, 5, 10, TokenPositioner.NewRow)); tls = new TokenListStream(tokens); // bi-grams, position incrememnt, weight, start offset, end offset ts = new PrefixAndSuffixAwareTokenFilter( new SingleTokenTokenStream(TokenFactory("^", 1, 100f, 0, 0)), tls, new SingleTokenTokenStream(TokenFactory("$", 1, 50f, 0, 0)) ); tls = new CachingTokenFilter(ts); ts = new ShingleMatrixFilter(tls, 2, 2, '_', false); //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token)) { // Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");"); // token.Clear(); //} AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4); AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); Assert.IsFalse(ts.IncrementToken()); // test unlimited size and allow single boundary token as shingle tls.Reset(); ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', false); //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token)) //{ // Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");"); // token.Clear(); //} AssertNext(ts, "^", 1, 10.0f, 0, 0); AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4); AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello", 1, 1.0f, 0, 4); AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "world", 1, 1.0f, 5, 10); AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "$", 1, 7.071068f, 10, 10); AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings", 1, 1.0f, 0, 4); AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "earth", 1, 1.0f, 5, 10); AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "tellus", 1, 1.0f, 5, 10); AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); Assert.IsFalse(ts.IncrementToken()); // test unlimited size but don't allow single boundary token as shingle tls.Reset(); ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', true); // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4); AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello", 1, 1.0f, 0, 4); AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "world", 1, 1.0f, 5, 10); AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings", 1, 1.0f, 0, 4); AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "earth", 1, 1.0f, 5, 10); AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "tellus", 1, 1.0f, 5, 10); AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); Assert.IsFalse(ts.IncrementToken()); //System.currentTimeMillis(); // multi-token synonyms // // Token[][][] { // {{hello}, {greetings, and, salutations}, // {{world}, {earth}, {tellus}} // } // tokens = new LinkedList<Token>(); tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("greetings", 1, 1f, 0, 4, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("and", 1, 1f, 0, 4, TokenPositioner.SameRow)); tokens.AddLast(TokenFactory("salutations", 1, 1f, 0, 4, TokenPositioner.SameRow)); tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("earth", 1, 1f, 5, 10, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("tellus", 1, 1f, 5, 10, TokenPositioner.NewRow)); tls = new TokenListStream(tokens); // 2-3 grams ts = new ShingleMatrixFilter(tls, 2, 3, '_', false); // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } // shingle, position increment, weight, start offset, end offset AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4); AssertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4); AssertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4); AssertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10); AssertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10); AssertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10); AssertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10); Assert.IsFalse(ts.IncrementToken()); //System.currentTimeMillis(); }
public virtual void TestMultipleSources() { TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString()))); TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter); TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter); TokenStream source1 = new CachingTokenFilter(tee1); tee1.AddAttribute(typeof(CheckClearAttributesAttribute)); dogDetector.AddAttribute(typeof(CheckClearAttributesAttribute)); theDetector.AddAttribute(typeof(CheckClearAttributesAttribute)); TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString()))); tee2.AddSinkTokenStream(dogDetector); tee2.AddSinkTokenStream(theDetector); TokenStream source2 = tee2; AssertTokenStreamContents(source1, tokens1); AssertTokenStreamContents(source2, tokens2); AssertTokenStreamContents(theDetector, new String[] { "The", "the", "The", "the" }); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); String[] lowerCaseTokens = new String[tokens1.Length]; for (int i = 0; i < tokens1.Length; i++) lowerCaseTokens[i] = tokens1[i].ToLower(); }