public virtual void TestMultipleSources() { TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false)); TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter); TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter); tee1.Reset(); TokenStream source1 = new CachingTokenFilter(tee1); tee1.AddAttribute <ICheckClearAttributesAttribute>(); dogDetector.AddAttribute <ICheckClearAttributesAttribute>(); theDetector.AddAttribute <ICheckClearAttributesAttribute>(); TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false)); tee2.AddSinkTokenStream(dogDetector); tee2.AddSinkTokenStream(theDetector); TokenStream source2 = tee2; AssertTokenStreamContents(source1, tokens1); AssertTokenStreamContents(source2, tokens2); AssertTokenStreamContents(theDetector, new string[] { "The", "the", "The", "the" }); AssertTokenStreamContents(dogDetector, new string[] { "Dogs", "Dogs" }); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1); string[] lowerCaseTokens = new string[tokens1.Length]; for (int i = 0; i < tokens1.Length; i++) { lowerCaseTokens[i] = CultureInfo.InvariantCulture.TextInfo.ToLower(tokens1[i]); } AssertTokenStreamContents(lowerCasing, lowerCaseTokens); }
// This is a simplified query builder which works for single Terms and single Phrases // Returns null, TermQuery, or PhraseQuery public static Lucene.Net.Search.Query GetFieldQuery(Analyzer analyzer, string field, string queryText) { TokenStream stream = analyzer.TokenStream(field, new StringReader(queryText)); TokenFilter filter = new CachingTokenFilter(stream); filter.Reset(); // This attribute way of getting token properties isn't very good, but it's the non-obsolete one. var attr1 = filter.GetAttribute <ITermAttribute>(); Func <string> getText = () => attr1 != null ? attr1.Term : null; Func <int> getPositionIncrement; if (filter.HasAttribute <IPositionIncrementAttribute>()) { var attr = filter.GetAttribute <IPositionIncrementAttribute>(); getPositionIncrement = () => attr.PositionIncrement; } else { getPositionIncrement = () => 1; } // 0 tokens if (!filter.IncrementToken()) { return(new BooleanQuery()); } // 1 token? string token1 = getText(); int position = 0; if (!filter.IncrementToken()) { return(new TermQuery(new Term(field, token1))); } // many tokens - handle first token PhraseQuery ret = new PhraseQuery(); ret.Add(new Term(field, token1)); do { // handle rest of tokens string tokenNext = getText(); position += getPositionIncrement(); ret.Add(new Term(field, tokenNext), position); }while (filter.IncrementToken()); return(ret); }
public virtual void TestEndOffsetPositionWithCachingTokenFilter() { Directory dir = NewDirectory(); Analyzer analyzer = new MockAnalyzer(Random); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); Exception priorException = null; // LUCENENET: No need to cast to IOExcpetion TokenStream stream = analyzer.GetTokenStream("field", new StringReader("abcd ")); try { stream.Reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct? TokenStream cachedStream = new CachingTokenFilter(stream); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.StoreTermVectors = true; customType.StoreTermVectorPositions = true; customType.StoreTermVectorOffsets = true; Field f = new Field("field", cachedStream, customType); doc.Add(f); doc.Add(f); w.AddDocument(doc); } catch (Exception e) when(e.IsIOException()) { priorException = e; } finally { IOUtils.DisposeWhileHandlingException(priorException, stream); } w.Dispose(); IndexReader r = DirectoryReader.Open(dir); TermsEnum termsEnum = r.GetTermVectors(0).GetTerms("field").GetEnumerator(); Assert.IsTrue(termsEnum.MoveNext()); DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); Assert.AreEqual(2, termsEnum.TotalTermFreq); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.NextPosition(); Assert.AreEqual(0, dpEnum.StartOffset); Assert.AreEqual(4, dpEnum.EndOffset); dpEnum.NextPosition(); Assert.AreEqual(8, dpEnum.StartOffset); Assert.AreEqual(12, dpEnum.EndOffset); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); r.Dispose(); dir.Dispose(); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testMultipleSources() throws Exception public virtual void testMultipleSources() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false)); TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter); TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter); TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter); tee1.reset(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TokenStream source1 = new CachingTokenFilter(tee1); TokenStream source1 = new CachingTokenFilter(tee1); tee1.addAttribute(typeof(CheckClearAttributesAttribute)); dogDetector.addAttribute(typeof(CheckClearAttributesAttribute)); theDetector.addAttribute(typeof(CheckClearAttributesAttribute)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false)); TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false)); tee2.addSinkTokenStream(dogDetector); tee2.addSinkTokenStream(theDetector); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TokenStream source2 = tee2; TokenStream source2 = tee2; assertTokenStreamContents(source1, tokens1); assertTokenStreamContents(source2, tokens2); assertTokenStreamContents(theDetector, new string[] { "The", "the", "The", "the" }); assertTokenStreamContents(dogDetector, new string[] { "Dogs", "Dogs" }); source1.reset(); TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1); string[] lowerCaseTokens = new string[tokens1.Length]; for (int i = 0; i < tokens1.Length; i++) { lowerCaseTokens[i] = tokens1[i].ToLower(Locale.ROOT); } assertTokenStreamContents(lowerCasing, lowerCaseTokens); }
protected override IQueryNode PostProcessNode(IQueryNode node) { if (node is ITextableQueryNode && !(node is WildcardQueryNode) && !(node is FuzzyQueryNode) && !(node is RegexpQueryNode) && !(node.Parent is IRangeQueryNode)) { FieldQueryNode fieldNode = ((FieldQueryNode)node); string text = fieldNode.GetTextAsString(); string field = fieldNode.GetFieldAsString(); CachingTokenFilter buffer = null; IPositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; bool severalTokensAtSamePosition = false; TokenStream source = null; try { source = this.analyzer.GetTokenStream(field, text); source.Reset(); buffer = new CachingTokenFilter(source); if (buffer.HasAttribute <IPositionIncrementAttribute>()) { posIncrAtt = buffer.GetAttribute <IPositionIncrementAttribute>(); } try { while (buffer.IncrementToken()) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt .PositionIncrement : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } } } #pragma warning disable 168 catch (IOException e) #pragma warning restore 168 { // ignore } } catch (IOException e) { throw new Exception(e.ToString(), e); } finally { IOUtils.DisposeWhileHandlingException(source); } // rewind the buffer stream buffer.Reset(); if (!buffer.HasAttribute <ICharTermAttribute>()) { return(new NoTokenFoundQueryNode()); } ICharTermAttribute termAtt = buffer.GetAttribute <ICharTermAttribute>(); if (numTokens == 0) { return(new NoTokenFoundQueryNode()); } else if (numTokens == 1) { string term = null; try { bool hasNext; hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } fieldNode.Text = term.AsCharSequence(); return(fieldNode); } else if (severalTokensAtSamePosition || !(node is QuotedFieldQueryNode)) { if (positionCount == 1 || !(node is QuotedFieldQueryNode)) { // no phrase query: if (positionCount == 1) { // simple case: only one position, with synonyms List <IQueryNode> children = new List <IQueryNode>(); for (int i = 0; i < numTokens; i++) { string term = null; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } children.Add(new FieldQueryNode(field, term, -1, -1)); } return(new GroupQueryNode( new StandardBooleanQueryNode(children, positionCount == 1))); } else { // multiple positions IQueryNode q = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), false); IQueryNode currentQuery = null; for (int i = 0; i < numTokens; i++) { string term = null; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0) { if (!(currentQuery is BooleanQueryNode)) { IQueryNode t = currentQuery; currentQuery = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), true); ((BooleanQueryNode)currentQuery).Add(t); } ((BooleanQueryNode)currentQuery).Add(new FieldQueryNode(field, term, -1, -1)); } else { if (currentQuery != null) { if (this.defaultOperator == Operator.OR) { q.Add(currentQuery); } else { q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ)); } } currentQuery = new FieldQueryNode(field, term, -1, -1); } } if (this.defaultOperator == Operator.OR) { q.Add(currentQuery); } else { q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ)); } if (q is BooleanQueryNode) { q = new GroupQueryNode(q); } return(q); } } else { // phrase query: MultiPhraseQueryNode mpq = new MultiPhraseQueryNode(); List <FieldQueryNode> multiTerms = new List <FieldQueryNode>(); int position = -1; int i = 0; int termGroupCount = 0; for (; i < numTokens; i++) { string term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.Count > 0) { foreach (FieldQueryNode termNode in multiTerms) { if (this.positionIncrementsEnabled) { termNode.PositionIncrement = position; } else { termNode.PositionIncrement = termGroupCount; } mpq.Add(termNode); } // Only increment once for each "group" of // terms that were in the same position: termGroupCount++; multiTerms.Clear(); } position += positionIncrement; multiTerms.Add(new FieldQueryNode(field, term, -1, -1)); } foreach (FieldQueryNode termNode in multiTerms) { if (this.positionIncrementsEnabled) { termNode.PositionIncrement = position; } else { termNode.PositionIncrement = termGroupCount; } mpq.Add(termNode); } return(mpq); } } else { TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode(); int position = -1; for (int i = 0; i < numTokens; i++) { string term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1); if (this.positionIncrementsEnabled) { position += positionIncrement; newFieldNode.PositionIncrement = position; } else { newFieldNode.PositionIncrement = i; } pq.Add(newFieldNode); } return(pq); } } return(node); }
/// <summary> /// Creates a query from the analysis chain. /// <para/> /// Expert: this is more useful for subclasses such as queryparsers. /// If using this class directly, just use <see cref="CreateBooleanQuery(string, string)"/> /// and <see cref="CreatePhraseQuery(string, string)"/>. </summary> /// <param name="analyzer"> Analyzer used for this query. </param> /// <param name="operator"> Default boolean operator used for this query. </param> /// <param name="field"> Field to create queries against. </param> /// <param name="queryText"> Text to be passed to the analysis chain. </param> /// <param name="quoted"> <c>true</c> if phrases should be generated when terms occur at more than one position. </param> /// <param name="phraseSlop"> Slop factor for phrase/multiphrase queries. </param> protected Query CreateFieldQuery(Analyzer analyzer, Occur @operator, string field, string queryText, bool quoted, int phraseSlop) { Debug.Assert(@operator == Occur.SHOULD || @operator == Occur.MUST); // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count CachingTokenFilter buffer = null; ITermToBytesRefAttribute termAtt = null; IPositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; bool severalTokensAtSamePosition = false; bool hasMoreTokens = false; TokenStream source = null; try { source = analyzer.GetTokenStream(field, new StringReader(queryText)); source.Reset(); buffer = new CachingTokenFilter(source); buffer.Reset(); if (buffer.HasAttribute <ITermToBytesRefAttribute>()) { termAtt = buffer.GetAttribute <ITermToBytesRefAttribute>(); } if (buffer.HasAttribute <IPositionIncrementAttribute>()) { posIncrAtt = buffer.GetAttribute <IPositionIncrementAttribute>(); } if (termAtt != null) { try { hasMoreTokens = buffer.IncrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.PositionIncrement : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.IncrementToken(); } } catch (System.IO.IOException) { // ignore } } } catch (System.IO.IOException e) { throw new Exception("Error analyzing query text", e); } finally { IOUtils.DisposeWhileHandlingException(source); } // rewind the buffer stream buffer.Reset(); BytesRef bytes = termAtt == null ? null : termAtt.BytesRef; if (numTokens == 0) { return(null); } else if (numTokens == 1) { try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } return(NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes)))); } else { if (severalTokensAtSamePosition || (!quoted)) { if (positionCount == 1 || (!quoted)) { // no phrase query: if (positionCount == 1) { // simple case: only one position, with synonyms BooleanQuery q = NewBooleanQuery(true); for (int i = 0; i < numTokens; i++) { try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } Query currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))); q.Add(currentQuery, Occur.SHOULD); } return(q); } else { // multiple positions BooleanQuery q = NewBooleanQuery(false); Query currentQuery = null; for (int i = 0; i < numTokens; i++) { try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0) { if (!(currentQuery is BooleanQuery)) { Query t = currentQuery; currentQuery = NewBooleanQuery(true); ((BooleanQuery)currentQuery).Add(t, Occur.SHOULD); } ((BooleanQuery)currentQuery).Add(NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))), Occur.SHOULD); } else { if (currentQuery != null) { q.Add(currentQuery, @operator); } currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))); } } q.Add(currentQuery, @operator); return(q); } } else { // phrase query: MultiPhraseQuery mpq = NewMultiPhraseQuery(); mpq.Slop = phraseSlop; IList <Term> multiTerms = new List <Term>(); int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.Count > 0) { if (enablePositionIncrements) { mpq.Add(multiTerms.ToArray(), position); } else { mpq.Add(multiTerms.ToArray()); } multiTerms.Clear(); } position += positionIncrement; multiTerms.Add(new Term(field, BytesRef.DeepCopyOf(bytes))); } if (enablePositionIncrements) { mpq.Add(multiTerms.ToArray(), position); } else { mpq.Add(multiTerms.ToArray()); } return(mpq); } } else { PhraseQuery pq = NewPhraseQuery(); pq.Slop = phraseSlop; int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } if (enablePositionIncrements) { position += positionIncrement; pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes)), position); } else { pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes))); } } return(pq); } } }
public void TestTokenStream() { ShingleMatrixFilter.DefaultSettingsCodec = null; //new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec(); // test a plain old token stream with synonyms tranlated to rows. var tokens = new LinkedList <Token>(); tokens.AddLast(TokenFactory("hello", 1, 0, 4)); tokens.AddLast(TokenFactory("greetings", 0, 0, 4)); tokens.AddLast(TokenFactory("world", 1, 5, 10)); tokens.AddLast(TokenFactory("earth", 0, 5, 10)); tokens.AddLast(TokenFactory("tellus", 0, 5, 10)); TokenStream tls = new TokenListStream(tokens); // bi-grams TokenStream ts = new ShingleMatrixFilter(tls, 2, 2, '_', false, new TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); AssertNext(ts, "hello_world"); AssertNext(ts, "greetings_world"); AssertNext(ts, "hello_earth"); AssertNext(ts, "greetings_earth"); AssertNext(ts, "hello_tellus"); AssertNext(ts, "greetings_tellus"); Assert.IsFalse(ts.IncrementToken()); // bi-grams with no spacer character, start offset, end offset tls.Reset(); ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); AssertNext(ts, "helloworld", 0, 10); AssertNext(ts, "greetingsworld", 0, 10); AssertNext(ts, "helloearth", 0, 10); AssertNext(ts, "greetingsearth", 0, 10); AssertNext(ts, "hellotellus", 0, 10); AssertNext(ts, "greetingstellus", 0, 10); Assert.IsFalse(ts.IncrementToken()); // add ^_prefix_and_suffix_$ // // using 3d codec as it supports weights ShingleMatrixFilter.DefaultSettingsCodec = new SimpleThreeDimensionalTokenSettingsCodec(); tokens = new LinkedList <Token>(); tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("greetings", 0, 1f, 0, 4, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("earth", 0, 1f, 5, 10, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("tellus", 0, 1f, 5, 10, TokenPositioner.NewRow)); tls = new TokenListStream(tokens); // bi-grams, position incrememnt, weight, start offset, end offset ts = new PrefixAndSuffixAwareTokenFilter( new SingleTokenTokenStream(TokenFactory("^", 1, 100f, 0, 0)), tls, new SingleTokenTokenStream(TokenFactory("$", 1, 50f, 0, 0)) ); tls = new CachingTokenFilter(ts); ts = new ShingleMatrixFilter(tls, 2, 2, '_', false); //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token)) { // Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");"); // token.Clear(); //} AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4); AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); Assert.IsFalse(ts.IncrementToken()); // test unlimited size and allow single boundary token as shingle tls.Reset(); ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', false); //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token)) //{ // Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");"); // token.Clear(); //} AssertNext(ts, "^", 1, 10.0f, 0, 0); AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4); AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello", 1, 1.0f, 0, 4); AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "world", 1, 1.0f, 5, 10); AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "$", 1, 7.071068f, 10, 10); AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings", 1, 1.0f, 0, 4); AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "earth", 1, 1.0f, 5, 10); AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "tellus", 1, 1.0f, 5, 10); AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); Assert.IsFalse(ts.IncrementToken()); // test unlimited size but don't allow single boundary token as shingle tls.Reset(); ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', true); // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4); AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello", 1, 1.0f, 0, 4); AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "world", 1, 1.0f, 5, 10); AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings", 1, 1.0f, 0, 4); AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "earth", 1, 1.0f, 5, 10); AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "tellus", 1, 1.0f, 5, 10); AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); Assert.IsFalse(ts.IncrementToken()); //System.currentTimeMillis(); // multi-token synonyms // // Token[][][] { // {{hello}, {greetings, and, salutations}, // {{world}, {earth}, {tellus}} // } // tokens = new LinkedList <Token>(); tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("greetings", 1, 1f, 0, 4, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("and", 1, 1f, 0, 4, TokenPositioner.SameRow)); tokens.AddLast(TokenFactory("salutations", 1, 1f, 0, 4, TokenPositioner.SameRow)); tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("earth", 1, 1f, 5, 10, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("tellus", 1, 1f, 5, 10, TokenPositioner.NewRow)); tls = new TokenListStream(tokens); // 2-3 grams ts = new ShingleMatrixFilter(tls, 2, 3, '_', false); // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } // shingle, position increment, weight, start offset, end offset AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4); AssertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4); AssertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4); AssertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10); AssertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10); AssertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10); AssertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10); Assert.IsFalse(ts.IncrementToken()); //System.currentTimeMillis(); }
public virtual void TestEndOffsetPositionWithCachingTokenFilter() { Directory dir = NewDirectory(); Analyzer analyzer = new MockAnalyzer(Random()); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); IOException priorException = null; TokenStream stream = analyzer.TokenStream("field", new StringReader("abcd ")); try { stream.Reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct? TokenStream cachedStream = new CachingTokenFilter(stream); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.StoreTermVectors = true; customType.StoreTermVectorPositions = true; customType.StoreTermVectorOffsets = true; Field f = new Field("field", cachedStream, customType); doc.Add(f); doc.Add(f); w.AddDocument(doc); } catch (IOException e) { priorException = e; } finally { IOUtils.CloseWhileHandlingException(priorException, stream); } w.Dispose(); IndexReader r = DirectoryReader.Open(dir); TermsEnum termsEnum = r.GetTermVectors(0).Terms("field").Iterator(null); Assert.IsNotNull(termsEnum.Next()); DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); Assert.AreEqual(2, termsEnum.TotalTermFreq()); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.NextPosition(); Assert.AreEqual(0, dpEnum.StartOffset()); Assert.AreEqual(4, dpEnum.EndOffset()); dpEnum.NextPosition(); Assert.AreEqual(8, dpEnum.StartOffset()); Assert.AreEqual(12, dpEnum.EndOffset()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); r.Dispose(); dir.Dispose(); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testMultipleSources() throws Exception public virtual void testMultipleSources() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false)); TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter); TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter); TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter); tee1.reset(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TokenStream source1 = new CachingTokenFilter(tee1); TokenStream source1 = new CachingTokenFilter(tee1); tee1.addAttribute(typeof(CheckClearAttributesAttribute)); dogDetector.addAttribute(typeof(CheckClearAttributesAttribute)); theDetector.addAttribute(typeof(CheckClearAttributesAttribute)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false)); TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false)); tee2.addSinkTokenStream(dogDetector); tee2.addSinkTokenStream(theDetector); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final TokenStream source2 = tee2; TokenStream source2 = tee2; assertTokenStreamContents(source1, tokens1); assertTokenStreamContents(source2, tokens2); assertTokenStreamContents(theDetector, new string[]{"The", "the", "The", "the"}); assertTokenStreamContents(dogDetector, new string[]{"Dogs", "Dogs"}); source1.reset(); TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1); string[] lowerCaseTokens = new string[tokens1.Length]; for (int i = 0; i < tokens1.Length; i++) { lowerCaseTokens[i] = tokens1[i].ToLower(Locale.ROOT); } assertTokenStreamContents(lowerCasing, lowerCaseTokens); }