Ejemplo n.º 1
0
        public virtual void TestMultipleSources()
        {
            TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false));

            TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter);
            TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter);
            tee1.Reset();
            TokenStream source1 = new CachingTokenFilter(tee1);

            tee1.AddAttribute <ICheckClearAttributesAttribute>();
            dogDetector.AddAttribute <ICheckClearAttributesAttribute>();
            theDetector.AddAttribute <ICheckClearAttributesAttribute>();

            TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false));

            tee2.AddSinkTokenStream(dogDetector);
            tee2.AddSinkTokenStream(theDetector);
            TokenStream source2 = tee2;

            AssertTokenStreamContents(source1, tokens1);
            AssertTokenStreamContents(source2, tokens2);

            AssertTokenStreamContents(theDetector, new string[] { "The", "the", "The", "the" });
            AssertTokenStreamContents(dogDetector, new string[] { "Dogs", "Dogs" });

            source1.Reset();
            TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);

            string[] lowerCaseTokens = new string[tokens1.Length];
            for (int i = 0; i < tokens1.Length; i++)
            {
                lowerCaseTokens[i] = CultureInfo.InvariantCulture.TextInfo.ToLower(tokens1[i]);
            }
            AssertTokenStreamContents(lowerCasing, lowerCaseTokens);
        }
Ejemplo n.º 2
0
        // This is a simplified query builder which works for single Terms and single Phrases
        // Returns null, TermQuery, or PhraseQuery
        public static Lucene.Net.Search.Query GetFieldQuery(Analyzer analyzer, string field, string queryText)
        {
            TokenStream stream = analyzer.TokenStream(field, new StringReader(queryText));
            TokenFilter filter = new CachingTokenFilter(stream);

            filter.Reset();

            // This attribute way of getting token properties isn't very good, but it's the non-obsolete one.
            var           attr1   = filter.GetAttribute <ITermAttribute>();
            Func <string> getText = () => attr1 != null ? attr1.Term : null;

            Func <int> getPositionIncrement;

            if (filter.HasAttribute <IPositionIncrementAttribute>())
            {
                var attr = filter.GetAttribute <IPositionIncrementAttribute>();
                getPositionIncrement = () => attr.PositionIncrement;
            }
            else
            {
                getPositionIncrement = () => 1;
            }

            // 0 tokens
            if (!filter.IncrementToken())
            {
                return(new BooleanQuery());
            }

            // 1 token?
            string token1   = getText();
            int    position = 0;

            if (!filter.IncrementToken())
            {
                return(new TermQuery(new Term(field, token1)));
            }

            // many tokens - handle first token
            PhraseQuery ret = new PhraseQuery();

            ret.Add(new Term(field, token1));

            do
            {
                // handle rest of tokens
                string tokenNext = getText();
                position += getPositionIncrement();
                ret.Add(new Term(field, tokenNext), position);
            }while (filter.IncrementToken());

            return(ret);
        }
Ejemplo n.º 3
0
        public virtual void TestEndOffsetPositionWithCachingTokenFilter()
        {
            Directory   dir            = NewDirectory();
            Analyzer    analyzer       = new MockAnalyzer(Random);
            IndexWriter w              = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
            Document    doc            = new Document();
            Exception   priorException = null; // LUCENENET: No need to cast to IOExcpetion
            TokenStream stream         = analyzer.GetTokenStream("field", new StringReader("abcd   "));

            try
            {
                stream.Reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct?
                TokenStream cachedStream = new CachingTokenFilter(stream);
                FieldType   customType   = new FieldType(TextField.TYPE_NOT_STORED);
                customType.StoreTermVectors         = true;
                customType.StoreTermVectorPositions = true;
                customType.StoreTermVectorOffsets   = true;
                Field f = new Field("field", cachedStream, customType);
                doc.Add(f);
                doc.Add(f);
                w.AddDocument(doc);
            }
            catch (Exception e) when(e.IsIOException())
            {
                priorException = e;
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(priorException, stream);
            }
            w.Dispose();

            IndexReader r         = DirectoryReader.Open(dir);
            TermsEnum   termsEnum = r.GetTermVectors(0).GetTerms("field").GetEnumerator();

            Assert.IsTrue(termsEnum.MoveNext());
            DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null);

            Assert.AreEqual(2, termsEnum.TotalTermFreq);

            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            dpEnum.NextPosition();
            Assert.AreEqual(0, dpEnum.StartOffset);
            Assert.AreEqual(4, dpEnum.EndOffset);

            dpEnum.NextPosition();
            Assert.AreEqual(8, dpEnum.StartOffset);
            Assert.AreEqual(12, dpEnum.EndOffset);
            Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc());

            r.Dispose();
            dir.Dispose();
        }
Ejemplo n.º 4
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testMultipleSources() throws Exception
        public virtual void testMultipleSources()
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
            TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false));

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
            TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
            TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
            tee1.reset();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TokenStream source1 = new CachingTokenFilter(tee1);
            TokenStream source1 = new CachingTokenFilter(tee1);

            tee1.addAttribute(typeof(CheckClearAttributesAttribute));
            dogDetector.addAttribute(typeof(CheckClearAttributesAttribute));
            theDetector.addAttribute(typeof(CheckClearAttributesAttribute));

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
            TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false));

            tee2.addSinkTokenStream(dogDetector);
            tee2.addSinkTokenStream(theDetector);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TokenStream source2 = tee2;
            TokenStream source2 = tee2;

            assertTokenStreamContents(source1, tokens1);
            assertTokenStreamContents(source2, tokens2);

            assertTokenStreamContents(theDetector, new string[] { "The", "the", "The", "the" });
            assertTokenStreamContents(dogDetector, new string[] { "Dogs", "Dogs" });

            source1.reset();
            TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);

            string[] lowerCaseTokens = new string[tokens1.Length];
            for (int i = 0; i < tokens1.Length; i++)
            {
                lowerCaseTokens[i] = tokens1[i].ToLower(Locale.ROOT);
            }
            assertTokenStreamContents(lowerCasing, lowerCaseTokens);
        }
Ejemplo n.º 5
0
        protected override IQueryNode PostProcessNode(IQueryNode node)
        {
            if (node is ITextableQueryNode &&
                !(node is WildcardQueryNode) &&
                !(node is FuzzyQueryNode) &&
                !(node is RegexpQueryNode) &&
                !(node.Parent is IRangeQueryNode))
            {
                FieldQueryNode fieldNode = ((FieldQueryNode)node);
                string         text      = fieldNode.GetTextAsString();
                string         field     = fieldNode.GetFieldAsString();

                CachingTokenFilter          buffer     = null;
                IPositionIncrementAttribute posIncrAtt = null;
                int  numTokens     = 0;
                int  positionCount = 0;
                bool severalTokensAtSamePosition = false;

                TokenStream source = null;
                try
                {
                    source = this.analyzer.GetTokenStream(field, text);
                    source.Reset();
                    buffer = new CachingTokenFilter(source);

                    if (buffer.HasAttribute <IPositionIncrementAttribute>())
                    {
                        posIncrAtt = buffer.GetAttribute <IPositionIncrementAttribute>();
                    }

                    try
                    {
                        while (buffer.IncrementToken())
                        {
                            numTokens++;
                            int positionIncrement = (posIncrAtt != null) ? posIncrAtt
                                                    .PositionIncrement : 1;
                            if (positionIncrement != 0)
                            {
                                positionCount += positionIncrement;
                            }
                            else
                            {
                                severalTokensAtSamePosition = true;
                            }
                        }
                    }
#pragma warning disable 168
                    catch (IOException e)
#pragma warning restore 168
                    {
                        // ignore
                    }
                }
                catch (IOException e)
                {
                    throw new Exception(e.ToString(), e);
                }
                finally
                {
                    IOUtils.DisposeWhileHandlingException(source);
                }

                // rewind the buffer stream
                buffer.Reset();

                if (!buffer.HasAttribute <ICharTermAttribute>())
                {
                    return(new NoTokenFoundQueryNode());
                }

                ICharTermAttribute termAtt = buffer.GetAttribute <ICharTermAttribute>();

                if (numTokens == 0)
                {
                    return(new NoTokenFoundQueryNode());
                }
                else if (numTokens == 1)
                {
                    string term = null;
                    try
                    {
                        bool hasNext;
                        hasNext = buffer.IncrementToken();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(hasNext == true);
                        }
                        term = termAtt.ToString();
                    }
                    catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                    {
                        // safe to ignore, because we know the number of tokens
                    }

                    fieldNode.Text = term.AsCharSequence();

                    return(fieldNode);
                }
                else if (severalTokensAtSamePosition || !(node is QuotedFieldQueryNode))
                {
                    if (positionCount == 1 || !(node is QuotedFieldQueryNode))
                    {
                        // no phrase query:

                        if (positionCount == 1)
                        {
                            // simple case: only one position, with synonyms
                            List <IQueryNode> children = new List <IQueryNode>();

                            for (int i = 0; i < numTokens; i++)
                            {
                                string term = null;
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    if (Debugging.AssertsEnabled)
                                    {
                                        Debugging.Assert(hasNext == true);
                                    }
                                    term = termAtt.ToString();
                                }
                                catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                                {
                                    // safe to ignore, because we know the number of tokens
                                }

                                children.Add(new FieldQueryNode(field, term, -1, -1));
                            }
                            return(new GroupQueryNode(
                                       new StandardBooleanQueryNode(children, positionCount == 1)));
                        }
                        else
                        {
                            // multiple positions
                            IQueryNode q            = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), false);
                            IQueryNode currentQuery = null;
                            for (int i = 0; i < numTokens; i++)
                            {
                                string term = null;
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    if (Debugging.AssertsEnabled)
                                    {
                                        Debugging.Assert(hasNext == true);
                                    }
                                    term = termAtt.ToString();
                                }
                                catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                                {
                                    // safe to ignore, because we know the number of tokens
                                }
                                if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0)
                                {
                                    if (!(currentQuery is BooleanQueryNode))
                                    {
                                        IQueryNode t = currentQuery;
                                        currentQuery = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), true);
                                        ((BooleanQueryNode)currentQuery).Add(t);
                                    }
                                    ((BooleanQueryNode)currentQuery).Add(new FieldQueryNode(field, term, -1, -1));
                                }
                                else
                                {
                                    if (currentQuery != null)
                                    {
                                        if (this.defaultOperator == Operator.OR)
                                        {
                                            q.Add(currentQuery);
                                        }
                                        else
                                        {
                                            q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
                                        }
                                    }
                                    currentQuery = new FieldQueryNode(field, term, -1, -1);
                                }
                            }
                            if (this.defaultOperator == Operator.OR)
                            {
                                q.Add(currentQuery);
                            }
                            else
                            {
                                q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
                            }

                            if (q is BooleanQueryNode)
                            {
                                q = new GroupQueryNode(q);
                            }
                            return(q);
                        }
                    }
                    else
                    {
                        // phrase query:
                        MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();

                        List <FieldQueryNode> multiTerms = new List <FieldQueryNode>();
                        int position       = -1;
                        int i              = 0;
                        int termGroupCount = 0;
                        for (; i < numTokens; i++)
                        {
                            string term = null;
                            int    positionIncrement = 1;
                            try
                            {
                                bool hasNext = buffer.IncrementToken();
                                if (Debugging.AssertsEnabled)
                                {
                                    Debugging.Assert(hasNext == true);
                                }
                                term = termAtt.ToString();
                                if (posIncrAtt != null)
                                {
                                    positionIncrement = posIncrAtt.PositionIncrement;
                                }
                            }
                            catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                            {
                                // safe to ignore, because we know the number of tokens
                            }

                            if (positionIncrement > 0 && multiTerms.Count > 0)
                            {
                                foreach (FieldQueryNode termNode in multiTerms)
                                {
                                    if (this.positionIncrementsEnabled)
                                    {
                                        termNode.PositionIncrement = position;
                                    }
                                    else
                                    {
                                        termNode.PositionIncrement = termGroupCount;
                                    }

                                    mpq.Add(termNode);
                                }

                                // Only increment once for each "group" of
                                // terms that were in the same position:
                                termGroupCount++;

                                multiTerms.Clear();
                            }

                            position += positionIncrement;
                            multiTerms.Add(new FieldQueryNode(field, term, -1, -1));
                        }

                        foreach (FieldQueryNode termNode in multiTerms)
                        {
                            if (this.positionIncrementsEnabled)
                            {
                                termNode.PositionIncrement = position;
                            }
                            else
                            {
                                termNode.PositionIncrement = termGroupCount;
                            }

                            mpq.Add(termNode);
                        }

                        return(mpq);
                    }
                }
                else
                {
                    TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();

                    int position = -1;

                    for (int i = 0; i < numTokens; i++)
                    {
                        string term = null;
                        int    positionIncrement = 1;

                        try
                        {
                            bool hasNext = buffer.IncrementToken();
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(hasNext == true);
                            }
                            term = termAtt.ToString();

                            if (posIncrAtt != null)
                            {
                                positionIncrement = posIncrAtt.PositionIncrement;
                            }
                        }
                        catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                        {
                            // safe to ignore, because we know the number of tokens
                        }

                        FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);

                        if (this.positionIncrementsEnabled)
                        {
                            position += positionIncrement;
                            newFieldNode.PositionIncrement = position;
                        }
                        else
                        {
                            newFieldNode.PositionIncrement = i;
                        }

                        pq.Add(newFieldNode);
                    }

                    return(pq);
                }
            }

            return(node);
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Creates a query from the analysis chain.
        /// <para/>
        /// Expert: this is more useful for subclasses such as queryparsers.
        /// If using this class directly, just use <see cref="CreateBooleanQuery(string, string)"/>
        /// and <see cref="CreatePhraseQuery(string, string)"/>. </summary>
        /// <param name="analyzer"> Analyzer used for this query. </param>
        /// <param name="operator"> Default boolean operator used for this query. </param>
        /// <param name="field"> Field to create queries against. </param>
        /// <param name="queryText"> Text to be passed to the analysis chain. </param>
        /// <param name="quoted"> <c>true</c> if phrases should be generated when terms occur at more than one position. </param>
        /// <param name="phraseSlop"> Slop factor for phrase/multiphrase queries. </param>
        protected Query CreateFieldQuery(Analyzer analyzer, Occur @operator, string field, string queryText, bool quoted, int phraseSlop)
        {
            Debug.Assert(@operator == Occur.SHOULD || @operator == Occur.MUST);
            // Use the analyzer to get all the tokens, and then build a TermQuery,
            // PhraseQuery, or nothing based on the term count
            CachingTokenFilter          buffer     = null;
            ITermToBytesRefAttribute    termAtt    = null;
            IPositionIncrementAttribute posIncrAtt = null;
            int  numTokens     = 0;
            int  positionCount = 0;
            bool severalTokensAtSamePosition = false;
            bool hasMoreTokens = false;

            TokenStream source = null;

            try
            {
                source = analyzer.GetTokenStream(field, new StringReader(queryText));
                source.Reset();
                buffer = new CachingTokenFilter(source);
                buffer.Reset();

                if (buffer.HasAttribute <ITermToBytesRefAttribute>())
                {
                    termAtt = buffer.GetAttribute <ITermToBytesRefAttribute>();
                }
                if (buffer.HasAttribute <IPositionIncrementAttribute>())
                {
                    posIncrAtt = buffer.GetAttribute <IPositionIncrementAttribute>();
                }

                if (termAtt != null)
                {
                    try
                    {
                        hasMoreTokens = buffer.IncrementToken();
                        while (hasMoreTokens)
                        {
                            numTokens++;
                            int positionIncrement = (posIncrAtt != null) ? posIncrAtt.PositionIncrement : 1;
                            if (positionIncrement != 0)
                            {
                                positionCount += positionIncrement;
                            }
                            else
                            {
                                severalTokensAtSamePosition = true;
                            }
                            hasMoreTokens = buffer.IncrementToken();
                        }
                    }
                    catch (System.IO.IOException)
                    {
                        // ignore
                    }
                }
            }
            catch (System.IO.IOException e)
            {
                throw new Exception("Error analyzing query text", e);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(source);
            }

            // rewind the buffer stream
            buffer.Reset();

            BytesRef bytes = termAtt == null ? null : termAtt.BytesRef;

            if (numTokens == 0)
            {
                return(null);
            }
            else if (numTokens == 1)
            {
                try
                {
                    bool hasNext = buffer.IncrementToken();
                    Debug.Assert(hasNext == true);
                    termAtt.FillBytesRef();
                }
                catch (System.IO.IOException)
                {
                    // safe to ignore, because we know the number of tokens
                }
                return(NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))));
            }
            else
            {
                if (severalTokensAtSamePosition || (!quoted))
                {
                    if (positionCount == 1 || (!quoted))
                    {
                        // no phrase query:

                        if (positionCount == 1)
                        {
                            // simple case: only one position, with synonyms
                            BooleanQuery q = NewBooleanQuery(true);
                            for (int i = 0; i < numTokens; i++)
                            {
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    Debug.Assert(hasNext == true);
                                    termAtt.FillBytesRef();
                                }
                                catch (System.IO.IOException)
                                {
                                    // safe to ignore, because we know the number of tokens
                                }
                                Query currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes)));
                                q.Add(currentQuery, Occur.SHOULD);
                            }
                            return(q);
                        }
                        else
                        {
                            // multiple positions
                            BooleanQuery q            = NewBooleanQuery(false);
                            Query        currentQuery = null;
                            for (int i = 0; i < numTokens; i++)
                            {
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    Debug.Assert(hasNext == true);
                                    termAtt.FillBytesRef();
                                }
                                catch (System.IO.IOException)
                                {
                                    // safe to ignore, because we know the number of tokens
                                }
                                if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0)
                                {
                                    if (!(currentQuery is BooleanQuery))
                                    {
                                        Query t = currentQuery;
                                        currentQuery = NewBooleanQuery(true);
                                        ((BooleanQuery)currentQuery).Add(t, Occur.SHOULD);
                                    }
                                    ((BooleanQuery)currentQuery).Add(NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))), Occur.SHOULD);
                                }
                                else
                                {
                                    if (currentQuery != null)
                                    {
                                        q.Add(currentQuery, @operator);
                                    }
                                    currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes)));
                                }
                            }
                            q.Add(currentQuery, @operator);
                            return(q);
                        }
                    }
                    else
                    {
                        // phrase query:
                        MultiPhraseQuery mpq = NewMultiPhraseQuery();
                        mpq.Slop = phraseSlop;
                        IList <Term> multiTerms = new List <Term>();
                        int          position   = -1;
                        for (int i = 0; i < numTokens; i++)
                        {
                            int positionIncrement = 1;
                            try
                            {
                                bool hasNext = buffer.IncrementToken();
                                Debug.Assert(hasNext == true);
                                termAtt.FillBytesRef();
                                if (posIncrAtt != null)
                                {
                                    positionIncrement = posIncrAtt.PositionIncrement;
                                }
                            }
                            catch (System.IO.IOException)
                            {
                                // safe to ignore, because we know the number of tokens
                            }

                            if (positionIncrement > 0 && multiTerms.Count > 0)
                            {
                                if (enablePositionIncrements)
                                {
                                    mpq.Add(multiTerms.ToArray(), position);
                                }
                                else
                                {
                                    mpq.Add(multiTerms.ToArray());
                                }
                                multiTerms.Clear();
                            }
                            position += positionIncrement;
                            multiTerms.Add(new Term(field, BytesRef.DeepCopyOf(bytes)));
                        }
                        if (enablePositionIncrements)
                        {
                            mpq.Add(multiTerms.ToArray(), position);
                        }
                        else
                        {
                            mpq.Add(multiTerms.ToArray());
                        }
                        return(mpq);
                    }
                }
                else
                {
                    PhraseQuery pq = NewPhraseQuery();
                    pq.Slop = phraseSlop;
                    int position = -1;

                    for (int i = 0; i < numTokens; i++)
                    {
                        int positionIncrement = 1;

                        try
                        {
                            bool hasNext = buffer.IncrementToken();
                            Debug.Assert(hasNext == true);
                            termAtt.FillBytesRef();
                            if (posIncrAtt != null)
                            {
                                positionIncrement = posIncrAtt.PositionIncrement;
                            }
                        }
                        catch (System.IO.IOException)
                        {
                            // safe to ignore, because we know the number of tokens
                        }

                        if (enablePositionIncrements)
                        {
                            position += positionIncrement;
                            pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes)), position);
                        }
                        else
                        {
                            pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes)));
                        }
                    }
                    return(pq);
                }
            }
        }
Ejemplo n.º 7
0
        public void TestTokenStream()
        {
            ShingleMatrixFilter.DefaultSettingsCodec = null;
            //new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();

            // test a plain old token stream with synonyms tranlated to rows.

            var tokens = new LinkedList <Token>();

            tokens.AddLast(TokenFactory("hello", 1, 0, 4));
            tokens.AddLast(TokenFactory("greetings", 0, 0, 4));
            tokens.AddLast(TokenFactory("world", 1, 5, 10));
            tokens.AddLast(TokenFactory("earth", 0, 5, 10));
            tokens.AddLast(TokenFactory("tellus", 0, 5, 10));

            TokenStream tls = new TokenListStream(tokens);

            // bi-grams

            TokenStream ts = new ShingleMatrixFilter(tls, 2, 2, '_', false,
                                                     new TwoDimensionalNonWeightedSynonymTokenSettingsCodec());

            AssertNext(ts, "hello_world");
            AssertNext(ts, "greetings_world");
            AssertNext(ts, "hello_earth");
            AssertNext(ts, "greetings_earth");
            AssertNext(ts, "hello_tellus");
            AssertNext(ts, "greetings_tellus");
            Assert.IsFalse(ts.IncrementToken());

            // bi-grams with no spacer character, start offset, end offset

            tls.Reset();
            ts = new ShingleMatrixFilter(tls, 2, 2, null, false,
                                         new TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
            AssertNext(ts, "helloworld", 0, 10);
            AssertNext(ts, "greetingsworld", 0, 10);
            AssertNext(ts, "helloearth", 0, 10);
            AssertNext(ts, "greetingsearth", 0, 10);
            AssertNext(ts, "hellotellus", 0, 10);
            AssertNext(ts, "greetingstellus", 0, 10);
            Assert.IsFalse(ts.IncrementToken());


            // add ^_prefix_and_suffix_$
            //
            // using 3d codec as it supports weights

            ShingleMatrixFilter.DefaultSettingsCodec =
                new SimpleThreeDimensionalTokenSettingsCodec();

            tokens = new LinkedList <Token>();
            tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("greetings", 0, 1f, 0, 4, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("earth", 0, 1f, 5, 10, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("tellus", 0, 1f, 5, 10, TokenPositioner.NewRow));

            tls = new TokenListStream(tokens);

            // bi-grams, position incrememnt, weight, start offset, end offset

            ts = new PrefixAndSuffixAwareTokenFilter(
                new SingleTokenTokenStream(TokenFactory("^", 1, 100f, 0, 0)),
                tls,
                new SingleTokenTokenStream(TokenFactory("$", 1, 50f, 0, 0))
                );
            tls = new CachingTokenFilter(ts);

            ts = new ShingleMatrixFilter(tls, 2, 2, '_', false);

            //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token)) {
            //    Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");");
            //    token.Clear();
            //}

            AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
            Assert.IsFalse(ts.IncrementToken());

            // test unlimited size and allow single boundary token as shingle
            tls.Reset();

            ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', false);


            //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token))
            //{
            //    Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");");
            //    token.Clear();
            //}

            AssertNext(ts, "^", 1, 10.0f, 0, 0);
            AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello", 1, 1.0f, 0, 4);
            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "world", 1, 1.0f, 5, 10);
            AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "$", 1, 7.071068f, 10, 10);
            AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings", 1, 1.0f, 0, 4);
            AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "earth", 1, 1.0f, 5, 10);
            AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "tellus", 1, 1.0f, 5, 10);
            AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);

            Assert.IsFalse(ts.IncrementToken());

            // test unlimited size but don't allow single boundary token as shingle

            tls.Reset();
            ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', true);

            //  for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
            //      System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
            //      token.clear();
            //    }

            AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello", 1, 1.0f, 0, 4);
            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "world", 1, 1.0f, 5, 10);
            AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings", 1, 1.0f, 0, 4);
            AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "earth", 1, 1.0f, 5, 10);
            AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "tellus", 1, 1.0f, 5, 10);
            AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);


            Assert.IsFalse(ts.IncrementToken());

            //System.currentTimeMillis();

            // multi-token synonyms
            //
            // Token[][][] {
            //    {{hello}, {greetings, and, salutations},
            //    {{world}, {earth}, {tellus}}
            // }
            //


            tokens = new LinkedList <Token>();
            tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("greetings", 1, 1f, 0, 4, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("and", 1, 1f, 0, 4, TokenPositioner.SameRow));
            tokens.AddLast(TokenFactory("salutations", 1, 1f, 0, 4, TokenPositioner.SameRow));
            tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("earth", 1, 1f, 5, 10, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("tellus", 1, 1f, 5, 10, TokenPositioner.NewRow));

            tls = new TokenListStream(tokens);

            // 2-3 grams

            ts = new ShingleMatrixFilter(tls, 2, 3, '_', false);

            //  for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
            //      System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
            //      token.clear();
            //    }

            // shingle, position increment, weight, start offset, end offset

            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
            AssertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
            AssertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
            AssertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
            AssertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
            AssertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
            AssertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);

            Assert.IsFalse(ts.IncrementToken());

            //System.currentTimeMillis();
        }
Ejemplo n.º 8
0
        public virtual void TestEndOffsetPositionWithCachingTokenFilter()
        {
            Directory dir = NewDirectory();
            Analyzer analyzer = new MockAnalyzer(Random());
            IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
            Document doc = new Document();
            IOException priorException = null;
            TokenStream stream = analyzer.TokenStream("field", new StringReader("abcd    "));
            try
            {
                stream.Reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct?
                TokenStream cachedStream = new CachingTokenFilter(stream);
                FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
                customType.StoreTermVectors = true;
                customType.StoreTermVectorPositions = true;
                customType.StoreTermVectorOffsets = true;
                Field f = new Field("field", cachedStream, customType);
                doc.Add(f);
                doc.Add(f);
                w.AddDocument(doc);
            }
            catch (IOException e)
            {
                priorException = e;
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(priorException, stream);
            }
            w.Dispose();

            IndexReader r = DirectoryReader.Open(dir);
            TermsEnum termsEnum = r.GetTermVectors(0).Terms("field").Iterator(null);
            Assert.IsNotNull(termsEnum.Next());
            DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null);
            Assert.AreEqual(2, termsEnum.TotalTermFreq());

            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            dpEnum.NextPosition();
            Assert.AreEqual(0, dpEnum.StartOffset());
            Assert.AreEqual(4, dpEnum.EndOffset());

            dpEnum.NextPosition();
            Assert.AreEqual(8, dpEnum.StartOffset());
            Assert.AreEqual(12, dpEnum.EndOffset());
            Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc());

            r.Dispose();
            dir.Dispose();
        }
Ejemplo n.º 9
0
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testMultipleSources() throws Exception
        public virtual void testMultipleSources()
        {
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
            TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false));
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
            TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
            TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
            tee1.reset();
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TokenStream source1 = new CachingTokenFilter(tee1);
            TokenStream source1 = new CachingTokenFilter(tee1);

            tee1.addAttribute(typeof(CheckClearAttributesAttribute));
            dogDetector.addAttribute(typeof(CheckClearAttributesAttribute));
            theDetector.addAttribute(typeof(CheckClearAttributesAttribute));

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
            TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false));
            tee2.addSinkTokenStream(dogDetector);
            tee2.addSinkTokenStream(theDetector);
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final TokenStream source2 = tee2;
            TokenStream source2 = tee2;

            assertTokenStreamContents(source1, tokens1);
            assertTokenStreamContents(source2, tokens2);

            assertTokenStreamContents(theDetector, new string[]{"The", "the", "The", "the"});
            assertTokenStreamContents(dogDetector, new string[]{"Dogs", "Dogs"});

            source1.reset();
            TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);
            string[] lowerCaseTokens = new string[tokens1.Length];
            for (int i = 0; i < tokens1.Length; i++)
            {
              lowerCaseTokens[i] = tokens1[i].ToLower(Locale.ROOT);
            }
            assertTokenStreamContents(lowerCasing, lowerCaseTokens);
        }