Beispiel #1
0
        private void QueryToSpanQuery(Query query, ICollection<sbyte[]> payloads)
        {
            if (query is BooleanQuery)
            {
                BooleanClause[] queryClauses = ((BooleanQuery)query).Clauses;

                for (int i = 0; i < queryClauses.Length; i++)
                {
                    if (!queryClauses[i].Prohibited)
                    {
                        QueryToSpanQuery(queryClauses[i].Query, payloads);
                    }
                }
            }
            else if (query is PhraseQuery)
            {
                Term[] phraseQueryTerms = ((PhraseQuery)query).Terms;
                SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length];
                for (int i = 0; i < phraseQueryTerms.Length; i++)
                {
                    clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
                }

                int slop = ((PhraseQuery)query).Slop;
                bool inorder = false;

                if (slop == 0)
                {
                    inorder = true;
                }

                SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
                sp.Boost = query.Boost;
                GetPayloads(payloads, sp);
            }
            else if (query is TermQuery)
            {
                SpanTermQuery stq = new SpanTermQuery(((TermQuery)query).Term);
                stq.Boost = query.Boost;
                GetPayloads(payloads, stq);
            }
            else if (query is SpanQuery)
            {
                GetPayloads(payloads, (SpanQuery)query);
            }
            else if (query is FilteredQuery)
            {
                QueryToSpanQuery(((FilteredQuery)query).Query, payloads);
            }
            else if (query is DisjunctionMaxQuery)
            {
                IEnumerator<Query> enumerator = ((DisjunctionMaxQuery)query).GetEnumerator();
                while (enumerator.MoveNext())
                {
                    QueryToSpanQuery(enumerator.Current, payloads);
                }
            }
            else if (query is MultiPhraseQuery)
            {
                MultiPhraseQuery mpq = (MultiPhraseQuery)query;
                IList<Term[]> termArrays = mpq.TermArrays;
                int[] positions = mpq.Positions;
                if (positions.Length > 0)
                {
                    int maxPosition = positions[positions.Length - 1];
                    for (int i = 0; i < positions.Length - 1; ++i)
                    {
                        if (positions[i] > maxPosition)
                        {
                            maxPosition = positions[i];
                        }
                    }

                    IList<Query>[] disjunctLists = new List<Query>[maxPosition + 1];
                    int distinctPositions = 0;

                    for (int i = 0; i < termArrays.Count; ++i)
                    {
                        Term[] termArray = termArrays[i];
                        IList<Query> disjuncts = disjunctLists[positions[i]];
                        if (disjuncts == null)
                        {
                            disjuncts = (disjunctLists[positions[i]] = new List<Query>(termArray.Length));
                            ++distinctPositions;
                        }
                        foreach (Term term in termArray)
                        {
                            disjuncts.Add(new SpanTermQuery(term));
                        }
                    }

                    int positionGaps = 0;
                    int position = 0;
                    SpanQuery[] clauses = new SpanQuery[distinctPositions];
                    for (int i = 0; i < disjunctLists.Length; ++i)
                    {
                        IList<Query> disjuncts = disjunctLists[i];
                        if (disjuncts != null)
                        {
                            clauses[position++] = new SpanOrQuery(disjuncts.OfType<SpanQuery>().ToArray());
                        }
                        else
                        {
                            ++positionGaps;
                        }
                    }

                    int slop = mpq.Slop;
                    bool inorder = (slop == 0);

                    SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
                    sp.Boost = query.Boost;
                    GetPayloads(payloads, sp);
                }
            }
        }
Beispiel #2
0
        private void QueryToSpanQuery(Query query, ICollection <byte[]> payloads)
        {
            if (query is BooleanQuery booleanQuery)
            {
                BooleanClause[] queryClauses = booleanQuery.GetClauses();

                for (int i = 0; i < queryClauses.Length; i++)
                {
                    if (!queryClauses[i].IsProhibited)
                    {
                        QueryToSpanQuery(queryClauses[i].Query, payloads);
                    }
                }
            }
            else if (query is PhraseQuery phraseQuery)
            {
                Term[]      phraseQueryTerms = phraseQuery.GetTerms();
                SpanQuery[] clauses          = new SpanQuery[phraseQueryTerms.Length];
                for (int i = 0; i < phraseQueryTerms.Length; i++)
                {
                    clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
                }

                int  slop    = phraseQuery.Slop;
                bool inorder = false;

                if (slop == 0)
                {
                    inorder = true;
                }

                SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder)
                {
                    Boost = query.Boost
                };
                GetPayloads(payloads, sp);
            }
            else if (query is TermQuery termQuery)
            {
                SpanTermQuery stq = new SpanTermQuery(termQuery.Term)
                {
                    Boost = query.Boost
                };
                GetPayloads(payloads, stq);
            }
            else if (query is SpanQuery spanQuery)
            {
                GetPayloads(payloads, spanQuery);
            }
            else if (query is FilteredQuery filteredQuery)
            {
                QueryToSpanQuery(filteredQuery.Query, payloads);
            }
            else if (query is DisjunctionMaxQuery disjunctionMaxQuery)
            {
                foreach (var q in disjunctionMaxQuery)
                {
                    QueryToSpanQuery(q, payloads);
                }
            }
            else if (query is MultiPhraseQuery mpq)
            {
                IList <Term[]> termArrays = mpq.GetTermArrays();
                int[]          positions  = mpq.GetPositions();
                if (positions.Length > 0)
                {
                    int maxPosition = positions[positions.Length - 1];
                    for (int i = 0; i < positions.Length - 1; ++i)
                    {
                        if (positions[i] > maxPosition)
                        {
                            maxPosition = positions[i];
                        }
                    }

                    // LUCENENET: Changed from Query to SpanQuery to eliminate the O(n) cast
                    // required to instantiate SpanOrQuery below
                    IList <SpanQuery>[] disjunctLists = new JCG.List <SpanQuery> [maxPosition + 1];
                    int distinctPositions             = 0;

                    for (int i = 0; i < termArrays.Count; ++i)
                    {
                        Term[]            termArray = termArrays[i];
                        IList <SpanQuery> disjuncts = disjunctLists[positions[i]]; // LUCENENET: Changed from Query to SpanQuery
                        if (disjuncts == null)
                        {
                            disjuncts = (disjunctLists[positions[i]] = new JCG.List <SpanQuery>(termArray.Length)); // LUCENENET: Changed from Query to SpanQuery
                            ++distinctPositions;
                        }
                        foreach (Term term in termArray)
                        {
                            disjuncts.Add(new SpanTermQuery(term));
                        }
                    }

                    int         positionGaps = 0;
                    int         position     = 0;
                    SpanQuery[] clauses      = new SpanQuery[distinctPositions];
                    for (int i = 0; i < disjunctLists.Length; ++i)
                    {
                        IList <SpanQuery> disjuncts = disjunctLists[i]; // LUCENENET: Changed from Query to SpanQuery
                        if (disjuncts != null)
                        {
                            clauses[position++] = new SpanOrQuery(disjuncts);
                        }
                        else
                        {
                            ++positionGaps;
                        }
                    }

                    int  slop    = mpq.Slop;
                    bool inorder = (slop == 0);

                    SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
                    sp.Boost = query.Boost;
                    GetPayloads(payloads, sp);
                }
            }
        }
        public virtual void TestPayloadsPos0()
        {
            Directory dir = NewDirectory();
            RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, new MockPayloadAnalyzer());
            Document doc = new Document();
            doc.Add(new TextField("content", new StringReader("a a b c d e a f g h i j a b k k")));
            writer.AddDocument(doc);

            IndexReader readerFromWriter = writer.Reader;
            AtomicReader r = SlowCompositeReaderWrapper.Wrap(readerFromWriter);

            DocsAndPositionsEnum tp = r.TermPositionsEnum(new Term("content", "a"));

            int count = 0;
            Assert.IsTrue(tp.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            // "a" occurs 4 times
            Assert.AreEqual(4, tp.Freq());
            Assert.AreEqual(0, tp.NextPosition());
            Assert.AreEqual(1, tp.NextPosition());
            Assert.AreEqual(3, tp.NextPosition());
            Assert.AreEqual(6, tp.NextPosition());

            // only one doc has "a"
            Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, tp.NextDoc());

            IndexSearcher @is = NewSearcher(readerFromWriter);

            SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
            SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
            SpanQuery[] sqs = new SpanQuery[] { stq1, stq2 };
            SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);

            count = 0;
            bool sawZero = false;
            if (VERBOSE)
            {
                Console.WriteLine("\ngetPayloadSpans test");
            }
            Search.Spans.Spans pspans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq);
            while (pspans.Next())
            {
                if (VERBOSE)
                {
                    Console.WriteLine("doc " + pspans.Doc() + ": span " + pspans.Start() + " to " + pspans.End());
                }
                var payloads = pspans.Payload;
                sawZero |= pspans.Start() == 0;
                foreach (var bytes in payloads)
                {
                    count++;
                    if (VERBOSE)
                    {
                        Console.WriteLine("  payload: " + Encoding.UTF8.GetString((byte[])(Array)bytes));
                    }
                }
            }
            Assert.IsTrue(sawZero);
            Assert.AreEqual(5, count);

            // System.out.println("\ngetSpans test");
            Search.Spans.Spans spans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq);
            count = 0;
            sawZero = false;
            while (spans.Next())
            {
                count++;
                sawZero |= spans.Start() == 0;
                // System.out.println(spans.Doc() + " - " + spans.Start() + " - " +
                // spans.End());
            }
            Assert.AreEqual(4, count);
            Assert.IsTrue(sawZero);

            // System.out.println("\nPayloadSpanUtil test");

            sawZero = false;
            PayloadSpanUtil psu = new PayloadSpanUtil(@is.TopReaderContext);
            var pls = psu.GetPayloadsForQuery(snq);
            count = pls.Count;
            foreach (var bytes in pls)
            {
                string s = Encoding.UTF8.GetString(bytes);
                //System.out.println(s);
                sawZero |= s.Equals("pos: 0");
            }
            Assert.AreEqual(5, count);
            Assert.IsTrue(sawZero);
            writer.Dispose();
            @is.IndexReader.Dispose();
            dir.Dispose();
        }
            public override Query Rewrite(IndexReader reader)
            {
                // ArrayList spanClauses = new ArrayList();
                if (contents is TermQuery)
                {
                    return(contents);
                }
                // Build a sequence of Span clauses arranged in a SpanNear - child
                // clauses can be complex
                // Booleans e.g. nots and ors etc
                int numNegatives = 0;

                if (!(contents is BooleanQuery))
                {
                    throw new ArgumentException("Unknown query type \""
                                                + contents.GetType().Name
                                                + "\" found in phrase query string \"" + phrasedQueryStringContents
                                                + "\"");
                }
                BooleanQuery bq = (BooleanQuery)contents;

                BooleanClause[] bclauses       = bq.GetClauses();
                SpanQuery[]     allSpanClauses = new SpanQuery[bclauses.Length];
                // For all clauses e.g. one* two~
                for (int i = 0; i < bclauses.Length; i++)
                {
                    // HashSet bclauseterms=new HashSet();
                    Query qc = bclauses[i].Query;
                    // Rewrite this clause e.g one* becomes (one OR onerous)
                    qc = qc.Rewrite(reader);
                    if (bclauses[i].Occur.Equals(Occur.MUST_NOT))
                    {
                        numNegatives++;
                    }

                    if (qc is BooleanQuery)
                    {
                        List <SpanQuery> sc = new List <SpanQuery>();
                        AddComplexPhraseClause(sc, (BooleanQuery)qc);
                        if (sc.Count > 0)
                        {
                            allSpanClauses[i] = sc.ElementAt(0);
                        }
                        else
                        {
                            // Insert fake term e.g. phrase query was for "Fred Smithe*" and
                            // there were no "Smithe*" terms - need to
                            // prevent match on just "Fred".
                            allSpanClauses[i] = new SpanTermQuery(new Term(field,
                                                                           "Dummy clause because no terms found - must match nothing"));
                        }
                    }
                    else
                    {
                        if (qc is TermQuery)
                        {
                            TermQuery tq = (TermQuery)qc;
                            allSpanClauses[i] = new SpanTermQuery(tq.Term);
                        }
                        else
                        {
                            throw new ArgumentException("Unknown query type \""
                                                        + qc.GetType().Name
                                                        + "\" found in phrase query string \""
                                                        + phrasedQueryStringContents + "\"");
                        }
                    }
                }
                if (numNegatives == 0)
                {
                    // The simple case - no negative elements in phrase
                    return(new SpanNearQuery(allSpanClauses, slopFactor, inOrder));
                }
                // Complex case - we have mixed positives and negatives in the
                // sequence.
                // Need to return a SpanNotQuery
                List <SpanQuery> positiveClauses = new List <SpanQuery>();

                for (int j = 0; j < allSpanClauses.Length; j++)
                {
                    if (!bclauses[j].Occur.Equals(Occur.MUST_NOT))
                    {
                        positiveClauses.Add(allSpanClauses[j]);
                    }
                }

                SpanQuery[] includeClauses = positiveClauses
                                             .ToArray();

                SpanQuery include = null;

                if (includeClauses.Length == 1)
                {
                    include = includeClauses[0]; // only one positive clause
                }
                else
                {
                    // need to increase slop factor based on gaps introduced by
                    // negatives
                    include = new SpanNearQuery(includeClauses, slopFactor + numNegatives,
                                                inOrder);
                }
                // Use sequence of positive and negative values as the exclude.
                SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor,
                                                          inOrder);
                SpanNotQuery snot = new SpanNotQuery(include, exclude);

                return(snot);
            }
        /// <summary>
        /// Fills a <see cref="T:IDictionary{string, WeightedSpanTerm}"/> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <paramref name="query"/>.
        /// </summary>
        /// <param name="query"><see cref="Query"/> to extract Terms from</param>
        /// <param name="terms">Map to place created <see cref="WeightedSpanTerm"/>s in</param>
        /// <exception cref="System.IO.IOException">If there is a low-level I/O error</exception>
        protected virtual void Extract(Query query, IDictionary <string, WeightedSpanTerm> terms)
        {
            if (query is BooleanQuery)
            {
                IList <BooleanClause> queryClauses = ((BooleanQuery)query).Clauses;

                for (int i = 0; i < queryClauses.Count; i++)
                {
                    if (!queryClauses[i].IsProhibited)
                    {
                        Extract(queryClauses[i].Query, terms);
                    }
                }
            }
            else if (query is PhraseQuery)
            {
                PhraseQuery phraseQuery      = (PhraseQuery)query;
                Term[]      phraseQueryTerms = phraseQuery.GetTerms();
                SpanQuery[] clauses          = new SpanQuery[phraseQueryTerms.Length];
                for (int i = 0; i < phraseQueryTerms.Length; i++)
                {
                    clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
                }
                int   slop      = phraseQuery.Slop;
                int[] positions = phraseQuery.GetPositions();
                // add largest position increment to slop
                if (positions.Length > 0)
                {
                    int lastPos    = positions[0];
                    int largestInc = 0;
                    int sz         = positions.Length;
                    for (int i = 1; i < sz; i++)
                    {
                        int pos = positions[i];
                        int inc = pos - lastPos;
                        if (inc > largestInc)
                        {
                            largestInc = inc;
                        }
                        lastPos = pos;
                    }
                    if (largestInc > 1)
                    {
                        slop += largestInc;
                    }
                }

                bool inorder = slop == 0;

                SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
                sp.Boost = query.Boost;
                ExtractWeightedSpanTerms(terms, sp);
            }
            else if (query is TermQuery)
            {
                ExtractWeightedTerms(terms, query);
            }
            else if (query is SpanQuery)
            {
                ExtractWeightedSpanTerms(terms, (SpanQuery)query);
            }
            else if (query is FilteredQuery)
            {
                Extract(((FilteredQuery)query).Query, terms);
            }
            else if (query is ConstantScoreQuery)
            {
                Query q = ((ConstantScoreQuery)query).Query;
                if (q != null)
                {
                    Extract(q, terms);
                }
            }
            else if (query is CommonTermsQuery)
            {
                // specialized since rewriting would change the result query
                // this query is TermContext sensitive.
                ExtractWeightedTerms(terms, query);
            }
            else if (query is DisjunctionMaxQuery)
            {
                foreach (var q in ((DisjunctionMaxQuery)query))
                {
                    Extract(q, terms);
                }
            }
            else if (query is MultiPhraseQuery)
            {
                MultiPhraseQuery mpq        = (MultiPhraseQuery)query;
                IList <Term[]>   termArrays = mpq.GetTermArrays();
                int[]            positions  = mpq.GetPositions();
                if (positions.Length > 0)
                {
                    int maxPosition = positions[positions.Length - 1];
                    for (int i = 0; i < positions.Length - 1; ++i)
                    {
                        if (positions[i] > maxPosition)
                        {
                            maxPosition = positions[i];
                        }
                    }

                    var disjunctLists     = new List <SpanQuery> [maxPosition + 1];
                    int distinctPositions = 0;

                    for (int i = 0; i < termArrays.Count; ++i)
                    {
                        Term[]           termArray = termArrays[i];
                        List <SpanQuery> disjuncts = disjunctLists[positions[i]];
                        if (disjuncts == null)
                        {
                            disjuncts = (disjunctLists[positions[i]] = new List <SpanQuery>(termArray.Length));
                            ++distinctPositions;
                        }
                        foreach (var term in termArray)
                        {
                            disjuncts.Add(new SpanTermQuery(term));
                        }
                    }

                    int         positionGaps = 0;
                    int         position     = 0;
                    SpanQuery[] clauses      = new SpanQuery[distinctPositions];
                    foreach (var disjuncts in disjunctLists)
                    {
                        if (disjuncts != null)
                        {
                            clauses[position++] = new SpanOrQuery(disjuncts.ToArray());
                        }
                        else
                        {
                            ++positionGaps;
                        }
                    }

                    int  slop    = mpq.Slop;
                    bool inorder = (slop == 0);

                    SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
                    sp.Boost = query.Boost;
                    ExtractWeightedSpanTerms(terms, sp);
                }
            }
            else
            {
                Query origQuery = query;
                if (query is MultiTermQuery)
                {
                    if (!expandMultiTermQuery)
                    {
                        return;
                    }
                    MultiTermQuery copy = (MultiTermQuery)query.Clone();
                    copy.MultiTermRewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE;
                    origQuery = copy;
                }
                IndexReader reader    = GetLeafContext().Reader;
                Query       rewritten = origQuery.Rewrite(reader);
                if (rewritten != origQuery)
                {
                    // only rewrite once and then flatten again - the rewritten query could have a speacial treatment
                    // if this method is overwritten in a subclass or above in the next recursion
                    Extract(rewritten, terms);
                }
            }
            ExtractUnknownQuery(query, terms);
        }
Beispiel #6
0
 public virtual Query VisitSpanNearQuery(SpanNearQuery spanNearq)
 {
     throw new NotImplementedException();
 }
Beispiel #7
0
        public virtual void TestBasic()
        {
            Directory         dir = NewDirectory();
            RandomIndexWriter w   = new RandomIndexWriter(Random(), dir);

            Document doc = new Document();

            doc.Add(NewStringField("id", "0", Field.Store.YES));
            doc.Add(NewTextField("field", "wizard the the the the the oz", Field.Store.NO));
            w.AddDocument(doc);
            doc = new Document();
            doc.Add(NewStringField("id", "1", Field.Store.YES));
            // 1 extra token, but wizard and oz are close;
            doc.Add(NewTextField("field", "wizard oz the the the the the the", Field.Store.NO));
            w.AddDocument(doc);
            IndexReader r = w.Reader;

            w.Dispose();

            // Do ordinary BooleanQuery:
            BooleanQuery bq = new BooleanQuery();

            bq.Add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD);
            bq.Add(new TermQuery(new Term("field", "oz")), Occur.SHOULD);
            IndexSearcher searcher = GetSearcher(r);

            searcher.Similarity = new DefaultSimilarity();

            TopDocs hits = searcher.Search(bq, 10);

            Assert.AreEqual(2, hits.TotalHits);
            Assert.AreEqual("0", searcher.Doc(hits.ScoreDocs[0].Doc).Get("id"));
            Assert.AreEqual("1", searcher.Doc(hits.ScoreDocs[1].Doc).Get("id"));

            // Now, resort using PhraseQuery:
            PhraseQuery pq = new PhraseQuery();

            pq.Slop = 5;
            pq.Add(new Term("field", "wizard"));
            pq.Add(new Term("field", "oz"));

            TopDocs hits2 = QueryRescorer.Rescore(searcher, hits, pq, 2.0, 10);

            // Resorting changed the order:
            Assert.AreEqual(2, hits2.TotalHits);
            Assert.AreEqual("1", searcher.Doc(hits2.ScoreDocs[0].Doc).Get("id"));
            Assert.AreEqual("0", searcher.Doc(hits2.ScoreDocs[1].Doc).Get("id"));

            // Resort using SpanNearQuery:
            SpanTermQuery t1  = new SpanTermQuery(new Term("field", "wizard"));
            SpanTermQuery t2  = new SpanTermQuery(new Term("field", "oz"));
            SpanNearQuery snq = new SpanNearQuery(new SpanQuery[] { t1, t2 }, 0, true);

            TopDocs hits3 = QueryRescorer.Rescore(searcher, hits, snq, 2.0, 10);

            // Resorting changed the order:
            Assert.AreEqual(2, hits3.TotalHits);
            Assert.AreEqual("1", searcher.Doc(hits3.ScoreDocs[0].Doc).Get("id"));
            Assert.AreEqual("0", searcher.Doc(hits3.ScoreDocs[1].Doc).Get("id"));

            r.Dispose();
            dir.Dispose();
        }
        private void QueryToSpanQuery(Query query, ICollection <byte[]> payloads)
        {
            if (query is BooleanQuery)
            {
                BooleanClause[] queryClauses = ((BooleanQuery)query).GetClauses();

                for (int i = 0; i < queryClauses.Length; i++)
                {
                    if (!queryClauses[i].IsProhibited)
                    {
                        QueryToSpanQuery(queryClauses[i].Query, payloads);
                    }
                }
            }
            else if (query is PhraseQuery)
            {
                Term[]      phraseQueryTerms = ((PhraseQuery)query).GetTerms();
                SpanQuery[] clauses          = new SpanQuery[phraseQueryTerms.Length];
                for (int i = 0; i < phraseQueryTerms.Length; i++)
                {
                    clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
                }

                int  slop    = ((PhraseQuery)query).Slop;
                bool inorder = false;

                if (slop == 0)
                {
                    inorder = true;
                }

                SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
                sp.Boost = query.Boost;
                GetPayloads(payloads, sp);
            }
            else if (query is TermQuery)
            {
                SpanTermQuery stq = new SpanTermQuery(((TermQuery)query).Term);
                stq.Boost = query.Boost;
                GetPayloads(payloads, stq);
            }
            else if (query is SpanQuery)
            {
                GetPayloads(payloads, (SpanQuery)query);
            }
            else if (query is FilteredQuery)
            {
                QueryToSpanQuery(((FilteredQuery)query).Query, payloads);
            }
            else if (query is DisjunctionMaxQuery)
            {
                foreach (var q in ((DisjunctionMaxQuery)query))
                {
                    QueryToSpanQuery(q, payloads);
                }
            }
            else if (query is MultiPhraseQuery)
            {
                MultiPhraseQuery mpq        = (MultiPhraseQuery)query;
                IList <Term[]>   termArrays = mpq.GetTermArrays();
                int[]            positions  = mpq.GetPositions();
                if (positions.Length > 0)
                {
                    int maxPosition = positions[positions.Length - 1];
                    for (int i = 0; i < positions.Length - 1; ++i)
                    {
                        if (positions[i] > maxPosition)
                        {
                            maxPosition = positions[i];
                        }
                    }

                    IList <Query>[] disjunctLists     = new List <Query> [maxPosition + 1];
                    int             distinctPositions = 0;

                    for (int i = 0; i < termArrays.Count; ++i)
                    {
                        Term[]        termArray = termArrays[i];
                        IList <Query> disjuncts = disjunctLists[positions[i]];
                        if (disjuncts == null)
                        {
                            disjuncts = (disjunctLists[positions[i]] = new List <Query>(termArray.Length));
                            ++distinctPositions;
                        }
                        foreach (Term term in termArray)
                        {
                            disjuncts.Add(new SpanTermQuery(term));
                        }
                    }

                    int         positionGaps = 0;
                    int         position     = 0;
                    SpanQuery[] clauses      = new SpanQuery[distinctPositions];
                    for (int i = 0; i < disjunctLists.Length; ++i)
                    {
                        IList <Query> disjuncts = disjunctLists[i];
                        if (disjuncts != null)
                        {
                            clauses[position++] = new SpanOrQuery(disjuncts.OfType <SpanQuery>().ToArray());
                        }
                        else
                        {
                            ++positionGaps;
                        }
                    }

                    int  slop    = mpq.Slop;
                    bool inorder = (slop == 0);

                    SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
                    sp.Boost = query.Boost;
                    GetPayloads(payloads, sp);
                }
            }
        }
        public virtual void TestPayloadsPos0()
        {
            Directory         dir    = NewDirectory();
            RandomIndexWriter writer = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, dir, new MockPayloadAnalyzer());
            Document doc = new Document();

            doc.Add(new TextField("content", new StringReader("a a b c d e a f g h i j a b k k")));
            writer.AddDocument(doc);

            IndexReader  readerFromWriter = writer.GetReader();
            AtomicReader r = SlowCompositeReaderWrapper.Wrap(readerFromWriter);

            DocsAndPositionsEnum tp = r.GetTermPositionsEnum(new Term("content", "a"));

            int count = 0;

            Assert.IsTrue(tp.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            // "a" occurs 4 times
            Assert.AreEqual(4, tp.Freq);
            Assert.AreEqual(0, tp.NextPosition());
            Assert.AreEqual(1, tp.NextPosition());
            Assert.AreEqual(3, tp.NextPosition());
            Assert.AreEqual(6, tp.NextPosition());

            // only one doc has "a"
            Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, tp.NextDoc());

            IndexSearcher @is = NewSearcher(readerFromWriter);

            SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
            SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));

            SpanQuery[]   sqs = new SpanQuery[] { stq1, stq2 };
            SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);

            count = 0;
            bool sawZero = false;

            if (Verbose)
            {
                Console.WriteLine("\ngetPayloadSpans test");
            }
            Search.Spans.Spans pspans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq);
            while (pspans.MoveNext())
            {
                if (Verbose)
                {
                    Console.WriteLine("doc " + pspans.Doc + ": span " + pspans.Start + " to " + pspans.End);
                }
                var payloads = pspans.GetPayload();
                sawZero |= pspans.Start == 0;
                foreach (var bytes in payloads)
                {
                    count++;
                    if (Verbose)
                    {
                        Console.WriteLine("  payload: " + Encoding.UTF8.GetString(bytes));
                    }
                }
            }
            Assert.IsTrue(sawZero);
            Assert.AreEqual(5, count);

            // System.out.println("\ngetSpans test");
            Search.Spans.Spans spans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq);
            count   = 0;
            sawZero = false;
            while (spans.MoveNext())
            {
                count++;
                sawZero |= spans.Start == 0;
                // System.out.println(spans.Doc() + " - " + spans.Start() + " - " +
                // spans.End());
            }
            Assert.AreEqual(4, count);
            Assert.IsTrue(sawZero);

            // System.out.println("\nPayloadSpanUtil test");

            sawZero = false;
            PayloadSpanUtil psu = new PayloadSpanUtil(@is.TopReaderContext);
            var             pls = psu.GetPayloadsForQuery(snq);

            count = pls.Count;
            foreach (var bytes in pls)
            {
                string s = Encoding.UTF8.GetString(bytes);
                //System.out.println(s);
                sawZero |= s.Equals("pos: 0", StringComparison.Ordinal);
            }
            Assert.AreEqual(5, count);
            Assert.IsTrue(sawZero);
            writer.Dispose();
            @is.IndexReader.Dispose();
            dir.Dispose();
        }
Beispiel #10
0
 public virtual Query VisitSpanNearQuery(SpanNearQuery spanNearq)
 {
     throw new SnNotSupportedException();
 }
        /// <summary>
        /// Fills a <c>Map</c> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <c>Query</c>.
        /// </summary>
        /// <param name="query">Query to extract Terms from</param>
        /// <param name="terms">Map to place created WeightedSpanTerms in</param>
        private void Extract(Query query, IDictionary <String, WeightedSpanTerm> terms)
        {
            if (query is BooleanQuery)
            {
                BooleanClause[] queryClauses = ((BooleanQuery)query).GetClauses();

                for (int i = 0; i < queryClauses.Length; i++)
                {
                    if (!queryClauses[i].IsProhibited)
                    {
                        Extract(queryClauses[i].Query, terms);
                    }
                }
            }
            else if (query is PhraseQuery)
            {
                PhraseQuery phraseQuery      = ((PhraseQuery)query);
                Term[]      phraseQueryTerms = phraseQuery.GetTerms();
                SpanQuery[] clauses          = new SpanQuery[phraseQueryTerms.Length];
                for (int i = 0; i < phraseQueryTerms.Length; i++)
                {
                    clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
                }
                int   slop      = phraseQuery.Slop;
                int[] positions = phraseQuery.GetPositions();
                // add largest position increment to slop
                if (positions.Length > 0)
                {
                    int lastPos    = positions[0];
                    int largestInc = 0;
                    int sz         = positions.Length;
                    for (int i = 1; i < sz; i++)
                    {
                        int pos = positions[i];
                        int inc = pos - lastPos;
                        if (inc > largestInc)
                        {
                            largestInc = inc;
                        }
                        lastPos = pos;
                    }
                    if (largestInc > 1)
                    {
                        slop += largestInc;
                    }
                }

                bool inorder = slop == 0;

                SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
                sp.Boost = query.Boost;
                ExtractWeightedSpanTerms(terms, sp);
            }
            else if (query is TermQuery)
            {
                ExtractWeightedTerms(terms, query);
            }
            else if (query is SpanQuery)
            {
                ExtractWeightedSpanTerms(terms, (SpanQuery)query);
            }
            else if (query is FilteredQuery)
            {
                Extract(((FilteredQuery)query).Query, terms);
            }
            else if (query is DisjunctionMaxQuery)
            {
                foreach (var q in ((DisjunctionMaxQuery)query))
                {
                    Extract(q, terms);
                }
            }
            else if (query is MultiTermQuery && expandMultiTermQuery)
            {
                MultiTermQuery mtq = ((MultiTermQuery)query);
                if (mtq.RewriteMethod != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
                {
                    mtq = (MultiTermQuery)mtq.Clone();
                    mtq.RewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE;
                    query             = mtq;
                }
                FakeReader fReader = new FakeReader();
                MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.Rewrite(fReader, mtq);
                if (fReader.Field != null)
                {
                    IndexReader ir = GetReaderForField(fReader.Field);
                    Extract(query.Rewrite(ir), terms);
                }
            }
            else if (query is MultiPhraseQuery)
            {
                MultiPhraseQuery mpq        = (MultiPhraseQuery)query;
                IList <Term[]>   termArrays = mpq.GetTermArrays();
                int[]            positions  = mpq.GetPositions();
                if (positions.Length > 0)
                {
                    int maxPosition = positions[positions.Length - 1];
                    for (int i = 0; i < positions.Length - 1; ++i)
                    {
                        if (positions[i] > maxPosition)
                        {
                            maxPosition = positions[i];
                        }
                    }

                    var disjunctLists     = new List <SpanQuery> [maxPosition + 1];
                    int distinctPositions = 0;

                    for (int i = 0; i < termArrays.Count; ++i)
                    {
                        Term[]           termArray = termArrays[i];
                        List <SpanQuery> disjuncts = disjunctLists[positions[i]];
                        if (disjuncts == null)
                        {
                            disjuncts = (disjunctLists[positions[i]] = new List <SpanQuery>(termArray.Length));
                            ++distinctPositions;
                        }
                        for (int j = 0; j < termArray.Length; ++j)
                        {
                            disjuncts.Add(new SpanTermQuery(termArray[j]));
                        }
                    }

                    int         positionGaps = 0;
                    int         position     = 0;
                    SpanQuery[] clauses      = new SpanQuery[distinctPositions];
                    for (int i = 0; i < disjunctLists.Length; ++i)
                    {
                        List <SpanQuery> disjuncts = disjunctLists[i];
                        if (disjuncts != null)
                        {
                            clauses[position++] = new SpanOrQuery(disjuncts.ToArray());
                        }
                        else
                        {
                            ++positionGaps;
                        }
                    }

                    int  slop    = mpq.Slop;
                    bool inorder = (slop == 0);

                    SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
                    sp.Boost = query.Boost;
                    ExtractWeightedSpanTerms(terms, sp);
                }
            }
        }
        public static void Run(string indexDirectory, string query)
        {
            var workspace = Helpers.GetWorkspace(indexDirectory);

            Log.Logger.Information("Querying for {query}...", query);

            Log.Logger.Information("Discovering captions...");
            var captionEntries = workspace.GetCaptions();

            if (captionEntries.Count == 0)
            {
                Log.Logger.Information("No captions are present.");
                Environment.Exit(1);
            }

            var videoCaptions      = workspace.GetCaptions();
            var videosWithCaptions = workspace.GetVideos().Where(x => videoCaptions.ContainsKey(x.Id)).ToList();

            Log.Information("Searching captions for {count} videos...", videosWithCaptions.Count);

            var topic = new TopicSearch();

            topic.Topic = query;

            foreach (var video in videosWithCaptions)
            {
                var captionText = string.Join(" ", captionEntries[video.Id].Select(x => $"[@{x.Start}] {x.Value}"));

                var terms = new List <SpanQuery>();
                foreach (var term in query.Trim().Split(" "))
                {
                    terms.Add(new SpanTermQuery(new Term("content", term)));
                }
                var spanNearQuery = new SpanNearQuery(terms.ToArray(), 25, false);

                var queryScorer = new QueryScorer(spanNearQuery);
                var highlighter = new Highlighter(new MarkerFormatter(), queryScorer)
                {
                    TextFragmenter = new NullFragmenter()
                };
                var tokenStream = new StandardAnalyzer(LuceneVersion.LUCENE_48).GetTokenStream("content", captionText);

                var searchResult = highlighter.GetBestFragment(tokenStream, captionText);

                if (string.IsNullOrEmpty(searchResult))
                {
                    continue;
                }

                var model = new TopicSearch.VideoResult();
                model.Id       = video.Id;
                model.Segments = new List <TopicSearch.VideoResult.Segment>();

                foreach (Match match in Regex.Matches(searchResult, @"\[\@([0-9\.]*)\].+?(?=\[\@[0-9\.]*\]|$)"))
                {
                    var segment = match.Groups[0].Value;
                    if (!segment.Contains("!!!! "))
                    {
                        continue;
                    }

                    segment = segment.Replace("!!!! ", "");
                    segment = Regex.Replace(segment, @"\[\@[0-9\.]*\]", "");
                    segment = segment.Trim();

                    var timeStamp = (int)Math.Max(Math.Floor(decimal.Parse(match.Groups[1].Value)), 0);

                    model.Segments.Add(new TopicSearch.VideoResult.Segment
                    {
                        Text     = segment,
                        Location = timeStamp
                    });
                }

                topic.Results.Add(model);
            }

            Log.Information("Found {total} videos, with {total} segments.", topic.Results.Count, topic.Results.Sum(x => x.Segments.Count));

            Console.WriteLine(JsonConvert.SerializeObject(topic, Formatting.Indented));

            Log.Logger.Information("Done!");
        }
		public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader):this(spanNearQuery, reader, true)
		{
		}
		public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader, bool collectPayloads)
		{
			InitBlock();
			if (spanNearQuery.GetClauses().Length < 2)
			{
				throw new System.ArgumentException("Less than 2 clauses: " + spanNearQuery);
			}
			this.collectPayloads = collectPayloads;
			allowedSlop = spanNearQuery.Slop;
			SpanQuery[] clauses = spanNearQuery.GetClauses();
			subSpans = new Spans[clauses.Length];
			matchPayload = new System.Collections.Generic.List<byte[]>();
			subSpansByDoc = new Spans[clauses.Length];
			for (int i = 0; i < clauses.Length; i++)
			{
				subSpans[i] = clauses[i].GetSpans(reader);
				subSpansByDoc[i] = subSpans[i]; // used in toSameDoc()
			}
			query = spanNearQuery; // kept for toString() only.
		}