private void QueryToSpanQuery(Query query, ICollection<sbyte[]> payloads) { if (query is BooleanQuery) { BooleanClause[] queryClauses = ((BooleanQuery)query).Clauses; for (int i = 0; i < queryClauses.Length; i++) { if (!queryClauses[i].Prohibited) { QueryToSpanQuery(queryClauses[i].Query, payloads); } } } else if (query is PhraseQuery) { Term[] phraseQueryTerms = ((PhraseQuery)query).Terms; SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length]; for (int i = 0; i < phraseQueryTerms.Length; i++) { clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); } int slop = ((PhraseQuery)query).Slop; bool inorder = false; if (slop == 0) { inorder = true; } SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); sp.Boost = query.Boost; GetPayloads(payloads, sp); } else if (query is TermQuery) { SpanTermQuery stq = new SpanTermQuery(((TermQuery)query).Term); stq.Boost = query.Boost; GetPayloads(payloads, stq); } else if (query is SpanQuery) { GetPayloads(payloads, (SpanQuery)query); } else if (query is FilteredQuery) { QueryToSpanQuery(((FilteredQuery)query).Query, payloads); } else if (query is DisjunctionMaxQuery) { IEnumerator<Query> enumerator = ((DisjunctionMaxQuery)query).GetEnumerator(); while (enumerator.MoveNext()) { QueryToSpanQuery(enumerator.Current, payloads); } } else if (query is MultiPhraseQuery) { MultiPhraseQuery mpq = (MultiPhraseQuery)query; IList<Term[]> termArrays = mpq.TermArrays; int[] positions = mpq.Positions; if (positions.Length > 0) { int maxPosition = positions[positions.Length - 1]; for (int i = 0; i < positions.Length - 1; ++i) { if (positions[i] > maxPosition) { maxPosition = positions[i]; } } IList<Query>[] disjunctLists = new List<Query>[maxPosition + 1]; int distinctPositions = 0; for (int i = 0; i < termArrays.Count; ++i) { Term[] termArray = termArrays[i]; IList<Query> disjuncts = disjunctLists[positions[i]]; if (disjuncts == null) { disjuncts = (disjunctLists[positions[i]] = new List<Query>(termArray.Length)); ++distinctPositions; } foreach (Term term in termArray) { disjuncts.Add(new SpanTermQuery(term)); } } int positionGaps = 0; int position = 0; SpanQuery[] clauses = new SpanQuery[distinctPositions]; for (int i = 0; i < disjunctLists.Length; ++i) { IList<Query> disjuncts = disjunctLists[i]; if (disjuncts != null) { clauses[position++] = new SpanOrQuery(disjuncts.OfType<SpanQuery>().ToArray()); } else { ++positionGaps; } } int slop = mpq.Slop; bool inorder = (slop == 0); SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); sp.Boost = query.Boost; GetPayloads(payloads, sp); } } }
private void QueryToSpanQuery(Query query, ICollection <byte[]> payloads) { if (query is BooleanQuery booleanQuery) { BooleanClause[] queryClauses = booleanQuery.GetClauses(); for (int i = 0; i < queryClauses.Length; i++) { if (!queryClauses[i].IsProhibited) { QueryToSpanQuery(queryClauses[i].Query, payloads); } } } else if (query is PhraseQuery phraseQuery) { Term[] phraseQueryTerms = phraseQuery.GetTerms(); SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length]; for (int i = 0; i < phraseQueryTerms.Length; i++) { clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); } int slop = phraseQuery.Slop; bool inorder = false; if (slop == 0) { inorder = true; } SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder) { Boost = query.Boost }; GetPayloads(payloads, sp); } else if (query is TermQuery termQuery) { SpanTermQuery stq = new SpanTermQuery(termQuery.Term) { Boost = query.Boost }; GetPayloads(payloads, stq); } else if (query is SpanQuery spanQuery) { GetPayloads(payloads, spanQuery); } else if (query is FilteredQuery filteredQuery) { QueryToSpanQuery(filteredQuery.Query, payloads); } else if (query is DisjunctionMaxQuery disjunctionMaxQuery) { foreach (var q in disjunctionMaxQuery) { QueryToSpanQuery(q, payloads); } } else if (query is MultiPhraseQuery mpq) { IList <Term[]> termArrays = mpq.GetTermArrays(); int[] positions = mpq.GetPositions(); if (positions.Length > 0) { int maxPosition = positions[positions.Length - 1]; for (int i = 0; i < positions.Length - 1; ++i) { if (positions[i] > maxPosition) { maxPosition = positions[i]; } } // LUCENENET: Changed from Query to SpanQuery to eliminate the O(n) cast // required to instantiate SpanOrQuery below IList <SpanQuery>[] disjunctLists = new JCG.List <SpanQuery> [maxPosition + 1]; int distinctPositions = 0; for (int i = 0; i < termArrays.Count; ++i) { Term[] termArray = termArrays[i]; IList <SpanQuery> disjuncts = disjunctLists[positions[i]]; // LUCENENET: Changed from Query to SpanQuery if (disjuncts == null) { disjuncts = (disjunctLists[positions[i]] = new JCG.List <SpanQuery>(termArray.Length)); // LUCENENET: Changed from Query to SpanQuery ++distinctPositions; } foreach (Term term in termArray) { disjuncts.Add(new SpanTermQuery(term)); } } int positionGaps = 0; int position = 0; SpanQuery[] clauses = new SpanQuery[distinctPositions]; for (int i = 0; i < disjunctLists.Length; ++i) { IList <SpanQuery> disjuncts = disjunctLists[i]; // LUCENENET: Changed from Query to SpanQuery if (disjuncts != null) { clauses[position++] = new SpanOrQuery(disjuncts); } else { ++positionGaps; } } int slop = mpq.Slop; bool inorder = (slop == 0); SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); sp.Boost = query.Boost; GetPayloads(payloads, sp); } } }
public virtual void TestPayloadsPos0() { Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, new MockPayloadAnalyzer()); Document doc = new Document(); doc.Add(new TextField("content", new StringReader("a a b c d e a f g h i j a b k k"))); writer.AddDocument(doc); IndexReader readerFromWriter = writer.Reader; AtomicReader r = SlowCompositeReaderWrapper.Wrap(readerFromWriter); DocsAndPositionsEnum tp = r.TermPositionsEnum(new Term("content", "a")); int count = 0; Assert.IsTrue(tp.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); // "a" occurs 4 times Assert.AreEqual(4, tp.Freq()); Assert.AreEqual(0, tp.NextPosition()); Assert.AreEqual(1, tp.NextPosition()); Assert.AreEqual(3, tp.NextPosition()); Assert.AreEqual(6, tp.NextPosition()); // only one doc has "a" Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, tp.NextDoc()); IndexSearcher @is = NewSearcher(readerFromWriter); SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a")); SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k")); SpanQuery[] sqs = new SpanQuery[] { stq1, stq2 }; SpanNearQuery snq = new SpanNearQuery(sqs, 30, false); count = 0; bool sawZero = false; if (VERBOSE) { Console.WriteLine("\ngetPayloadSpans test"); } Search.Spans.Spans pspans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq); while (pspans.Next()) { if (VERBOSE) { Console.WriteLine("doc " + pspans.Doc() + ": span " + pspans.Start() + " to " + pspans.End()); } var payloads = pspans.Payload; sawZero |= pspans.Start() == 0; foreach (var bytes in payloads) { count++; if (VERBOSE) { Console.WriteLine(" payload: " + Encoding.UTF8.GetString((byte[])(Array)bytes)); } } } Assert.IsTrue(sawZero); Assert.AreEqual(5, count); // System.out.println("\ngetSpans test"); Search.Spans.Spans spans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq); count = 0; sawZero = false; while (spans.Next()) { count++; sawZero |= spans.Start() == 0; // System.out.println(spans.Doc() + " - " + spans.Start() + " - " + // spans.End()); } Assert.AreEqual(4, count); Assert.IsTrue(sawZero); // System.out.println("\nPayloadSpanUtil test"); sawZero = false; PayloadSpanUtil psu = new PayloadSpanUtil(@is.TopReaderContext); var pls = psu.GetPayloadsForQuery(snq); count = pls.Count; foreach (var bytes in pls) { string s = Encoding.UTF8.GetString(bytes); //System.out.println(s); sawZero |= s.Equals("pos: 0"); } Assert.AreEqual(5, count); Assert.IsTrue(sawZero); writer.Dispose(); @is.IndexReader.Dispose(); dir.Dispose(); }
public override Query Rewrite(IndexReader reader) { // ArrayList spanClauses = new ArrayList(); if (contents is TermQuery) { return(contents); } // Build a sequence of Span clauses arranged in a SpanNear - child // clauses can be complex // Booleans e.g. nots and ors etc int numNegatives = 0; if (!(contents is BooleanQuery)) { throw new ArgumentException("Unknown query type \"" + contents.GetType().Name + "\" found in phrase query string \"" + phrasedQueryStringContents + "\""); } BooleanQuery bq = (BooleanQuery)contents; BooleanClause[] bclauses = bq.GetClauses(); SpanQuery[] allSpanClauses = new SpanQuery[bclauses.Length]; // For all clauses e.g. one* two~ for (int i = 0; i < bclauses.Length; i++) { // HashSet bclauseterms=new HashSet(); Query qc = bclauses[i].Query; // Rewrite this clause e.g one* becomes (one OR onerous) qc = qc.Rewrite(reader); if (bclauses[i].Occur.Equals(Occur.MUST_NOT)) { numNegatives++; } if (qc is BooleanQuery) { List <SpanQuery> sc = new List <SpanQuery>(); AddComplexPhraseClause(sc, (BooleanQuery)qc); if (sc.Count > 0) { allSpanClauses[i] = sc.ElementAt(0); } else { // Insert fake term e.g. phrase query was for "Fred Smithe*" and // there were no "Smithe*" terms - need to // prevent match on just "Fred". allSpanClauses[i] = new SpanTermQuery(new Term(field, "Dummy clause because no terms found - must match nothing")); } } else { if (qc is TermQuery) { TermQuery tq = (TermQuery)qc; allSpanClauses[i] = new SpanTermQuery(tq.Term); } else { throw new ArgumentException("Unknown query type \"" + qc.GetType().Name + "\" found in phrase query string \"" + phrasedQueryStringContents + "\""); } } } if (numNegatives == 0) { // The simple case - no negative elements in phrase return(new SpanNearQuery(allSpanClauses, slopFactor, inOrder)); } // Complex case - we have mixed positives and negatives in the // sequence. // Need to return a SpanNotQuery List <SpanQuery> positiveClauses = new List <SpanQuery>(); for (int j = 0; j < allSpanClauses.Length; j++) { if (!bclauses[j].Occur.Equals(Occur.MUST_NOT)) { positiveClauses.Add(allSpanClauses[j]); } } SpanQuery[] includeClauses = positiveClauses .ToArray(); SpanQuery include = null; if (includeClauses.Length == 1) { include = includeClauses[0]; // only one positive clause } else { // need to increase slop factor based on gaps introduced by // negatives include = new SpanNearQuery(includeClauses, slopFactor + numNegatives, inOrder); } // Use sequence of positive and negative values as the exclude. SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor, inOrder); SpanNotQuery snot = new SpanNotQuery(include, exclude); return(snot); }
/// <summary> /// Fills a <see cref="T:IDictionary{string, WeightedSpanTerm}"/> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <paramref name="query"/>. /// </summary> /// <param name="query"><see cref="Query"/> to extract Terms from</param> /// <param name="terms">Map to place created <see cref="WeightedSpanTerm"/>s in</param> /// <exception cref="System.IO.IOException">If there is a low-level I/O error</exception> protected virtual void Extract(Query query, IDictionary <string, WeightedSpanTerm> terms) { if (query is BooleanQuery) { IList <BooleanClause> queryClauses = ((BooleanQuery)query).Clauses; for (int i = 0; i < queryClauses.Count; i++) { if (!queryClauses[i].IsProhibited) { Extract(queryClauses[i].Query, terms); } } } else if (query is PhraseQuery) { PhraseQuery phraseQuery = (PhraseQuery)query; Term[] phraseQueryTerms = phraseQuery.GetTerms(); SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length]; for (int i = 0; i < phraseQueryTerms.Length; i++) { clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); } int slop = phraseQuery.Slop; int[] positions = phraseQuery.GetPositions(); // add largest position increment to slop if (positions.Length > 0) { int lastPos = positions[0]; int largestInc = 0; int sz = positions.Length; for (int i = 1; i < sz; i++) { int pos = positions[i]; int inc = pos - lastPos; if (inc > largestInc) { largestInc = inc; } lastPos = pos; } if (largestInc > 1) { slop += largestInc; } } bool inorder = slop == 0; SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); sp.Boost = query.Boost; ExtractWeightedSpanTerms(terms, sp); } else if (query is TermQuery) { ExtractWeightedTerms(terms, query); } else if (query is SpanQuery) { ExtractWeightedSpanTerms(terms, (SpanQuery)query); } else if (query is FilteredQuery) { Extract(((FilteredQuery)query).Query, terms); } else if (query is ConstantScoreQuery) { Query q = ((ConstantScoreQuery)query).Query; if (q != null) { Extract(q, terms); } } else if (query is CommonTermsQuery) { // specialized since rewriting would change the result query // this query is TermContext sensitive. ExtractWeightedTerms(terms, query); } else if (query is DisjunctionMaxQuery) { foreach (var q in ((DisjunctionMaxQuery)query)) { Extract(q, terms); } } else if (query is MultiPhraseQuery) { MultiPhraseQuery mpq = (MultiPhraseQuery)query; IList <Term[]> termArrays = mpq.GetTermArrays(); int[] positions = mpq.GetPositions(); if (positions.Length > 0) { int maxPosition = positions[positions.Length - 1]; for (int i = 0; i < positions.Length - 1; ++i) { if (positions[i] > maxPosition) { maxPosition = positions[i]; } } var disjunctLists = new List <SpanQuery> [maxPosition + 1]; int distinctPositions = 0; for (int i = 0; i < termArrays.Count; ++i) { Term[] termArray = termArrays[i]; List <SpanQuery> disjuncts = disjunctLists[positions[i]]; if (disjuncts == null) { disjuncts = (disjunctLists[positions[i]] = new List <SpanQuery>(termArray.Length)); ++distinctPositions; } foreach (var term in termArray) { disjuncts.Add(new SpanTermQuery(term)); } } int positionGaps = 0; int position = 0; SpanQuery[] clauses = new SpanQuery[distinctPositions]; foreach (var disjuncts in disjunctLists) { if (disjuncts != null) { clauses[position++] = new SpanOrQuery(disjuncts.ToArray()); } else { ++positionGaps; } } int slop = mpq.Slop; bool inorder = (slop == 0); SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); sp.Boost = query.Boost; ExtractWeightedSpanTerms(terms, sp); } } else { Query origQuery = query; if (query is MultiTermQuery) { if (!expandMultiTermQuery) { return; } MultiTermQuery copy = (MultiTermQuery)query.Clone(); copy.MultiTermRewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE; origQuery = copy; } IndexReader reader = GetLeafContext().Reader; Query rewritten = origQuery.Rewrite(reader); if (rewritten != origQuery) { // only rewrite once and then flatten again - the rewritten query could have a speacial treatment // if this method is overwritten in a subclass or above in the next recursion Extract(rewritten, terms); } } ExtractUnknownQuery(query, terms); }
public virtual Query VisitSpanNearQuery(SpanNearQuery spanNearq) { throw new NotImplementedException(); }
public virtual void TestBasic() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random(), dir); Document doc = new Document(); doc.Add(NewStringField("id", "0", Field.Store.YES)); doc.Add(NewTextField("field", "wizard the the the the the oz", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("id", "1", Field.Store.YES)); // 1 extra token, but wizard and oz are close; doc.Add(NewTextField("field", "wizard oz the the the the the the", Field.Store.NO)); w.AddDocument(doc); IndexReader r = w.Reader; w.Dispose(); // Do ordinary BooleanQuery: BooleanQuery bq = new BooleanQuery(); bq.Add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD); bq.Add(new TermQuery(new Term("field", "oz")), Occur.SHOULD); IndexSearcher searcher = GetSearcher(r); searcher.Similarity = new DefaultSimilarity(); TopDocs hits = searcher.Search(bq, 10); Assert.AreEqual(2, hits.TotalHits); Assert.AreEqual("0", searcher.Doc(hits.ScoreDocs[0].Doc).Get("id")); Assert.AreEqual("1", searcher.Doc(hits.ScoreDocs[1].Doc).Get("id")); // Now, resort using PhraseQuery: PhraseQuery pq = new PhraseQuery(); pq.Slop = 5; pq.Add(new Term("field", "wizard")); pq.Add(new Term("field", "oz")); TopDocs hits2 = QueryRescorer.Rescore(searcher, hits, pq, 2.0, 10); // Resorting changed the order: Assert.AreEqual(2, hits2.TotalHits); Assert.AreEqual("1", searcher.Doc(hits2.ScoreDocs[0].Doc).Get("id")); Assert.AreEqual("0", searcher.Doc(hits2.ScoreDocs[1].Doc).Get("id")); // Resort using SpanNearQuery: SpanTermQuery t1 = new SpanTermQuery(new Term("field", "wizard")); SpanTermQuery t2 = new SpanTermQuery(new Term("field", "oz")); SpanNearQuery snq = new SpanNearQuery(new SpanQuery[] { t1, t2 }, 0, true); TopDocs hits3 = QueryRescorer.Rescore(searcher, hits, snq, 2.0, 10); // Resorting changed the order: Assert.AreEqual(2, hits3.TotalHits); Assert.AreEqual("1", searcher.Doc(hits3.ScoreDocs[0].Doc).Get("id")); Assert.AreEqual("0", searcher.Doc(hits3.ScoreDocs[1].Doc).Get("id")); r.Dispose(); dir.Dispose(); }
private void QueryToSpanQuery(Query query, ICollection <byte[]> payloads) { if (query is BooleanQuery) { BooleanClause[] queryClauses = ((BooleanQuery)query).GetClauses(); for (int i = 0; i < queryClauses.Length; i++) { if (!queryClauses[i].IsProhibited) { QueryToSpanQuery(queryClauses[i].Query, payloads); } } } else if (query is PhraseQuery) { Term[] phraseQueryTerms = ((PhraseQuery)query).GetTerms(); SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length]; for (int i = 0; i < phraseQueryTerms.Length; i++) { clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); } int slop = ((PhraseQuery)query).Slop; bool inorder = false; if (slop == 0) { inorder = true; } SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); sp.Boost = query.Boost; GetPayloads(payloads, sp); } else if (query is TermQuery) { SpanTermQuery stq = new SpanTermQuery(((TermQuery)query).Term); stq.Boost = query.Boost; GetPayloads(payloads, stq); } else if (query is SpanQuery) { GetPayloads(payloads, (SpanQuery)query); } else if (query is FilteredQuery) { QueryToSpanQuery(((FilteredQuery)query).Query, payloads); } else if (query is DisjunctionMaxQuery) { foreach (var q in ((DisjunctionMaxQuery)query)) { QueryToSpanQuery(q, payloads); } } else if (query is MultiPhraseQuery) { MultiPhraseQuery mpq = (MultiPhraseQuery)query; IList <Term[]> termArrays = mpq.GetTermArrays(); int[] positions = mpq.GetPositions(); if (positions.Length > 0) { int maxPosition = positions[positions.Length - 1]; for (int i = 0; i < positions.Length - 1; ++i) { if (positions[i] > maxPosition) { maxPosition = positions[i]; } } IList <Query>[] disjunctLists = new List <Query> [maxPosition + 1]; int distinctPositions = 0; for (int i = 0; i < termArrays.Count; ++i) { Term[] termArray = termArrays[i]; IList <Query> disjuncts = disjunctLists[positions[i]]; if (disjuncts == null) { disjuncts = (disjunctLists[positions[i]] = new List <Query>(termArray.Length)); ++distinctPositions; } foreach (Term term in termArray) { disjuncts.Add(new SpanTermQuery(term)); } } int positionGaps = 0; int position = 0; SpanQuery[] clauses = new SpanQuery[distinctPositions]; for (int i = 0; i < disjunctLists.Length; ++i) { IList <Query> disjuncts = disjunctLists[i]; if (disjuncts != null) { clauses[position++] = new SpanOrQuery(disjuncts.OfType <SpanQuery>().ToArray()); } else { ++positionGaps; } } int slop = mpq.Slop; bool inorder = (slop == 0); SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); sp.Boost = query.Boost; GetPayloads(payloads, sp); } } }
public virtual void TestPayloadsPos0() { Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir, new MockPayloadAnalyzer()); Document doc = new Document(); doc.Add(new TextField("content", new StringReader("a a b c d e a f g h i j a b k k"))); writer.AddDocument(doc); IndexReader readerFromWriter = writer.GetReader(); AtomicReader r = SlowCompositeReaderWrapper.Wrap(readerFromWriter); DocsAndPositionsEnum tp = r.GetTermPositionsEnum(new Term("content", "a")); int count = 0; Assert.IsTrue(tp.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); // "a" occurs 4 times Assert.AreEqual(4, tp.Freq); Assert.AreEqual(0, tp.NextPosition()); Assert.AreEqual(1, tp.NextPosition()); Assert.AreEqual(3, tp.NextPosition()); Assert.AreEqual(6, tp.NextPosition()); // only one doc has "a" Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, tp.NextDoc()); IndexSearcher @is = NewSearcher(readerFromWriter); SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a")); SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k")); SpanQuery[] sqs = new SpanQuery[] { stq1, stq2 }; SpanNearQuery snq = new SpanNearQuery(sqs, 30, false); count = 0; bool sawZero = false; if (Verbose) { Console.WriteLine("\ngetPayloadSpans test"); } Search.Spans.Spans pspans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq); while (pspans.MoveNext()) { if (Verbose) { Console.WriteLine("doc " + pspans.Doc + ": span " + pspans.Start + " to " + pspans.End); } var payloads = pspans.GetPayload(); sawZero |= pspans.Start == 0; foreach (var bytes in payloads) { count++; if (Verbose) { Console.WriteLine(" payload: " + Encoding.UTF8.GetString(bytes)); } } } Assert.IsTrue(sawZero); Assert.AreEqual(5, count); // System.out.println("\ngetSpans test"); Search.Spans.Spans spans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq); count = 0; sawZero = false; while (spans.MoveNext()) { count++; sawZero |= spans.Start == 0; // System.out.println(spans.Doc() + " - " + spans.Start() + " - " + // spans.End()); } Assert.AreEqual(4, count); Assert.IsTrue(sawZero); // System.out.println("\nPayloadSpanUtil test"); sawZero = false; PayloadSpanUtil psu = new PayloadSpanUtil(@is.TopReaderContext); var pls = psu.GetPayloadsForQuery(snq); count = pls.Count; foreach (var bytes in pls) { string s = Encoding.UTF8.GetString(bytes); //System.out.println(s); sawZero |= s.Equals("pos: 0", StringComparison.Ordinal); } Assert.AreEqual(5, count); Assert.IsTrue(sawZero); writer.Dispose(); @is.IndexReader.Dispose(); dir.Dispose(); }
public virtual Query VisitSpanNearQuery(SpanNearQuery spanNearq) { throw new SnNotSupportedException(); }
/// <summary> /// Fills a <c>Map</c> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <c>Query</c>. /// </summary> /// <param name="query">Query to extract Terms from</param> /// <param name="terms">Map to place created WeightedSpanTerms in</param> private void Extract(Query query, IDictionary <String, WeightedSpanTerm> terms) { if (query is BooleanQuery) { BooleanClause[] queryClauses = ((BooleanQuery)query).GetClauses(); for (int i = 0; i < queryClauses.Length; i++) { if (!queryClauses[i].IsProhibited) { Extract(queryClauses[i].Query, terms); } } } else if (query is PhraseQuery) { PhraseQuery phraseQuery = ((PhraseQuery)query); Term[] phraseQueryTerms = phraseQuery.GetTerms(); SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length]; for (int i = 0; i < phraseQueryTerms.Length; i++) { clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); } int slop = phraseQuery.Slop; int[] positions = phraseQuery.GetPositions(); // add largest position increment to slop if (positions.Length > 0) { int lastPos = positions[0]; int largestInc = 0; int sz = positions.Length; for (int i = 1; i < sz; i++) { int pos = positions[i]; int inc = pos - lastPos; if (inc > largestInc) { largestInc = inc; } lastPos = pos; } if (largestInc > 1) { slop += largestInc; } } bool inorder = slop == 0; SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); sp.Boost = query.Boost; ExtractWeightedSpanTerms(terms, sp); } else if (query is TermQuery) { ExtractWeightedTerms(terms, query); } else if (query is SpanQuery) { ExtractWeightedSpanTerms(terms, (SpanQuery)query); } else if (query is FilteredQuery) { Extract(((FilteredQuery)query).Query, terms); } else if (query is DisjunctionMaxQuery) { foreach (var q in ((DisjunctionMaxQuery)query)) { Extract(q, terms); } } else if (query is MultiTermQuery && expandMultiTermQuery) { MultiTermQuery mtq = ((MultiTermQuery)query); if (mtq.RewriteMethod != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) { mtq = (MultiTermQuery)mtq.Clone(); mtq.RewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE; query = mtq; } FakeReader fReader = new FakeReader(); MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.Rewrite(fReader, mtq); if (fReader.Field != null) { IndexReader ir = GetReaderForField(fReader.Field); Extract(query.Rewrite(ir), terms); } } else if (query is MultiPhraseQuery) { MultiPhraseQuery mpq = (MultiPhraseQuery)query; IList <Term[]> termArrays = mpq.GetTermArrays(); int[] positions = mpq.GetPositions(); if (positions.Length > 0) { int maxPosition = positions[positions.Length - 1]; for (int i = 0; i < positions.Length - 1; ++i) { if (positions[i] > maxPosition) { maxPosition = positions[i]; } } var disjunctLists = new List <SpanQuery> [maxPosition + 1]; int distinctPositions = 0; for (int i = 0; i < termArrays.Count; ++i) { Term[] termArray = termArrays[i]; List <SpanQuery> disjuncts = disjunctLists[positions[i]]; if (disjuncts == null) { disjuncts = (disjunctLists[positions[i]] = new List <SpanQuery>(termArray.Length)); ++distinctPositions; } for (int j = 0; j < termArray.Length; ++j) { disjuncts.Add(new SpanTermQuery(termArray[j])); } } int positionGaps = 0; int position = 0; SpanQuery[] clauses = new SpanQuery[distinctPositions]; for (int i = 0; i < disjunctLists.Length; ++i) { List <SpanQuery> disjuncts = disjunctLists[i]; if (disjuncts != null) { clauses[position++] = new SpanOrQuery(disjuncts.ToArray()); } else { ++positionGaps; } } int slop = mpq.Slop; bool inorder = (slop == 0); SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); sp.Boost = query.Boost; ExtractWeightedSpanTerms(terms, sp); } } }
public static void Run(string indexDirectory, string query) { var workspace = Helpers.GetWorkspace(indexDirectory); Log.Logger.Information("Querying for {query}...", query); Log.Logger.Information("Discovering captions..."); var captionEntries = workspace.GetCaptions(); if (captionEntries.Count == 0) { Log.Logger.Information("No captions are present."); Environment.Exit(1); } var videoCaptions = workspace.GetCaptions(); var videosWithCaptions = workspace.GetVideos().Where(x => videoCaptions.ContainsKey(x.Id)).ToList(); Log.Information("Searching captions for {count} videos...", videosWithCaptions.Count); var topic = new TopicSearch(); topic.Topic = query; foreach (var video in videosWithCaptions) { var captionText = string.Join(" ", captionEntries[video.Id].Select(x => $"[@{x.Start}] {x.Value}")); var terms = new List <SpanQuery>(); foreach (var term in query.Trim().Split(" ")) { terms.Add(new SpanTermQuery(new Term("content", term))); } var spanNearQuery = new SpanNearQuery(terms.ToArray(), 25, false); var queryScorer = new QueryScorer(spanNearQuery); var highlighter = new Highlighter(new MarkerFormatter(), queryScorer) { TextFragmenter = new NullFragmenter() }; var tokenStream = new StandardAnalyzer(LuceneVersion.LUCENE_48).GetTokenStream("content", captionText); var searchResult = highlighter.GetBestFragment(tokenStream, captionText); if (string.IsNullOrEmpty(searchResult)) { continue; } var model = new TopicSearch.VideoResult(); model.Id = video.Id; model.Segments = new List <TopicSearch.VideoResult.Segment>(); foreach (Match match in Regex.Matches(searchResult, @"\[\@([0-9\.]*)\].+?(?=\[\@[0-9\.]*\]|$)")) { var segment = match.Groups[0].Value; if (!segment.Contains("!!!! ")) { continue; } segment = segment.Replace("!!!! ", ""); segment = Regex.Replace(segment, @"\[\@[0-9\.]*\]", ""); segment = segment.Trim(); var timeStamp = (int)Math.Max(Math.Floor(decimal.Parse(match.Groups[1].Value)), 0); model.Segments.Add(new TopicSearch.VideoResult.Segment { Text = segment, Location = timeStamp }); } topic.Results.Add(model); } Log.Information("Found {total} videos, with {total} segments.", topic.Results.Count, topic.Results.Sum(x => x.Segments.Count)); Console.WriteLine(JsonConvert.SerializeObject(topic, Formatting.Indented)); Log.Logger.Information("Done!"); }
public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader):this(spanNearQuery, reader, true) { }
public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader, bool collectPayloads) { InitBlock(); if (spanNearQuery.GetClauses().Length < 2) { throw new System.ArgumentException("Less than 2 clauses: " + spanNearQuery); } this.collectPayloads = collectPayloads; allowedSlop = spanNearQuery.Slop; SpanQuery[] clauses = spanNearQuery.GetClauses(); subSpans = new Spans[clauses.Length]; matchPayload = new System.Collections.Generic.List<byte[]>(); subSpansByDoc = new Spans[clauses.Length]; for (int i = 0; i < clauses.Length; i++) { subSpans[i] = clauses[i].GetSpans(reader); subSpansByDoc[i] = subSpans[i]; // used in toSameDoc() } query = spanNearQuery; // kept for toString() only. }