static private LNS.Query GetDateRangeQuery (QueryPart_DateRange part, out HitFilter hit_filter) { string field_name; if (part.Key == QueryPart_DateRange.AllPropertiesKey) field_name = TypeToWildcardField (PropertyType.Date); else if (part.Key == QueryPart_DateRange.TimestampKey) field_name = "Timestamp"; else field_name = PropertyToFieldName (PropertyType.Date, part.Key); // FIXME: We could optimize this and reduce the size of our range // queries if we actually new the min and max date that appear in // any properties in the index. We would need to inspect the index to // determine that at start-up, and then track it as new documents // get added to the index. if (part.StartDate < lower_bound) part.StartDate = lower_bound; if (part.EndDate > upper_bound || part.EndDate == DateTime.MinValue) part.EndDate = upper_bound; // Swap the start and end dates if they come in reversed. if (part.StartDate > part.EndDate) { DateTime swap; swap = part.StartDate; part.StartDate = part.EndDate; part.EndDate = swap; } // Set up our hit filter to cull out the bad dates. DateRangeHitFilter drhf; drhf = new DateRangeHitFilter (); drhf.Key = part.Key; drhf.StartDate = part.StartDate; drhf.EndDate = part.EndDate; hit_filter = new HitFilter (drhf.HitFilter); Logger.Log.Debug ("Building new date range query"); Logger.Log.Debug ("Start: {0}", part.StartDate); Logger.Log.Debug ("End: {0}", part.EndDate); int y1, m1, d1, y2, m2, d2; y1 = part.StartDate.Year; m1 = part.StartDate.Month; d1 = part.StartDate.Day; y2 = part.EndDate.Year; m2 = part.EndDate.Month; d2 = part.EndDate.Day; LNS.BooleanQuery top_level_query; top_level_query = new LNS.BooleanQuery (); // A special case: both the start and the end of our range fall // in the same month. if (y1 == y2 && m1 == m2) { LNS.Query ym_query; ym_query = NewYearMonthQuery (field_name, y1, m1); // If our range only covers a part of the month, do a range query on the days. if (d1 != 1 || d2 != DateTime.DaysInMonth (y2, m2)) { LNS.BooleanQuery sub_query; sub_query = new LNS.BooleanQuery (); sub_query.Add (ym_query, LNS.BooleanClause.Occur.MUST); sub_query.Add (NewDayQuery (field_name, d1, d2), LNS.BooleanClause.Occur.MUST); top_level_query.Add (sub_query, LNS.BooleanClause.Occur.SHOULD); } else { top_level_query.Add (ym_query, LNS.BooleanClause.Occur.SHOULD); } } else { // Handle a partial month at the beginning of our range. if (d1 > 1) { LNS.BooleanQuery sub_query; sub_query = new LNS.BooleanQuery (); sub_query.Add (NewYearMonthQuery (field_name, y1, m1), LNS.BooleanClause.Occur.MUST); sub_query.Add (NewDayQuery (field_name, d1, DateTime.DaysInMonth (y1, m1)), LNS.BooleanClause.Occur.MUST); top_level_query.Add (sub_query, LNS.BooleanClause.Occur.SHOULD); ++m1; if (m1 == 13) { m1 = 1; ++y1; } } // And likewise, handle a partial month at the end of our range. if (d2 < DateTime.DaysInMonth (y2, m2)) { LNS.BooleanQuery sub_query; sub_query = new LNS.BooleanQuery (); sub_query.Add (NewYearMonthQuery (field_name, y2, m2), LNS.BooleanClause.Occur.MUST); sub_query.Add (NewDayQuery (field_name, 1, d2), LNS.BooleanClause.Occur.MUST); top_level_query.Add (sub_query, LNS.BooleanClause.Occur.SHOULD); --m2; if (m2 == 0) { m2 = 12; --y2; } } // Generate the query for the "middle" of our period, if it is non-empty if (y1 < y2 || ((y1 == y2) && m1 <= m2)) top_level_query.Add (NewYearMonthQuery (field_name, y1, m1, y2, m2), LNS.BooleanClause.Occur.SHOULD); } return top_level_query; }
// search_subset_uris is a list of Uris that this search should be // limited to. static protected void QueryPartToQuery (QueryPart abstract_part, bool only_build_primary_query, ArrayList term_list, QueryPartHook query_part_hook, out LNS.Query primary_query, out LNS.Query secondary_query, out HitFilter hit_filter) { primary_query = null; secondary_query = null; // By default, we assume that our lucene queries will return exactly the // matching set of objects. We need to set the hit filter if further // refinement of the search results is required. (As in the case of // date range queries, for example.) We essentially have to do this // to make OR queries work correctly. hit_filter = true_hit_filter; // The exception is when dealing with a prohibited part. Just return // null for the hit filter in that case. This works since // prohibited parts are not allowed inside of OR queries. if (abstract_part.Logic == QueryPartLogic.Prohibited) hit_filter = null; if (abstract_part == null) return; // Run the backend hook first. // This gives a chance to modify create new queries based on // backend specific properties if (query_part_hook != null) abstract_part = query_part_hook (abstract_part); if (abstract_part == null) return; if (abstract_part is QueryPart_Text) { QueryPart_Text part = (QueryPart_Text) abstract_part; if (! (part.SearchFullText || part.SearchTextProperties)) return; LNS.BooleanQuery p_query = new LNS.BooleanQuery (); LNS.BooleanQuery s_query = new LNS.BooleanQuery (); bool added_subquery = false; if (part.SearchFullText) { LNS.Query subquery; subquery = StringToQuery ("Text", part.Text, term_list); if (subquery != null) { p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD); added_subquery = true; } // FIXME: HotText is ignored for now! // subquery = StringToQuery ("HotText", part.Text); // if (subquery != null) { // p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD); // added_subquery = true; // } } if (part.SearchTextProperties) { LNS.Query subquery; subquery = StringToQuery ("PropertyText", part.Text, term_list); if (subquery != null) { p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD); // Properties can live in either index if (! only_build_primary_query) s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD); added_subquery = true; } // The "added_subquery" check is to handle the situation where // a part of the text is a stop word. Normally, a search for // "hello world" would break down into this query: // // (Text:hello OR PropertyText:hello OR PropertyKeyword:hello) // AND (Text:world OR PropertText:world OR PropertyKeyword:world) // // This fails with stop words, though. Let's assume that "world" // is a stop word. You would end up with: // // (Text:hello OR PropertyText:hello OR PropertyKeyword:hello) // AND (PropertyKeyword:world) // // Which is not what we want. We'd want to match documents that // had only "hello" without also having a keyword "world". In // this case, don't create the PropertyKeyword part of the query, // since it would be included in the larger set if it weren't // required anyway. if (added_subquery) { Term term; term = new Term ("PropertyKeyword", part.Text.ToLower ()); // make sure text is lowercased // FIXME: terms are already added in term_list. But they may have been tokenized // The term here is non-tokenized version. Should this be added to term_list ? // term_list is used to calculate scores if (term_list != null) term_list.Add (term); subquery = new LNS.TermQuery (term); p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD); // Properties can live in either index if (! only_build_primary_query) s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD); } else { // Reset these so we return a null query p_query = null; s_query = null; } } primary_query = p_query; if (! only_build_primary_query) secondary_query = s_query; return; } if (abstract_part is QueryPart_Wildcard) { QueryPart_Wildcard part = (QueryPart_Wildcard) abstract_part; LNS.BooleanQuery p_query = new LNS.BooleanQuery (); LNS.BooleanQuery s_query = new LNS.BooleanQuery (); Term term; LNS.Query subquery; // Lower case the terms for searching string query_string_lower = part.QueryString.ToLower (); // Search text content if (! part.PropertyOnly) { term = new Term ("Text", query_string_lower); subquery = new LNS.WildcardQuery (term); p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD); term_list.Add (term); } // Search text properties term = new Term ("PropertyText", query_string_lower); subquery = new LNS.WildcardQuery (term); p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD); // Properties can live in either index if (! only_build_primary_query) s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD); term_list.Add (term); if (! part.PropertyOnly) { // Search property keywords term = new Term ("PropertyKeyword", query_string_lower); term_list.Add (term); subquery = new LNS.WildcardQuery (term); p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD); // Properties can live in either index if (! only_build_primary_query) s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD); } primary_query = p_query; if (! only_build_primary_query) secondary_query = s_query; return; } if (abstract_part is QueryPart_DateRange) { QueryPart_DateRange part = (QueryPart_DateRange) abstract_part; // FIXME: We don't handle prohibited queries with sub-date // accuracy. For example, if we say we prohibit matches // between 5 May 2007 at 2 PM and 8 May at 5 AM, we'll // miss any matches that happen between midnight and 2 PM // on 5 May 2007 and between midnight and 5 AM on 8 May. primary_query = GetDateRangeQuery (part, out hit_filter); // Date properties can live in either index if (! only_build_primary_query && primary_query != null) secondary_query = primary_query.Clone () as LNS.Query; return; } if (abstract_part is QueryPart_Or) { QueryPart_Or part = (QueryPart_Or) abstract_part; // Assemble a new BooleanQuery combining all of the sub-parts. LNS.BooleanQuery p_query; p_query = new LNS.BooleanQuery (); LNS.BooleanQuery s_query = null; if (! only_build_primary_query) s_query = new LNS.BooleanQuery (); primary_query = p_query; secondary_query = s_query; OrHitFilter or_hit_filter = null; foreach (QueryPart sub_part in part.SubParts) { LNS.Query p_subq, s_subq; HitFilter sub_hit_filter; // FIXME: This is (and must be) ignored // FIXME: Any subpart in an OR which has a hit filter won't work // correctly, because we can't tell which part of an OR we matched // against to filter correctly. This affects date range queries. QueryPartToQuery (sub_part, only_build_primary_query, term_list, query_part_hook, out p_subq, out s_subq, out sub_hit_filter); if (p_subq != null) p_query.Add (p_subq, LNS.BooleanClause.Occur.SHOULD); if (s_subq != null) s_query.Add (s_subq, LNS.BooleanClause.Occur.SHOULD); if (sub_hit_filter != null) { if (or_hit_filter == null) or_hit_filter = new OrHitFilter (); or_hit_filter.Add (sub_hit_filter); } } if (or_hit_filter != null) hit_filter = new HitFilter (or_hit_filter.HitFilter); return; } if (abstract_part is QueryPart_Uri) { QueryPart_Uri part = (QueryPart_Uri) abstract_part; // Do a term query on the Uri field. // This is probably less efficient that using a TermEnum; // but this is required for the query API where the uri query // can be part of a prohibited query or a boolean or query. Term term; term = new Term ("Uri", UriFu.UriToEscapedString (part.Uri)); if (term_list != null) term_list.Add (term); primary_query = new LNS.TermQuery (term); // Query only the primary index return; } if (abstract_part is QueryPart_Property) { QueryPart_Property part = (QueryPart_Property) abstract_part; string field_name; if (part.Key == QueryPart_Property.AllProperties) field_name = TypeToWildcardField (part.Type); else field_name = PropertyToFieldName (part.Type, part.Key); // Details of the conversion here depends on BeagrepAnalyzer::TokenStream if (part.Type == PropertyType.Text) primary_query = StringToQuery (field_name, part.Value, term_list); else { Term term; // FIXME: Handle date queries for other date fields if (part.Type == PropertyType.Internal || field_name.StartsWith ("prop:k:" + Property.PrivateNamespace)) term = new Term (field_name, part.Value); else term = new Term (field_name, part.Value.ToLower ()); if (term_list != null) term_list.Add (term); primary_query = new LNS.TermQuery (term); } // Properties can live in either index if (! only_build_primary_query && primary_query != null) secondary_query = primary_query.Clone () as LNS.Query; return; } throw new Exception ("Unhandled QueryPart type! " + abstract_part.ToString ()); }
public void Add (HitFilter hit_filter) { all.Add (hit_filter); }
public NotHitFilter (HitFilter original) { this.original = original; }
private static void GenerateQueryResults(IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, IQueryResult result, ICollection query_term_list, int max_results, HitFilter hit_filter, string index_name) { int num_hits; if (Debug) { Logger.Log.Debug(">>> {0}: Initially handed {1} matches", index_name, primary_matches.TrueCount); } if (primary_matches.TrueCount <= max_results) { if (Debug) { Logger.Log.Debug(">>> {0}: Initial count is within our limit of {1}", index_name, max_results); } num_hits = primary_matches.TrueCount; } else { if (Debug) { Logger.Log.Debug(">>> {0}: Number of hits is capped at {1}", index_name, max_results); } num_hits = max_results; } Stopwatch total, d, e; total = new Stopwatch(); d = new Stopwatch(); e = new Stopwatch(); total.Start(); ArrayList final_list_of_hits = null; // This is used only for scoring Dictionary <int, Hit> hits_by_id = new Dictionary <int, Hit> (num_hits); int total_number_of_matches = primary_matches.TrueCount; if (primary_matches.TrueCount > max_results) { final_list_of_hits = ScanRecentDocs(primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); } if (final_list_of_hits == null) { final_list_of_hits = FindRecentResults(primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); } d.Start(); ScoreHits(hits_by_id, primary_reader, query_term_list); hits_by_id = null; d.Stop(); if (Debug) { Log.Debug(">>> {0}: Scored hits in {1}", index_name, d); } e.Start(); // 25 hits seems to be the sweet spot: anything lower // and serialization overhead gets us, higher takes // longer to send out. const int MAX_QUEUED_HITS = 25; int sent_index = 0; // Break up the hits into reasonably sized chunks for // sending over the wire. for (int i = 0; i < final_list_of_hits.Count; ++i) { // Flush our hits if (i > 0 && i % MAX_QUEUED_HITS == 0) { result.Add(final_list_of_hits.GetRange(0, MAX_QUEUED_HITS)); final_list_of_hits.RemoveRange(0, MAX_QUEUED_HITS); i -= MAX_QUEUED_HITS; } } // Flush the remaining hits result.Add(final_list_of_hits, total_number_of_matches); final_list_of_hits = null; e.Stop(); if (Debug) { Log.Debug(">>> {0}: Hit filters executed and results sent in {1}", index_name, e); } total.Stop(); if (Debug) { Logger.Log.Debug(">>> {0}: GenerateQueryResults time statistics:", index_name); //Logger.Log.Debug (">>> {0}: Short circuit {1,6} ({2:0.0}%)", index_name, a == null ? "N/A" : a.ToString (), a == null ? 0.0 : 100 * a.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Create docs {1,6} ({2:0.0}%)", index_name, b, 100 * b.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Hit assembly {1,6} ({2:0.0}%)", index_name, c, 100 * c.ElapsedTime / total.ElapsedTime); Logger.Log.Debug(">>> {0}: Scored hits {1,6} ({2:0.0}%)", index_name, d, 100 * d.ElapsedTime / total.ElapsedTime); Logger.Log.Debug(">>> {0}: Results sent {1,6} ({2:0.0}%)", index_name, e, 100 * e.ElapsedTime / total.ElapsedTime); Logger.Log.Debug(">>> {0}: TOTAL {1,6}", index_name, total); } }
public void Add (HitFilter hit_filter) { if (hit_filter == true_hit_filter) contains_known_true = true; all.Add (hit_filter); }
//////////////////////////////////////////////////////////////// // Returns the lists of terms in the query private ArrayList AssembleQuery(Query query, QueryPartHook query_part_hook, HitFilter hit_filter, out ArrayList primary_required_part_queries, out ArrayList secondary_required_part_queries, out LNS.BooleanQuery primary_prohibited_part_query, out LNS.BooleanQuery secondary_prohibited_part_query, out AndHitFilter all_hit_filters) { primary_required_part_queries = null; secondary_required_part_queries = null; primary_prohibited_part_query = null; secondary_prohibited_part_query = null; all_hit_filters = new AndHitFilter(); if (hit_filter != null) { all_hit_filters.Add(hit_filter); } ArrayList term_list = new ArrayList(); foreach (QueryPart part in query.Parts) { LNS.Query primary_part_query; LNS.Query secondary_part_query; HitFilter part_hit_filter; QueryPartToQuery(part, false, // we want both primary and secondary queries part.Logic == QueryPartLogic.Required ? term_list : null, query_part_hook, out primary_part_query, out secondary_part_query, out part_hit_filter); if (primary_part_query == null) { continue; } switch (part.Logic) { case QueryPartLogic.Required: if (primary_required_part_queries == null) { primary_required_part_queries = new ArrayList(); secondary_required_part_queries = new ArrayList(); } primary_required_part_queries.Add(primary_part_query); secondary_required_part_queries.Add(secondary_part_query); if (part_hit_filter != null) { all_hit_filters.Add(part_hit_filter); } break; case QueryPartLogic.Prohibited: if (primary_prohibited_part_query == null) { primary_prohibited_part_query = new LNS.BooleanQuery(); } primary_prohibited_part_query.Add(primary_part_query, LNS.BooleanClause.Occur.SHOULD); if (secondary_part_query != null) { if (secondary_prohibited_part_query == null) { secondary_prohibited_part_query = new LNS.BooleanQuery(); } secondary_prohibited_part_query.Add(secondary_part_query, LNS.BooleanClause.Occur.SHOULD); } if (part_hit_filter != null) { NotHitFilter nhf; nhf = new NotHitFilter(part_hit_filter); all_hit_filters.Add(new HitFilter(nhf.HitFilter)); } break; } } return(term_list); }
//////////////////////////////////////////////////////////////// public void DoQuery(Query query, IQueryResult result, ICollection search_subset_uris, // should be internal uris QueryPartHook query_part_hook, HitFilter hit_filter) { if (Debug) { Logger.Log.Debug("###### {0}: Starting low-level queries", IndexName); } Stopwatch total, a, b, c, d, e, f; total = new Stopwatch(); a = new Stopwatch(); b = new Stopwatch(); c = new Stopwatch(); d = new Stopwatch(); e = new Stopwatch(); f = new Stopwatch(); total.Start(); a.Start(); ArrayList primary_required_part_queries; ArrayList secondary_required_part_queries; LNS.BooleanQuery primary_prohibited_part_query; LNS.BooleanQuery secondary_prohibited_part_query; AndHitFilter all_hit_filters; ArrayList term_list; // Assemble all of the parts into a bunch of Lucene queries term_list = AssembleQuery(query, query_part_hook, hit_filter, out primary_required_part_queries, out secondary_required_part_queries, out primary_prohibited_part_query, out secondary_prohibited_part_query, out all_hit_filters); a.Stop(); if (Debug) { Log.Debug("###### {0}: Building queries took {1}", IndexName, a); } // If we have no required parts, give up. if (primary_required_part_queries == null) { return; } b.Start(); // // Now that we have all of these nice queries, let's execute them! // IndexReader primary_reader; LNS.IndexSearcher primary_searcher; IndexReader secondary_reader; LNS.IndexSearcher secondary_searcher; // Create the searchers that we will need. if (!BuildSearchers(out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher)) { return; } b.Stop(); if (Debug) { Log.Debug("###### {0}: Readers/searchers built in {1}", IndexName, b); } // Build whitelists and blacklists for search subsets. c.Start(); // Possibly create our whitelists from the search subset. LuceneBitArray primary_whitelist, secondary_whitelist; CreateQueryWhitelists(search_subset_uris, primary_searcher, secondary_searcher, primary_prohibited_part_query, secondary_prohibited_part_query, out primary_whitelist, out secondary_whitelist); c.Stop(); if (Debug) { Log.Debug("###### {0}: Whitelists and blacklists built in {1}", IndexName, c); } // Now run the low level queries against our indexes. d.Start(); BetterBitArray primary_matches = null; if (primary_required_part_queries != null) { if (secondary_searcher != null) { primary_matches = DoRequiredQueries_TwoIndex(primary_searcher, secondary_searcher, primary_required_part_queries, secondary_required_part_queries, primary_whitelist, secondary_whitelist); } else { primary_matches = DoRequiredQueries(primary_searcher, primary_required_part_queries, primary_whitelist); } } d.Stop(); if (Debug) { Logger.Log.Debug("###### {0}: Low-level queries finished in {1}", IndexName, d); } e.Start(); // Only generate results if we got some matches if (primary_matches != null && primary_matches.ContainsTrue()) { GenerateQueryResults(primary_reader, secondary_reader, primary_matches, result, term_list, query.MaxHits, new HitFilter(all_hit_filters.HitFilter), IndexName); } e.Stop(); if (Debug) { Log.Debug("###### {0}: Query results generated in {1}", IndexName, e); } // // Finally, we clean up after ourselves. // f.Start(); CloseSearchers(primary_reader, primary_searcher, secondary_reader, secondary_searcher); f.Stop(); if (Debug) { Log.Debug("###### {0}: Readers/searchers released in {1}", IndexName, f); } total.Stop(); if (Debug) { Log.Debug("###### {0}: Query time breakdown:", IndexName); Log.Debug("###### {0}: Build queries {1,6} ({2:0.0}%)", IndexName, a, 100 * a.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Got readers {1,6} ({2:0.0}%)", IndexName, b, 100 * b.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Whitelists {1,6} ({2:0.0}%)", IndexName, c, 100 * c.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Queries {1,6} ({2:0.0}%)", IndexName, d, 100 * d.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Gen'd Results {1,6} ({2:0.0}%)", IndexName, e, 100 * e.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Reader cleanup {1,6} ({2:0.0}%)", IndexName, f, 100 * f.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: TOTAL {1,6}", IndexName, total); Logger.Log.Debug("###### {0}: Total query run in {1}", IndexName, total); } }
// There are two ways we can determine the max_results // most recent items: // // One is to instantiate Lucene documents for each of // the document IDs in primary_matches. This is a // fairly expensive operation. // // The other is to walk through the list of all // document IDs in descending time order. This is // a less expensive operation, but adds up over time // on large data sets. // // We can walk about 2.5 docs for every Document we // instantiate. So what we'll do, if we have more // matches than available hits, is walk (m * 1.25) // docs to see if we can fill out the top 100 hits. // If not, we'll fall back to creating documents // for all of them. private static ArrayList ScanRecentDocs(IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary <int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch a = new Stopwatch(); a.Start(); TermDocs docs = primary_reader.TermDocs(); TermEnum enumerator = primary_reader.Terms(new Term("InvertedTimestamp", String.Empty)); ArrayList results = new ArrayList(max_results); int docs_found = 0; int docs_walked = 0; int hit_filter_removed = 0; int max_docs = (int)(primary_matches.TrueCount * 1.25); Term term; TermDocs secondary_term_docs = null; if (secondary_reader != null) { secondary_term_docs = secondary_reader.TermDocs(); } do { term = enumerator.Term(); if (term.Field() != "InvertedTimestamp") { break; } docs.Seek(enumerator); while (docs.Next() && docs_found < max_results && docs_walked < max_docs) { int doc_id = docs.Doc(); if (primary_matches.Get(doc_id)) { Document doc = primary_reader.Document(doc_id); Hit hit = CreateHit(doc, secondary_reader, secondary_term_docs); // If we have a HitFilter, apply it. if (hit_filter != null && !hit_filter(hit)) { if (Debug) { Log.Debug("Filtered out {0}", hit.Uri); } hit_filter_removed++; continue; } hits_by_id [doc_id] = hit; // Add the result, last modified first results.Add(hit); docs_found++; } docs_walked++; } } while (enumerator.Next() && docs_found < max_results && docs_walked < max_docs); docs.Close(); if (secondary_term_docs != null) { secondary_term_docs.Close(); } // If we've found all the docs we can return in a subset! // Fantastic, we've probably short circuited a slow search. if (docs_found != max_results) { // Otherwise bad luck! Not all docs found // Start afresh - this time traversing all results results = null; } else { // Adjust total_number_of_matches. We need to do this to avoid scenarios like the following: // max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned. // We want to avoid saying "Showing top 70 of 100". Note that since we are not passing // every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the // 1234 could actually be much less. But since max_hits was 100, that will not mislead the user. total_number_of_matches -= hit_filter_removed; } a.Stop(); if (Debug) { Log.Debug(">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a); if (docs_found == max_results) { Log.Debug(">>> {0}: Successfully short circuited timestamp ordering!", index_name); } } return(results); }
private static ArrayList FindRecentResults(IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary <int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch b = new Stopwatch(); b.Start(); int count = 0; Document doc; ArrayList all_docs = null; TopScores top_docs = null; TermDocs term_docs = null; if (primary_matches.TrueCount > max_results) { top_docs = new TopScores(max_results); } else { all_docs = new ArrayList(primary_matches.TrueCount); } if (secondary_reader != null) { term_docs = secondary_reader.TermDocs(); } for (int match_index = primary_matches.Count; ; match_index--) { // Walk across the matches backwards, since newer // documents are more likely to be at the end of // the index. match_index = primary_matches.GetPreviousTrueIndex(match_index); if (match_index < 0) { break; } count++; doc = primary_reader.Document(match_index, fields_timestamp_uri); // Check the timestamp --- if we have already reached our // limit, we might be able to reject it immediately. string timestamp_str; long timestamp_num = 0; timestamp_str = doc.Get("Timestamp"); if (timestamp_str == null) { Logger.Log.Warn("No timestamp on {0}!", GetUriFromDocument(doc)); } else { timestamp_num = Int64.Parse(doc.Get("Timestamp")); if (top_docs != null && !top_docs.WillAccept(timestamp_num)) { continue; } } // Get the actual hit now // doc was created with only 2 fields, so first get the complete lucene document for primary document. // Also run our hit_filter now, if we have one. Since we insist of returning max_results // most recent hits, any hits that would be filtered out should happen now and not later. Hit hit = CreateHit(primary_reader.Document(match_index), secondary_reader, term_docs); if (hit_filter != null && !hit_filter(hit)) { if (Debug) { Log.Debug("Filtered out {0}", hit.Uri); } total_number_of_matches--; continue; } hits_by_id [match_index] = hit; // Add the document to the appropriate data structure. // We use the timestamp_num as the score, so high // scores correspond to more-recent timestamps. if (all_docs != null) { all_docs.Add(hit); } else { top_docs.Add(timestamp_num, hit); } } if (term_docs != null) { term_docs.Close(); } b.Stop(); if (Debug) { Log.Debug(">>> {0}: Instantiated and scanned {1} documents in {2}", index_name, count, b); } if (all_docs != null) { // Sort results before sending all_docs.Sort(); return(all_docs); } else { return(top_docs.TopScoringObjects); } }
//////////////////////////////////////////////////////////////// public void DoQuery (Query query, IQueryResult result, ICollection search_subset_uris, // should be internal uris QueryPartHook query_part_hook, HitFilter hit_filter) { if (Debug) Logger.Log.Debug ("###### {0}: Starting low-level queries", IndexName); Stopwatch total, a, b, c, d, e, f; total = new Stopwatch (); a = new Stopwatch (); b = new Stopwatch (); c = new Stopwatch (); d = new Stopwatch (); e = new Stopwatch (); f = new Stopwatch (); total.Start (); a.Start (); ArrayList primary_required_part_queries; ArrayList secondary_required_part_queries; LNS.BooleanQuery primary_prohibited_part_query; LNS.BooleanQuery secondary_prohibited_part_query; AndHitFilter all_hit_filters; ArrayList term_list; // Assemble all of the parts into a bunch of Lucene queries term_list = AssembleQuery (query, query_part_hook, hit_filter, out primary_required_part_queries, out secondary_required_part_queries, out primary_prohibited_part_query, out secondary_prohibited_part_query, out all_hit_filters); a.Stop (); if (Debug) Log.Debug ("###### {0}: Building queries took {1}", IndexName, a); // If we have no required parts, give up. if (primary_required_part_queries == null) return; b.Start (); // // Now that we have all of these nice queries, let's execute them! // IndexReader primary_reader; LNS.IndexSearcher primary_searcher; IndexReader secondary_reader; LNS.IndexSearcher secondary_searcher; // Create the searchers that we will need. if (! BuildSearchers (out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher)) return; b.Stop (); if (Debug) Log.Debug ("###### {0}: Readers/searchers built in {1}", IndexName, b); // Build whitelists and blacklists for search subsets. c.Start (); // Possibly create our whitelists from the search subset. LuceneBitArray primary_whitelist, secondary_whitelist; CreateQueryWhitelists (search_subset_uris, primary_searcher, secondary_searcher, primary_prohibited_part_query, secondary_prohibited_part_query, out primary_whitelist, out secondary_whitelist); c.Stop (); if (Debug) Log.Debug ("###### {0}: Whitelists and blacklists built in {1}", IndexName, c); // Now run the low level queries against our indexes. d.Start (); BetterBitArray primary_matches = null; if (primary_required_part_queries != null) { if (secondary_searcher != null) primary_matches = DoRequiredQueries_TwoIndex (primary_searcher, secondary_searcher, primary_required_part_queries, secondary_required_part_queries, primary_whitelist, secondary_whitelist); else primary_matches = DoRequiredQueries (primary_searcher, primary_required_part_queries, primary_whitelist); } d.Stop (); if (Debug) Logger.Log.Debug ("###### {0}: Low-level queries finished in {1}", IndexName, d); e.Start (); // Only generate results if we got some matches if (primary_matches != null && primary_matches.ContainsTrue ()) { GenerateQueryResults (primary_reader, secondary_reader, primary_matches, result, term_list, query.MaxHits, new HitFilter (all_hit_filters.HitFilter), IndexName); } e.Stop (); if (Debug) Log.Debug ("###### {0}: Query results generated in {1}", IndexName, e); // // Finally, we clean up after ourselves. // f.Start (); CloseSearchers (primary_reader, primary_searcher, secondary_reader, secondary_searcher); f.Stop (); if (Debug) Log.Debug ("###### {0}: Readers/searchers released in {1}", IndexName, f); total.Stop (); if (Debug) { Log.Debug ("###### {0}: Query time breakdown:", IndexName); Log.Debug ("###### {0}: Build queries {1,6} ({2:0.0}%)", IndexName, a, 100 * a.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Got readers {1,6} ({2:0.0}%)", IndexName, b, 100 * b.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Whitelists {1,6} ({2:0.0}%)", IndexName, c, 100 * c.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Queries {1,6} ({2:0.0}%)", IndexName, d, 100 * d.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Gen'd Results {1,6} ({2:0.0}%)", IndexName, e, 100 * e.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Reader cleanup {1,6} ({2:0.0}%)", IndexName, f, 100 * f.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: TOTAL {1,6}", IndexName, total); Logger.Log.Debug ("###### {0}: Total query run in {1}", IndexName, total); } }
private static ArrayList FindRecentResults (IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary<int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch b = new Stopwatch (); b.Start (); int count = 0; Document doc; ArrayList all_docs = null; TopScores top_docs = null; TermDocs term_docs = null; if (primary_matches.TrueCount > max_results) top_docs = new TopScores (max_results); else all_docs = new ArrayList (primary_matches.TrueCount); if (secondary_reader != null) term_docs = secondary_reader.TermDocs (); for (int match_index = primary_matches.Count; ; match_index --) { // Walk across the matches backwards, since newer // documents are more likely to be at the end of // the index. match_index = primary_matches.GetPreviousTrueIndex (match_index); if (match_index < 0) break; count++; doc = primary_reader.Document (match_index, fields_timestamp_uri); // Check the timestamp --- if we have already reached our // limit, we might be able to reject it immediately. string timestamp_str; long timestamp_num = 0; timestamp_str = doc.Get ("Timestamp"); if (timestamp_str == null) { Logger.Log.Warn ("No timestamp on {0}!", GetUriFromDocument (doc)); } else { timestamp_num = Int64.Parse (doc.Get ("Timestamp")); if (top_docs != null && ! top_docs.WillAccept (timestamp_num)) continue; } // Get the actual hit now // doc was created with only 2 fields, so first get the complete lucene document for primary document. // Also run our hit_filter now, if we have one. Since we insist of returning max_results // most recent hits, any hits that would be filtered out should happen now and not later. Hit hit = CreateHit (primary_reader.Document (match_index), secondary_reader, term_docs); if (hit_filter != null && ! hit_filter (hit)) { if (Debug) Log.Debug ("Filtered out {0}", hit.Uri); total_number_of_matches --; continue; } hits_by_id [match_index] = hit; // Add the document to the appropriate data structure. // We use the timestamp_num as the score, so high // scores correspond to more-recent timestamps. if (all_docs != null) all_docs.Add (hit); else top_docs.Add (timestamp_num, hit); } if (term_docs != null) term_docs.Close (); b.Stop (); if (Debug) Log.Debug (">>> {0}: Instantiated and scanned {1} documents in {2}", index_name, count, b); if (all_docs != null) { // Sort results before sending all_docs.Sort (); return all_docs; } else { return top_docs.TopScoringObjects; } }
//////////////////////////////////////////////////////////////// // Returns the lists of terms in the query private ArrayList AssembleQuery (Query query, QueryPartHook query_part_hook, HitFilter hit_filter, out ArrayList primary_required_part_queries, out ArrayList secondary_required_part_queries, out LNS.BooleanQuery primary_prohibited_part_query, out LNS.BooleanQuery secondary_prohibited_part_query, out AndHitFilter all_hit_filters) { primary_required_part_queries = null; secondary_required_part_queries = null; primary_prohibited_part_query = null; secondary_prohibited_part_query = null; all_hit_filters = new AndHitFilter (); if (hit_filter != null) all_hit_filters.Add (hit_filter); ArrayList term_list = new ArrayList (); foreach (QueryPart part in query.Parts) { LNS.Query primary_part_query; LNS.Query secondary_part_query; HitFilter part_hit_filter; QueryPartToQuery (part, false, // we want both primary and secondary queries part.Logic == QueryPartLogic.Required ? term_list : null, query_part_hook, out primary_part_query, out secondary_part_query, out part_hit_filter); if (primary_part_query == null) continue; switch (part.Logic) { case QueryPartLogic.Required: if (primary_required_part_queries == null) { primary_required_part_queries = new ArrayList (); secondary_required_part_queries = new ArrayList (); } primary_required_part_queries.Add (primary_part_query); secondary_required_part_queries.Add (secondary_part_query); if (part_hit_filter != null) all_hit_filters.Add (part_hit_filter); break; case QueryPartLogic.Prohibited: if (primary_prohibited_part_query == null) primary_prohibited_part_query = new LNS.BooleanQuery (); primary_prohibited_part_query.Add (primary_part_query, LNS.BooleanClause.Occur.SHOULD); if (secondary_part_query != null) { if (secondary_prohibited_part_query == null) secondary_prohibited_part_query = new LNS.BooleanQuery (); secondary_prohibited_part_query.Add (secondary_part_query, LNS.BooleanClause.Occur.SHOULD); } if (part_hit_filter != null) { NotHitFilter nhf; nhf = new NotHitFilter (part_hit_filter); all_hit_filters.Add (new HitFilter (nhf.HitFilter)); } break; } } return term_list; }
// There are two ways we can determine the max_results // most recent items: // // One is to instantiate Lucene documents for each of // the document IDs in primary_matches. This is a // fairly expensive operation. // // The other is to walk through the list of all // document IDs in descending time order. This is // a less expensive operation, but adds up over time // on large data sets. // // We can walk about 2.5 docs for every Document we // instantiate. So what we'll do, if we have more // matches than available hits, is walk (m * 1.25) // docs to see if we can fill out the top 100 hits. // If not, we'll fall back to creating documents // for all of them. private static ArrayList ScanRecentDocs (IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary<int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch a = new Stopwatch (); a.Start (); TermDocs docs = primary_reader.TermDocs (); TermEnum enumerator = primary_reader.Terms (new Term ("InvertedTimestamp", String.Empty)); ArrayList results = new ArrayList (max_results); int docs_found = 0; int docs_walked = 0; int hit_filter_removed = 0; int max_docs = (int) (primary_matches.TrueCount * 1.25); Term term; TermDocs secondary_term_docs = null; if (secondary_reader != null) secondary_term_docs = secondary_reader.TermDocs (); do { term = enumerator.Term (); if (term.Field () != "InvertedTimestamp") break; docs.Seek (enumerator); while (docs.Next () && docs_found < max_results && docs_walked < max_docs) { int doc_id = docs.Doc (); if (primary_matches.Get (doc_id)) { Document doc = primary_reader.Document (doc_id); Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs); // If we have a HitFilter, apply it. if (hit_filter != null && ! hit_filter (hit)) { if (Debug) Log.Debug ("Filtered out {0}", hit.Uri); hit_filter_removed ++; continue; } hits_by_id [doc_id] = hit; // Add the result, last modified first results.Add (hit); docs_found++; } docs_walked++; } } while (enumerator.Next () && docs_found < max_results && docs_walked < max_docs); docs.Close (); if (secondary_term_docs != null) secondary_term_docs.Close (); // If we've found all the docs we can return in a subset! // Fantastic, we've probably short circuited a slow search. if (docs_found != max_results) { // Otherwise bad luck! Not all docs found // Start afresh - this time traversing all results results = null; } else { // Adjust total_number_of_matches. We need to do this to avoid scenarios like the following: // max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned. // We want to avoid saying "Showing top 70 of 100". Note that since we are not passing // every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the // 1234 could actually be much less. But since max_hits was 100, that will not mislead the user. total_number_of_matches -= hit_filter_removed; } a.Stop (); if (Debug) { Log.Debug (">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a); if (docs_found == max_results) Log.Debug (">>> {0}: Successfully short circuited timestamp ordering!", index_name); } return results; }
private static void GenerateQueryResults (IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, IQueryResult result, ICollection query_term_list, int max_results, HitFilter hit_filter, string index_name) { int num_hits; if (Debug) Logger.Log.Debug (">>> {0}: Initially handed {1} matches", index_name, primary_matches.TrueCount); if (primary_matches.TrueCount <= max_results) { if (Debug) Logger.Log.Debug (">>> {0}: Initial count is within our limit of {1}", index_name, max_results); num_hits = primary_matches.TrueCount; } else { if (Debug) Logger.Log.Debug (">>> {0}: Number of hits is capped at {1}", index_name, max_results); num_hits = max_results; } Stopwatch total, d, e; total = new Stopwatch (); d = new Stopwatch (); e = new Stopwatch (); total.Start (); ArrayList final_list_of_hits = null; // This is used only for scoring Dictionary<int, Hit> hits_by_id = new Dictionary<int, Hit> (num_hits); int total_number_of_matches = primary_matches.TrueCount; if (primary_matches.TrueCount > max_results) final_list_of_hits = ScanRecentDocs (primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); if (final_list_of_hits == null) final_list_of_hits = FindRecentResults (primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); d.Start (); ScoreHits (hits_by_id, primary_reader, query_term_list); hits_by_id = null; d.Stop (); if (Debug) Log.Debug (">>> {0}: Scored hits in {1}", index_name, d); e.Start (); // 25 hits seems to be the sweet spot: anything lower // and serialization overhead gets us, higher takes // longer to send out. const int MAX_QUEUED_HITS = 25; int sent_index = 0; // Break up the hits into reasonably sized chunks for // sending over the wire. for (int i = 0; i < final_list_of_hits.Count; ++i) { // Flush our hits if (i > 0 && i % MAX_QUEUED_HITS == 0) { result.Add (final_list_of_hits.GetRange (0, MAX_QUEUED_HITS)); final_list_of_hits.RemoveRange (0, MAX_QUEUED_HITS); i -= MAX_QUEUED_HITS; } } // Flush the remaining hits result.Add (final_list_of_hits, total_number_of_matches); final_list_of_hits = null; e.Stop (); if (Debug) Log.Debug (">>> {0}: Hit filters executed and results sent in {1}", index_name, e); total.Stop (); if (Debug) { Logger.Log.Debug (">>> {0}: GenerateQueryResults time statistics:", index_name); //Logger.Log.Debug (">>> {0}: Short circuit {1,6} ({2:0.0}%)", index_name, a == null ? "N/A" : a.ToString (), a == null ? 0.0 : 100 * a.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Create docs {1,6} ({2:0.0}%)", index_name, b, 100 * b.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Hit assembly {1,6} ({2:0.0}%)", index_name, c, 100 * c.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> {0}: Scored hits {1,6} ({2:0.0}%)", index_name, d, 100 * d.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> {0}: Results sent {1,6} ({2:0.0}%)", index_name, e, 100 * e.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> {0}: TOTAL {1,6}", index_name, total); } }