// Returns a list of all files and directories in dir static ICollection GetAllItemsInDirectory(DirectoryInfo dir) { // form the query string parent_uri_str = PathToUri(dir.FullName).ToString(); // Instead of taking the painfull way of using BeagrepAnalyzer, lets just add the prefix manually // LuceneCommon thinks exposing secret property type encoding is bad, I think so too... except for now string key = "prop:k:" + Property.ParentDirUriPropKey; //Logger.Log.Debug ("Querying for {0}={1}", parent_uri_str, key); LNS.Query query = new LNS.TermQuery(new Term(key, parent_uri_str)); // do the search LNS.IndexSearcher searcher; searcher = LuceneCommon.GetSearcher(driver.PrimaryStore); BetterBitArray matches; matches = new BetterBitArray(searcher.MaxDoc()); BitArrayHitCollector collector; collector = new BitArrayHitCollector(matches); searcher.Search(query, null, collector); // Finally we pull all of the matching documents, // convert them to Dirent, and store them in a list. ArrayList match_list = new ArrayList(); int i = 0; while (i < matches.Count) { i = matches.GetNextTrueIndex(i); if (i >= matches.Count) { break; } Document doc; doc = searcher.Doc(i); Dirent info; info = DocumentToDirent(doc); match_list.Add(info); ++i; } LuceneCommon.ReleaseSearcher(searcher); //Logger.Log.Debug ("Found {0} items in {1}", match_list.Count, dir.FullName); return(match_list); }
private void UseScratch() { if (scratch == null) { scratch = new BetterBitArray(searcher.MaxDoc()); } else { scratch.SetAll(false); } collector.Array = scratch; }
//////////////////////////////////////////////////////////////// // // Special logic for handling our set of required queries // // This is the easy case: we just combine all of the queries // into one big BooleanQuery. private static BetterBitArray DoRequiredQueries(LNS.IndexSearcher primary_searcher, ArrayList primary_queries, BetterBitArray primary_whitelist) { LNS.BooleanQuery combined_query; combined_query = new LNS.BooleanQuery(); foreach (LNS.Query query in primary_queries) { combined_query.Add(query, LNS.BooleanClause.Occur.MUST); } LuceneBitArray matches; matches = new LuceneBitArray(primary_searcher, combined_query); if (primary_whitelist != null) { matches.And(primary_whitelist); } return(matches); }
public BitArrayHitCollector(BetterBitArray matches) { this.matches = matches; }
private static void GenerateQueryResults(IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, IQueryResult result, ICollection query_term_list, int max_results, HitFilter hit_filter, string index_name) { int num_hits; if (Debug) { Logger.Log.Debug(">>> {0}: Initially handed {1} matches", index_name, primary_matches.TrueCount); } if (primary_matches.TrueCount <= max_results) { if (Debug) { Logger.Log.Debug(">>> {0}: Initial count is within our limit of {1}", index_name, max_results); } num_hits = primary_matches.TrueCount; } else { if (Debug) { Logger.Log.Debug(">>> {0}: Number of hits is capped at {1}", index_name, max_results); } num_hits = max_results; } Stopwatch total, d, e; total = new Stopwatch(); d = new Stopwatch(); e = new Stopwatch(); total.Start(); ArrayList final_list_of_hits = null; // This is used only for scoring Dictionary <int, Hit> hits_by_id = new Dictionary <int, Hit> (num_hits); int total_number_of_matches = primary_matches.TrueCount; if (primary_matches.TrueCount > max_results) { final_list_of_hits = ScanRecentDocs(primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); } if (final_list_of_hits == null) { final_list_of_hits = FindRecentResults(primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); } d.Start(); ScoreHits(hits_by_id, primary_reader, query_term_list); hits_by_id = null; d.Stop(); if (Debug) { Log.Debug(">>> {0}: Scored hits in {1}", index_name, d); } e.Start(); // 25 hits seems to be the sweet spot: anything lower // and serialization overhead gets us, higher takes // longer to send out. const int MAX_QUEUED_HITS = 25; int sent_index = 0; // Break up the hits into reasonably sized chunks for // sending over the wire. for (int i = 0; i < final_list_of_hits.Count; ++i) { // Flush our hits if (i > 0 && i % MAX_QUEUED_HITS == 0) { result.Add(final_list_of_hits.GetRange(0, MAX_QUEUED_HITS)); final_list_of_hits.RemoveRange(0, MAX_QUEUED_HITS); i -= MAX_QUEUED_HITS; } } // Flush the remaining hits result.Add(final_list_of_hits, total_number_of_matches); final_list_of_hits = null; e.Stop(); if (Debug) { Log.Debug(">>> {0}: Hit filters executed and results sent in {1}", index_name, e); } total.Stop(); if (Debug) { Logger.Log.Debug(">>> {0}: GenerateQueryResults time statistics:", index_name); //Logger.Log.Debug (">>> {0}: Short circuit {1,6} ({2:0.0}%)", index_name, a == null ? "N/A" : a.ToString (), a == null ? 0.0 : 100 * a.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Create docs {1,6} ({2:0.0}%)", index_name, b, 100 * b.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Hit assembly {1,6} ({2:0.0}%)", index_name, c, 100 * c.ElapsedTime / total.ElapsedTime); Logger.Log.Debug(">>> {0}: Scored hits {1,6} ({2:0.0}%)", index_name, d, 100 * d.ElapsedTime / total.ElapsedTime); Logger.Log.Debug(">>> {0}: Results sent {1,6} ({2:0.0}%)", index_name, e, 100 * e.ElapsedTime / total.ElapsedTime); Logger.Log.Debug(">>> {0}: TOTAL {1,6}", index_name, total); } }
// Any whitelists that are passed in must be fully joined, or // query results will be incorrect. private static BetterBitArray DoRequiredQueries_TwoIndex(LNS.IndexSearcher primary_searcher, LNS.IndexSearcher secondary_searcher, ArrayList primary_queries, ArrayList secondary_queries, BetterBitArray primary_whitelist, BetterBitArray secondary_whitelist) { ArrayList match_info_list; match_info_list = new ArrayList(); // First, do all of the low-level queries // and store them in our MatchInfo for (int i = 0; i < primary_queries.Count; ++i) { LNS.Query pq, sq; pq = primary_queries [i] as LNS.Query; sq = secondary_queries [i] as LNS.Query; LuceneBitArray p_matches = null, s_matches = null; p_matches = new LuceneBitArray(primary_searcher); if (pq != null) { p_matches.Or(pq); if (primary_whitelist != null) { p_matches.And(primary_whitelist); } } s_matches = new LuceneBitArray(secondary_searcher); if (sq != null) { s_matches.Or(sq); if (secondary_whitelist != null) { s_matches.And(secondary_whitelist); } } MatchInfo info; info = new MatchInfo(); info.PrimaryMatches = p_matches; info.SecondaryMatches = s_matches; info.RestrictBy(null); // a hack to initialize the UpperBound match_info_list.Add(info); } // We want to be smart about the order we do this in, // to minimize the expense of the Join. while (match_info_list.Count > 1) { // linear scan to find the minimum int index_min = 0; for (int i = 1; i < match_info_list.Count; ++i) { if (((MatchInfo)match_info_list [i]).CompareTo((MatchInfo)match_info_list [index_min]) < 0) { index_min = i; } } MatchInfo smallest; smallest = match_info_list [index_min] as MatchInfo; match_info_list.RemoveAt(index_min); // We can short-circuit if our smallest set of // matches is empty. if (smallest.UpperBound == 0) { return(smallest.PrimaryMatches); // this must be an empty array. } smallest.Join(); foreach (MatchInfo info in match_info_list) { info.RestrictBy(smallest); } } // For the final pair, we don't need to do a full join: // mapping the secondary onto the primary is sufficient MatchInfo last; last = match_info_list [0] as MatchInfo; last.SecondaryMatches.ProjectOnto(last.PrimaryMatches); return(last.PrimaryMatches); }
//////////////////////////////////////////////////////////////// public void DoQuery(Query query, IQueryResult result, ICollection search_subset_uris, // should be internal uris QueryPartHook query_part_hook, HitFilter hit_filter) { if (Debug) { Logger.Log.Debug("###### {0}: Starting low-level queries", IndexName); } Stopwatch total, a, b, c, d, e, f; total = new Stopwatch(); a = new Stopwatch(); b = new Stopwatch(); c = new Stopwatch(); d = new Stopwatch(); e = new Stopwatch(); f = new Stopwatch(); total.Start(); a.Start(); ArrayList primary_required_part_queries; ArrayList secondary_required_part_queries; LNS.BooleanQuery primary_prohibited_part_query; LNS.BooleanQuery secondary_prohibited_part_query; AndHitFilter all_hit_filters; ArrayList term_list; // Assemble all of the parts into a bunch of Lucene queries term_list = AssembleQuery(query, query_part_hook, hit_filter, out primary_required_part_queries, out secondary_required_part_queries, out primary_prohibited_part_query, out secondary_prohibited_part_query, out all_hit_filters); a.Stop(); if (Debug) { Log.Debug("###### {0}: Building queries took {1}", IndexName, a); } // If we have no required parts, give up. if (primary_required_part_queries == null) { return; } b.Start(); // // Now that we have all of these nice queries, let's execute them! // IndexReader primary_reader; LNS.IndexSearcher primary_searcher; IndexReader secondary_reader; LNS.IndexSearcher secondary_searcher; // Create the searchers that we will need. if (!BuildSearchers(out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher)) { return; } b.Stop(); if (Debug) { Log.Debug("###### {0}: Readers/searchers built in {1}", IndexName, b); } // Build whitelists and blacklists for search subsets. c.Start(); // Possibly create our whitelists from the search subset. LuceneBitArray primary_whitelist, secondary_whitelist; CreateQueryWhitelists(search_subset_uris, primary_searcher, secondary_searcher, primary_prohibited_part_query, secondary_prohibited_part_query, out primary_whitelist, out secondary_whitelist); c.Stop(); if (Debug) { Log.Debug("###### {0}: Whitelists and blacklists built in {1}", IndexName, c); } // Now run the low level queries against our indexes. d.Start(); BetterBitArray primary_matches = null; if (primary_required_part_queries != null) { if (secondary_searcher != null) { primary_matches = DoRequiredQueries_TwoIndex(primary_searcher, secondary_searcher, primary_required_part_queries, secondary_required_part_queries, primary_whitelist, secondary_whitelist); } else { primary_matches = DoRequiredQueries(primary_searcher, primary_required_part_queries, primary_whitelist); } } d.Stop(); if (Debug) { Logger.Log.Debug("###### {0}: Low-level queries finished in {1}", IndexName, d); } e.Start(); // Only generate results if we got some matches if (primary_matches != null && primary_matches.ContainsTrue()) { GenerateQueryResults(primary_reader, secondary_reader, primary_matches, result, term_list, query.MaxHits, new HitFilter(all_hit_filters.HitFilter), IndexName); } e.Stop(); if (Debug) { Log.Debug("###### {0}: Query results generated in {1}", IndexName, e); } // // Finally, we clean up after ourselves. // f.Start(); CloseSearchers(primary_reader, primary_searcher, secondary_reader, secondary_searcher); f.Stop(); if (Debug) { Log.Debug("###### {0}: Readers/searchers released in {1}", IndexName, f); } total.Stop(); if (Debug) { Log.Debug("###### {0}: Query time breakdown:", IndexName); Log.Debug("###### {0}: Build queries {1,6} ({2:0.0}%)", IndexName, a, 100 * a.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Got readers {1,6} ({2:0.0}%)", IndexName, b, 100 * b.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Whitelists {1,6} ({2:0.0}%)", IndexName, c, 100 * c.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Queries {1,6} ({2:0.0}%)", IndexName, d, 100 * d.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Gen'd Results {1,6} ({2:0.0}%)", IndexName, e, 100 * e.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Reader cleanup {1,6} ({2:0.0}%)", IndexName, f, 100 * f.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: TOTAL {1,6}", IndexName, total); Logger.Log.Debug("###### {0}: Total query run in {1}", IndexName, total); } }
//////////////////////////////////////////////////////////////// public int DoCountMatchQuery(Query query, QueryPartHook query_part_hook) { if (Debug) { Logger.Log.Debug("###### {0}: Starting low-level queries", IndexName); } Stopwatch total; total = new Stopwatch(); total.Start(); ArrayList primary_required_part_queries; ArrayList secondary_required_part_queries; LNS.BooleanQuery primary_prohibited_part_query; LNS.BooleanQuery secondary_prohibited_part_query; AndHitFilter all_hit_filters; ArrayList term_list; term_list = AssembleQuery(query, query_part_hook, null, out primary_required_part_queries, out secondary_required_part_queries, out primary_prohibited_part_query, out secondary_prohibited_part_query, out all_hit_filters); // If we have no required parts, give up. if (primary_required_part_queries == null) { return(0); } IndexReader primary_reader; LNS.IndexSearcher primary_searcher; IndexReader secondary_reader; LNS.IndexSearcher secondary_searcher; if (!BuildSearchers(out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher)) { return(0); } // Build whitelists and blacklists for search subsets. LuceneBitArray primary_whitelist, secondary_whitelist; CreateQueryWhitelists(null, primary_searcher, secondary_searcher, primary_prohibited_part_query, secondary_prohibited_part_query, out primary_whitelist, out secondary_whitelist); // Now run the low level queries against our indexes. BetterBitArray primary_matches = null; if (primary_required_part_queries != null) { if (secondary_searcher != null) { primary_matches = DoRequiredQueries_TwoIndex(primary_searcher, secondary_searcher, primary_required_part_queries, secondary_required_part_queries, primary_whitelist, secondary_whitelist); } else { primary_matches = DoRequiredQueries(primary_searcher, primary_required_part_queries, primary_whitelist); } } int result = 0; // FIXME: Pass the count through uri-filter and other validation checks if (primary_matches != null) { result = primary_matches.TrueCount; } CloseSearchers(primary_reader, primary_searcher, secondary_reader, secondary_searcher); total.Stop(); if (Debug) { Logger.Log.Debug("###### {0}: Total query run in {1}", IndexName, total); } return(result); }
private static ArrayList FindRecentResults(IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary <int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch b = new Stopwatch(); b.Start(); int count = 0; Document doc; ArrayList all_docs = null; TopScores top_docs = null; TermDocs term_docs = null; if (primary_matches.TrueCount > max_results) { top_docs = new TopScores(max_results); } else { all_docs = new ArrayList(primary_matches.TrueCount); } if (secondary_reader != null) { term_docs = secondary_reader.TermDocs(); } for (int match_index = primary_matches.Count; ; match_index--) { // Walk across the matches backwards, since newer // documents are more likely to be at the end of // the index. match_index = primary_matches.GetPreviousTrueIndex(match_index); if (match_index < 0) { break; } count++; doc = primary_reader.Document(match_index, fields_timestamp_uri); // Check the timestamp --- if we have already reached our // limit, we might be able to reject it immediately. string timestamp_str; long timestamp_num = 0; timestamp_str = doc.Get("Timestamp"); if (timestamp_str == null) { Logger.Log.Warn("No timestamp on {0}!", GetUriFromDocument(doc)); } else { timestamp_num = Int64.Parse(doc.Get("Timestamp")); if (top_docs != null && !top_docs.WillAccept(timestamp_num)) { continue; } } // Get the actual hit now // doc was created with only 2 fields, so first get the complete lucene document for primary document. // Also run our hit_filter now, if we have one. Since we insist of returning max_results // most recent hits, any hits that would be filtered out should happen now and not later. Hit hit = CreateHit(primary_reader.Document(match_index), secondary_reader, term_docs); if (hit_filter != null && !hit_filter(hit)) { if (Debug) { Log.Debug("Filtered out {0}", hit.Uri); } total_number_of_matches--; continue; } hits_by_id [match_index] = hit; // Add the document to the appropriate data structure. // We use the timestamp_num as the score, so high // scores correspond to more-recent timestamps. if (all_docs != null) { all_docs.Add(hit); } else { top_docs.Add(timestamp_num, hit); } } if (term_docs != null) { term_docs.Close(); } b.Stop(); if (Debug) { Log.Debug(">>> {0}: Instantiated and scanned {1} documents in {2}", index_name, count, b); } if (all_docs != null) { // Sort results before sending all_docs.Sort(); return(all_docs); } else { return(top_docs.TopScoringObjects); } }
// There are two ways we can determine the max_results // most recent items: // // One is to instantiate Lucene documents for each of // the document IDs in primary_matches. This is a // fairly expensive operation. // // The other is to walk through the list of all // document IDs in descending time order. This is // a less expensive operation, but adds up over time // on large data sets. // // We can walk about 2.5 docs for every Document we // instantiate. So what we'll do, if we have more // matches than available hits, is walk (m * 1.25) // docs to see if we can fill out the top 100 hits. // If not, we'll fall back to creating documents // for all of them. private static ArrayList ScanRecentDocs(IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary <int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch a = new Stopwatch(); a.Start(); TermDocs docs = primary_reader.TermDocs(); TermEnum enumerator = primary_reader.Terms(new Term("InvertedTimestamp", String.Empty)); ArrayList results = new ArrayList(max_results); int docs_found = 0; int docs_walked = 0; int hit_filter_removed = 0; int max_docs = (int)(primary_matches.TrueCount * 1.25); Term term; TermDocs secondary_term_docs = null; if (secondary_reader != null) { secondary_term_docs = secondary_reader.TermDocs(); } do { term = enumerator.Term(); if (term.Field() != "InvertedTimestamp") { break; } docs.Seek(enumerator); while (docs.Next() && docs_found < max_results && docs_walked < max_docs) { int doc_id = docs.Doc(); if (primary_matches.Get(doc_id)) { Document doc = primary_reader.Document(doc_id); Hit hit = CreateHit(doc, secondary_reader, secondary_term_docs); // If we have a HitFilter, apply it. if (hit_filter != null && !hit_filter(hit)) { if (Debug) { Log.Debug("Filtered out {0}", hit.Uri); } hit_filter_removed++; continue; } hits_by_id [doc_id] = hit; // Add the result, last modified first results.Add(hit); docs_found++; } docs_walked++; } } while (enumerator.Next() && docs_found < max_results && docs_walked < max_docs); docs.Close(); if (secondary_term_docs != null) { secondary_term_docs.Close(); } // If we've found all the docs we can return in a subset! // Fantastic, we've probably short circuited a slow search. if (docs_found != max_results) { // Otherwise bad luck! Not all docs found // Start afresh - this time traversing all results results = null; } else { // Adjust total_number_of_matches. We need to do this to avoid scenarios like the following: // max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned. // We want to avoid saying "Showing top 70 of 100". Note that since we are not passing // every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the // 1234 could actually be much less. But since max_hits was 100, that will not mislead the user. total_number_of_matches -= hit_filter_removed; } a.Stop(); if (Debug) { Log.Debug(">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a); if (docs_found == max_results) { Log.Debug(">>> {0}: Successfully short circuited timestamp ordering!", index_name); } } return(results); }
public LuceneBitArray(LNS.IndexSearcher searcher) : base(searcher.MaxDoc()) { this.searcher = searcher; this.collector = new BitArrayHitCollector(); this.scratch = null; }
// Return all directories with name public ICollection GetAllDirectoryNameInfo(string name) { // First we assemble a query to find all of the directories. string field_name; field_name = PropertyToFieldName(PropertyType.Keyword, Property.IsDirectoryPropKey); LNS.Query isdir_query = new LNS.TermQuery(new Term(field_name, "true")); LNS.Query query = null; if (name == null) { query = isdir_query; } else { string dirname_field; dirname_field = PropertyToFieldName(PropertyType.Text, Property.TextFilenamePropKey); LNS.Query dirname_query; dirname_query = LuceneCommon.StringToQuery(dirname_field, name, null); LNS.BooleanQuery bool_query = new LNS.BooleanQuery(); bool_query.Add(isdir_query, LNS.BooleanClause.Occur.MUST); bool_query.Add(dirname_query, LNS.BooleanClause.Occur.MUST); query = bool_query; } // Then we actually run the query LNS.IndexSearcher searcher; //searcher = new LNS.IndexSearcher (SecondaryStore); searcher = LuceneCommon.GetSearcher(SecondaryStore); BetterBitArray matches; matches = new BetterBitArray(searcher.MaxDoc()); BitArrayHitCollector collector; collector = new BitArrayHitCollector(matches); searcher.Search(query, null, collector); // Finally we pull all of the matching documents, // convert them to NameInfo, and store them in a list. ArrayList match_list = new ArrayList(); int i = 0; while (i < matches.Count) { i = matches.GetNextTrueIndex(i); if (i >= matches.Count) { break; } Document doc; doc = searcher.Doc(i, fields_nameinfo); NameInfo info; info = DocumentToNameInfo(doc); match_list.Add(info); ++i; } LuceneCommon.ReleaseSearcher(searcher); return(match_list); }