private void UseScratch () { if (scratch == null) scratch = new BetterBitArray (searcher.MaxDoc ()); else scratch.SetAll (false); collector.Array = scratch; }
public BetterBitArray (BetterBitArray bits) { if (bits == null) throw new ArgumentNullException ("bits"); _length = bits._length; _array = new int [(_length + 31) / 32]; Array.Copy(bits._array, _array, _array.Length); }
void checkOperand(BetterBitArray operand) { if (operand == null) { throw new ArgumentNullException(); } if (operand._length != _length) { throw new ArgumentException(); } }
static void Main() { Random rng = new Random(); for (int trial = 0; trial < 10000; ++trial) { bool failed = false; BetterBitArray ba = new BetterBitArray(rng.Next(100000) + 1); Hashtable true_hash = new Hashtable(); int N = 1 + rng.Next(10000); for (int k = 0; k < N; ++k) { int j = rng.Next(ba.Count); ba [j] = true; true_hash [j] = true; } int i = 0; while (i < ba.Count) { i = ba.GetNextTrueIndex(i); if (i < ba.Count) { if (true_hash.Contains(i)) { true_hash.Remove(i); } else { Console.WriteLine("Spurious true at {0}", i); failed = true; } } ++i; } if (true_hash.Count > 0) { Console.WriteLine("Missed some trues:"); foreach (int k in true_hash.Values) { Console.WriteLine(" {0}", k); } failed = true; } Console.WriteLine("Trial #{0}: {1}", trial + 1, failed ? "FAILED" : "ok"); } }
public BetterBitArray(BetterBitArray bits) { if (bits == null) { throw new ArgumentNullException("bits"); } _length = bits._length; _array = new int [(_length + 31) / 32]; Array.Copy(bits._array, _array, _array.Length); }
public SmallIntArray (int length, int max_int) { this.length = length; this.max_int = max_int; this.bits_per_int = 0; while (max_int != 0) { ++bits_per_int; max_int = max_int >> 1; } this.bit_array = new BetterBitArray (this.length * this.bits_per_int); }
public SmallIntArray(int length, int max_int) { this.length = length; this.max_int = max_int; this.bits_per_int = 0; while (max_int != 0) { ++bits_per_int; max_int = max_int >> 1; } this.bit_array = new BetterBitArray(this.length * this.bits_per_int); }
public BetterBitArray Xor(BetterBitArray value) { checkOperand(value); int ints = (_length + 31) / 32; for (int i = 0; i < ints; i++) { _array [i] ^= value._array [i]; } _version++; _contains_true = ContainsTrueState.Maybe; // better _cached_true_count = -1; // better return(this); }
public void Incr(BetterBitArray bit_array) { if (bit_array.Length != length) { throw new Exception("Incr BetterBitArray has wrong length!"); } int i = 0; while (i < length) { i = bit_array.GetNextTrueIndex(i); if (i >= length) { break; } Set(i, Get(i) + 1); ++i; } }
public BetterBitArray Or(BetterBitArray value) { checkOperand(value); int ints = (_length + 31) / 32; for (int i = 0; i < ints; i++) { _array [i] |= value._array [i]; } _version++; if (_contains_true == ContainsTrueState.Yes || value._contains_true == ContainsTrueState.Yes) { _contains_true = ContainsTrueState.Yes; } else { _contains_true = ContainsTrueState.Maybe; // better } _cached_true_count = -1; // better return(this); }
public BetterBitArray Xor (BetterBitArray value) { checkOperand (value); int ints = (_length + 31) / 32; for (int i = 0; i < ints; i++) _array [i] ^= value._array [i]; _version++; _contains_true = ContainsTrueState.Maybe; // better _cached_true_count = -1; // better return this; }
public void Incr (BetterBitArray bit_array) { if (bit_array.Length != length) throw new Exception ("Incr BetterBitArray has wrong length!"); int i = 0; while (i < length) { i = bit_array.GetNextTrueIndex (i); if (i >= length) break; Set (i, Get (i) + 1); ++i; } }
// Returns a list of all files and directories in dir static ICollection GetAllItemsInDirectory (DirectoryInfo dir) { // form the query string parent_uri_str = PathToUri (dir.FullName).ToString (); // Instead of taking the painfull way of using BeagleAnalyzer, lets just add the prefix manually // LuceneCommon thinks exposing secret property type encoding is bad, I think so too... except for now string key = "prop:k:" + Property.ParentDirUriPropKey; //Logger.Log.Debug ("Querying for {0}={1}", parent_uri_str, key); LNS.Query query = new LNS.TermQuery (new Term (key, parent_uri_str)); // do the search LNS.IndexSearcher searcher; searcher = LuceneCommon.GetSearcher (driver.PrimaryStore); BetterBitArray matches; matches = new BetterBitArray (searcher.MaxDoc ()); BitArrayHitCollector collector; collector = new BitArrayHitCollector (matches); searcher.Search (query, null, collector); // Finally we pull all of the matching documents, // convert them to Dirent, and store them in a list. ArrayList match_list = new ArrayList (); int i = 0; while (i < matches.Count) { i = matches.GetNextTrueIndex (i); if (i >= matches.Count) break; Document doc; doc = searcher.Doc (i); Dirent info; info = DocumentToDirent (doc); match_list.Add (info); ++i; } LuceneCommon.ReleaseSearcher (searcher); //Logger.Log.Debug ("Found {0} items in {1}", match_list.Count, dir.FullName); return match_list; }
public BitArrayHitCollector (BetterBitArray matches) { this.matches = matches; }
private static ArrayList FindRecentResults (IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary<int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch b = new Stopwatch (); b.Start (); int count = 0; Document doc; ArrayList all_docs = null; TopScores top_docs = null; TermDocs term_docs = null; if (primary_matches.TrueCount > max_results) top_docs = new TopScores (max_results); else all_docs = new ArrayList (primary_matches.TrueCount); if (secondary_reader != null) term_docs = secondary_reader.TermDocs (); for (int match_index = primary_matches.Count; ; match_index --) { // Walk across the matches backwards, since newer // documents are more likely to be at the end of // the index. match_index = primary_matches.GetPreviousTrueIndex (match_index); if (match_index < 0) break; count++; doc = primary_reader.Document (match_index, fields_timestamp_uri); // Check the timestamp --- if we have already reached our // limit, we might be able to reject it immediately. string timestamp_str; long timestamp_num = 0; timestamp_str = doc.Get ("Timestamp"); if (timestamp_str == null) { Logger.Log.Warn ("No timestamp on {0}!", GetUriFromDocument (doc)); } else { timestamp_num = Int64.Parse (doc.Get ("Timestamp")); if (top_docs != null && ! top_docs.WillAccept (timestamp_num)) continue; } // Get the actual hit now // doc was created with only 2 fields, so first get the complete lucene document for primary document. // Also run our hit_filter now, if we have one. Since we insist of returning max_results // most recent hits, any hits that would be filtered out should happen now and not later. Hit hit = CreateHit (primary_reader.Document (match_index), secondary_reader, term_docs); if (hit_filter != null && ! hit_filter (hit)) { if (Debug) Log.Debug ("Filtered out {0}", hit.Uri); total_number_of_matches --; continue; } hits_by_id [match_index] = hit; // Add the document to the appropriate data structure. // We use the timestamp_num as the score, so high // scores correspond to more-recent timestamps. if (all_docs != null) all_docs.Add (hit); else top_docs.Add (timestamp_num, hit); } if (term_docs != null) term_docs.Close (); b.Stop (); if (Debug) Log.Debug (">>> {0}: Instantiated and scanned {1} documents in {2}", index_name, count, b); if (all_docs != null) { // Sort results before sending all_docs.Sort (); return all_docs; } else { return top_docs.TopScoringObjects; } }
// There are two ways we can determine the max_results // most recent items: // // One is to instantiate Lucene documents for each of // the document IDs in primary_matches. This is a // fairly expensive operation. // // The other is to walk through the list of all // document IDs in descending time order. This is // a less expensive operation, but adds up over time // on large data sets. // // We can walk about 2.5 docs for every Document we // instantiate. So what we'll do, if we have more // matches than available hits, is walk (m * 1.25) // docs to see if we can fill out the top 100 hits. // If not, we'll fall back to creating documents // for all of them. private static ArrayList ScanRecentDocs (IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary<int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch a = new Stopwatch (); a.Start (); TermDocs docs = primary_reader.TermDocs (); TermEnum enumerator = primary_reader.Terms (new Term ("InvertedTimestamp", String.Empty)); ArrayList results = new ArrayList (max_results); int docs_found = 0; int docs_walked = 0; int hit_filter_removed = 0; int max_docs = (int) (primary_matches.TrueCount * 1.25); Term term; TermDocs secondary_term_docs = null; if (secondary_reader != null) secondary_term_docs = secondary_reader.TermDocs (); do { term = enumerator.Term (); if (term.Field () != "InvertedTimestamp") break; docs.Seek (enumerator); while (docs.Next () && docs_found < max_results && docs_walked < max_docs) { int doc_id = docs.Doc (); if (primary_matches.Get (doc_id)) { Document doc = primary_reader.Document (doc_id); Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs); // If we have a HitFilter, apply it. if (hit_filter != null && ! hit_filter (hit)) { if (Debug) Log.Debug ("Filtered out {0}", hit.Uri); hit_filter_removed ++; continue; } hits_by_id [doc_id] = hit; // Add the result, last modified first results.Add (hit); docs_found++; } docs_walked++; } } while (enumerator.Next () && docs_found < max_results && docs_walked < max_docs); docs.Close (); if (secondary_term_docs != null) secondary_term_docs.Close (); // If we've found all the docs we can return in a subset! // Fantastic, we've probably short circuited a slow search. if (docs_found != max_results) { // Otherwise bad luck! Not all docs found // Start afresh - this time traversing all results results = null; } else { // Adjust total_number_of_matches. We need to do this to avoid scenarios like the following: // max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned. // We want to avoid saying "Showing top 70 of 100". Note that since we are not passing // every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the // 1234 could actually be much less. But since max_hits was 100, that will not mislead the user. total_number_of_matches -= hit_filter_removed; } a.Stop (); if (Debug) { Log.Debug (">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a); if (docs_found == max_results) Log.Debug (">>> {0}: Successfully short circuited timestamp ordering!", index_name); } return results; }
private static void GenerateQueryResults (IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, IQueryResult result, ICollection query_term_list, int max_results, HitFilter hit_filter, string index_name) { int num_hits; if (Debug) Logger.Log.Debug (">>> {0}: Initially handed {1} matches", index_name, primary_matches.TrueCount); if (primary_matches.TrueCount <= max_results) { if (Debug) Logger.Log.Debug (">>> {0}: Initial count is within our limit of {1}", index_name, max_results); num_hits = primary_matches.TrueCount; } else { if (Debug) Logger.Log.Debug (">>> {0}: Number of hits is capped at {1}", index_name, max_results); num_hits = max_results; } Stopwatch total, d, e; total = new Stopwatch (); d = new Stopwatch (); e = new Stopwatch (); total.Start (); ArrayList final_list_of_hits = null; // This is used only for scoring Dictionary<int, Hit> hits_by_id = new Dictionary<int, Hit> (num_hits); int total_number_of_matches = primary_matches.TrueCount; if (primary_matches.TrueCount > max_results) final_list_of_hits = ScanRecentDocs (primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); if (final_list_of_hits == null) final_list_of_hits = FindRecentResults (primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); d.Start (); ScoreHits (hits_by_id, primary_reader, query_term_list); hits_by_id = null; d.Stop (); if (Debug) Log.Debug (">>> {0}: Scored hits in {1}", index_name, d); e.Start (); // 25 hits seems to be the sweet spot: anything lower // and serialization overhead gets us, higher takes // longer to send out. const int MAX_QUEUED_HITS = 25; int sent_index = 0; // Break up the hits into reasonably sized chunks for // sending over the wire. for (int i = 0; i < final_list_of_hits.Count; ++i) { // Flush our hits if (i > 0 && i % MAX_QUEUED_HITS == 0) { result.Add (final_list_of_hits.GetRange (0, MAX_QUEUED_HITS)); final_list_of_hits.RemoveRange (0, MAX_QUEUED_HITS); i -= MAX_QUEUED_HITS; } } // Flush the remaining hits result.Add (final_list_of_hits, total_number_of_matches); final_list_of_hits = null; e.Stop (); if (Debug) Log.Debug (">>> {0}: Hit filters executed and results sent in {1}", index_name, e); total.Stop (); if (Debug) { Logger.Log.Debug (">>> {0}: GenerateQueryResults time statistics:", index_name); //Logger.Log.Debug (">>> {0}: Short circuit {1,6} ({2:0.0}%)", index_name, a == null ? "N/A" : a.ToString (), a == null ? 0.0 : 100 * a.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Create docs {1,6} ({2:0.0}%)", index_name, b, 100 * b.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Hit assembly {1,6} ({2:0.0}%)", index_name, c, 100 * c.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> {0}: Scored hits {1,6} ({2:0.0}%)", index_name, d, 100 * d.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> {0}: Results sent {1,6} ({2:0.0}%)", index_name, e, 100 * e.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> {0}: TOTAL {1,6}", index_name, total); } }
// Any whitelists that are passed in must be fully joined, or // query results will be incorrect. private static BetterBitArray DoRequiredQueries_TwoIndex (LNS.IndexSearcher primary_searcher, LNS.IndexSearcher secondary_searcher, ArrayList primary_queries, ArrayList secondary_queries, BetterBitArray primary_whitelist, BetterBitArray secondary_whitelist) { ArrayList match_info_list; match_info_list = new ArrayList (); // First, do all of the low-level queries // and store them in our MatchInfo for (int i = 0; i < primary_queries.Count; ++i) { LNS.Query pq, sq; pq = primary_queries [i] as LNS.Query; sq = secondary_queries [i] as LNS.Query; LuceneBitArray p_matches = null, s_matches = null; p_matches = new LuceneBitArray (primary_searcher); if (pq != null) { p_matches.Or (pq); if (primary_whitelist != null) p_matches.And (primary_whitelist); } s_matches = new LuceneBitArray (secondary_searcher); if (sq != null) { s_matches.Or (sq); if (secondary_whitelist != null) s_matches.And (secondary_whitelist); } MatchInfo info; info = new MatchInfo (); info.PrimaryMatches = p_matches; info.SecondaryMatches = s_matches; info.RestrictBy (null); // a hack to initialize the UpperBound match_info_list.Add (info); } // We want to be smart about the order we do this in, // to minimize the expense of the Join. while (match_info_list.Count > 1) { // linear scan to find the minimum int index_min = 0; for (int i = 1; i < match_info_list.Count; ++i) if (((MatchInfo) match_info_list [i]).CompareTo ((MatchInfo) match_info_list [index_min]) < 0) index_min = i; MatchInfo smallest; smallest = match_info_list [index_min] as MatchInfo; match_info_list.RemoveAt (index_min); // We can short-circuit if our smallest set of // matches is empty. if (smallest.UpperBound == 0) return smallest.PrimaryMatches; // this must be an empty array. smallest.Join (); foreach (MatchInfo info in match_info_list) info.RestrictBy (smallest); } // For the final pair, we don't need to do a full join: // mapping the secondary onto the primary is sufficient MatchInfo last; last = match_info_list [0] as MatchInfo; last.SecondaryMatches.ProjectOnto (last.PrimaryMatches); return last.PrimaryMatches; }
//////////////////////////////////////////////////////////////// // // Special logic for handling our set of required queries // // This is the easy case: we just combine all of the queries // into one big BooleanQuery. private static BetterBitArray DoRequiredQueries (LNS.IndexSearcher primary_searcher, ArrayList primary_queries, BetterBitArray primary_whitelist) { LNS.BooleanQuery combined_query; combined_query = new LNS.BooleanQuery (); foreach (LNS.Query query in primary_queries) combined_query.Add (query, LNS.BooleanClause.Occur.MUST); LuceneBitArray matches; matches = new LuceneBitArray (primary_searcher, combined_query); if (primary_whitelist != null) matches.And (primary_whitelist); return matches; }
static void Main () { Random rng = new Random (); for (int trial = 0; trial < 10000; ++trial) { bool failed = false; BetterBitArray ba = new BetterBitArray (rng.Next (100000) + 1); Hashtable true_hash = new Hashtable (); int N = 1 + rng.Next (10000); for (int k = 0; k < N; ++k) { int j = rng.Next (ba.Count); ba [j] = true; true_hash [j] = true; } int i = 0; while (i < ba.Count) { i = ba.GetNextTrueIndex (i); if (i < ba.Count) { if (true_hash.Contains (i)) { true_hash.Remove (i); } else { Console.WriteLine ("Spurious true at {0}", i); failed = true; } } ++i; } if (true_hash.Count > 0) { Console.WriteLine ("Missed some trues:"); foreach (int k in true_hash.Values) Console.WriteLine (" {0}", k); failed = true; } Console.WriteLine ("Trial #{0}: {1}", trial+1, failed ? "FAILED" : "ok"); } }
public LuceneBitArray (LNS.IndexSearcher searcher) : base (searcher.MaxDoc ()) { this.searcher = searcher; this.collector = new BitArrayHitCollector (); this.scratch = null; }
public BetterBitArrayEnumerator (BetterBitArray ba) { _index = -1; _bitArray = ba; _version = ba._version; }
// Return all directories with name public ICollection GetAllDirectoryNameInfo (string name) { // First we assemble a query to find all of the directories. string field_name; field_name = PropertyToFieldName (PropertyType.Keyword, Property.IsDirectoryPropKey); LNS.Query isdir_query = new LNS.TermQuery (new Term (field_name, "true")); LNS.Query query = null; if (name == null) { query = isdir_query; } else { string dirname_field; dirname_field = PropertyToFieldName (PropertyType.Text, Property.TextFilenamePropKey); LNS.Query dirname_query; dirname_query = LuceneCommon.StringToQuery (dirname_field, name, null); LNS.BooleanQuery bool_query = new LNS.BooleanQuery (); bool_query.Add (isdir_query, LNS.BooleanClause.Occur.MUST); bool_query.Add (dirname_query, LNS.BooleanClause.Occur.MUST); query = bool_query; } // Then we actually run the query LNS.IndexSearcher searcher; //searcher = new LNS.IndexSearcher (SecondaryStore); searcher = LuceneCommon.GetSearcher (SecondaryStore); BetterBitArray matches; matches = new BetterBitArray (searcher.MaxDoc ()); BitArrayHitCollector collector; collector = new BitArrayHitCollector (matches); searcher.Search (query, null, collector); // Finally we pull all of the matching documents, // convert them to NameInfo, and store them in a list. ArrayList match_list = new ArrayList (); int i = 0; while (i < matches.Count) { i = matches.GetNextTrueIndex (i); if (i >= matches.Count) break; Document doc; doc = searcher.Doc (i, fields_nameinfo); NameInfo info; info = DocumentToNameInfo (doc); match_list.Add (info); ++i; } LuceneCommon.ReleaseSearcher (searcher); return match_list; }
void checkOperand (BetterBitArray operand) { if (operand == null) throw new ArgumentNullException (); if (operand._length != _length) throw new ArgumentException (); }
public BetterBitArrayEnumerator(BetterBitArray ba) { _index = -1; _bitArray = ba; _version = ba._version; }