static void DoMain(string [] args) { SystemInformation.SetProcessName("beagrep-build-index"); if (args.Length < 2) { PrintUsage(); } ArrayList allowed_patterns = new ArrayList(); ArrayList denied_patterns = new ArrayList(); ArrayList denied_dir_patterns = new ArrayList(); int i = 0; while (i < args.Length) { string arg = args [i]; ++i; string next_arg = i < args.Length ? args [i] : null; switch (arg) { case "-h": case "--help": PrintUsage(); break; case "--tag": if (next_arg != null) { arg_tag = next_arg; } ++i; break; case "-r": case "--recursive": arg_recursive = true; break; case "--enable-deletion": arg_delete = true; break; case "--disable-directories": arg_disable_directories = true; break; case "--enable-text-cache": arg_cache_text = true; break; case "--target": if (next_arg != null) { arg_output = Path.IsPathRooted(next_arg) ? next_arg : Path.GetFullPath(next_arg); } ++i; break; case "--disable-filtering": arg_disable_filtering = true; break; case "--allow-pattern": if (next_arg == null) { break; } if (next_arg.IndexOf(',') != -1) { foreach (string pattern in next_arg.Split(',')) { allowed_patterns.Add(pattern); } } else { allowed_patterns.Add(next_arg); } ++i; break; case "--deny-directory-pattern": if (next_arg == null) { break; } if (next_arg.IndexOf(',') != -1) { foreach (string pattern in next_arg.Split(',')) { denied_dir_patterns.Add(pattern); } } else { denied_dir_patterns.Add(next_arg); } ++i; break; case "--deny-pattern": if (next_arg == null) { break; } if (next_arg.IndexOf(',') != -1) { foreach (string pattern in next_arg.Split(',')) { denied_patterns.Add(pattern); } } else { denied_patterns.Add(next_arg); } ++i; break; case "--disable-restart": arg_disable_restart = true; break; case "--source": if (next_arg == null) { break; } arg_source = next_arg; ++i; break; default: if (arg.StartsWith("-") || arg.StartsWith("--")) { PrintUsage(); } string path = Path.IsPathRooted(arg) ? arg : Path.GetFullPath(arg); if (path != "/" && path.EndsWith("/")) { path = path.TrimEnd('/'); } if (Directory.Exists(path)) { pending_directories.Enqueue(new DirectoryInfo(path)); } else if (File.Exists(path)) { pending_files.Enqueue(new FileInfo(path)); } break; } } ///////////////////////////////////////////////////////// if (arg_output == null) { Logger.Log.Error("--target must be specified"); Environment.Exit(1); } // Set the storage dir, this should be used to store log messages // and filterver.dat PathFinder.StorageDir = arg_output; foreach (FileSystemInfo info in pending_directories) { if (Path.GetFullPath(arg_output) == info.FullName) { Logger.Log.Error("Target directory cannot be one of the source paths."); Environment.Exit(1); } } foreach (FileSystemInfo info in pending_files) { if (Path.GetFullPath(arg_output) == info.FullName) { Logger.Log.Error("Target directory cannot be one of the source paths."); Environment.Exit(1); } } if (!Directory.Exists(Path.GetDirectoryName(arg_output))) { Logger.Log.Error("Index directory not available for construction: {0}", arg_output); Environment.Exit(1); } // Be *EXTRA PARANOID* about the contents of the target // directory, because creating an indexing driver will // nuke it. if (Directory.Exists(arg_output)) { foreach (FileInfo info in DirectoryWalker.GetFileInfos(arg_output)) { if (Array.IndexOf(allowed_files, info.Name) == -1) { Logger.Log.Error("{0} doesn't look safe to delete: non-Beagrep file {1} was found", arg_output, info.FullName); Environment.Exit(1); } } foreach (DirectoryInfo info in DirectoryWalker.GetDirectoryInfos(arg_output)) { if (Array.IndexOf(allowed_dirs, info.Name) == -1) { Logger.Log.Error("{0} doesn't look safe to delete: non-Beagrep directory {1} was found", arg_output, info.FullName); Environment.Exit(1); } } } string config_file_path = Path.Combine(arg_output, "StaticIndex.xml"); string prev_source = null; if (File.Exists(config_file_path)) { Config static_index_config = Conf.LoadFrom(config_file_path); if (static_index_config == null) { Log.Error("Invalid configuation file {0}", config_file_path); Environment.Exit(1); } prev_source = static_index_config.GetOption("Source", null); if (arg_source != null && prev_source != arg_source) { Log.Error("Source already set to {0} for existing static index. Cannot set source to {1}.", prev_source, arg_source); Environment.Exit(1); } // If arg_source is not given, and prev_source is present, use prev_source // as the arg_source. This is useful for re-running build-index without // giving --arg_source for already existing static index arg_source = prev_source; } if (arg_source == null) { DirectoryInfo dir = new DirectoryInfo(StringFu.SanitizePath(arg_output)); arg_source = dir.Name; } string global_files_config = Path.Combine(PathFinder.ConfigDataDir, "config-files"); global_files_config = Path.Combine(global_files_config, Conf.Names.FilesQueryableConfig + ".xml"); if (!File.Exists(global_files_config)) { Log.Error("Global configuration file not found {0}", global_files_config); Environment.Exit(0); } // Setup regexes for allowed/denied patterns if (allowed_patterns.Count > 0) { allowed_regex = StringFu.GetPatternRegex(allowed_patterns); } else { // Read the exclude values from config // For system-wide indexes, only the global config value will be used Config config = Conf.Get(Conf.Names.FilesQueryableConfig); List <string[]> values = config.GetListOptionValues(Conf.Names.ExcludePattern); if (values != null) { foreach (string[] exclude in values) { denied_patterns.Add(exclude [0]); } } if (denied_patterns.Count > 0) { denied_regex = StringFu.GetPatternRegex(denied_patterns); } } if (denied_dir_patterns.Count > 0) { denied_dir_regex = StringFu.GetPatternRegex(denied_dir_patterns); Log.Always("Will ignore directories matching regular expression: {0}", denied_dir_regex); } Log.Always("Starting beagrep-build-index (pid {0}) at {1}", Process.GetCurrentProcess().Id, DateTime.Now); driver = new LuceneIndexingDriver(arg_output, MINOR_VERSION, false); driver.TextCache = (arg_cache_text) ? new TextCache(arg_output) : null; if (driver.TextCache != null) { driver.TextCache.WorldReadable = true; } backing_fa_store = new FileAttributesStore_Sqlite(driver.TopDirectory, driver.Fingerprint); fa_store = new FileAttributesStore(backing_fa_store); // Set up signal handlers #if MONO_1_9 Shutdown.SetupSignalHandlers(delegate(int signal) { if (signal == (int)Mono.Unix.Native.Signum.SIGINT || signal == (int)Mono.Unix.Native.Signum.SIGTERM) { Shutdown.BeginShutdown(); } }); #else SetupSignalHandlers(); #endif Thread monitor_thread = null; Stopwatch watch = new Stopwatch(); watch.Start(); if (!arg_disable_restart) { // Start the thread that monitors memory usage. monitor_thread = ExceptionHandlingThread.Start(new ThreadStart(MemoryMonitorWorker)); } // Start indexworker to do the crawling and indexing IndexWorker(); // Join any threads so that we know that we're the only thread still running if (monitor_thread != null) { monitor_thread.Join(); } watch.Stop(); Logger.Log.Debug("Elapsed time {0}.", watch); // Write this after indexing is done. This is because, if creating a new index, // LuceneIndexingDriver.Create() is called which purges the entire directory. if (prev_source == null) { Config static_index_config = Conf.LoadNew("StaticIndex.xml"); // Write StaticIndex.xml containing: // The name of the source static_index_config.SetOption("Source", arg_source); static_index_config ["Source"].Description = "Source of the static index"; Conf.SaveTo(static_index_config, config_file_path); } if (restart) { Logger.Log.Debug("Restarting beagrep-build-index"); Process p = new Process(); p.StartInfo.UseShellExecute = false; // FIXME: Maybe this isn't the right way to do things? It should be ok, // the PATH is inherited from the shell script which runs mono itself. p.StartInfo.FileName = "mono"; p.StartInfo.Arguments = String.Join(" ", Environment.GetCommandLineArgs()); p.Start(); } Log.Always("Exiting beagrep-build-index (pid {0}) at {1}", Process.GetCurrentProcess().Id, DateTime.Now); }
static void LaunchHelper () { // If we are in the process of shutting down, return immediately. if (Shutdown.ShutdownRequested) return; lock (helper_lock) { // If a helper appears to be running, return immediately. if (CheckHelper ()) return; Logger.Log.Debug ("Launching helper process"); SafeProcess p = new SafeProcess (); string[] args = new string [3]; args [0] = helper_path; if (BeagrepDaemon.DisableTextCache) args [1] = "--disable-text-cache"; else args [1] = String.Empty; if (Log.Level == LogLevel.Debug) args [2] = "--debug"; else args [2] = String.Empty; p.Arguments = args; p.RedirectStandardOutput = false; p.RedirectStandardError = false; p.Start (); Logger.Log.Debug ("IndexHelper PID is {0}", p.Id); // Poll the helper's socket. Wait up to a minute // (500 ms * 120 times) for the helper to be ready // to handle requests. Stopwatch watch = new Stopwatch (); watch.Start (); int poll_count = 0; bool found_helper; do { Thread.Sleep (500); ++poll_count; found_helper = CheckHelper (); } while (poll_count < 120 && ! found_helper && ! Shutdown.ShutdownRequested); watch.Stop (); if (! found_helper) throw new Exception (String.Format ("Couldn't launch helper process {0}", p.Id)); Logger.Log.Debug ("Found IndexHelper ({0}) in {1}", p.Id, watch); helper_pid = p.Id; } }
//////////////////////////////////////////////////////////////// public void DoQuery(Query query, IQueryResult result, ICollection search_subset_uris, // should be internal uris QueryPartHook query_part_hook, HitFilter hit_filter) { if (Debug) { Logger.Log.Debug("###### {0}: Starting low-level queries", IndexName); } Stopwatch total, a, b, c, d, e, f; total = new Stopwatch(); a = new Stopwatch(); b = new Stopwatch(); c = new Stopwatch(); d = new Stopwatch(); e = new Stopwatch(); f = new Stopwatch(); total.Start(); a.Start(); ArrayList primary_required_part_queries; ArrayList secondary_required_part_queries; LNS.BooleanQuery primary_prohibited_part_query; LNS.BooleanQuery secondary_prohibited_part_query; AndHitFilter all_hit_filters; ArrayList term_list; // Assemble all of the parts into a bunch of Lucene queries term_list = AssembleQuery(query, query_part_hook, hit_filter, out primary_required_part_queries, out secondary_required_part_queries, out primary_prohibited_part_query, out secondary_prohibited_part_query, out all_hit_filters); a.Stop(); if (Debug) { Log.Debug("###### {0}: Building queries took {1}", IndexName, a); } // If we have no required parts, give up. if (primary_required_part_queries == null) { return; } b.Start(); // // Now that we have all of these nice queries, let's execute them! // IndexReader primary_reader; LNS.IndexSearcher primary_searcher; IndexReader secondary_reader; LNS.IndexSearcher secondary_searcher; // Create the searchers that we will need. if (!BuildSearchers(out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher)) { return; } b.Stop(); if (Debug) { Log.Debug("###### {0}: Readers/searchers built in {1}", IndexName, b); } // Build whitelists and blacklists for search subsets. c.Start(); // Possibly create our whitelists from the search subset. LuceneBitArray primary_whitelist, secondary_whitelist; CreateQueryWhitelists(search_subset_uris, primary_searcher, secondary_searcher, primary_prohibited_part_query, secondary_prohibited_part_query, out primary_whitelist, out secondary_whitelist); c.Stop(); if (Debug) { Log.Debug("###### {0}: Whitelists and blacklists built in {1}", IndexName, c); } // Now run the low level queries against our indexes. d.Start(); BetterBitArray primary_matches = null; if (primary_required_part_queries != null) { if (secondary_searcher != null) { primary_matches = DoRequiredQueries_TwoIndex(primary_searcher, secondary_searcher, primary_required_part_queries, secondary_required_part_queries, primary_whitelist, secondary_whitelist); } else { primary_matches = DoRequiredQueries(primary_searcher, primary_required_part_queries, primary_whitelist); } } d.Stop(); if (Debug) { Logger.Log.Debug("###### {0}: Low-level queries finished in {1}", IndexName, d); } e.Start(); // Only generate results if we got some matches if (primary_matches != null && primary_matches.ContainsTrue()) { GenerateQueryResults(primary_reader, secondary_reader, primary_matches, result, term_list, query.MaxHits, new HitFilter(all_hit_filters.HitFilter), IndexName); } e.Stop(); if (Debug) { Log.Debug("###### {0}: Query results generated in {1}", IndexName, e); } // // Finally, we clean up after ourselves. // f.Start(); CloseSearchers(primary_reader, primary_searcher, secondary_reader, secondary_searcher); f.Stop(); if (Debug) { Log.Debug("###### {0}: Readers/searchers released in {1}", IndexName, f); } total.Stop(); if (Debug) { Log.Debug("###### {0}: Query time breakdown:", IndexName); Log.Debug("###### {0}: Build queries {1,6} ({2:0.0}%)", IndexName, a, 100 * a.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Got readers {1,6} ({2:0.0}%)", IndexName, b, 100 * b.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Whitelists {1,6} ({2:0.0}%)", IndexName, c, 100 * c.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Queries {1,6} ({2:0.0}%)", IndexName, d, 100 * d.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Gen'd Results {1,6} ({2:0.0}%)", IndexName, e, 100 * e.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: Reader cleanup {1,6} ({2:0.0}%)", IndexName, f, 100 * f.ElapsedTime / total.ElapsedTime); Log.Debug("###### {0}: TOTAL {1,6}", IndexName, total); Logger.Log.Debug("###### {0}: Total query run in {1}", IndexName, total); } }
private static void GenerateQueryResults(IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, IQueryResult result, ICollection query_term_list, int max_results, HitFilter hit_filter, string index_name) { int num_hits; if (Debug) { Logger.Log.Debug(">>> {0}: Initially handed {1} matches", index_name, primary_matches.TrueCount); } if (primary_matches.TrueCount <= max_results) { if (Debug) { Logger.Log.Debug(">>> {0}: Initial count is within our limit of {1}", index_name, max_results); } num_hits = primary_matches.TrueCount; } else { if (Debug) { Logger.Log.Debug(">>> {0}: Number of hits is capped at {1}", index_name, max_results); } num_hits = max_results; } Stopwatch total, d, e; total = new Stopwatch(); d = new Stopwatch(); e = new Stopwatch(); total.Start(); ArrayList final_list_of_hits = null; // This is used only for scoring Dictionary <int, Hit> hits_by_id = new Dictionary <int, Hit> (num_hits); int total_number_of_matches = primary_matches.TrueCount; if (primary_matches.TrueCount > max_results) { final_list_of_hits = ScanRecentDocs(primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); } if (final_list_of_hits == null) { final_list_of_hits = FindRecentResults(primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); } d.Start(); ScoreHits(hits_by_id, primary_reader, query_term_list); hits_by_id = null; d.Stop(); if (Debug) { Log.Debug(">>> {0}: Scored hits in {1}", index_name, d); } e.Start(); // 25 hits seems to be the sweet spot: anything lower // and serialization overhead gets us, higher takes // longer to send out. const int MAX_QUEUED_HITS = 25; int sent_index = 0; // Break up the hits into reasonably sized chunks for // sending over the wire. for (int i = 0; i < final_list_of_hits.Count; ++i) { // Flush our hits if (i > 0 && i % MAX_QUEUED_HITS == 0) { result.Add(final_list_of_hits.GetRange(0, MAX_QUEUED_HITS)); final_list_of_hits.RemoveRange(0, MAX_QUEUED_HITS); i -= MAX_QUEUED_HITS; } } // Flush the remaining hits result.Add(final_list_of_hits, total_number_of_matches); final_list_of_hits = null; e.Stop(); if (Debug) { Log.Debug(">>> {0}: Hit filters executed and results sent in {1}", index_name, e); } total.Stop(); if (Debug) { Logger.Log.Debug(">>> {0}: GenerateQueryResults time statistics:", index_name); //Logger.Log.Debug (">>> {0}: Short circuit {1,6} ({2:0.0}%)", index_name, a == null ? "N/A" : a.ToString (), a == null ? 0.0 : 100 * a.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Create docs {1,6} ({2:0.0}%)", index_name, b, 100 * b.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Hit assembly {1,6} ({2:0.0}%)", index_name, c, 100 * c.ElapsedTime / total.ElapsedTime); Logger.Log.Debug(">>> {0}: Scored hits {1,6} ({2:0.0}%)", index_name, d, 100 * d.ElapsedTime / total.ElapsedTime); Logger.Log.Debug(">>> {0}: Results sent {1,6} ({2:0.0}%)", index_name, e, 100 * e.ElapsedTime / total.ElapsedTime); Logger.Log.Debug(">>> {0}: TOTAL {1,6}", index_name, total); } }
//////////////////////////////////////////////////////////////// public int DoCountMatchQuery (Query query, QueryPartHook query_part_hook) { if (Debug) Logger.Log.Debug ("###### {0}: Starting low-level queries", IndexName); Stopwatch total; total = new Stopwatch (); total.Start (); ArrayList primary_required_part_queries; ArrayList secondary_required_part_queries; LNS.BooleanQuery primary_prohibited_part_query; LNS.BooleanQuery secondary_prohibited_part_query; AndHitFilter all_hit_filters; ArrayList term_list; term_list = AssembleQuery ( query, query_part_hook, null, out primary_required_part_queries, out secondary_required_part_queries, out primary_prohibited_part_query, out secondary_prohibited_part_query, out all_hit_filters); // If we have no required parts, give up. if (primary_required_part_queries == null) return 0; IndexReader primary_reader; LNS.IndexSearcher primary_searcher; IndexReader secondary_reader; LNS.IndexSearcher secondary_searcher; if (! BuildSearchers (out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher)) return 0; // Build whitelists and blacklists for search subsets. LuceneBitArray primary_whitelist, secondary_whitelist; CreateQueryWhitelists (null, primary_searcher, secondary_searcher, primary_prohibited_part_query, secondary_prohibited_part_query, out primary_whitelist, out secondary_whitelist); // Now run the low level queries against our indexes. BetterBitArray primary_matches = null; if (primary_required_part_queries != null) { if (secondary_searcher != null) primary_matches = DoRequiredQueries_TwoIndex (primary_searcher, secondary_searcher, primary_required_part_queries, secondary_required_part_queries, primary_whitelist, secondary_whitelist); else primary_matches = DoRequiredQueries (primary_searcher, primary_required_part_queries, primary_whitelist); } int result = 0; // FIXME: Pass the count through uri-filter and other validation checks if (primary_matches != null) result = primary_matches.TrueCount; CloseSearchers (primary_reader, primary_searcher, secondary_reader, secondary_searcher); total.Stop (); if (Debug) Logger.Log.Debug ("###### {0}: Total query run in {1}", IndexName, total); return result; }
//////////////////////////////////////////////////////////////// public int DoCountMatchQuery(Query query, QueryPartHook query_part_hook) { if (Debug) { Logger.Log.Debug("###### {0}: Starting low-level queries", IndexName); } Stopwatch total; total = new Stopwatch(); total.Start(); ArrayList primary_required_part_queries; ArrayList secondary_required_part_queries; LNS.BooleanQuery primary_prohibited_part_query; LNS.BooleanQuery secondary_prohibited_part_query; AndHitFilter all_hit_filters; ArrayList term_list; term_list = AssembleQuery(query, query_part_hook, null, out primary_required_part_queries, out secondary_required_part_queries, out primary_prohibited_part_query, out secondary_prohibited_part_query, out all_hit_filters); // If we have no required parts, give up. if (primary_required_part_queries == null) { return(0); } IndexReader primary_reader; LNS.IndexSearcher primary_searcher; IndexReader secondary_reader; LNS.IndexSearcher secondary_searcher; if (!BuildSearchers(out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher)) { return(0); } // Build whitelists and blacklists for search subsets. LuceneBitArray primary_whitelist, secondary_whitelist; CreateQueryWhitelists(null, primary_searcher, secondary_searcher, primary_prohibited_part_query, secondary_prohibited_part_query, out primary_whitelist, out secondary_whitelist); // Now run the low level queries against our indexes. BetterBitArray primary_matches = null; if (primary_required_part_queries != null) { if (secondary_searcher != null) { primary_matches = DoRequiredQueries_TwoIndex(primary_searcher, secondary_searcher, primary_required_part_queries, secondary_required_part_queries, primary_whitelist, secondary_whitelist); } else { primary_matches = DoRequiredQueries(primary_searcher, primary_required_part_queries, primary_whitelist); } } int result = 0; // FIXME: Pass the count through uri-filter and other validation checks if (primary_matches != null) { result = primary_matches.TrueCount; } CloseSearchers(primary_reader, primary_searcher, secondary_reader, secondary_searcher); total.Stop(); if (Debug) { Logger.Log.Debug("###### {0}: Total query run in {1}", IndexName, total); } return(result); }
static void LaunchHelper() { // If we are in the process of shutting down, return immediately. if (Shutdown.ShutdownRequested) { return; } lock (helper_lock) { // If a helper appears to be running, return immediately. if (CheckHelper()) { return; } Logger.Log.Debug("Launching helper process"); SafeProcess p = new SafeProcess(); string[] args = new string [3]; args [0] = helper_path; if (BeagrepDaemon.DisableTextCache) { args [1] = "--disable-text-cache"; } else { args [1] = String.Empty; } if (Log.Level == LogLevel.Debug) { args [2] = "--debug"; } else { args [2] = String.Empty; } p.Arguments = args; p.RedirectStandardOutput = false; p.RedirectStandardError = false; p.Start(); Logger.Log.Debug("IndexHelper PID is {0}", p.Id); // Poll the helper's socket. Wait up to a minute // (500 ms * 120 times) for the helper to be ready // to handle requests. Stopwatch watch = new Stopwatch(); watch.Start(); int poll_count = 0; bool found_helper; do { Thread.Sleep(500); ++poll_count; found_helper = CheckHelper(); } while (poll_count < 120 && !found_helper && !Shutdown.ShutdownRequested); watch.Stop(); if (!found_helper) { throw new Exception(String.Format("Couldn't launch helper process {0}", p.Id)); } Logger.Log.Debug("Found IndexHelper ({0}) in {1}", p.Id, watch); helper_pid = p.Id; } }
public static bool StartupProcess () { // Profile our initialization Stopwatch stopwatch = new Stopwatch (); stopwatch.Start (); // Fire up our server if (! StartServer ()) { if (! arg_replace) { Logger.Log.Error ("Could not set up the listener for beagrep requests. " + "There is probably another beagrepd instance running. " + "Use --replace to replace the running service"); Environment.Exit (1); } ReplaceExisting (); } // Set up out-of-process indexing LuceneQueryable.IndexerHook = new LuceneQueryable.IndexerCreator (RemoteIndexer.NewRemoteIndexer); Config config = Conf.Get (Conf.Names.DaemonConfig); // Initialize synchronization to keep the indexes local if PathFinder.StorageDir // is on a non-block device, or if BEAGREP_SYNCHRONIZE_LOCALLY is set if ((! SystemInformation.IsPathOnBlockDevice (PathFinder.StorageDir) && config.GetOption (Conf.Names.IndexSynchronization, true)) || Environment.GetEnvironmentVariable ("BEAGREP_SYNCHRONIZE_LOCALLY") != null) IndexSynchronization.Initialize (); // Start the query driver. Logger.Log.Debug ("Starting QueryDriver"); QueryDriver.Start (); // Start our battery monitor so we can shut down the // scheduler if needed. BatteryMonitor.Init (); bool initially_on_battery = ! BatteryMonitor.UsingAC && ! config.GetOption (Conf.Names.IndexOnBattery, false); // Start the Global Scheduler thread if (! arg_disable_scheduler) { if (! initially_on_battery) { Logger.Log.Debug ("Starting Scheduler thread"); Scheduler.Global.Start (); } else { Log.Debug ("Beagrep started on battery, not starting scheduler thread"); } } // Start our Inotify threads Inotify.Start (); // Test if the FileAdvise stuff is working: This will print a // warning if not. The actual advice calls will fail silently. FileAdvise.TestAdvise (); #if ENABLE_AVAHI zeroconf = new Beagrep.Daemon.Network.Zeroconf (); #endif Conf.WatchForUpdates (); stopwatch.Stop (); Logger.Log.Debug ("Daemon initialization finished after {0}", stopwatch); SystemInformation.LogMemoryUsage (); if (arg_indexing_test_mode) { Thread.Sleep (1000); // Ugly paranoia: wait a second for the backends to settle. Logger.Log.Debug ("Running in indexing test mode"); Scheduler.Global.EmptyQueueEvent += OnEmptySchedulerQueue; Scheduler.Global.Add (null); // pulse the scheduler } return false; }
private static ArrayList FindRecentResults (IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary<int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch b = new Stopwatch (); b.Start (); int count = 0; Document doc; ArrayList all_docs = null; TopScores top_docs = null; TermDocs term_docs = null; if (primary_matches.TrueCount > max_results) top_docs = new TopScores (max_results); else all_docs = new ArrayList (primary_matches.TrueCount); if (secondary_reader != null) term_docs = secondary_reader.TermDocs (); for (int match_index = primary_matches.Count; ; match_index --) { // Walk across the matches backwards, since newer // documents are more likely to be at the end of // the index. match_index = primary_matches.GetPreviousTrueIndex (match_index); if (match_index < 0) break; count++; doc = primary_reader.Document (match_index, fields_timestamp_uri); // Check the timestamp --- if we have already reached our // limit, we might be able to reject it immediately. string timestamp_str; long timestamp_num = 0; timestamp_str = doc.Get ("Timestamp"); if (timestamp_str == null) { Logger.Log.Warn ("No timestamp on {0}!", GetUriFromDocument (doc)); } else { timestamp_num = Int64.Parse (doc.Get ("Timestamp")); if (top_docs != null && ! top_docs.WillAccept (timestamp_num)) continue; } // Get the actual hit now // doc was created with only 2 fields, so first get the complete lucene document for primary document. // Also run our hit_filter now, if we have one. Since we insist of returning max_results // most recent hits, any hits that would be filtered out should happen now and not later. Hit hit = CreateHit (primary_reader.Document (match_index), secondary_reader, term_docs); if (hit_filter != null && ! hit_filter (hit)) { if (Debug) Log.Debug ("Filtered out {0}", hit.Uri); total_number_of_matches --; continue; } hits_by_id [match_index] = hit; // Add the document to the appropriate data structure. // We use the timestamp_num as the score, so high // scores correspond to more-recent timestamps. if (all_docs != null) all_docs.Add (hit); else top_docs.Add (timestamp_num, hit); } if (term_docs != null) term_docs.Close (); b.Stop (); if (Debug) Log.Debug (">>> {0}: Instantiated and scanned {1} documents in {2}", index_name, count, b); if (all_docs != null) { // Sort results before sending all_docs.Sort (); return all_docs; } else { return top_docs.TopScoringObjects; } }
private IndexerReceipt [] Flush_Unlocked (IndexerRequest request) { ArrayList receipt_queue; receipt_queue = new ArrayList (); IndexReader primary_reader, secondary_reader; primary_reader = IndexReader.Open (PrimaryStore); secondary_reader = IndexReader.Open (SecondaryStore); // Step #1: Make our first pass over the list of // indexables that make up our request. For each add // or property change in the request, get the Lucene // documents so we can move forward any persistent // properties (for adds) or all old properties (for // property changes). // // Then, for each add or remove in the request, // delete the associated documents from the index. // Note that we previously cached added documents so // that we can move persistent properties forward. // parent_child_old_props is double-nested hashtable (depth-2 tree) // indexed by the parent uri, it stores another hashtable indexed by the (parent+child documents) // FIXME: 2-level hashtable is a waste for any non-child document. // Replace this by a better data structure. Hashtable parent_child_old_props = UriFu.NewHashtable (); TermDocs term_docs = secondary_reader.TermDocs (); int delete_count = 0; IEnumerable request_indexables = request.Indexables; foreach (Indexable indexable in request_indexables) { string uri_str = UriFu.UriToEscapedString (indexable.Uri); Term term; // Store the necessary properties from old documents for re-addition if (indexable.Type == IndexableType.Add || indexable.Type == IndexableType.PropertyChange) { term = new Term ("Uri", uri_str); term_docs.Seek (term); Hashtable this_parent_child_props = null; if (term_docs.Next ()) { this_parent_child_props = UriFu.NewHashtable (); this_parent_child_props [indexable.Uri] = secondary_reader.Document (term_docs.Doc ()); parent_child_old_props [indexable.Uri] = this_parent_child_props; } term = new Term ("ParentUri", uri_str); term_docs.Seek (term); while (term_docs.Next ()) { Document doc = secondary_reader.Document (term_docs.Doc ()); string child_uri_str = doc.Get ("Uri"); Uri child_uri = UriFu.EscapedStringToUri (child_uri_str); // Any valid lucene document *should* have a Uri, so no need to check for null // Store the child documents too, to save persistent-properties // of child documents this_parent_child_props [child_uri] = doc; } } // Now remove (non-remove indexables will be re-added in next block) Logger.Log.Debug ("-{0}", indexable.DisplayUri); int num_delete = 0; term = new Term ("Uri", uri_str); // For property changes, only secondary index is modified secondary_reader.DeleteDocuments (term); // Now remove from everywhere else (if asked to remove or if asked to add, in which case // we first remove and then add) // So we also need to remove child documents if (indexable.Type != IndexableType.PropertyChange) { num_delete = primary_reader.DeleteDocuments (term); // When we delete an indexable, also delete any children. // FIXME: Shouldn't we also delete any children of children, etc.? term = new Term ("ParentUri", uri_str); num_delete += primary_reader.DeleteDocuments (term); secondary_reader.DeleteDocuments (term); } // If this is a strict removal (and not a deletion that // we are doing in anticipation of adding something back), // queue up a removed receipt. if (indexable.Type == IndexableType.Remove) { IndexerRemovedReceipt r; r = new IndexerRemovedReceipt (indexable.Id); r.NumRemoved = num_delete; receipt_queue.Add (r); } delete_count += num_delete; } term_docs.Close (); if (HaveItemCount) AdjustItemCount (-delete_count); else SetItemCount (primary_reader); // We are now done with the readers, so we close them. // And also free them. Somehow not freeing them is preventing them from // GCed at all. primary_reader.Close (); primary_reader = null; secondary_reader.Close (); secondary_reader = null; // FIXME: If we crash at exactly this point, we are in // trouble. Items will have been dropped from the index // without the proper replacements being added. We can // hopefully fix this when we move to Lucene 2.1. // Step #2: Make another pass across our list of indexables // and write out any new documents. if (text_cache != null) text_cache.BeginTransaction (); IndexWriter primary_writer, secondary_writer; // FIXME: Lock obtain time-out can happen here; if that happens, // an exception will be thrown and this method will break in the middle // leaving IndexWriters unclosed! Same for any Lucene.Net-index modification // methods. primary_writer = new IndexWriter (PrimaryStore, IndexingAnalyzer, false); secondary_writer = null; foreach (Indexable indexable in request_indexables) { // If shutdown has been started, break here // FIXME: Some more processing will continue, a lot of them // concerning receipts, but the daemon will anyway ignore receipts // now, what is the fastest way to stop from here ? if (Shutdown.ShutdownRequested) { Log.Debug ("Shutdown initiated. Breaking while flushing indexables."); break; } // Receipts for removes were generated in the // previous block. Now we just have to remove // items from the text cache. if (indexable.Type == IndexableType.Remove) { if (text_cache != null) text_cache.Delete (indexable.Uri); continue; } IndexerAddedReceipt r; Hashtable prop_change_docs = (Hashtable) parent_child_old_props [indexable.Uri]; if (indexable.Type == IndexableType.PropertyChange) { Logger.Log.Debug ("+{0} (props only)", indexable.DisplayUri); r = new IndexerAddedReceipt (indexable.Id); r.PropertyChangesOnly = true; receipt_queue.Add (r); Document doc; if (prop_change_docs == null) doc = null; else doc = (Document) prop_change_docs [indexable.Uri]; Document new_doc; new_doc = RewriteDocument (doc, indexable); // Write out the new document... if (secondary_writer == null) secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false); secondary_writer.AddDocument (new_doc); // Get child property change indexables... ArrayList prop_change_indexables; prop_change_indexables = GetChildPropertyChange (prop_change_docs, indexable); // and store them; no need to delete them first, since they were already removed from the index if (prop_change_indexables == null) continue; foreach (Indexable prop_change_indexable in prop_change_indexables) { Log.Debug ("+{0} (props only, generated indexable)", prop_change_indexable.Uri); doc = (Document) prop_change_docs [prop_change_indexable.Uri]; new_doc = RewriteDocument (doc, prop_change_indexable); secondary_writer.AddDocument (new_doc); } continue; // ...and proceed to the next Indexable } // If we reach this point we know we are dealing with an IndexableType.Add if (indexable.Type != IndexableType.Add) throw new Exception ("When I said it was an IndexableType.Add, I meant it!"); r = AddIndexableToIndex (indexable, primary_writer, ref secondary_writer, prop_change_docs); if (r != null) receipt_queue.Add (r); } if (text_cache != null) text_cache.CommitTransaction (); if (Shutdown.ShutdownRequested) { foreach (DeferredInfo di in deferred_indexables) di.Cleanup (); deferred_indexables.Clear (); foreach (Indexable indexable in request_indexables) indexable.Cleanup (); primary_writer.Close (); if (secondary_writer != null) secondary_writer.Close (); return null; } if (request.OptimizeIndex) { Stopwatch watch = new Stopwatch (); Logger.Log.Debug ("Optimizing {0}", IndexName); watch.Start (); primary_writer.Optimize (); if (secondary_writer == null) secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false); secondary_writer.Optimize (); watch.Stop (); Logger.Log.Debug ("{0} optimized in {1}", IndexName, watch); } // Step #4. Close our writers and return the events to // indicate what has happened. primary_writer.Close (); if (secondary_writer != null) secondary_writer.Close (); // Send a single IndexerIndexablesReceipt if there were deferred indexables if (deferred_indexables.Count > 0) { Log.Debug ("{0} indexables generated more indexables; asking daemon to schedule their indexing.", deferred_indexables.Count); IndexerIndexablesReceipt r = new IndexerIndexablesReceipt (); receipt_queue.Add (r); } IndexerReceipt [] receipt_array; receipt_array = new IndexerReceipt [receipt_queue.Count]; for (int i = 0; i < receipt_queue.Count; ++i) receipt_array [i] = (IndexerReceipt) receipt_queue [i]; return receipt_array; }
// There are two ways we can determine the max_results // most recent items: // // One is to instantiate Lucene documents for each of // the document IDs in primary_matches. This is a // fairly expensive operation. // // The other is to walk through the list of all // document IDs in descending time order. This is // a less expensive operation, but adds up over time // on large data sets. // // We can walk about 2.5 docs for every Document we // instantiate. So what we'll do, if we have more // matches than available hits, is walk (m * 1.25) // docs to see if we can fill out the top 100 hits. // If not, we'll fall back to creating documents // for all of them. private static ArrayList ScanRecentDocs (IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary<int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch a = new Stopwatch (); a.Start (); TermDocs docs = primary_reader.TermDocs (); TermEnum enumerator = primary_reader.Terms (new Term ("InvertedTimestamp", String.Empty)); ArrayList results = new ArrayList (max_results); int docs_found = 0; int docs_walked = 0; int hit_filter_removed = 0; int max_docs = (int) (primary_matches.TrueCount * 1.25); Term term; TermDocs secondary_term_docs = null; if (secondary_reader != null) secondary_term_docs = secondary_reader.TermDocs (); do { term = enumerator.Term (); if (term.Field () != "InvertedTimestamp") break; docs.Seek (enumerator); while (docs.Next () && docs_found < max_results && docs_walked < max_docs) { int doc_id = docs.Doc (); if (primary_matches.Get (doc_id)) { Document doc = primary_reader.Document (doc_id); Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs); // If we have a HitFilter, apply it. if (hit_filter != null && ! hit_filter (hit)) { if (Debug) Log.Debug ("Filtered out {0}", hit.Uri); hit_filter_removed ++; continue; } hits_by_id [doc_id] = hit; // Add the result, last modified first results.Add (hit); docs_found++; } docs_walked++; } } while (enumerator.Next () && docs_found < max_results && docs_walked < max_docs); docs.Close (); if (secondary_term_docs != null) secondary_term_docs.Close (); // If we've found all the docs we can return in a subset! // Fantastic, we've probably short circuited a slow search. if (docs_found != max_results) { // Otherwise bad luck! Not all docs found // Start afresh - this time traversing all results results = null; } else { // Adjust total_number_of_matches. We need to do this to avoid scenarios like the following: // max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned. // We want to avoid saying "Showing top 70 of 100". Note that since we are not passing // every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the // 1234 could actually be much less. But since max_hits was 100, that will not mislead the user. total_number_of_matches -= hit_filter_removed; } a.Stop (); if (Debug) { Log.Debug (">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a); if (docs_found == max_results) Log.Debug (">>> {0}: Successfully short circuited timestamp ordering!", index_name); } return results; }
private static void GenerateQueryResults (IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, IQueryResult result, ICollection query_term_list, int max_results, HitFilter hit_filter, string index_name) { int num_hits; if (Debug) Logger.Log.Debug (">>> {0}: Initially handed {1} matches", index_name, primary_matches.TrueCount); if (primary_matches.TrueCount <= max_results) { if (Debug) Logger.Log.Debug (">>> {0}: Initial count is within our limit of {1}", index_name, max_results); num_hits = primary_matches.TrueCount; } else { if (Debug) Logger.Log.Debug (">>> {0}: Number of hits is capped at {1}", index_name, max_results); num_hits = max_results; } Stopwatch total, d, e; total = new Stopwatch (); d = new Stopwatch (); e = new Stopwatch (); total.Start (); ArrayList final_list_of_hits = null; // This is used only for scoring Dictionary<int, Hit> hits_by_id = new Dictionary<int, Hit> (num_hits); int total_number_of_matches = primary_matches.TrueCount; if (primary_matches.TrueCount > max_results) final_list_of_hits = ScanRecentDocs (primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); if (final_list_of_hits == null) final_list_of_hits = FindRecentResults (primary_reader, secondary_reader, primary_matches, hits_by_id, max_results, ref total_number_of_matches, hit_filter, index_name); d.Start (); ScoreHits (hits_by_id, primary_reader, query_term_list); hits_by_id = null; d.Stop (); if (Debug) Log.Debug (">>> {0}: Scored hits in {1}", index_name, d); e.Start (); // 25 hits seems to be the sweet spot: anything lower // and serialization overhead gets us, higher takes // longer to send out. const int MAX_QUEUED_HITS = 25; int sent_index = 0; // Break up the hits into reasonably sized chunks for // sending over the wire. for (int i = 0; i < final_list_of_hits.Count; ++i) { // Flush our hits if (i > 0 && i % MAX_QUEUED_HITS == 0) { result.Add (final_list_of_hits.GetRange (0, MAX_QUEUED_HITS)); final_list_of_hits.RemoveRange (0, MAX_QUEUED_HITS); i -= MAX_QUEUED_HITS; } } // Flush the remaining hits result.Add (final_list_of_hits, total_number_of_matches); final_list_of_hits = null; e.Stop (); if (Debug) Log.Debug (">>> {0}: Hit filters executed and results sent in {1}", index_name, e); total.Stop (); if (Debug) { Logger.Log.Debug (">>> {0}: GenerateQueryResults time statistics:", index_name); //Logger.Log.Debug (">>> {0}: Short circuit {1,6} ({2:0.0}%)", index_name, a == null ? "N/A" : a.ToString (), a == null ? 0.0 : 100 * a.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Create docs {1,6} ({2:0.0}%)", index_name, b, 100 * b.ElapsedTime / total.ElapsedTime); //Logger.Log.Debug (">>> {0}: Hit assembly {1,6} ({2:0.0}%)", index_name, c, 100 * c.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> {0}: Scored hits {1,6} ({2:0.0}%)", index_name, d, 100 * d.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> {0}: Results sent {1,6} ({2:0.0}%)", index_name, e, 100 * e.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> {0}: TOTAL {1,6}", index_name, total); } }
//////////////////////////////////////////////////////////////// public void DoQuery (Query query, IQueryResult result, ICollection search_subset_uris, // should be internal uris QueryPartHook query_part_hook, HitFilter hit_filter) { if (Debug) Logger.Log.Debug ("###### {0}: Starting low-level queries", IndexName); Stopwatch total, a, b, c, d, e, f; total = new Stopwatch (); a = new Stopwatch (); b = new Stopwatch (); c = new Stopwatch (); d = new Stopwatch (); e = new Stopwatch (); f = new Stopwatch (); total.Start (); a.Start (); ArrayList primary_required_part_queries; ArrayList secondary_required_part_queries; LNS.BooleanQuery primary_prohibited_part_query; LNS.BooleanQuery secondary_prohibited_part_query; AndHitFilter all_hit_filters; ArrayList term_list; // Assemble all of the parts into a bunch of Lucene queries term_list = AssembleQuery (query, query_part_hook, hit_filter, out primary_required_part_queries, out secondary_required_part_queries, out primary_prohibited_part_query, out secondary_prohibited_part_query, out all_hit_filters); a.Stop (); if (Debug) Log.Debug ("###### {0}: Building queries took {1}", IndexName, a); // If we have no required parts, give up. if (primary_required_part_queries == null) return; b.Start (); // // Now that we have all of these nice queries, let's execute them! // IndexReader primary_reader; LNS.IndexSearcher primary_searcher; IndexReader secondary_reader; LNS.IndexSearcher secondary_searcher; // Create the searchers that we will need. if (! BuildSearchers (out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher)) return; b.Stop (); if (Debug) Log.Debug ("###### {0}: Readers/searchers built in {1}", IndexName, b); // Build whitelists and blacklists for search subsets. c.Start (); // Possibly create our whitelists from the search subset. LuceneBitArray primary_whitelist, secondary_whitelist; CreateQueryWhitelists (search_subset_uris, primary_searcher, secondary_searcher, primary_prohibited_part_query, secondary_prohibited_part_query, out primary_whitelist, out secondary_whitelist); c.Stop (); if (Debug) Log.Debug ("###### {0}: Whitelists and blacklists built in {1}", IndexName, c); // Now run the low level queries against our indexes. d.Start (); BetterBitArray primary_matches = null; if (primary_required_part_queries != null) { if (secondary_searcher != null) primary_matches = DoRequiredQueries_TwoIndex (primary_searcher, secondary_searcher, primary_required_part_queries, secondary_required_part_queries, primary_whitelist, secondary_whitelist); else primary_matches = DoRequiredQueries (primary_searcher, primary_required_part_queries, primary_whitelist); } d.Stop (); if (Debug) Logger.Log.Debug ("###### {0}: Low-level queries finished in {1}", IndexName, d); e.Start (); // Only generate results if we got some matches if (primary_matches != null && primary_matches.ContainsTrue ()) { GenerateQueryResults (primary_reader, secondary_reader, primary_matches, result, term_list, query.MaxHits, new HitFilter (all_hit_filters.HitFilter), IndexName); } e.Stop (); if (Debug) Log.Debug ("###### {0}: Query results generated in {1}", IndexName, e); // // Finally, we clean up after ourselves. // f.Start (); CloseSearchers (primary_reader, primary_searcher, secondary_reader, secondary_searcher); f.Stop (); if (Debug) Log.Debug ("###### {0}: Readers/searchers released in {1}", IndexName, f); total.Stop (); if (Debug) { Log.Debug ("###### {0}: Query time breakdown:", IndexName); Log.Debug ("###### {0}: Build queries {1,6} ({2:0.0}%)", IndexName, a, 100 * a.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Got readers {1,6} ({2:0.0}%)", IndexName, b, 100 * b.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Whitelists {1,6} ({2:0.0}%)", IndexName, c, 100 * c.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Queries {1,6} ({2:0.0}%)", IndexName, d, 100 * d.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Gen'd Results {1,6} ({2:0.0}%)", IndexName, e, 100 * e.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Reader cleanup {1,6} ({2:0.0}%)", IndexName, f, 100 * f.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: TOTAL {1,6}", IndexName, total); Logger.Log.Debug ("###### {0}: Total query run in {1}", IndexName, total); } }
public static bool StartupProcess() { // Profile our initialization Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); // Fire up our server if (!StartServer()) { if (!arg_replace) { Logger.Log.Error("Could not set up the listener for beagrep requests. " + "There is probably another beagrepd instance running. " + "Use --replace to replace the running service"); Environment.Exit(1); } ReplaceExisting(); } // Set up out-of-process indexing LuceneQueryable.IndexerHook = new LuceneQueryable.IndexerCreator(RemoteIndexer.NewRemoteIndexer); Config config = Conf.Get(Conf.Names.DaemonConfig); // Initialize synchronization to keep the indexes local if PathFinder.StorageDir // is on a non-block device, or if BEAGREP_SYNCHRONIZE_LOCALLY is set if ((!SystemInformation.IsPathOnBlockDevice(PathFinder.StorageDir) && config.GetOption(Conf.Names.IndexSynchronization, true)) || Environment.GetEnvironmentVariable("BEAGREP_SYNCHRONIZE_LOCALLY") != null) { IndexSynchronization.Initialize(); } // Start the query driver. Logger.Log.Debug("Starting QueryDriver"); QueryDriver.Start(); // Start our battery monitor so we can shut down the // scheduler if needed. BatteryMonitor.Init(); bool initially_on_battery = !BatteryMonitor.UsingAC && !config.GetOption(Conf.Names.IndexOnBattery, false); // Start the Global Scheduler thread if (!arg_disable_scheduler) { if (!initially_on_battery) { Logger.Log.Debug("Starting Scheduler thread"); Scheduler.Global.Start(); } else { Log.Debug("Beagrep started on battery, not starting scheduler thread"); } } // Start our Inotify threads Inotify.Start(); // Test if the FileAdvise stuff is working: This will print a // warning if not. The actual advice calls will fail silently. FileAdvise.TestAdvise(); #if ENABLE_AVAHI zeroconf = new Beagrep.Daemon.Network.Zeroconf(); #endif Conf.WatchForUpdates(); stopwatch.Stop(); Logger.Log.Debug("Daemon initialization finished after {0}", stopwatch); SystemInformation.LogMemoryUsage(); if (arg_indexing_test_mode) { Thread.Sleep(1000); // Ugly paranoia: wait a second for the backends to settle. Logger.Log.Debug("Running in indexing test mode"); Scheduler.Global.EmptyQueueEvent += OnEmptySchedulerQueue; Scheduler.Global.Add(null); // pulse the scheduler } return(false); }
// There are two ways we can determine the max_results // most recent items: // // One is to instantiate Lucene documents for each of // the document IDs in primary_matches. This is a // fairly expensive operation. // // The other is to walk through the list of all // document IDs in descending time order. This is // a less expensive operation, but adds up over time // on large data sets. // // We can walk about 2.5 docs for every Document we // instantiate. So what we'll do, if we have more // matches than available hits, is walk (m * 1.25) // docs to see if we can fill out the top 100 hits. // If not, we'll fall back to creating documents // for all of them. private static ArrayList ScanRecentDocs(IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary <int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch a = new Stopwatch(); a.Start(); TermDocs docs = primary_reader.TermDocs(); TermEnum enumerator = primary_reader.Terms(new Term("InvertedTimestamp", String.Empty)); ArrayList results = new ArrayList(max_results); int docs_found = 0; int docs_walked = 0; int hit_filter_removed = 0; int max_docs = (int)(primary_matches.TrueCount * 1.25); Term term; TermDocs secondary_term_docs = null; if (secondary_reader != null) { secondary_term_docs = secondary_reader.TermDocs(); } do { term = enumerator.Term(); if (term.Field() != "InvertedTimestamp") { break; } docs.Seek(enumerator); while (docs.Next() && docs_found < max_results && docs_walked < max_docs) { int doc_id = docs.Doc(); if (primary_matches.Get(doc_id)) { Document doc = primary_reader.Document(doc_id); Hit hit = CreateHit(doc, secondary_reader, secondary_term_docs); // If we have a HitFilter, apply it. if (hit_filter != null && !hit_filter(hit)) { if (Debug) { Log.Debug("Filtered out {0}", hit.Uri); } hit_filter_removed++; continue; } hits_by_id [doc_id] = hit; // Add the result, last modified first results.Add(hit); docs_found++; } docs_walked++; } } while (enumerator.Next() && docs_found < max_results && docs_walked < max_docs); docs.Close(); if (secondary_term_docs != null) { secondary_term_docs.Close(); } // If we've found all the docs we can return in a subset! // Fantastic, we've probably short circuited a slow search. if (docs_found != max_results) { // Otherwise bad luck! Not all docs found // Start afresh - this time traversing all results results = null; } else { // Adjust total_number_of_matches. We need to do this to avoid scenarios like the following: // max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned. // We want to avoid saying "Showing top 70 of 100". Note that since we are not passing // every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the // 1234 could actually be much less. But since max_hits was 100, that will not mislead the user. total_number_of_matches -= hit_filter_removed; } a.Stop(); if (Debug) { Log.Debug(">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a); if (docs_found == max_results) { Log.Debug(">>> {0}: Successfully short circuited timestamp ordering!", index_name); } } return(results); }
static void DoMain (string [] args) { SystemInformation.SetProcessName ("beagrep-build-index"); if (args.Length < 2) PrintUsage (); ArrayList allowed_patterns = new ArrayList (); ArrayList denied_patterns = new ArrayList (); ArrayList denied_dir_patterns = new ArrayList (); int i = 0; while (i < args.Length) { string arg = args [i]; ++i; string next_arg = i < args.Length ? args [i] : null; switch (arg) { case "-h": case "--help": PrintUsage (); break; case "--tag": if (next_arg != null) arg_tag = next_arg; ++i; break; case "-r": case "--recursive": arg_recursive = true; break; case "--enable-deletion": arg_delete = true; break; case "--disable-directories": arg_disable_directories = true; break; case "--enable-text-cache": arg_cache_text = true; break; case "--target": if (next_arg != null) arg_output = Path.IsPathRooted (next_arg) ? next_arg : Path.GetFullPath (next_arg); ++i; break; case "--disable-filtering": arg_disable_filtering = true; break; case "--allow-pattern": if (next_arg == null) break; if (next_arg.IndexOf (',') != -1) { foreach (string pattern in next_arg.Split (',')) allowed_patterns.Add (pattern); } else { allowed_patterns.Add (next_arg); } ++i; break; case "--deny-directory-pattern": if (next_arg == null) break; if (next_arg.IndexOf (',') != -1) { foreach (string pattern in next_arg.Split (',')) denied_dir_patterns.Add (pattern); } else { denied_dir_patterns.Add (next_arg); } ++i; break; case "--deny-pattern": if (next_arg == null) break; if (next_arg.IndexOf (',') != -1) { foreach (string pattern in next_arg.Split (',')) denied_patterns.Add (pattern); } else { denied_patterns.Add (next_arg); } ++i; break; case "--disable-restart": arg_disable_restart = true; break; case "--source": if (next_arg == null) break; arg_source = next_arg; ++i; break; default: if (arg.StartsWith ("-") || arg.StartsWith ("--")) PrintUsage (); string path = Path.IsPathRooted (arg) ? arg : Path.GetFullPath (arg); if (path != "/" && path.EndsWith ("/")) path = path.TrimEnd ('/'); if (Directory.Exists (path)) pending_directories.Enqueue (new DirectoryInfo (path)); else if (File.Exists (path)) pending_files.Enqueue (new FileInfo (path)); break; } } ///////////////////////////////////////////////////////// if (arg_output == null) { Logger.Log.Error ("--target must be specified"); Environment.Exit (1); } // Set the storage dir, this should be used to store log messages // and filterver.dat PathFinder.StorageDir = arg_output; foreach (FileSystemInfo info in pending_directories) { if (Path.GetFullPath (arg_output) == info.FullName) { Logger.Log.Error ("Target directory cannot be one of the source paths."); Environment.Exit (1); } } foreach (FileSystemInfo info in pending_files) { if (Path.GetFullPath (arg_output) == info.FullName) { Logger.Log.Error ("Target directory cannot be one of the source paths."); Environment.Exit (1); } } if (!Directory.Exists (Path.GetDirectoryName (arg_output))) { Logger.Log.Error ("Index directory not available for construction: {0}", arg_output); Environment.Exit (1); } // Be *EXTRA PARANOID* about the contents of the target // directory, because creating an indexing driver will // nuke it. if (Directory.Exists (arg_output)) { foreach (FileInfo info in DirectoryWalker.GetFileInfos (arg_output)) { if (Array.IndexOf (allowed_files, info.Name) == -1) { Logger.Log.Error ("{0} doesn't look safe to delete: non-Beagrep file {1} was found", arg_output, info.FullName); Environment.Exit (1); } } foreach (DirectoryInfo info in DirectoryWalker.GetDirectoryInfos (arg_output)) { if (Array.IndexOf (allowed_dirs, info.Name) == -1) { Logger.Log.Error ("{0} doesn't look safe to delete: non-Beagrep directory {1} was found", arg_output, info.FullName); Environment.Exit (1); } } } string config_file_path = Path.Combine (arg_output, "StaticIndex.xml"); string prev_source = null; if (File.Exists (config_file_path)) { Config static_index_config = Conf.LoadFrom (config_file_path); if (static_index_config == null) { Log.Error ("Invalid configuation file {0}", config_file_path); Environment.Exit (1); } prev_source = static_index_config.GetOption ("Source", null); if (arg_source != null && prev_source != arg_source) { Log.Error ("Source already set to {0} for existing static index. Cannot set source to {1}.", prev_source, arg_source); Environment.Exit (1); } // If arg_source is not given, and prev_source is present, use prev_source // as the arg_source. This is useful for re-running build-index without // giving --arg_source for already existing static index arg_source = prev_source; } if (arg_source == null) { DirectoryInfo dir = new DirectoryInfo (StringFu.SanitizePath (arg_output)); arg_source = dir.Name; } string global_files_config = Path.Combine (PathFinder.ConfigDataDir, "config-files"); global_files_config = Path.Combine (global_files_config, Conf.Names.FilesQueryableConfig + ".xml"); if (! File.Exists (global_files_config)) { Log.Error ("Global configuration file not found {0}", global_files_config); Environment.Exit (0); } // Setup regexes for allowed/denied patterns if (allowed_patterns.Count > 0) { allowed_regex = StringFu.GetPatternRegex (allowed_patterns); } else { // Read the exclude values from config // For system-wide indexes, only the global config value will be used Config config = Conf.Get (Conf.Names.FilesQueryableConfig); List<string[]> values = config.GetListOptionValues (Conf.Names.ExcludePattern); if (values != null) foreach (string[] exclude in values) denied_patterns.Add (exclude [0]); if (denied_patterns.Count > 0) denied_regex = StringFu.GetPatternRegex (denied_patterns); } if (denied_dir_patterns.Count > 0) { denied_dir_regex = StringFu.GetPatternRegex (denied_dir_patterns); Log.Always("Will ignore directories matching regular expression: {0}", denied_dir_regex); } Log.Always ("Starting beagrep-build-index (pid {0}) at {1}", Process.GetCurrentProcess ().Id, DateTime.Now); driver = new LuceneIndexingDriver (arg_output, MINOR_VERSION, false); driver.TextCache = (arg_cache_text) ? new TextCache (arg_output) : null; if (driver.TextCache != null) driver.TextCache.WorldReadable = true; backing_fa_store = new FileAttributesStore_Sqlite (driver.TopDirectory, driver.Fingerprint); fa_store = new FileAttributesStore (backing_fa_store); // Set up signal handlers #if MONO_1_9 Shutdown.SetupSignalHandlers (delegate (int signal) { if (signal == (int) Mono.Unix.Native.Signum.SIGINT || signal == (int) Mono.Unix.Native.Signum.SIGTERM) Shutdown.BeginShutdown (); }); #else SetupSignalHandlers (); #endif Thread monitor_thread = null; Stopwatch watch = new Stopwatch (); watch.Start (); if (!arg_disable_restart) { // Start the thread that monitors memory usage. monitor_thread = ExceptionHandlingThread.Start (new ThreadStart (MemoryMonitorWorker)); } // Start indexworker to do the crawling and indexing IndexWorker (); // Join any threads so that we know that we're the only thread still running if (monitor_thread != null) monitor_thread.Join (); watch.Stop (); Logger.Log.Debug ("Elapsed time {0}.", watch); // Write this after indexing is done. This is because, if creating a new index, // LuceneIndexingDriver.Create() is called which purges the entire directory. if (prev_source == null) { Config static_index_config = Conf.LoadNew ("StaticIndex.xml"); // Write StaticIndex.xml containing: // The name of the source static_index_config.SetOption ("Source", arg_source); static_index_config ["Source"].Description = "Source of the static index"; Conf.SaveTo (static_index_config, config_file_path); } if (restart) { Logger.Log.Debug ("Restarting beagrep-build-index"); Process p = new Process (); p.StartInfo.UseShellExecute = false; // FIXME: Maybe this isn't the right way to do things? It should be ok, // the PATH is inherited from the shell script which runs mono itself. p.StartInfo.FileName = "mono"; p.StartInfo.Arguments = String.Join (" ", Environment.GetCommandLineArgs ()); p.Start (); } Log.Always ("Exiting beagrep-build-index (pid {0}) at {1}", Process.GetCurrentProcess ().Id, DateTime.Now); }
private static ArrayList FindRecentResults(IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary <int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch b = new Stopwatch(); b.Start(); int count = 0; Document doc; ArrayList all_docs = null; TopScores top_docs = null; TermDocs term_docs = null; if (primary_matches.TrueCount > max_results) { top_docs = new TopScores(max_results); } else { all_docs = new ArrayList(primary_matches.TrueCount); } if (secondary_reader != null) { term_docs = secondary_reader.TermDocs(); } for (int match_index = primary_matches.Count; ; match_index--) { // Walk across the matches backwards, since newer // documents are more likely to be at the end of // the index. match_index = primary_matches.GetPreviousTrueIndex(match_index); if (match_index < 0) { break; } count++; doc = primary_reader.Document(match_index, fields_timestamp_uri); // Check the timestamp --- if we have already reached our // limit, we might be able to reject it immediately. string timestamp_str; long timestamp_num = 0; timestamp_str = doc.Get("Timestamp"); if (timestamp_str == null) { Logger.Log.Warn("No timestamp on {0}!", GetUriFromDocument(doc)); } else { timestamp_num = Int64.Parse(doc.Get("Timestamp")); if (top_docs != null && !top_docs.WillAccept(timestamp_num)) { continue; } } // Get the actual hit now // doc was created with only 2 fields, so first get the complete lucene document for primary document. // Also run our hit_filter now, if we have one. Since we insist of returning max_results // most recent hits, any hits that would be filtered out should happen now and not later. Hit hit = CreateHit(primary_reader.Document(match_index), secondary_reader, term_docs); if (hit_filter != null && !hit_filter(hit)) { if (Debug) { Log.Debug("Filtered out {0}", hit.Uri); } total_number_of_matches--; continue; } hits_by_id [match_index] = hit; // Add the document to the appropriate data structure. // We use the timestamp_num as the score, so high // scores correspond to more-recent timestamps. if (all_docs != null) { all_docs.Add(hit); } else { top_docs.Add(timestamp_num, hit); } } if (term_docs != null) { term_docs.Close(); } b.Stop(); if (Debug) { Log.Debug(">>> {0}: Instantiated and scanned {1} documents in {2}", index_name, count, b); } if (all_docs != null) { // Sort results before sending all_docs.Sort(); return(all_docs); } else { return(top_docs.TopScoringObjects); } }
protected bool Exists () { if (! (Directory.Exists (top_dir) && File.Exists (VersionFile) && File.Exists (FingerprintFile) && Directory.Exists (PrimaryIndexDirectory) && IndexReader.IndexExists (PrimaryIndexDirectory) && Directory.Exists (SecondaryIndexDirectory) && IndexReader.IndexExists (SecondaryIndexDirectory) && Directory.Exists (LockDirectory))) return false; // Check the index's version number. If it is wrong, // declare the index non-existent. StreamReader version_reader; string version_str; version_reader = new StreamReader (VersionFile); version_str = version_reader.ReadLine (); version_reader.Close (); int current_major_version, current_minor_version; int i = version_str.IndexOf ('.'); try { if (i != -1) { current_major_version = Convert.ToInt32 (version_str.Substring (0, i)); current_minor_version = Convert.ToInt32 (version_str.Substring (i+1)); } else { current_minor_version = Convert.ToInt32 (version_str); current_major_version = 0; } } catch (FormatException) { // Something wrong with the version file. return false; } if (current_major_version != MAJOR_VERSION || (minor_version >= 0 && current_minor_version != minor_version)) { Logger.Log.Debug ("Version mismatch in {0}", index_name); Logger.Log.Debug ("Index has version {0}.{1}, expected {2}.{3}", current_major_version, current_minor_version, MAJOR_VERSION, minor_version); return false; } // Check the lock directory: If there is a dangling write lock, // assume that the index is corrupted and declare it non-existent. DirectoryInfo lock_dir_info; lock_dir_info = new DirectoryInfo (LockDirectory); bool dangling_lock = false; foreach (FileInfo info in lock_dir_info.GetFiles ()) { if (IsDanglingLock (info)) { Logger.Log.Warn ("Found a dangling index lock on {0}.", info.FullName); dangling_lock = true; } } if (dangling_lock) { Beagrep.Util.Stopwatch w = new Beagrep.Util.Stopwatch (); w.Start (); if (VerifyLuceneIndex (PrimaryIndexDirectory) && VerifyLuceneIndex (SecondaryIndexDirectory)) { w.Stop (); Log.Warn ("Indexes verified in {0}. Deleting stale lock files.", w); try { foreach (FileInfo info in lock_dir_info.GetFiles ()) info.Delete (); } catch { Log.Warn ("Could not delete lock files."); return false; } return true; } else return false; } return true; }
private IndexerReceipt [] Flush_Unlocked(IndexerRequest request) { ArrayList receipt_queue; receipt_queue = new ArrayList(); IndexReader primary_reader, secondary_reader; primary_reader = IndexReader.Open(PrimaryStore); secondary_reader = IndexReader.Open(SecondaryStore); // Step #1: Make our first pass over the list of // indexables that make up our request. For each add // or property change in the request, get the Lucene // documents so we can move forward any persistent // properties (for adds) or all old properties (for // property changes). // // Then, for each add or remove in the request, // delete the associated documents from the index. // Note that we previously cached added documents so // that we can move persistent properties forward. // parent_child_old_props is double-nested hashtable (depth-2 tree) // indexed by the parent uri, it stores another hashtable indexed by the (parent+child documents) // FIXME: 2-level hashtable is a waste for any non-child document. // Replace this by a better data structure. Hashtable parent_child_old_props = UriFu.NewHashtable(); TermDocs term_docs = secondary_reader.TermDocs(); int delete_count = 0; IEnumerable request_indexables = request.Indexables; foreach (Indexable indexable in request_indexables) { string uri_str = UriFu.UriToEscapedString(indexable.Uri); Term term; // Store the necessary properties from old documents for re-addition if (indexable.Type == IndexableType.Add || indexable.Type == IndexableType.PropertyChange) { term = new Term("Uri", uri_str); term_docs.Seek(term); Hashtable this_parent_child_props = null; if (term_docs.Next()) { this_parent_child_props = UriFu.NewHashtable(); this_parent_child_props [indexable.Uri] = secondary_reader.Document(term_docs.Doc()); parent_child_old_props [indexable.Uri] = this_parent_child_props; } term = new Term("ParentUri", uri_str); term_docs.Seek(term); while (term_docs.Next()) { Document doc = secondary_reader.Document(term_docs.Doc()); string child_uri_str = doc.Get("Uri"); Uri child_uri = UriFu.EscapedStringToUri(child_uri_str); // Any valid lucene document *should* have a Uri, so no need to check for null // Store the child documents too, to save persistent-properties // of child documents this_parent_child_props [child_uri] = doc; } } // Now remove (non-remove indexables will be re-added in next block) Logger.Log.Debug("-{0}", indexable.DisplayUri); int num_delete = 0; term = new Term("Uri", uri_str); // For property changes, only secondary index is modified secondary_reader.DeleteDocuments(term); // Now remove from everywhere else (if asked to remove or if asked to add, in which case // we first remove and then add) // So we also need to remove child documents if (indexable.Type != IndexableType.PropertyChange) { num_delete = primary_reader.DeleteDocuments(term); // When we delete an indexable, also delete any children. // FIXME: Shouldn't we also delete any children of children, etc.? term = new Term("ParentUri", uri_str); num_delete += primary_reader.DeleteDocuments(term); secondary_reader.DeleteDocuments(term); } // If this is a strict removal (and not a deletion that // we are doing in anticipation of adding something back), // queue up a removed receipt. if (indexable.Type == IndexableType.Remove) { IndexerRemovedReceipt r; r = new IndexerRemovedReceipt(indexable.Id); r.NumRemoved = num_delete; receipt_queue.Add(r); } delete_count += num_delete; } term_docs.Close(); if (HaveItemCount) { AdjustItemCount(-delete_count); } else { SetItemCount(primary_reader); } // We are now done with the readers, so we close them. // And also free them. Somehow not freeing them is preventing them from // GCed at all. primary_reader.Close(); primary_reader = null; secondary_reader.Close(); secondary_reader = null; // FIXME: If we crash at exactly this point, we are in // trouble. Items will have been dropped from the index // without the proper replacements being added. We can // hopefully fix this when we move to Lucene 2.1. // Step #2: Make another pass across our list of indexables // and write out any new documents. if (text_cache != null) { text_cache.BeginTransaction(); } IndexWriter primary_writer, secondary_writer; // FIXME: Lock obtain time-out can happen here; if that happens, // an exception will be thrown and this method will break in the middle // leaving IndexWriters unclosed! Same for any Lucene.Net-index modification // methods. primary_writer = new IndexWriter(PrimaryStore, IndexingAnalyzer, false); secondary_writer = null; foreach (Indexable indexable in request_indexables) { // If shutdown has been started, break here // FIXME: Some more processing will continue, a lot of them // concerning receipts, but the daemon will anyway ignore receipts // now, what is the fastest way to stop from here ? if (Shutdown.ShutdownRequested) { Log.Debug("Shutdown initiated. Breaking while flushing indexables."); break; } // Receipts for removes were generated in the // previous block. Now we just have to remove // items from the text cache. if (indexable.Type == IndexableType.Remove) { if (text_cache != null) { text_cache.Delete(indexable.Uri); } continue; } IndexerAddedReceipt r; Hashtable prop_change_docs = (Hashtable)parent_child_old_props [indexable.Uri]; if (indexable.Type == IndexableType.PropertyChange) { Logger.Log.Debug("+{0} (props only)", indexable.DisplayUri); r = new IndexerAddedReceipt(indexable.Id); r.PropertyChangesOnly = true; receipt_queue.Add(r); Document doc; if (prop_change_docs == null) { doc = null; } else { doc = (Document)prop_change_docs [indexable.Uri]; } Document new_doc; new_doc = RewriteDocument(doc, indexable); // Write out the new document... if (secondary_writer == null) { secondary_writer = new IndexWriter(SecondaryStore, IndexingAnalyzer, false); } secondary_writer.AddDocument(new_doc); // Get child property change indexables... ArrayList prop_change_indexables; prop_change_indexables = GetChildPropertyChange(prop_change_docs, indexable); // and store them; no need to delete them first, since they were already removed from the index if (prop_change_indexables == null) { continue; } foreach (Indexable prop_change_indexable in prop_change_indexables) { Log.Debug("+{0} (props only, generated indexable)", prop_change_indexable.Uri); doc = (Document)prop_change_docs [prop_change_indexable.Uri]; new_doc = RewriteDocument(doc, prop_change_indexable); secondary_writer.AddDocument(new_doc); } continue; // ...and proceed to the next Indexable } // If we reach this point we know we are dealing with an IndexableType.Add if (indexable.Type != IndexableType.Add) { throw new Exception("When I said it was an IndexableType.Add, I meant it!"); } r = AddIndexableToIndex(indexable, primary_writer, ref secondary_writer, prop_change_docs); if (r != null) { receipt_queue.Add(r); } } if (text_cache != null) { text_cache.CommitTransaction(); } if (Shutdown.ShutdownRequested) { foreach (DeferredInfo di in deferred_indexables) { di.Cleanup(); } deferred_indexables.Clear(); foreach (Indexable indexable in request_indexables) { indexable.Cleanup(); } primary_writer.Close(); if (secondary_writer != null) { secondary_writer.Close(); } return(null); } if (request.OptimizeIndex) { Stopwatch watch = new Stopwatch(); Logger.Log.Debug("Optimizing {0}", IndexName); watch.Start(); primary_writer.Optimize(); if (secondary_writer == null) { secondary_writer = new IndexWriter(SecondaryStore, IndexingAnalyzer, false); } secondary_writer.Optimize(); watch.Stop(); Logger.Log.Debug("{0} optimized in {1}", IndexName, watch); } // Step #4. Close our writers and return the events to // indicate what has happened. primary_writer.Close(); if (secondary_writer != null) { secondary_writer.Close(); } // Send a single IndexerIndexablesReceipt if there were deferred indexables if (deferred_indexables.Count > 0) { Log.Debug("{0} indexables generated more indexables; asking daemon to schedule their indexing.", deferred_indexables.Count); IndexerIndexablesReceipt r = new IndexerIndexablesReceipt(); receipt_queue.Add(r); } IndexerReceipt [] receipt_array; receipt_array = new IndexerReceipt [receipt_queue.Count]; for (int i = 0; i < receipt_queue.Count; ++i) { receipt_array [i] = (IndexerReceipt)receipt_queue [i]; } return(receipt_array); }