Beispiel #1
0
        static void DoMain(string [] args)
        {
            SystemInformation.SetProcessName("beagrep-build-index");

            if (args.Length < 2)
            {
                PrintUsage();
            }

            ArrayList allowed_patterns    = new ArrayList();
            ArrayList denied_patterns     = new ArrayList();
            ArrayList denied_dir_patterns = new ArrayList();

            int i = 0;

            while (i < args.Length)
            {
                string arg = args [i];
                ++i;
                string next_arg = i < args.Length ? args [i] : null;

                switch (arg)
                {
                case "-h":
                case "--help":
                    PrintUsage();
                    break;

                case "--tag":
                    if (next_arg != null)
                    {
                        arg_tag = next_arg;
                    }
                    ++i;
                    break;

                case "-r":
                case "--recursive":
                    arg_recursive = true;
                    break;

                case "--enable-deletion":
                    arg_delete = true;
                    break;

                case "--disable-directories":
                    arg_disable_directories = true;
                    break;

                case "--enable-text-cache":
                    arg_cache_text = true;
                    break;

                case "--target":
                    if (next_arg != null)
                    {
                        arg_output = Path.IsPathRooted(next_arg) ? next_arg : Path.GetFullPath(next_arg);
                    }
                    ++i;
                    break;

                case "--disable-filtering":
                    arg_disable_filtering = true;
                    break;

                case "--allow-pattern":
                    if (next_arg == null)
                    {
                        break;
                    }

                    if (next_arg.IndexOf(',') != -1)
                    {
                        foreach (string pattern in next_arg.Split(','))
                        {
                            allowed_patterns.Add(pattern);
                        }
                    }
                    else
                    {
                        allowed_patterns.Add(next_arg);
                    }

                    ++i;
                    break;

                case "--deny-directory-pattern":
                    if (next_arg == null)
                    {
                        break;
                    }

                    if (next_arg.IndexOf(',') != -1)
                    {
                        foreach (string pattern in next_arg.Split(','))
                        {
                            denied_dir_patterns.Add(pattern);
                        }
                    }
                    else
                    {
                        denied_dir_patterns.Add(next_arg);
                    }

                    ++i;
                    break;

                case "--deny-pattern":
                    if (next_arg == null)
                    {
                        break;
                    }

                    if (next_arg.IndexOf(',') != -1)
                    {
                        foreach (string pattern in next_arg.Split(','))
                        {
                            denied_patterns.Add(pattern);
                        }
                    }
                    else
                    {
                        denied_patterns.Add(next_arg);
                    }

                    ++i;
                    break;

                case "--disable-restart":
                    arg_disable_restart = true;
                    break;

                case "--source":
                    if (next_arg == null)
                    {
                        break;
                    }

                    arg_source = next_arg;
                    ++i;
                    break;

                default:
                    if (arg.StartsWith("-") || arg.StartsWith("--"))
                    {
                        PrintUsage();
                    }

                    string path = Path.IsPathRooted(arg) ? arg : Path.GetFullPath(arg);
                    if (path != "/" && path.EndsWith("/"))
                    {
                        path = path.TrimEnd('/');
                    }

                    if (Directory.Exists(path))
                    {
                        pending_directories.Enqueue(new DirectoryInfo(path));
                    }
                    else if (File.Exists(path))
                    {
                        pending_files.Enqueue(new FileInfo(path));
                    }
                    break;
                }
            }

            /////////////////////////////////////////////////////////

            if (arg_output == null)
            {
                Logger.Log.Error("--target must be specified");
                Environment.Exit(1);
            }

            // Set the storage dir, this should be used to store log messages
            // and filterver.dat
            PathFinder.StorageDir = arg_output;

            foreach (FileSystemInfo info in pending_directories)
            {
                if (Path.GetFullPath(arg_output) == info.FullName)
                {
                    Logger.Log.Error("Target directory cannot be one of the source paths.");
                    Environment.Exit(1);
                }
            }

            foreach (FileSystemInfo info in pending_files)
            {
                if (Path.GetFullPath(arg_output) == info.FullName)
                {
                    Logger.Log.Error("Target directory cannot be one of the source paths.");
                    Environment.Exit(1);
                }
            }

            if (!Directory.Exists(Path.GetDirectoryName(arg_output)))
            {
                Logger.Log.Error("Index directory not available for construction: {0}", arg_output);
                Environment.Exit(1);
            }

            // Be *EXTRA PARANOID* about the contents of the target
            // directory, because creating an indexing driver will
            // nuke it.
            if (Directory.Exists(arg_output))
            {
                foreach (FileInfo info in DirectoryWalker.GetFileInfos(arg_output))
                {
                    if (Array.IndexOf(allowed_files, info.Name) == -1)
                    {
                        Logger.Log.Error("{0} doesn't look safe to delete: non-Beagrep file {1} was found", arg_output, info.FullName);
                        Environment.Exit(1);
                    }
                }

                foreach (DirectoryInfo info in DirectoryWalker.GetDirectoryInfos(arg_output))
                {
                    if (Array.IndexOf(allowed_dirs, info.Name) == -1)
                    {
                        Logger.Log.Error("{0} doesn't look safe to delete: non-Beagrep directory {1} was found", arg_output, info.FullName);
                        Environment.Exit(1);
                    }
                }
            }

            string config_file_path = Path.Combine(arg_output, "StaticIndex.xml");
            string prev_source      = null;

            if (File.Exists(config_file_path))
            {
                Config static_index_config = Conf.LoadFrom(config_file_path);
                if (static_index_config == null)
                {
                    Log.Error("Invalid configuation file {0}", config_file_path);
                    Environment.Exit(1);
                }

                prev_source = static_index_config.GetOption("Source", null);
                if (arg_source != null && prev_source != arg_source)
                {
                    Log.Error("Source already set to {0} for existing static index. Cannot set source to {1}.", prev_source, arg_source);
                    Environment.Exit(1);
                }

                // If arg_source is not given, and prev_source is present, use prev_source
                // as the arg_source. This is useful for re-running build-index without
                // giving --arg_source for already existing static index
                arg_source = prev_source;
            }


            if (arg_source == null)
            {
                DirectoryInfo dir = new DirectoryInfo(StringFu.SanitizePath(arg_output));
                arg_source = dir.Name;
            }

            string global_files_config = Path.Combine(PathFinder.ConfigDataDir, "config-files");

            global_files_config = Path.Combine(global_files_config, Conf.Names.FilesQueryableConfig + ".xml");
            if (!File.Exists(global_files_config))
            {
                Log.Error("Global configuration file not found {0}", global_files_config);
                Environment.Exit(0);
            }

            // Setup regexes for allowed/denied patterns
            if (allowed_patterns.Count > 0)
            {
                allowed_regex = StringFu.GetPatternRegex(allowed_patterns);
            }
            else
            {
                // Read the exclude values from config
                // For system-wide indexes, only the global config value will be used
                Config          config = Conf.Get(Conf.Names.FilesQueryableConfig);
                List <string[]> values = config.GetListOptionValues(Conf.Names.ExcludePattern);
                if (values != null)
                {
                    foreach (string[] exclude in values)
                    {
                        denied_patterns.Add(exclude [0]);
                    }
                }

                if (denied_patterns.Count > 0)
                {
                    denied_regex = StringFu.GetPatternRegex(denied_patterns);
                }
            }

            if (denied_dir_patterns.Count > 0)
            {
                denied_dir_regex = StringFu.GetPatternRegex(denied_dir_patterns);
                Log.Always("Will ignore directories matching regular expression: {0}", denied_dir_regex);
            }

            Log.Always("Starting beagrep-build-index (pid {0}) at {1}", Process.GetCurrentProcess().Id, DateTime.Now);

            driver           = new LuceneIndexingDriver(arg_output, MINOR_VERSION, false);
            driver.TextCache = (arg_cache_text) ? new TextCache(arg_output) : null;
            if (driver.TextCache != null)
            {
                driver.TextCache.WorldReadable = true;
            }

            backing_fa_store = new FileAttributesStore_Sqlite(driver.TopDirectory, driver.Fingerprint);
            fa_store         = new FileAttributesStore(backing_fa_store);

            // Set up signal handlers
#if MONO_1_9
            Shutdown.SetupSignalHandlers(delegate(int signal)
            {
                if (signal == (int)Mono.Unix.Native.Signum.SIGINT ||
                    signal == (int)Mono.Unix.Native.Signum.SIGTERM)
                {
                    Shutdown.BeginShutdown();
                }
            });
#else
            SetupSignalHandlers();
#endif

            Thread monitor_thread = null;

            Stopwatch watch = new Stopwatch();
            watch.Start();

            if (!arg_disable_restart)
            {
                // Start the thread that monitors memory usage.
                monitor_thread = ExceptionHandlingThread.Start(new ThreadStart(MemoryMonitorWorker));
            }

            // Start indexworker to do the crawling and indexing
            IndexWorker();

            // Join any threads so that we know that we're the only thread still running
            if (monitor_thread != null)
            {
                monitor_thread.Join();
            }

            watch.Stop();
            Logger.Log.Debug("Elapsed time {0}.", watch);

            // Write this after indexing is done. This is because, if creating a new index,
            // LuceneIndexingDriver.Create() is called which purges the entire directory.

            if (prev_source == null)
            {
                Config static_index_config = Conf.LoadNew("StaticIndex.xml");

                // Write StaticIndex.xml containing:
                // The name of the source
                static_index_config.SetOption("Source", arg_source);
                static_index_config ["Source"].Description = "Source of the static index";


                Conf.SaveTo(static_index_config, config_file_path);
            }

            if (restart)
            {
                Logger.Log.Debug("Restarting beagrep-build-index");
                Process p = new Process();
                p.StartInfo.UseShellExecute = false;
                // FIXME: Maybe this isn't the right way to do things?  It should be ok,
                // the PATH is inherited from the shell script which runs mono itself.
                p.StartInfo.FileName  = "mono";
                p.StartInfo.Arguments = String.Join(" ", Environment.GetCommandLineArgs());
                p.Start();
            }

            Log.Always("Exiting beagrep-build-index (pid {0}) at {1}", Process.GetCurrentProcess().Id, DateTime.Now);
        }
Beispiel #2
0
		static void LaunchHelper ()
		{
			// If we are in the process of shutting down, return immediately.
			if (Shutdown.ShutdownRequested)
				return;

			lock (helper_lock) {

				// If a helper appears to be running, return immediately.
				if (CheckHelper ())
					return;
				
				Logger.Log.Debug ("Launching helper process");

				SafeProcess p = new SafeProcess ();
				string[] args = new string [3];
				args [0] = helper_path;

				if (BeagrepDaemon.DisableTextCache)
					args [1] = "--disable-text-cache";
				else
					args [1] = String.Empty;

				if (Log.Level == LogLevel.Debug)
					args [2] = "--debug";
				else
					args [2] = String.Empty;

				p.Arguments = args;
				p.RedirectStandardOutput = false;
				p.RedirectStandardError = false;
				p.Start ();

				Logger.Log.Debug ("IndexHelper PID is {0}", p.Id);

				// Poll the helper's socket.  Wait up to a minute
				// (500 ms * 120 times) for the helper to be ready
				// to handle requests.
				Stopwatch watch = new Stopwatch ();
				watch.Start ();
				int poll_count = 0;
				bool found_helper;
				do {
					Thread.Sleep (500);
					++poll_count;
					found_helper = CheckHelper ();
				} while (poll_count < 120 
					 && ! found_helper
					 && ! Shutdown.ShutdownRequested);
				watch.Stop ();
				
				if (! found_helper)
					throw new Exception (String.Format ("Couldn't launch helper process {0}", p.Id));

				Logger.Log.Debug ("Found IndexHelper ({0}) in {1}", p.Id, watch);
				helper_pid = p.Id;
			}
		}
        ////////////////////////////////////////////////////////////////

        public void DoQuery(Query query,
                            IQueryResult result,
                            ICollection search_subset_uris,                  // should be internal uris
                            QueryPartHook query_part_hook,
                            HitFilter hit_filter)
        {
            if (Debug)
            {
                Logger.Log.Debug("###### {0}: Starting low-level queries", IndexName);
            }

            Stopwatch total, a, b, c, d, e, f;

            total = new Stopwatch();
            a     = new Stopwatch();
            b     = new Stopwatch();
            c     = new Stopwatch();
            d     = new Stopwatch();
            e     = new Stopwatch();
            f     = new Stopwatch();

            total.Start();
            a.Start();

            ArrayList primary_required_part_queries;
            ArrayList secondary_required_part_queries;

            LNS.BooleanQuery primary_prohibited_part_query;
            LNS.BooleanQuery secondary_prohibited_part_query;

            AndHitFilter all_hit_filters;

            ArrayList term_list;

            // Assemble all of the parts into a bunch of Lucene queries

            term_list = AssembleQuery(query,
                                      query_part_hook,
                                      hit_filter,
                                      out primary_required_part_queries,
                                      out secondary_required_part_queries,
                                      out primary_prohibited_part_query,
                                      out secondary_prohibited_part_query,
                                      out all_hit_filters);

            a.Stop();
            if (Debug)
            {
                Log.Debug("###### {0}: Building queries took {1}", IndexName, a);
            }

            // If we have no required parts, give up.
            if (primary_required_part_queries == null)
            {
                return;
            }

            b.Start();

            //
            // Now that we have all of these nice queries, let's execute them!
            //

            IndexReader primary_reader;

            LNS.IndexSearcher primary_searcher;
            IndexReader       secondary_reader;

            LNS.IndexSearcher secondary_searcher;

            // Create the searchers that we will need.
            if (!BuildSearchers(out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher))
            {
                return;
            }

            b.Stop();
            if (Debug)
            {
                Log.Debug("###### {0}: Readers/searchers built in {1}", IndexName, b);
            }

            // Build whitelists and blacklists for search subsets.
            c.Start();

            // Possibly create our whitelists from the search subset.
            LuceneBitArray primary_whitelist, secondary_whitelist;

            CreateQueryWhitelists(search_subset_uris,
                                  primary_searcher,
                                  secondary_searcher,
                                  primary_prohibited_part_query,
                                  secondary_prohibited_part_query,
                                  out primary_whitelist,
                                  out secondary_whitelist);

            c.Stop();
            if (Debug)
            {
                Log.Debug("###### {0}: Whitelists and blacklists built in {1}", IndexName, c);
            }

            // Now run the low level queries against our indexes.
            d.Start();

            BetterBitArray primary_matches = null;

            if (primary_required_part_queries != null)
            {
                if (secondary_searcher != null)
                {
                    primary_matches = DoRequiredQueries_TwoIndex(primary_searcher,
                                                                 secondary_searcher,
                                                                 primary_required_part_queries,
                                                                 secondary_required_part_queries,
                                                                 primary_whitelist,
                                                                 secondary_whitelist);
                }
                else
                {
                    primary_matches = DoRequiredQueries(primary_searcher,
                                                        primary_required_part_queries,
                                                        primary_whitelist);
                }
            }

            d.Stop();
            if (Debug)
            {
                Logger.Log.Debug("###### {0}: Low-level queries finished in {1}", IndexName, d);
            }

            e.Start();
            // Only generate results if we got some matches
            if (primary_matches != null && primary_matches.ContainsTrue())
            {
                GenerateQueryResults(primary_reader,
                                     secondary_reader,
                                     primary_matches,
                                     result,
                                     term_list,
                                     query.MaxHits,
                                     new HitFilter(all_hit_filters.HitFilter),
                                     IndexName);
            }

            e.Stop();

            if (Debug)
            {
                Log.Debug("###### {0}: Query results generated in {1}", IndexName, e);
            }

            //
            // Finally, we clean up after ourselves.
            //

            f.Start();
            CloseSearchers(primary_reader, primary_searcher, secondary_reader, secondary_searcher);
            f.Stop();

            if (Debug)
            {
                Log.Debug("###### {0}: Readers/searchers released in {1}", IndexName, f);
            }

            total.Stop();
            if (Debug)
            {
                Log.Debug("###### {0}: Query time breakdown:", IndexName);
                Log.Debug("###### {0}:    Build queries {1,6} ({2:0.0}%)", IndexName, a, 100 * a.ElapsedTime / total.ElapsedTime);
                Log.Debug("###### {0}:      Got readers {1,6} ({2:0.0}%)", IndexName, b, 100 * b.ElapsedTime / total.ElapsedTime);
                Log.Debug("###### {0}:       Whitelists {1,6} ({2:0.0}%)", IndexName, c, 100 * c.ElapsedTime / total.ElapsedTime);
                Log.Debug("###### {0}:          Queries {1,6} ({2:0.0}%)", IndexName, d, 100 * d.ElapsedTime / total.ElapsedTime);
                Log.Debug("###### {0}:    Gen'd Results {1,6} ({2:0.0}%)", IndexName, e, 100 * e.ElapsedTime / total.ElapsedTime);
                Log.Debug("###### {0}:   Reader cleanup {1,6} ({2:0.0}%)", IndexName, f, 100 * f.ElapsedTime / total.ElapsedTime);
                Log.Debug("###### {0}:            TOTAL {1,6}", IndexName, total);

                Logger.Log.Debug("###### {0}: Total query run in {1}", IndexName, total);
            }
        }
        private static void GenerateQueryResults(IndexReader primary_reader,
                                                 IndexReader secondary_reader,
                                                 BetterBitArray primary_matches,
                                                 IQueryResult result,
                                                 ICollection query_term_list,
                                                 int max_results,
                                                 HitFilter hit_filter,
                                                 string index_name)
        {
            int num_hits;

            if (Debug)
            {
                Logger.Log.Debug(">>> {0}: Initially handed {1} matches", index_name, primary_matches.TrueCount);
            }

            if (primary_matches.TrueCount <= max_results)
            {
                if (Debug)
                {
                    Logger.Log.Debug(">>> {0}: Initial count is within our limit of {1}", index_name, max_results);
                }
                num_hits = primary_matches.TrueCount;
            }
            else
            {
                if (Debug)
                {
                    Logger.Log.Debug(">>> {0}: Number of hits is capped at {1}", index_name, max_results);
                }
                num_hits = max_results;
            }

            Stopwatch total, d, e;

            total = new Stopwatch();
            d     = new Stopwatch();
            e     = new Stopwatch();

            total.Start();

            ArrayList final_list_of_hits = null;

            // This is used only for scoring
            Dictionary <int, Hit> hits_by_id = new Dictionary <int, Hit> (num_hits);

            int total_number_of_matches = primary_matches.TrueCount;

            if (primary_matches.TrueCount > max_results)
            {
                final_list_of_hits = ScanRecentDocs(primary_reader,
                                                    secondary_reader,
                                                    primary_matches,
                                                    hits_by_id,
                                                    max_results,
                                                    ref total_number_of_matches,
                                                    hit_filter,
                                                    index_name);
            }

            if (final_list_of_hits == null)
            {
                final_list_of_hits = FindRecentResults(primary_reader,
                                                       secondary_reader,
                                                       primary_matches,
                                                       hits_by_id,
                                                       max_results,
                                                       ref total_number_of_matches,
                                                       hit_filter,
                                                       index_name);
            }

            d.Start();

            ScoreHits(hits_by_id, primary_reader, query_term_list);
            hits_by_id = null;

            d.Stop();

            if (Debug)
            {
                Log.Debug(">>> {0}: Scored hits in {1}", index_name, d);
            }

            e.Start();

            // 25 hits seems to be the sweet spot: anything lower
            // and serialization overhead gets us, higher takes
            // longer to send out.
            const int MAX_QUEUED_HITS = 25;
            int       sent_index      = 0;

            // Break up the hits into reasonably sized chunks for
            // sending over the wire.
            for (int i = 0; i < final_list_of_hits.Count; ++i)
            {
                // Flush our hits
                if (i > 0 && i % MAX_QUEUED_HITS == 0)
                {
                    result.Add(final_list_of_hits.GetRange(0, MAX_QUEUED_HITS));
                    final_list_of_hits.RemoveRange(0, MAX_QUEUED_HITS);
                    i -= MAX_QUEUED_HITS;
                }
            }

            // Flush the remaining hits
            result.Add(final_list_of_hits, total_number_of_matches);
            final_list_of_hits = null;

            e.Stop();

            if (Debug)
            {
                Log.Debug(">>> {0}: Hit filters executed and results sent in {1}", index_name, e);
            }

            total.Stop();

            if (Debug)
            {
                Logger.Log.Debug(">>> {0}: GenerateQueryResults time statistics:", index_name);
                //Logger.Log.Debug (">>> {0}:   Short circuit {1,6} ({2:0.0}%)", index_name, a == null ? "N/A" : a.ToString (), a == null ? 0.0 : 100 * a.ElapsedTime / total.ElapsedTime);
                //Logger.Log.Debug (">>> {0}:     Create docs {1,6} ({2:0.0}%)", index_name, b, 100 * b.ElapsedTime / total.ElapsedTime);
                //Logger.Log.Debug (">>> {0}:    Hit assembly {1,6} ({2:0.0}%)", index_name, c, 100 * c.ElapsedTime / total.ElapsedTime);
                Logger.Log.Debug(">>> {0}:     Scored hits {1,6} ({2:0.0}%)", index_name, d, 100 * d.ElapsedTime / total.ElapsedTime);
                Logger.Log.Debug(">>> {0}:    Results sent {1,6} ({2:0.0}%)", index_name, e, 100 * e.ElapsedTime / total.ElapsedTime);
                Logger.Log.Debug(">>> {0}:           TOTAL {1,6}", index_name, total);
            }
        }
		////////////////////////////////////////////////////////////////

		public int DoCountMatchQuery (Query query, QueryPartHook query_part_hook)
		{
			if (Debug)
				Logger.Log.Debug ("###### {0}: Starting low-level queries", IndexName);

			Stopwatch total;
			total = new Stopwatch ();
			total.Start ();

			ArrayList primary_required_part_queries;
			ArrayList secondary_required_part_queries;

			LNS.BooleanQuery primary_prohibited_part_query;
			LNS.BooleanQuery secondary_prohibited_part_query;

			AndHitFilter all_hit_filters;

			ArrayList term_list;
			term_list = AssembleQuery ( query,
						    query_part_hook,
						    null,
						    out primary_required_part_queries,
						    out secondary_required_part_queries,
						    out primary_prohibited_part_query,
						    out secondary_prohibited_part_query,
						    out all_hit_filters);

			// If we have no required parts, give up.
			if (primary_required_part_queries == null)
				return 0;

			IndexReader primary_reader;
			LNS.IndexSearcher primary_searcher;
			IndexReader secondary_reader;
			LNS.IndexSearcher secondary_searcher;

			if (! BuildSearchers (out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher))
				return 0;

			// Build whitelists and blacklists for search subsets.
			LuceneBitArray primary_whitelist, secondary_whitelist;
			CreateQueryWhitelists (null,
				primary_searcher,
				secondary_searcher,
				primary_prohibited_part_query,
				secondary_prohibited_part_query,
				out primary_whitelist,
				out secondary_whitelist);

			// Now run the low level queries against our indexes.
			BetterBitArray primary_matches = null;
			if (primary_required_part_queries != null) {

				if (secondary_searcher != null)
					primary_matches = DoRequiredQueries_TwoIndex (primary_searcher,
										      secondary_searcher,
										      primary_required_part_queries,
										      secondary_required_part_queries,
										      primary_whitelist,
										      secondary_whitelist);
				else
					primary_matches = DoRequiredQueries (primary_searcher,
									     primary_required_part_queries,
									     primary_whitelist);

			} 

			int result = 0;
			// FIXME: Pass the count through uri-filter and other validation checks
			if (primary_matches != null)
				result = primary_matches.TrueCount;

			CloseSearchers (primary_reader, primary_searcher, secondary_reader, secondary_searcher);

			total.Stop ();
			if (Debug)
				Logger.Log.Debug ("###### {0}: Total query run in {1}", IndexName, total);

			return result;
		}
        ////////////////////////////////////////////////////////////////

        public int DoCountMatchQuery(Query query, QueryPartHook query_part_hook)
        {
            if (Debug)
            {
                Logger.Log.Debug("###### {0}: Starting low-level queries", IndexName);
            }

            Stopwatch total;

            total = new Stopwatch();
            total.Start();

            ArrayList primary_required_part_queries;
            ArrayList secondary_required_part_queries;

            LNS.BooleanQuery primary_prohibited_part_query;
            LNS.BooleanQuery secondary_prohibited_part_query;

            AndHitFilter all_hit_filters;

            ArrayList term_list;

            term_list = AssembleQuery(query,
                                      query_part_hook,
                                      null,
                                      out primary_required_part_queries,
                                      out secondary_required_part_queries,
                                      out primary_prohibited_part_query,
                                      out secondary_prohibited_part_query,
                                      out all_hit_filters);

            // If we have no required parts, give up.
            if (primary_required_part_queries == null)
            {
                return(0);
            }

            IndexReader primary_reader;

            LNS.IndexSearcher primary_searcher;
            IndexReader       secondary_reader;

            LNS.IndexSearcher secondary_searcher;

            if (!BuildSearchers(out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher))
            {
                return(0);
            }

            // Build whitelists and blacklists for search subsets.
            LuceneBitArray primary_whitelist, secondary_whitelist;

            CreateQueryWhitelists(null,
                                  primary_searcher,
                                  secondary_searcher,
                                  primary_prohibited_part_query,
                                  secondary_prohibited_part_query,
                                  out primary_whitelist,
                                  out secondary_whitelist);

            // Now run the low level queries against our indexes.
            BetterBitArray primary_matches = null;

            if (primary_required_part_queries != null)
            {
                if (secondary_searcher != null)
                {
                    primary_matches = DoRequiredQueries_TwoIndex(primary_searcher,
                                                                 secondary_searcher,
                                                                 primary_required_part_queries,
                                                                 secondary_required_part_queries,
                                                                 primary_whitelist,
                                                                 secondary_whitelist);
                }
                else
                {
                    primary_matches = DoRequiredQueries(primary_searcher,
                                                        primary_required_part_queries,
                                                        primary_whitelist);
                }
            }

            int result = 0;

            // FIXME: Pass the count through uri-filter and other validation checks
            if (primary_matches != null)
            {
                result = primary_matches.TrueCount;
            }

            CloseSearchers(primary_reader, primary_searcher, secondary_reader, secondary_searcher);

            total.Stop();
            if (Debug)
            {
                Logger.Log.Debug("###### {0}: Total query run in {1}", IndexName, total);
            }

            return(result);
        }
Beispiel #7
0
        static void LaunchHelper()
        {
            // If we are in the process of shutting down, return immediately.
            if (Shutdown.ShutdownRequested)
            {
                return;
            }

            lock (helper_lock) {
                // If a helper appears to be running, return immediately.
                if (CheckHelper())
                {
                    return;
                }

                Logger.Log.Debug("Launching helper process");

                SafeProcess p    = new SafeProcess();
                string[]    args = new string [3];
                args [0] = helper_path;

                if (BeagrepDaemon.DisableTextCache)
                {
                    args [1] = "--disable-text-cache";
                }
                else
                {
                    args [1] = String.Empty;
                }

                if (Log.Level == LogLevel.Debug)
                {
                    args [2] = "--debug";
                }
                else
                {
                    args [2] = String.Empty;
                }

                p.Arguments = args;
                p.RedirectStandardOutput = false;
                p.RedirectStandardError  = false;
                p.Start();

                Logger.Log.Debug("IndexHelper PID is {0}", p.Id);

                // Poll the helper's socket.  Wait up to a minute
                // (500 ms * 120 times) for the helper to be ready
                // to handle requests.
                Stopwatch watch = new Stopwatch();
                watch.Start();
                int  poll_count = 0;
                bool found_helper;
                do
                {
                    Thread.Sleep(500);
                    ++poll_count;
                    found_helper = CheckHelper();
                } while (poll_count < 120 &&
                         !found_helper &&
                         !Shutdown.ShutdownRequested);
                watch.Stop();

                if (!found_helper)
                {
                    throw new Exception(String.Format("Couldn't launch helper process {0}", p.Id));
                }

                Logger.Log.Debug("Found IndexHelper ({0}) in {1}", p.Id, watch);
                helper_pid = p.Id;
            }
        }
Beispiel #8
0
                public static bool StartupProcess ()
                {
                        // Profile our initialization
                        Stopwatch stopwatch = new Stopwatch ();
                        stopwatch.Start ();

                        // Fire up our server
                        if (! StartServer ()) {
                                if (! arg_replace) {
                                        Logger.Log.Error ("Could not set up the listener for beagrep requests.  "
                                                          + "There is probably another beagrepd instance running.  "
                                                          + "Use --replace to replace the running service");
                                        Environment.Exit (1);
                                }

                                ReplaceExisting ();
                        }

                        // Set up out-of-process indexing
                        LuceneQueryable.IndexerHook = new LuceneQueryable.IndexerCreator (RemoteIndexer.NewRemoteIndexer);

                        Config config = Conf.Get (Conf.Names.DaemonConfig);

                        // Initialize synchronization to keep the indexes local if PathFinder.StorageDir
                        // is on a non-block device, or if BEAGREP_SYNCHRONIZE_LOCALLY is set

                        if ((! SystemInformation.IsPathOnBlockDevice (PathFinder.StorageDir) &&
                             config.GetOption (Conf.Names.IndexSynchronization, true)) ||
                            Environment.GetEnvironmentVariable ("BEAGREP_SYNCHRONIZE_LOCALLY") != null)
                                IndexSynchronization.Initialize ();

                        // Start the query driver.
                        Logger.Log.Debug ("Starting QueryDriver");
                        QueryDriver.Start ();

                        // Start our battery monitor so we can shut down the
                        // scheduler if needed.
                        BatteryMonitor.Init ();

                        bool initially_on_battery = ! BatteryMonitor.UsingAC && ! config.GetOption (Conf.Names.IndexOnBattery, false);

                        // Start the Global Scheduler thread
                        if (! arg_disable_scheduler) {
                                if (! initially_on_battery) {
                                        Logger.Log.Debug ("Starting Scheduler thread");
                                        Scheduler.Global.Start ();
                                } else {
                                        Log.Debug ("Beagrep started on battery, not starting scheduler thread");
                                }
                        }

                        // Start our Inotify threads
                        Inotify.Start ();

                        // Test if the FileAdvise stuff is working: This will print a
                        // warning if not.  The actual advice calls will fail silently.
                        FileAdvise.TestAdvise ();

#if ENABLE_AVAHI
                        zeroconf = new Beagrep.Daemon.Network.Zeroconf ();
#endif

                        Conf.WatchForUpdates ();

                        stopwatch.Stop ();

                        Logger.Log.Debug ("Daemon initialization finished after {0}", stopwatch);

                        SystemInformation.LogMemoryUsage ();

                        if (arg_indexing_test_mode) {
                                Thread.Sleep (1000); // Ugly paranoia: wait a second for the backends to settle.
                                Logger.Log.Debug ("Running in indexing test mode");
                                Scheduler.Global.EmptyQueueEvent += OnEmptySchedulerQueue;
                                Scheduler.Global.Add (null); // pulse the scheduler
                        }

                        return false;
                }
		private static ArrayList   FindRecentResults (IndexReader	    primary_reader,
							      IndexReader	    secondary_reader,
							      BetterBitArray	    primary_matches,
							      Dictionary<int, Hit>  hits_by_id,
							      int		    max_results,
							      ref int		    total_number_of_matches,
							      HitFilter		    hit_filter,
							      string		    index_name)
		{
			Stopwatch b = new Stopwatch ();
			b.Start ();
			
			int count = 0;
			Document doc;

			ArrayList all_docs = null;
			TopScores top_docs = null;
			TermDocs term_docs = null;

			if (primary_matches.TrueCount > max_results)
				top_docs = new TopScores (max_results);
			else
				all_docs = new ArrayList (primary_matches.TrueCount);

			if (secondary_reader != null)
				term_docs = secondary_reader.TermDocs ();

			for (int match_index = primary_matches.Count; ; match_index --) {
				// Walk across the matches backwards, since newer
				// documents are more likely to be at the end of
				// the index.
				match_index = primary_matches.GetPreviousTrueIndex (match_index);
				if (match_index < 0)
					break;

				count++;

				doc = primary_reader.Document (match_index, fields_timestamp_uri);

				// Check the timestamp --- if we have already reached our
				// limit, we might be able to reject it immediately.
				string timestamp_str;
				long timestamp_num = 0;

				timestamp_str = doc.Get ("Timestamp");
				if (timestamp_str == null) {
					Logger.Log.Warn ("No timestamp on {0}!", GetUriFromDocument (doc));
				} else {
					timestamp_num = Int64.Parse (doc.Get ("Timestamp"));
					if (top_docs != null && ! top_docs.WillAccept (timestamp_num))
						continue;
				}

				// Get the actual hit now
				// doc was created with only 2 fields, so first get the complete lucene document for primary document.
				// Also run our hit_filter now, if we have one. Since we insist of returning max_results
				// most recent hits, any hits that would be filtered out should happen now and not later.
				Hit hit = CreateHit (primary_reader.Document (match_index), secondary_reader, term_docs);
				if (hit_filter != null && ! hit_filter (hit)) {
					if (Debug)
						Log.Debug ("Filtered out {0}", hit.Uri);
					total_number_of_matches --;
					continue;
				}

				hits_by_id [match_index] = hit;

				// Add the document to the appropriate data structure.
				// We use the timestamp_num as the score, so high
				// scores correspond to more-recent timestamps.
				if (all_docs != null)
					all_docs.Add (hit);
				else
					top_docs.Add (timestamp_num, hit);
			}

			if (term_docs != null)
				term_docs.Close ();

			b.Stop ();

			if (Debug)
				Log.Debug (">>> {0}: Instantiated and scanned {1} documents in {2}", index_name, count, b);

			if (all_docs != null) {
				// Sort results before sending
				all_docs.Sort ();
				return all_docs;
			} else {
				return top_docs.TopScoringObjects;
			}
		}
		private IndexerReceipt [] Flush_Unlocked (IndexerRequest request)
		{
			ArrayList receipt_queue;
			receipt_queue = new ArrayList ();

			IndexReader primary_reader, secondary_reader;
			primary_reader = IndexReader.Open (PrimaryStore);
			secondary_reader = IndexReader.Open (SecondaryStore);

			// Step #1: Make our first pass over the list of
			// indexables that make up our request.  For each add
			// or property change in the request, get the Lucene
			// documents so we can move forward any persistent
			// properties (for adds) or all old properties (for
			// property changes).
			//
			// Then, for each add or remove in the request,
			// delete the associated documents from the index.
			// Note that we previously cached added documents so
			// that we can move persistent properties forward.

			// parent_child_old_props is double-nested hashtable (depth-2 tree)
			// indexed by the parent uri, it stores another hashtable indexed by the (parent+child documents)
			// FIXME: 2-level hashtable is a waste for any non-child document.
			// Replace this by a better data structure.
			Hashtable parent_child_old_props = UriFu.NewHashtable ();
			TermDocs term_docs = secondary_reader.TermDocs ();
			int delete_count = 0;

			IEnumerable request_indexables = request.Indexables;

			foreach (Indexable indexable in request_indexables) {

				string uri_str = UriFu.UriToEscapedString (indexable.Uri);
				Term term;

				// Store the necessary properties from old documents for re-addition
				if (indexable.Type == IndexableType.Add ||
				    indexable.Type == IndexableType.PropertyChange) {

					term = new Term ("Uri", uri_str);
					term_docs.Seek (term);

					Hashtable this_parent_child_props = null;

					if (term_docs.Next ()) {
						this_parent_child_props = UriFu.NewHashtable ();
						this_parent_child_props [indexable.Uri] = secondary_reader.Document (term_docs.Doc ());
						parent_child_old_props [indexable.Uri] = this_parent_child_props;
					}

					term = new Term ("ParentUri", uri_str);
					term_docs.Seek (term);

					while (term_docs.Next ()) {
						Document doc = secondary_reader.Document (term_docs.Doc ());

						string child_uri_str = doc.Get ("Uri");
						Uri child_uri = UriFu.EscapedStringToUri (child_uri_str);
						// Any valid lucene document *should* have a Uri, so no need to check for null
						// Store the child documents too, to save persistent-properties
						// of child documents
						this_parent_child_props [child_uri] = doc;
					}
				}

				// Now remove (non-remove indexables will be re-added in next block)
				Logger.Log.Debug ("-{0}", indexable.DisplayUri);
				
				int num_delete = 0;

				term = new Term ("Uri", uri_str);
				// For property changes, only secondary index is modified
				secondary_reader.DeleteDocuments (term);

				// Now remove from everywhere else (if asked to remove or if asked to add, in which case
				// we first remove and then add)
				// So we also need to remove child documents
				if (indexable.Type != IndexableType.PropertyChange) {
					num_delete = primary_reader.DeleteDocuments (term);

					// When we delete an indexable, also delete any children.
					// FIXME: Shouldn't we also delete any children of children, etc.?
					term = new Term ("ParentUri", uri_str);
					num_delete += primary_reader.DeleteDocuments (term);
					secondary_reader.DeleteDocuments (term);
				}

				// If this is a strict removal (and not a deletion that
				// we are doing in anticipation of adding something back),
				// queue up a removed receipt.
				if (indexable.Type == IndexableType.Remove) {
					IndexerRemovedReceipt r;
					r = new IndexerRemovedReceipt (indexable.Id);
					r.NumRemoved = num_delete;
					receipt_queue.Add (r);
				}

				delete_count += num_delete;
			}

			term_docs.Close ();

			if (HaveItemCount)
				AdjustItemCount (-delete_count);
			else
				SetItemCount (primary_reader);
			
			// We are now done with the readers, so we close them.
			// And also free them. Somehow not freeing them is preventing them from
			// GCed at all.
			primary_reader.Close ();
			primary_reader = null;
			secondary_reader.Close ();
			secondary_reader = null;

			// FIXME: If we crash at exactly this point, we are in
			// trouble.  Items will have been dropped from the index
			// without the proper replacements being added.  We can
			// hopefully fix this when we move to Lucene 2.1.

			// Step #2: Make another pass across our list of indexables
			// and write out any new documents.

			if (text_cache != null)
				text_cache.BeginTransaction ();
				
			IndexWriter primary_writer, secondary_writer;
			// FIXME: Lock obtain time-out can happen here; if that happens,
			// an exception will be thrown and this method will break in the middle
			// leaving IndexWriters unclosed! Same for any Lucene.Net-index modification
			// methods.
			primary_writer = new IndexWriter (PrimaryStore, IndexingAnalyzer, false);
			secondary_writer = null;

			foreach (Indexable indexable in request_indexables) {
				// If shutdown has been started, break here
				// FIXME: Some more processing will continue, a lot of them
				// concerning receipts, but the daemon will anyway ignore receipts
				// now, what is the fastest way to stop from here ?
				if (Shutdown.ShutdownRequested) {
					Log.Debug ("Shutdown initiated. Breaking while flushing indexables.");
					break;
				}

				// Receipts for removes were generated in the
				// previous block.  Now we just have to remove
				// items from the text cache.
				if (indexable.Type == IndexableType.Remove) {
					if (text_cache != null)
						text_cache.Delete (indexable.Uri);

					continue;
				}

				IndexerAddedReceipt r;
				Hashtable prop_change_docs = (Hashtable) parent_child_old_props [indexable.Uri];

				if (indexable.Type == IndexableType.PropertyChange) {

					Logger.Log.Debug ("+{0} (props only)", indexable.DisplayUri);

					r = new IndexerAddedReceipt (indexable.Id);
					r.PropertyChangesOnly = true;
					receipt_queue.Add (r);

					Document doc;
					if (prop_change_docs == null)
						doc = null;
					else
						doc = (Document) prop_change_docs [indexable.Uri];

					Document new_doc;
					new_doc = RewriteDocument (doc, indexable);

					// Write out the new document...
					if (secondary_writer == null)
						secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
					secondary_writer.AddDocument (new_doc);

					// Get child property change indexables...
					ArrayList prop_change_indexables;
					prop_change_indexables = GetChildPropertyChange (prop_change_docs, indexable);
					// and store them; no need to delete them first, since they were already removed from the index
					if (prop_change_indexables == null)
						continue;

					foreach (Indexable prop_change_indexable in prop_change_indexables) {
						Log.Debug ("+{0} (props only, generated indexable)", prop_change_indexable.Uri);
						doc = (Document) prop_change_docs [prop_change_indexable.Uri];
						new_doc = RewriteDocument (doc, prop_change_indexable);
						secondary_writer.AddDocument (new_doc);
					}

					continue; // ...and proceed to the next Indexable
				}

				// If we reach this point we know we are dealing with an IndexableType.Add

				if (indexable.Type != IndexableType.Add)
					throw new Exception ("When I said it was an IndexableType.Add, I meant it!");

				r = AddIndexableToIndex (indexable, primary_writer, ref secondary_writer, prop_change_docs);
				if (r != null)
					receipt_queue.Add (r);
			}

			if (text_cache != null)
				text_cache.CommitTransaction ();

			if (Shutdown.ShutdownRequested) {
				foreach (DeferredInfo di in deferred_indexables)
					di.Cleanup ();
				deferred_indexables.Clear ();

				foreach (Indexable indexable in request_indexables)
					indexable.Cleanup ();

				primary_writer.Close ();
				if (secondary_writer != null)
					secondary_writer.Close ();

				return null;
			}

			if (request.OptimizeIndex) {
				Stopwatch watch = new Stopwatch ();
				Logger.Log.Debug ("Optimizing {0}", IndexName);
				watch.Start ();
				primary_writer.Optimize ();
				if (secondary_writer == null)
					secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
				secondary_writer.Optimize ();
				watch.Stop ();
				Logger.Log.Debug ("{0} optimized in {1}", IndexName, watch);
			}

			// Step #4. Close our writers and return the events to
			// indicate what has happened.

			primary_writer.Close ();
			if (secondary_writer != null)
				secondary_writer.Close ();

			// Send a single IndexerIndexablesReceipt if there were deferred indexables
			if (deferred_indexables.Count > 0) {
				Log.Debug ("{0} indexables generated more indexables; asking daemon to schedule their indexing.", deferred_indexables.Count);
				IndexerIndexablesReceipt r = new IndexerIndexablesReceipt ();
				receipt_queue.Add (r);
			}

			IndexerReceipt [] receipt_array;
			receipt_array = new IndexerReceipt [receipt_queue.Count];
			for (int i = 0; i < receipt_queue.Count; ++i)
				receipt_array [i] = (IndexerReceipt) receipt_queue [i];

			return receipt_array;
		}
		// There are two ways we can determine the max_results
		// most recent items:  
		//
		// One is to instantiate Lucene documents for each of
		// the document IDs in primary_matches.  This is a
		// fairly expensive operation.
		//
		// The other is to walk through the list of all
		// document IDs in descending time order.  This is
		// a less expensive operation, but adds up over time
		// on large data sets.
		//
		// We can walk about 2.5 docs for every Document we
		// instantiate.  So what we'll do, if we have more
		// matches than available hits, is walk (m * 1.25)
		// docs to see if we can fill out the top 100 hits.
		// If not, we'll fall back to creating documents
		// for all of them.

		private static ArrayList ScanRecentDocs (IndexReader	    primary_reader,
						    IndexReader		    secondary_reader,
						    BetterBitArray	    primary_matches,
						    Dictionary<int, Hit>    hits_by_id,
						    int			    max_results,
						    ref int		    total_number_of_matches,
						    HitFilter		    hit_filter,
						    string		    index_name)
		{
			Stopwatch a = new Stopwatch ();
			a.Start ();

			TermDocs docs = primary_reader.TermDocs ();
			TermEnum enumerator = primary_reader.Terms (new Term ("InvertedTimestamp", String.Empty));
			ArrayList results = new ArrayList (max_results);
			int docs_found = 0;
			int docs_walked = 0;
			int hit_filter_removed = 0;
			int max_docs = (int) (primary_matches.TrueCount * 1.25);

			Term term;
			TermDocs secondary_term_docs = null;
			if (secondary_reader != null)
				secondary_term_docs = secondary_reader.TermDocs ();

			do {
				term = enumerator.Term ();
			
				if (term.Field () != "InvertedTimestamp")
					break;

				docs.Seek (enumerator);

				while (docs.Next ()
				       && docs_found < max_results
				       && docs_walked < max_docs) {
					int doc_id = docs.Doc ();

					if (primary_matches.Get (doc_id)) {
						Document doc = primary_reader.Document (doc_id);
						Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs);

						// If we have a HitFilter, apply it.
						if (hit_filter != null && ! hit_filter (hit)) {
							if (Debug)
								Log.Debug ("Filtered out {0}", hit.Uri);
							hit_filter_removed ++;
							continue;
						}
						hits_by_id [doc_id] = hit;
						// Add the result, last modified first
						results.Add (hit);
						docs_found++;
					}
			
					docs_walked++;
				}
			} while (enumerator.Next ()
				 && docs_found < max_results
				 && docs_walked < max_docs);

			docs.Close ();
			if (secondary_term_docs != null)
				secondary_term_docs.Close ();

			// If we've found all the docs we can return in a subset!
			// Fantastic, we've probably short circuited a slow search.
			if (docs_found != max_results) {
				// Otherwise bad luck! Not all docs found
				// Start afresh - this time traversing all results
				results = null;
			} else {
				// Adjust total_number_of_matches. We need to do this to avoid scenarios like the following:
				// max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned.
				// We want to avoid saying "Showing top 70 of 100". Note that since we are not passing
				// every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the
				// 1234 could actually be much less. But since max_hits was 100, that will not mislead the user.
				total_number_of_matches -= hit_filter_removed;
			}

			a.Stop ();
			if (Debug) {
				Log.Debug (">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a);
				
				if (docs_found == max_results)
					Log.Debug (">>> {0}: Successfully short circuited timestamp ordering!", index_name);
			}

			return results;
		}
		private static void GenerateQueryResults (IndexReader       primary_reader,
							  IndexReader       secondary_reader,
							  BetterBitArray    primary_matches,
							  IQueryResult      result,
							  ICollection       query_term_list,
							  int               max_results,
							  HitFilter         hit_filter,
							  string            index_name)
		{
			int num_hits;

			if (Debug)
				Logger.Log.Debug (">>> {0}: Initially handed {1} matches", index_name, primary_matches.TrueCount);

			if (primary_matches.TrueCount <= max_results) {
				if (Debug)
					Logger.Log.Debug (">>> {0}: Initial count is within our limit of {1}", index_name, max_results);
				num_hits = primary_matches.TrueCount;
			} else {
				if (Debug)
					Logger.Log.Debug (">>> {0}: Number of hits is capped at {1}", index_name, max_results);
				num_hits = max_results;
			}

			Stopwatch total, d, e;
			total = new Stopwatch ();
			d = new Stopwatch ();
			e = new Stopwatch ();

			total.Start ();

			ArrayList final_list_of_hits = null;

			// This is used only for scoring
			Dictionary<int, Hit> hits_by_id = new Dictionary<int, Hit> (num_hits);

			int total_number_of_matches = primary_matches.TrueCount;

			if (primary_matches.TrueCount > max_results)
				final_list_of_hits = ScanRecentDocs (primary_reader,
					secondary_reader,
					primary_matches,
					hits_by_id,
					max_results,
					ref total_number_of_matches,
					hit_filter,
					index_name);

			if (final_list_of_hits == null)
				final_list_of_hits = FindRecentResults (primary_reader,
					secondary_reader,
					primary_matches,
					hits_by_id,
					max_results,
					ref total_number_of_matches,
					hit_filter,
					index_name);

			d.Start ();

			ScoreHits (hits_by_id, primary_reader, query_term_list);
			hits_by_id = null;

			d.Stop ();

			if (Debug)
				Log.Debug (">>> {0}: Scored hits in {1}", index_name, d);

			e.Start ();

			// 25 hits seems to be the sweet spot: anything lower
			// and serialization overhead gets us, higher takes
			// longer to send out.
			const int MAX_QUEUED_HITS = 25;
			int sent_index = 0;

			// Break up the hits into reasonably sized chunks for
			// sending over the wire.
			for (int i = 0; i < final_list_of_hits.Count; ++i) {
				// Flush our hits
				if (i > 0 && i % MAX_QUEUED_HITS == 0) {
					result.Add (final_list_of_hits.GetRange (0, MAX_QUEUED_HITS));
					final_list_of_hits.RemoveRange (0, MAX_QUEUED_HITS);
					i -= MAX_QUEUED_HITS;
				}
			}

			// Flush the remaining hits
			result.Add (final_list_of_hits, total_number_of_matches);
			final_list_of_hits = null;

			e.Stop ();

			if (Debug)
				Log.Debug (">>> {0}: Hit filters executed and results sent in {1}", index_name, e);

			total.Stop ();

			if (Debug) {
				Logger.Log.Debug (">>> {0}: GenerateQueryResults time statistics:", index_name);
				//Logger.Log.Debug (">>> {0}:   Short circuit {1,6} ({2:0.0}%)", index_name, a == null ? "N/A" : a.ToString (), a == null ? 0.0 : 100 * a.ElapsedTime / total.ElapsedTime);
				//Logger.Log.Debug (">>> {0}:     Create docs {1,6} ({2:0.0}%)", index_name, b, 100 * b.ElapsedTime / total.ElapsedTime);
				//Logger.Log.Debug (">>> {0}:    Hit assembly {1,6} ({2:0.0}%)", index_name, c, 100 * c.ElapsedTime / total.ElapsedTime);
				Logger.Log.Debug (">>> {0}:     Scored hits {1,6} ({2:0.0}%)", index_name, d, 100 * d.ElapsedTime / total.ElapsedTime);
				Logger.Log.Debug (">>> {0}:    Results sent {1,6} ({2:0.0}%)", index_name, e, 100 * e.ElapsedTime / total.ElapsedTime);
				Logger.Log.Debug (">>> {0}:           TOTAL {1,6}", index_name, total);
			}
		}
		////////////////////////////////////////////////////////////////

		public void DoQuery (Query               query,
				     IQueryResult        result,
				     ICollection         search_subset_uris, // should be internal uris
				     QueryPartHook       query_part_hook,
				     HitFilter           hit_filter)
		{
			if (Debug)
				Logger.Log.Debug ("###### {0}: Starting low-level queries", IndexName);

			Stopwatch total, a, b, c, d, e, f;

			total = new Stopwatch ();
			a = new Stopwatch ();
			b = new Stopwatch ();
			c = new Stopwatch ();
			d = new Stopwatch ();
			e = new Stopwatch ();
			f = new Stopwatch ();

			total.Start ();
			a.Start ();

			ArrayList primary_required_part_queries;
			ArrayList secondary_required_part_queries;

			LNS.BooleanQuery primary_prohibited_part_query;
			LNS.BooleanQuery secondary_prohibited_part_query;

			AndHitFilter all_hit_filters;

			ArrayList term_list;

			// Assemble all of the parts into a bunch of Lucene queries

			term_list = AssembleQuery (query,
				query_part_hook,
				hit_filter,
				out primary_required_part_queries,
				out secondary_required_part_queries,
				out primary_prohibited_part_query,
				out secondary_prohibited_part_query,
				out all_hit_filters);

			a.Stop ();
			if (Debug)
				Log.Debug ("###### {0}: Building queries took {1}", IndexName, a);

			// If we have no required parts, give up.
			if (primary_required_part_queries == null)
				return;

			b.Start ();
			
			//
			// Now that we have all of these nice queries, let's execute them!
			//

			IndexReader primary_reader;
			LNS.IndexSearcher primary_searcher;
			IndexReader secondary_reader;
			LNS.IndexSearcher secondary_searcher;

			// Create the searchers that we will need.
			if (! BuildSearchers (out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher))
				return;

			b.Stop ();
			if (Debug)
				Log.Debug ("###### {0}: Readers/searchers built in {1}", IndexName, b);

			// Build whitelists and blacklists for search subsets.
			c.Start ();

			// Possibly create our whitelists from the search subset.
			LuceneBitArray primary_whitelist, secondary_whitelist;
			CreateQueryWhitelists (search_subset_uris,
				primary_searcher,
				secondary_searcher,
				primary_prohibited_part_query,
				secondary_prohibited_part_query,
				out primary_whitelist,
				out secondary_whitelist);

			c.Stop ();
			if (Debug)
				Log.Debug ("###### {0}: Whitelists and blacklists built in {1}", IndexName, c);

			// Now run the low level queries against our indexes.
			d.Start ();

			BetterBitArray primary_matches = null;

			if (primary_required_part_queries != null) {

				if (secondary_searcher != null)
					primary_matches = DoRequiredQueries_TwoIndex (primary_searcher,
										      secondary_searcher,
										      primary_required_part_queries,
										      secondary_required_part_queries,
										      primary_whitelist,
										      secondary_whitelist);
				else
					primary_matches = DoRequiredQueries (primary_searcher,
									     primary_required_part_queries,
									     primary_whitelist);

			} 

			d.Stop ();
			if (Debug)
				Logger.Log.Debug ("###### {0}: Low-level queries finished in {1}", IndexName, d);

			e.Start ();
			// Only generate results if we got some matches
			if (primary_matches != null && primary_matches.ContainsTrue ()) {
				GenerateQueryResults (primary_reader,
						      secondary_reader,
						      primary_matches,
						      result,
						      term_list,
						      query.MaxHits,
						      new HitFilter (all_hit_filters.HitFilter),
						      IndexName);
			}

			e.Stop ();

			if (Debug)
				Log.Debug ("###### {0}: Query results generated in {1}", IndexName, e);

			//
			// Finally, we clean up after ourselves.
			//

			f.Start ();
			CloseSearchers (primary_reader, primary_searcher, secondary_reader, secondary_searcher);
			f.Stop ();
			
			if (Debug)
				Log.Debug ("###### {0}: Readers/searchers released in {1}", IndexName, f);

			total.Stop ();
			if (Debug) {
				Log.Debug ("###### {0}: Query time breakdown:", IndexName);
				Log.Debug ("###### {0}:    Build queries {1,6} ({2:0.0}%)", IndexName, a, 100 * a.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:      Got readers {1,6} ({2:0.0}%)", IndexName, b, 100 * b.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:       Whitelists {1,6} ({2:0.0}%)", IndexName, c, 100 * c.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:          Queries {1,6} ({2:0.0}%)", IndexName, d, 100 * d.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:    Gen'd Results {1,6} ({2:0.0}%)", IndexName, e, 100 * e.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:   Reader cleanup {1,6} ({2:0.0}%)", IndexName, f, 100 * f.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:            TOTAL {1,6}", IndexName, total);

				Logger.Log.Debug ("###### {0}: Total query run in {1}", IndexName, total);
			}

		}
Beispiel #14
0
        public static bool StartupProcess()
        {
            // Profile our initialization
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            // Fire up our server
            if (!StartServer())
            {
                if (!arg_replace)
                {
                    Logger.Log.Error("Could not set up the listener for beagrep requests.  "
                                     + "There is probably another beagrepd instance running.  "
                                     + "Use --replace to replace the running service");
                    Environment.Exit(1);
                }

                ReplaceExisting();
            }

            // Set up out-of-process indexing
            LuceneQueryable.IndexerHook = new LuceneQueryable.IndexerCreator(RemoteIndexer.NewRemoteIndexer);

            Config config = Conf.Get(Conf.Names.DaemonConfig);

            // Initialize synchronization to keep the indexes local if PathFinder.StorageDir
            // is on a non-block device, or if BEAGREP_SYNCHRONIZE_LOCALLY is set

            if ((!SystemInformation.IsPathOnBlockDevice(PathFinder.StorageDir) &&
                 config.GetOption(Conf.Names.IndexSynchronization, true)) ||
                Environment.GetEnvironmentVariable("BEAGREP_SYNCHRONIZE_LOCALLY") != null)
            {
                IndexSynchronization.Initialize();
            }

            // Start the query driver.
            Logger.Log.Debug("Starting QueryDriver");
            QueryDriver.Start();

            // Start our battery monitor so we can shut down the
            // scheduler if needed.
            BatteryMonitor.Init();

            bool initially_on_battery = !BatteryMonitor.UsingAC && !config.GetOption(Conf.Names.IndexOnBattery, false);

            // Start the Global Scheduler thread
            if (!arg_disable_scheduler)
            {
                if (!initially_on_battery)
                {
                    Logger.Log.Debug("Starting Scheduler thread");
                    Scheduler.Global.Start();
                }
                else
                {
                    Log.Debug("Beagrep started on battery, not starting scheduler thread");
                }
            }

            // Start our Inotify threads
            Inotify.Start();

            // Test if the FileAdvise stuff is working: This will print a
            // warning if not.  The actual advice calls will fail silently.
            FileAdvise.TestAdvise();

#if ENABLE_AVAHI
            zeroconf = new Beagrep.Daemon.Network.Zeroconf();
#endif

            Conf.WatchForUpdates();

            stopwatch.Stop();

            Logger.Log.Debug("Daemon initialization finished after {0}", stopwatch);

            SystemInformation.LogMemoryUsage();

            if (arg_indexing_test_mode)
            {
                Thread.Sleep(1000);                  // Ugly paranoia: wait a second for the backends to settle.
                Logger.Log.Debug("Running in indexing test mode");
                Scheduler.Global.EmptyQueueEvent += OnEmptySchedulerQueue;
                Scheduler.Global.Add(null);                  // pulse the scheduler
            }

            return(false);
        }
        // There are two ways we can determine the max_results
        // most recent items:
        //
        // One is to instantiate Lucene documents for each of
        // the document IDs in primary_matches.  This is a
        // fairly expensive operation.
        //
        // The other is to walk through the list of all
        // document IDs in descending time order.  This is
        // a less expensive operation, but adds up over time
        // on large data sets.
        //
        // We can walk about 2.5 docs for every Document we
        // instantiate.  So what we'll do, if we have more
        // matches than available hits, is walk (m * 1.25)
        // docs to see if we can fill out the top 100 hits.
        // If not, we'll fall back to creating documents
        // for all of them.

        private static ArrayList ScanRecentDocs(IndexReader primary_reader,
                                                IndexReader secondary_reader,
                                                BetterBitArray primary_matches,
                                                Dictionary <int, Hit> hits_by_id,
                                                int max_results,
                                                ref int total_number_of_matches,
                                                HitFilter hit_filter,
                                                string index_name)
        {
            Stopwatch a = new Stopwatch();

            a.Start();

            TermDocs  docs               = primary_reader.TermDocs();
            TermEnum  enumerator         = primary_reader.Terms(new Term("InvertedTimestamp", String.Empty));
            ArrayList results            = new ArrayList(max_results);
            int       docs_found         = 0;
            int       docs_walked        = 0;
            int       hit_filter_removed = 0;
            int       max_docs           = (int)(primary_matches.TrueCount * 1.25);

            Term     term;
            TermDocs secondary_term_docs = null;

            if (secondary_reader != null)
            {
                secondary_term_docs = secondary_reader.TermDocs();
            }

            do
            {
                term = enumerator.Term();

                if (term.Field() != "InvertedTimestamp")
                {
                    break;
                }

                docs.Seek(enumerator);

                while (docs.Next() &&
                       docs_found < max_results &&
                       docs_walked < max_docs)
                {
                    int doc_id = docs.Doc();

                    if (primary_matches.Get(doc_id))
                    {
                        Document doc = primary_reader.Document(doc_id);
                        Hit      hit = CreateHit(doc, secondary_reader, secondary_term_docs);

                        // If we have a HitFilter, apply it.
                        if (hit_filter != null && !hit_filter(hit))
                        {
                            if (Debug)
                            {
                                Log.Debug("Filtered out {0}", hit.Uri);
                            }
                            hit_filter_removed++;
                            continue;
                        }
                        hits_by_id [doc_id] = hit;
                        // Add the result, last modified first
                        results.Add(hit);
                        docs_found++;
                    }

                    docs_walked++;
                }
            } while (enumerator.Next() &&
                     docs_found < max_results &&
                     docs_walked < max_docs);

            docs.Close();
            if (secondary_term_docs != null)
            {
                secondary_term_docs.Close();
            }

            // If we've found all the docs we can return in a subset!
            // Fantastic, we've probably short circuited a slow search.
            if (docs_found != max_results)
            {
                // Otherwise bad luck! Not all docs found
                // Start afresh - this time traversing all results
                results = null;
            }
            else
            {
                // Adjust total_number_of_matches. We need to do this to avoid scenarios like the following:
                // max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned.
                // We want to avoid saying "Showing top 70 of 100". Note that since we are not passing
                // every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the
                // 1234 could actually be much less. But since max_hits was 100, that will not mislead the user.
                total_number_of_matches -= hit_filter_removed;
            }

            a.Stop();
            if (Debug)
            {
                Log.Debug(">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a);

                if (docs_found == max_results)
                {
                    Log.Debug(">>> {0}: Successfully short circuited timestamp ordering!", index_name);
                }
            }

            return(results);
        }
Beispiel #16
0
                static void DoMain (string [] args)
                {
                        SystemInformation.SetProcessName ("beagrep-build-index");

                        if (args.Length < 2)
                                PrintUsage ();

                        ArrayList allowed_patterns = new ArrayList ();
                        ArrayList denied_patterns = new ArrayList ();
                        ArrayList denied_dir_patterns = new ArrayList ();

                        int i = 0;
                        while (i < args.Length) {

                                string arg = args [i];
                                ++i;
                                string next_arg = i < args.Length ? args [i] : null;

                                switch (arg) {
                                case "-h":
                                case "--help":
                                        PrintUsage ();
                                        break;

                                case "--tag":
                                        if (next_arg != null)
                                                arg_tag = next_arg;
                                        ++i;
                                        break;

                                case "-r":
                                case "--recursive":
                                        arg_recursive = true;
                                        break;

                                case "--enable-deletion":
                                        arg_delete = true;
                                        break;

                                case "--disable-directories":
                                        arg_disable_directories = true;
                                        break;

                                case "--enable-text-cache":
                                        arg_cache_text = true;
                                        break;

                                case "--target":
                                        if (next_arg != null)
                                                arg_output = Path.IsPathRooted (next_arg) ? next_arg : Path.GetFullPath (next_arg);
                                        ++i;
                                        break;

                                case "--disable-filtering":
                                        arg_disable_filtering = true;
                                        break;

                                case "--allow-pattern":
                                        if (next_arg == null)
                                                break;

                                        if (next_arg.IndexOf (',') != -1) {
                                                foreach (string pattern in next_arg.Split (','))
                                                        allowed_patterns.Add (pattern);

                                        } else {
                                                allowed_patterns.Add (next_arg);
                                        }

                                        ++i;
                                        break;

                                case "--deny-directory-pattern":
                                        if (next_arg == null)
                                                break;

                                        if (next_arg.IndexOf (',') != -1) {
                                                foreach (string pattern in next_arg.Split (','))
                                                        denied_dir_patterns.Add (pattern);
                                        } else {
                                                denied_dir_patterns.Add (next_arg);
                                        }

                                        ++i;
                                        break;

                                case "--deny-pattern":
                                        if (next_arg == null)
                                                break;

                                        if (next_arg.IndexOf (',') != -1) {
                                                foreach (string pattern in next_arg.Split (','))
                                                        denied_patterns.Add (pattern);

                                        } else {
                                                denied_patterns.Add (next_arg);
                                        }

                                        ++i;
                                        break;

                                case "--disable-restart":
                                        arg_disable_restart = true;
                                        break;

                                case "--source":
                                        if (next_arg == null)
                                                break;

                                        arg_source = next_arg;
                                        ++i;
                                        break;

                                default:
                                        if (arg.StartsWith ("-") || arg.StartsWith ("--"))
                                                PrintUsage ();

                                        string path = Path.IsPathRooted (arg) ? arg : Path.GetFullPath (arg);
                                        if (path != "/" && path.EndsWith ("/"))
                                                path = path.TrimEnd ('/');

                                        if (Directory.Exists (path))
                                                pending_directories.Enqueue (new DirectoryInfo (path));
                                        else if (File.Exists (path))
                                                pending_files.Enqueue (new FileInfo (path));
                                        break;
                                }
                        }

                        /////////////////////////////////////////////////////////

                        if (arg_output == null) {
                                Logger.Log.Error ("--target must be specified");
                                Environment.Exit (1);
                        }

                        // Set the storage dir, this should be used to store log messages
                        // and filterver.dat
                        PathFinder.StorageDir = arg_output;

                        foreach (FileSystemInfo info in pending_directories) {
                                if (Path.GetFullPath (arg_output) == info.FullName) {
                                        Logger.Log.Error ("Target directory cannot be one of the source paths.");
                                        Environment.Exit (1);
                                }
                        }

                        foreach (FileSystemInfo info in pending_files) {
                                if (Path.GetFullPath (arg_output) == info.FullName) {
                                        Logger.Log.Error ("Target directory cannot be one of the source paths.");
                                        Environment.Exit (1);
                                }
                        }

                        if (!Directory.Exists (Path.GetDirectoryName (arg_output))) {
                                Logger.Log.Error ("Index directory not available for construction: {0}", arg_output);
                                Environment.Exit (1);
                        }

                        // Be *EXTRA PARANOID* about the contents of the target
                        // directory, because creating an indexing driver will
                        // nuke it.
                        if (Directory.Exists (arg_output)) {

                                foreach (FileInfo info in DirectoryWalker.GetFileInfos (arg_output)) {
                                        if (Array.IndexOf (allowed_files, info.Name) == -1) {
                                                Logger.Log.Error ("{0} doesn't look safe to delete: non-Beagrep file {1} was found", arg_output, info.FullName);
                                                Environment.Exit (1);
                                        }
                                }

                                foreach (DirectoryInfo info in DirectoryWalker.GetDirectoryInfos (arg_output)) {
                                        if (Array.IndexOf (allowed_dirs, info.Name) == -1) {
                                                Logger.Log.Error ("{0} doesn't look safe to delete: non-Beagrep directory {1} was found", arg_output, info.FullName);
                                                Environment.Exit (1);
                                        }
                                }
                        }

                        string config_file_path = Path.Combine (arg_output, "StaticIndex.xml");
                        string prev_source = null;
                        if (File.Exists (config_file_path)) {
                                Config static_index_config = Conf.LoadFrom (config_file_path);
                                if (static_index_config == null) {
                                        Log.Error ("Invalid configuation file {0}", config_file_path);
                                        Environment.Exit (1);
                                }

                                prev_source = static_index_config.GetOption ("Source", null);
                                if (arg_source != null && prev_source != arg_source) {
                                        Log.Error ("Source already set to {0} for existing static index. Cannot set source to {1}.", prev_source, arg_source);
                                        Environment.Exit (1);
                                }

                                // If arg_source is not given, and prev_source is present, use prev_source
                                // as the arg_source. This is useful for re-running build-index without
                                // giving --arg_source for already existing static index
                                arg_source = prev_source;
                        }


                        if (arg_source == null) {
                                DirectoryInfo dir = new DirectoryInfo (StringFu.SanitizePath (arg_output));
                                arg_source = dir.Name;
                        }

                        string global_files_config = Path.Combine (PathFinder.ConfigDataDir, "config-files");
                        global_files_config = Path.Combine (global_files_config, Conf.Names.FilesQueryableConfig + ".xml");
                        if (! File.Exists (global_files_config)) {
                                Log.Error ("Global configuration file not found {0}", global_files_config);
                                Environment.Exit (0);
                        }

                        // Setup regexes for allowed/denied patterns
                        if (allowed_patterns.Count > 0) {
                                allowed_regex = StringFu.GetPatternRegex (allowed_patterns);
                        } else {
                                // Read the exclude values from config
                                // For system-wide indexes, only the global config value will be used
                                Config config = Conf.Get (Conf.Names.FilesQueryableConfig);
                                List<string[]> values = config.GetListOptionValues (Conf.Names.ExcludePattern);
                                if (values != null)
                                        foreach (string[] exclude in values)
                                                denied_patterns.Add (exclude [0]);

                                if (denied_patterns.Count > 0)
                                        denied_regex = StringFu.GetPatternRegex (denied_patterns);
                        }

                        if (denied_dir_patterns.Count > 0) {
                                denied_dir_regex = StringFu.GetPatternRegex (denied_dir_patterns);
                                Log.Always("Will ignore directories matching regular expression: {0}", denied_dir_regex);
                        }

                        Log.Always ("Starting beagrep-build-index (pid {0}) at {1}", Process.GetCurrentProcess ().Id, DateTime.Now);

                        driver = new LuceneIndexingDriver (arg_output, MINOR_VERSION, false);
                        driver.TextCache = (arg_cache_text) ? new TextCache (arg_output) : null;
                        if (driver.TextCache != null)
                                driver.TextCache.WorldReadable = true;

                        backing_fa_store = new FileAttributesStore_Sqlite (driver.TopDirectory, driver.Fingerprint);
                        fa_store = new FileAttributesStore (backing_fa_store);

                        // Set up signal handlers
#if MONO_1_9
                        Shutdown.SetupSignalHandlers (delegate (int signal)
                                                        {
                                                                if (signal == (int) Mono.Unix.Native.Signum.SIGINT ||
                                                                    signal == (int) Mono.Unix.Native.Signum.SIGTERM)
                                                                        Shutdown.BeginShutdown ();
                                                        });
#else
                        SetupSignalHandlers ();
#endif

                        Thread monitor_thread = null;

                        Stopwatch watch = new Stopwatch ();
                        watch.Start ();

                        if (!arg_disable_restart) {
                                // Start the thread that monitors memory usage.
                                monitor_thread = ExceptionHandlingThread.Start (new ThreadStart (MemoryMonitorWorker));
                        }

                        // Start indexworker to do the crawling and indexing
                        IndexWorker ();

                        // Join any threads so that we know that we're the only thread still running
                        if (monitor_thread != null)
                                monitor_thread.Join ();

                        watch.Stop ();
                        Logger.Log.Debug ("Elapsed time {0}.", watch);

                        // Write this after indexing is done. This is because, if creating a new index,
                        // LuceneIndexingDriver.Create() is called which purges the entire directory.

                        if (prev_source == null) {
                                Config static_index_config = Conf.LoadNew ("StaticIndex.xml");

                                // Write StaticIndex.xml containing:
                                // The name of the source
                                static_index_config.SetOption ("Source", arg_source);
                                static_index_config ["Source"].Description = "Source of the static index";


                                Conf.SaveTo (static_index_config, config_file_path);
                        }

                        if (restart) {
                                Logger.Log.Debug ("Restarting beagrep-build-index");
                                Process p = new Process ();
                                p.StartInfo.UseShellExecute = false;
                                // FIXME: Maybe this isn't the right way to do things?  It should be ok,
                                // the PATH is inherited from the shell script which runs mono itself.
                                p.StartInfo.FileName = "mono";
                                p.StartInfo.Arguments = String.Join (" ", Environment.GetCommandLineArgs ());
                                p.Start ();
                        }

                        Log.Always ("Exiting beagrep-build-index (pid {0}) at {1}", Process.GetCurrentProcess ().Id, DateTime.Now);
                }
        private static ArrayList   FindRecentResults(IndexReader primary_reader,
                                                     IndexReader secondary_reader,
                                                     BetterBitArray primary_matches,
                                                     Dictionary <int, Hit> hits_by_id,
                                                     int max_results,
                                                     ref int total_number_of_matches,
                                                     HitFilter hit_filter,
                                                     string index_name)
        {
            Stopwatch b = new Stopwatch();

            b.Start();

            int      count = 0;
            Document doc;

            ArrayList all_docs  = null;
            TopScores top_docs  = null;
            TermDocs  term_docs = null;

            if (primary_matches.TrueCount > max_results)
            {
                top_docs = new TopScores(max_results);
            }
            else
            {
                all_docs = new ArrayList(primary_matches.TrueCount);
            }

            if (secondary_reader != null)
            {
                term_docs = secondary_reader.TermDocs();
            }

            for (int match_index = primary_matches.Count; ; match_index--)
            {
                // Walk across the matches backwards, since newer
                // documents are more likely to be at the end of
                // the index.
                match_index = primary_matches.GetPreviousTrueIndex(match_index);
                if (match_index < 0)
                {
                    break;
                }

                count++;

                doc = primary_reader.Document(match_index, fields_timestamp_uri);

                // Check the timestamp --- if we have already reached our
                // limit, we might be able to reject it immediately.
                string timestamp_str;
                long   timestamp_num = 0;

                timestamp_str = doc.Get("Timestamp");
                if (timestamp_str == null)
                {
                    Logger.Log.Warn("No timestamp on {0}!", GetUriFromDocument(doc));
                }
                else
                {
                    timestamp_num = Int64.Parse(doc.Get("Timestamp"));
                    if (top_docs != null && !top_docs.WillAccept(timestamp_num))
                    {
                        continue;
                    }
                }

                // Get the actual hit now
                // doc was created with only 2 fields, so first get the complete lucene document for primary document.
                // Also run our hit_filter now, if we have one. Since we insist of returning max_results
                // most recent hits, any hits that would be filtered out should happen now and not later.
                Hit hit = CreateHit(primary_reader.Document(match_index), secondary_reader, term_docs);
                if (hit_filter != null && !hit_filter(hit))
                {
                    if (Debug)
                    {
                        Log.Debug("Filtered out {0}", hit.Uri);
                    }
                    total_number_of_matches--;
                    continue;
                }

                hits_by_id [match_index] = hit;

                // Add the document to the appropriate data structure.
                // We use the timestamp_num as the score, so high
                // scores correspond to more-recent timestamps.
                if (all_docs != null)
                {
                    all_docs.Add(hit);
                }
                else
                {
                    top_docs.Add(timestamp_num, hit);
                }
            }

            if (term_docs != null)
            {
                term_docs.Close();
            }

            b.Stop();

            if (Debug)
            {
                Log.Debug(">>> {0}: Instantiated and scanned {1} documents in {2}", index_name, count, b);
            }

            if (all_docs != null)
            {
                // Sort results before sending
                all_docs.Sort();
                return(all_docs);
            }
            else
            {
                return(top_docs.TopScoringObjects);
            }
        }
Beispiel #18
0
                protected bool Exists ()
                {
                        if (! (Directory.Exists (top_dir)
                               && File.Exists (VersionFile)
                               && File.Exists (FingerprintFile)
                               && Directory.Exists (PrimaryIndexDirectory)
                               && IndexReader.IndexExists (PrimaryIndexDirectory)
                               && Directory.Exists (SecondaryIndexDirectory)
                               && IndexReader.IndexExists (SecondaryIndexDirectory)
                               && Directory.Exists (LockDirectory)))
                                return false;

                        // Check the index's version number.  If it is wrong,
                        // declare the index non-existent.

                        StreamReader version_reader;
                        string version_str;
                        version_reader = new StreamReader (VersionFile);
                        version_str = version_reader.ReadLine ();
                        version_reader.Close ();

                        int current_major_version, current_minor_version;
                        int i = version_str.IndexOf ('.');

                        try {
                                if (i != -1) {
                                        current_major_version = Convert.ToInt32 (version_str.Substring (0, i));
                                        current_minor_version = Convert.ToInt32 (version_str.Substring (i+1));
                                } else {
                                        current_minor_version = Convert.ToInt32 (version_str);
                                        current_major_version = 0;
                                }
                        } catch (FormatException) {
                                // Something wrong with the version file.
                                return false;
                        }

                        if (current_major_version != MAJOR_VERSION
                            || (minor_version >= 0 && current_minor_version != minor_version)) {
                                Logger.Log.Debug ("Version mismatch in {0}", index_name);
                                Logger.Log.Debug ("Index has version {0}.{1}, expected {2}.{3}",
                                                  current_major_version, current_minor_version,
                                                  MAJOR_VERSION, minor_version);
                                return false;
                        }

                        // Check the lock directory: If there is a dangling write lock,
                        // assume that the index is corrupted and declare it non-existent.
                        DirectoryInfo lock_dir_info;
                        lock_dir_info = new DirectoryInfo (LockDirectory);
                        bool dangling_lock = false;

                        foreach (FileInfo info in lock_dir_info.GetFiles ()) {
                                if (IsDanglingLock (info)) {
                                        Logger.Log.Warn ("Found a dangling index lock on {0}.", info.FullName);
                                        dangling_lock = true;
                                }
                        }

                        if (dangling_lock) {
                                Beagrep.Util.Stopwatch w = new Beagrep.Util.Stopwatch ();
                                w.Start ();

                                if (VerifyLuceneIndex (PrimaryIndexDirectory) &&
                                    VerifyLuceneIndex (SecondaryIndexDirectory)) {
                                        w.Stop ();

                                        Log.Warn ("Indexes verified in {0}.  Deleting stale lock files.", w);

                                        try {
                                                foreach (FileInfo info in lock_dir_info.GetFiles ())
                                                        info.Delete ();
                                        } catch {
                                                Log.Warn ("Could not delete lock files.");
                                                return false;
                                        }
                                        return true;
                                } else
                                        return false;
                        }

                        return true;
                }
Beispiel #19
0
        private IndexerReceipt [] Flush_Unlocked(IndexerRequest request)
        {
            ArrayList receipt_queue;

            receipt_queue = new ArrayList();

            IndexReader primary_reader, secondary_reader;

            primary_reader   = IndexReader.Open(PrimaryStore);
            secondary_reader = IndexReader.Open(SecondaryStore);

            // Step #1: Make our first pass over the list of
            // indexables that make up our request.  For each add
            // or property change in the request, get the Lucene
            // documents so we can move forward any persistent
            // properties (for adds) or all old properties (for
            // property changes).
            //
            // Then, for each add or remove in the request,
            // delete the associated documents from the index.
            // Note that we previously cached added documents so
            // that we can move persistent properties forward.

            // parent_child_old_props is double-nested hashtable (depth-2 tree)
            // indexed by the parent uri, it stores another hashtable indexed by the (parent+child documents)
            // FIXME: 2-level hashtable is a waste for any non-child document.
            // Replace this by a better data structure.
            Hashtable parent_child_old_props = UriFu.NewHashtable();
            TermDocs  term_docs    = secondary_reader.TermDocs();
            int       delete_count = 0;

            IEnumerable request_indexables = request.Indexables;

            foreach (Indexable indexable in request_indexables)
            {
                string uri_str = UriFu.UriToEscapedString(indexable.Uri);
                Term   term;

                // Store the necessary properties from old documents for re-addition
                if (indexable.Type == IndexableType.Add ||
                    indexable.Type == IndexableType.PropertyChange)
                {
                    term = new Term("Uri", uri_str);
                    term_docs.Seek(term);

                    Hashtable this_parent_child_props = null;

                    if (term_docs.Next())
                    {
                        this_parent_child_props = UriFu.NewHashtable();
                        this_parent_child_props [indexable.Uri] = secondary_reader.Document(term_docs.Doc());
                        parent_child_old_props [indexable.Uri]  = this_parent_child_props;
                    }

                    term = new Term("ParentUri", uri_str);
                    term_docs.Seek(term);

                    while (term_docs.Next())
                    {
                        Document doc = secondary_reader.Document(term_docs.Doc());

                        string child_uri_str = doc.Get("Uri");
                        Uri    child_uri     = UriFu.EscapedStringToUri(child_uri_str);
                        // Any valid lucene document *should* have a Uri, so no need to check for null
                        // Store the child documents too, to save persistent-properties
                        // of child documents
                        this_parent_child_props [child_uri] = doc;
                    }
                }

                // Now remove (non-remove indexables will be re-added in next block)
                Logger.Log.Debug("-{0}", indexable.DisplayUri);

                int num_delete = 0;

                term = new Term("Uri", uri_str);
                // For property changes, only secondary index is modified
                secondary_reader.DeleteDocuments(term);

                // Now remove from everywhere else (if asked to remove or if asked to add, in which case
                // we first remove and then add)
                // So we also need to remove child documents
                if (indexable.Type != IndexableType.PropertyChange)
                {
                    num_delete = primary_reader.DeleteDocuments(term);

                    // When we delete an indexable, also delete any children.
                    // FIXME: Shouldn't we also delete any children of children, etc.?
                    term        = new Term("ParentUri", uri_str);
                    num_delete += primary_reader.DeleteDocuments(term);
                    secondary_reader.DeleteDocuments(term);
                }

                // If this is a strict removal (and not a deletion that
                // we are doing in anticipation of adding something back),
                // queue up a removed receipt.
                if (indexable.Type == IndexableType.Remove)
                {
                    IndexerRemovedReceipt r;
                    r            = new IndexerRemovedReceipt(indexable.Id);
                    r.NumRemoved = num_delete;
                    receipt_queue.Add(r);
                }

                delete_count += num_delete;
            }

            term_docs.Close();

            if (HaveItemCount)
            {
                AdjustItemCount(-delete_count);
            }
            else
            {
                SetItemCount(primary_reader);
            }

            // We are now done with the readers, so we close them.
            // And also free them. Somehow not freeing them is preventing them from
            // GCed at all.
            primary_reader.Close();
            primary_reader = null;
            secondary_reader.Close();
            secondary_reader = null;

            // FIXME: If we crash at exactly this point, we are in
            // trouble.  Items will have been dropped from the index
            // without the proper replacements being added.  We can
            // hopefully fix this when we move to Lucene 2.1.

            // Step #2: Make another pass across our list of indexables
            // and write out any new documents.

            if (text_cache != null)
            {
                text_cache.BeginTransaction();
            }

            IndexWriter primary_writer, secondary_writer;

            // FIXME: Lock obtain time-out can happen here; if that happens,
            // an exception will be thrown and this method will break in the middle
            // leaving IndexWriters unclosed! Same for any Lucene.Net-index modification
            // methods.
            primary_writer   = new IndexWriter(PrimaryStore, IndexingAnalyzer, false);
            secondary_writer = null;

            foreach (Indexable indexable in request_indexables)
            {
                // If shutdown has been started, break here
                // FIXME: Some more processing will continue, a lot of them
                // concerning receipts, but the daemon will anyway ignore receipts
                // now, what is the fastest way to stop from here ?
                if (Shutdown.ShutdownRequested)
                {
                    Log.Debug("Shutdown initiated. Breaking while flushing indexables.");
                    break;
                }

                // Receipts for removes were generated in the
                // previous block.  Now we just have to remove
                // items from the text cache.
                if (indexable.Type == IndexableType.Remove)
                {
                    if (text_cache != null)
                    {
                        text_cache.Delete(indexable.Uri);
                    }

                    continue;
                }

                IndexerAddedReceipt r;
                Hashtable           prop_change_docs = (Hashtable)parent_child_old_props [indexable.Uri];

                if (indexable.Type == IndexableType.PropertyChange)
                {
                    Logger.Log.Debug("+{0} (props only)", indexable.DisplayUri);

                    r = new IndexerAddedReceipt(indexable.Id);
                    r.PropertyChangesOnly = true;
                    receipt_queue.Add(r);

                    Document doc;
                    if (prop_change_docs == null)
                    {
                        doc = null;
                    }
                    else
                    {
                        doc = (Document)prop_change_docs [indexable.Uri];
                    }

                    Document new_doc;
                    new_doc = RewriteDocument(doc, indexable);

                    // Write out the new document...
                    if (secondary_writer == null)
                    {
                        secondary_writer = new IndexWriter(SecondaryStore, IndexingAnalyzer, false);
                    }
                    secondary_writer.AddDocument(new_doc);

                    // Get child property change indexables...
                    ArrayList prop_change_indexables;
                    prop_change_indexables = GetChildPropertyChange(prop_change_docs, indexable);
                    // and store them; no need to delete them first, since they were already removed from the index
                    if (prop_change_indexables == null)
                    {
                        continue;
                    }

                    foreach (Indexable prop_change_indexable in prop_change_indexables)
                    {
                        Log.Debug("+{0} (props only, generated indexable)", prop_change_indexable.Uri);
                        doc     = (Document)prop_change_docs [prop_change_indexable.Uri];
                        new_doc = RewriteDocument(doc, prop_change_indexable);
                        secondary_writer.AddDocument(new_doc);
                    }

                    continue;                     // ...and proceed to the next Indexable
                }

                // If we reach this point we know we are dealing with an IndexableType.Add

                if (indexable.Type != IndexableType.Add)
                {
                    throw new Exception("When I said it was an IndexableType.Add, I meant it!");
                }

                r = AddIndexableToIndex(indexable, primary_writer, ref secondary_writer, prop_change_docs);
                if (r != null)
                {
                    receipt_queue.Add(r);
                }
            }

            if (text_cache != null)
            {
                text_cache.CommitTransaction();
            }

            if (Shutdown.ShutdownRequested)
            {
                foreach (DeferredInfo di in deferred_indexables)
                {
                    di.Cleanup();
                }
                deferred_indexables.Clear();

                foreach (Indexable indexable in request_indexables)
                {
                    indexable.Cleanup();
                }

                primary_writer.Close();
                if (secondary_writer != null)
                {
                    secondary_writer.Close();
                }

                return(null);
            }

            if (request.OptimizeIndex)
            {
                Stopwatch watch = new Stopwatch();
                Logger.Log.Debug("Optimizing {0}", IndexName);
                watch.Start();
                primary_writer.Optimize();
                if (secondary_writer == null)
                {
                    secondary_writer = new IndexWriter(SecondaryStore, IndexingAnalyzer, false);
                }
                secondary_writer.Optimize();
                watch.Stop();
                Logger.Log.Debug("{0} optimized in {1}", IndexName, watch);
            }

            // Step #4. Close our writers and return the events to
            // indicate what has happened.

            primary_writer.Close();
            if (secondary_writer != null)
            {
                secondary_writer.Close();
            }

            // Send a single IndexerIndexablesReceipt if there were deferred indexables
            if (deferred_indexables.Count > 0)
            {
                Log.Debug("{0} indexables generated more indexables; asking daemon to schedule their indexing.", deferred_indexables.Count);
                IndexerIndexablesReceipt r = new IndexerIndexablesReceipt();
                receipt_queue.Add(r);
            }

            IndexerReceipt [] receipt_array;
            receipt_array = new IndexerReceipt [receipt_queue.Count];
            for (int i = 0; i < receipt_queue.Count; ++i)
            {
                receipt_array [i] = (IndexerReceipt)receipt_queue [i];
            }

            return(receipt_array);
        }