Esempio n. 1
0
        public void Crawl()
        {
            this.RaiseStatusEvent("Crawling started");
            this.RaiseProgressEvent(0.0);

            this.graph = Graph.Reconstruct(this.configuration.OutputPath);

            var queue = Crawler.CreateQueue(this.graph, this.seeds);

            while (queue.Count != 0)
            {
                var seed = queue.Dequeue().Value;

                if (this.cancellationToken.IsCancellationRequested)
                {
                    Trace.TraceInformation("Crawl cancel requested");
                    break;
                }

                var site = new Site(seed, this.configuration);

                try
                {
                    string info = string.Format("Crawling {0}", seed.Host);
                    Trace.TraceInformation(info);
                    this.RaiseStatusEvent(info);

                    var policy = this.RetrievePolicy(site);

                    this.RetrieveSitemap(site, policy);

                    if (this.graph.Exists(seed))
                    {
                        Trace.TraceInformation(string.Format("Seed {0} exists in the graph", seed.Host));

                        // TO-DO: Rediscover in case if seed info is outdated

                        continue;
                    }

                    var attributes = new Dictionary <string, string>()
                    {
                        { "robots", policy.IsRobots.ToString() },
                        { "sitemap", policy.IsSitemap.ToString() }
                    };

                    this.graph.AddVertex(seed, attributes);

                    this.Start(seed, site, policy);

                    this.graph.MarkAsDiscovered(seed);

                    info = string.Format("Crawling {0} completed", seed.Host);
                    Trace.TraceInformation(info);
                    this.RaiseStatusEvent(info);

                    Crawler.UpdateQueue(this.graph, queue);
                }
                catch (Exception exception)
                {
                    Trace.TraceError(string.Format("Failed to crawl {0}. {1}", seed.Host, exception.Message));

                    // Avoid crawling this host until a bug fix or
                    // functionality implemented (should throw NotImplementedException)
                    this.graph.MarkAsDoNotProcess(seed);
                }
                finally
                {
                    if (!this.configuration.SaveRobotsFile)
                    {
                        File.Delete(site.RobotsPath);
                    }

                    if (this.configuration.SerializeGraph)
                    {
                        // Saving vertices under separate paths
                        this.graph.Persist(this.configuration.OutputPath, true);
                    }

                    // Deleting empty paths for downloaded html files
                    if (site.Configuration.DeleteHtmlAfterScrape && Directory.Exists(site.HtmlDownloadPath))
                    {
                        Directory.Delete(site.HtmlDownloadPath, true);
                    }

                    site.Serialize();
                }
            }

            if (this.configuration.SerializeGraph)
            {
                // Persisting graph as a whole
                this.graph.Serialize(this.configuration.GraphFilePath);
            }

            this.RaiseStatusEvent("Crawling completed");
            this.RaiseProgressEvent(1.0);
        }