Example #1
0
        static void Main(string[] args)
        {
            var urisToCrawl      = GetSiteToCrawl(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlUrls.txt"));
            var crawlRuleContent = GetCrawlRuleFileContent(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlRules.txt"));
            var decisionMaker    = new CrawlDecisionMakerWithCrawlRules(crawlRuleContent);

            XmlConfigurator.Configure();

            var config = AbotXConfigurationSectionHandler.LoadFromXml().Convert();

            config.IsJavascriptRenderingEnabled = true;
            config.JavascriptRenderingWaitTimeInMilliseconds = 3000;
            config.MaxConcurrentSiteCrawls = 1;
            config.MaxConcurrentThreads    = 2;

            var impls = new ImplementationOverride(config);

            impls.CrawlDecisionMaker = decisionMaker;
            var crawler = new CrawlerX(config, impls);

            crawler.PageCrawlStarting        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompleted       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowed      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;

            foreach (var uriToCrawl in urisToCrawl)
            {
                var result = crawler.Crawl(uriToCrawl);
            }

            Console.Read();
        }
Example #2
0
        public void Run()
        {
            var resources = SqlHelper.UnloadResources();

            if (!resources.Any())
            {
                return;
            }

            var siteToCrawls = new List <SiteToCrawl>();

            foreach (var res in resources)
            {
                for (var i = 0; i < 10; i++)
                {
                    siteToCrawls.Add(new SiteToCrawl
                    {
                        Uri     = new Uri(string.Format(_urlPattern, res, 10 * i)),
                        SiteBag = new { Name = res, Number = i + 1 }
                    });
                }
            }

            CrawlConfigurationX config = AbotXConfigurationSectionHandler.LoadFromXml().Convert();

            XmlConfigurator.Configure();//So the logger

            var siteToCrawlProvider = new SiteToCrawlProvider();

            siteToCrawlProvider.AddSitesToCrawl(siteToCrawls);

            //Create the crawl engine instance
            var impls = new ParallelImplementationOverride(
                config,
                new ParallelImplementationContainer
            {
                SiteToCrawlProvider = siteToCrawlProvider
            }
                );

            _crawlerEngine = new ParallelCrawlerEngine(config, impls);

            //Register for site level events
            _crawlerEngine.AllCrawlsCompleted += (sender, eventArgs) =>
            {
                Console.WriteLine("Completed crawling all sites");
                _crawlerEngine.Stop(true);
                Run();
            };
            _crawlerEngine.SiteCrawlCompleted += (sender, eventArgs) =>
            {
                Console.WriteLine("Completed crawling site {0}", eventArgs.CrawledSite.SiteToCrawl.Uri);
            };
            _crawlerEngine.CrawlerInstanceCreated += (sender, eventArgs) =>
            {
                eventArgs.Crawler.CrawlBag = eventArgs.SiteToCrawl.SiteBag;
                //Register for crawler level events. These are Abot's events!!!
                eventArgs.Crawler.PageCrawlCompleted += (abotSender, abotEventArgs) =>
                {
                    var         crawlX      = abotSender as CrawlerX;
                    CrawledPage crawledPage = abotEventArgs.CrawledPage;

                    if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                    {
                        Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
                    }
                    else
                    {
                        if (string.IsNullOrEmpty(crawledPage.Content.Text))
                        {
                            Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
                        }
                        else
                        {
                            try
                            {
                                if (crawledPage.CrawlDepth == 1)
                                {
                                    Console.WriteLine("Depth: {0} --- Crawl of page succeeded {1}", crawledPage.CrawlDepth, crawledPage.Uri.AbsoluteUri);
                                    var item = new CrawledItem()
                                    {
                                        Name       = crawlX.CrawlBag.Name,
                                        PageNumber = crawlX.CrawlBag.Number,
                                        Url        = crawledPage.Uri.AbsoluteUri,
                                        Detail     = crawledPage.Content.Text
                                    };

                                    SqlHelper.Store(new System.Collections.Generic.List <CrawledItem>()
                                    {
                                        item
                                    });
                                }
                            }
                            catch (Exception e)
                            {
                                Console.WriteLine(e.Message);
                            }
                        }
                    }

                    //var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser
                    //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser
                };
                eventArgs.Crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
                {
                    CrawlDecision decision = new CrawlDecision {
                        Allow = true
                    };

                    var isCrawlDepth1 = pageToCrawl.CrawlDepth == 0 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/s?wd");
                    var isCrawlDepth2 = pageToCrawl.CrawlDepth == 1 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/link");

                    if (isCrawlDepth1 || isCrawlDepth2)
                    {
                        return new CrawlDecision {
                            Allow = false, Reason = "Dont want to crawl google pages"
                        }
                    }
                    ;

                    return(decision);
                });
            };

            _crawlerEngine.StartAsync();
        }