Exemple #1
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = new PoliteWebCrawler();

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow =
                        (pageToCrawl.Uri.AbsoluteUri.StartsWith("https://home.treasury.gov") ||
                         pageToCrawl.Uri.AbsoluteUri.StartsWith("https://www.treasury.gov")) &&
                        !pageToCrawl.Uri.AbsoluteUri.EndsWith(".pdf")
                });
            });


            crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow = true
                });
            });

            //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri.
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run
            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow = true
                });
            });

            return(crawler);
        }
        private void doWork(object sender, DoWorkEventArgs e)
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            crawlConfig.MaxConcurrentThreads        = AppSettings.Settings.Crawler.MaxConcurrentThreads;
            crawlConfig.MaxPagesToCrawl             = AppSettings.Settings.Crawler.MaxPagesToCrawl;
            crawlConfig.UserAgentString             = AppSettings.Settings.Crawler.UserAgentString;
            crawlConfig.HttpRequestMaxAutoRedirects = AppSettings.Settings.Crawler.HttpRequestMaxAutoRedirects;
            crawlConfig.MaxCrawlDepth = AppSettings.Settings.Crawler.MaxCrawlDepth;

            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, new SitePageRequester(crawlConfig), null, null, null, null);

            crawler.PageCrawlStartingAsync        += crawler_PageCrawlStartingAsync;
            crawler.PageCrawlCompletedAsync       += crawler_PageCrawlCompletedAsync;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowedAsync;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowedAsync;
            crawler.ShouldCrawlPage(crawler_ShouldCrawlPage);

            m_globalWordsCounter = 0;

            updateCrawlingStarted();

            Uri         url           = new Uri(siteURL.Text);
            CrawlResult crawlerResult = crawler.Crawl(url);

            if (m_backgroundWorker.CancellationPending)
            {
                e.Cancel = true;
                m_backgroundWorker.ReportProgress(0);
            }

            string elapsedTimeStr = crawlerResult.Elapsed.ToString(@"dd\.hh\:mm\:ss");

            log.Debug("*** Crawling completed. Elapsed time: " + elapsedTimeStr);
            log.Debug("*** Total words found in this site: " + m_globalWordsCounter.ToString());

            // create CSV report
            createCSVFile(m_globalWordsCounter);

            updateCrawlingFinished();
            done();
        }
Exemple #3
0
        private PoliteWebCrawler SetUp()
        {
            XmlConfigurator.Configure();

            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.MaxConcurrentThreads = 5;//this overrides the config value

            // Ostrożnie z głębokością, 0 i tak sporo zwraca rekordów
            crawlConfig.MaxCrawlDepth = 0;

            //Will use app.config for configuration
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.ShouldCrawlPage(ShouldCrawlPage);

            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            return(crawler);
        }
Exemple #4
0
        public PoliteWebCrawler CreateCrawler()
        {
            _dataFinder = new DataFinder(new KomputronikDataExtractor());

            XmlConfigurator.Configure();

            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.MaxConcurrentThreads = 15;//this overrides the config value

            crawlConfig.MaxCrawlDepth = 15;

            //Will use app.config for configuration
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.ShouldCrawlPage(ShouldCrawlPage);
            crawler.ShouldDownloadPageContent(ShouldCrawlPageContent);
            crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks);

            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            return(crawler);
        }