public void RunWebCrawler(string rootURI, string[] uriContains) { if (rootURI == null || rootURI == "") throw new ArgumentNullException(); Uri uri = new Uri(rootURI); CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 10; crawlConfig.MaxPagesToCrawl = 1000; XmlConfigurator.Configure(); PoliteWebCrawler webCrawler = new PoliteWebCrawler(); webCrawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; webCrawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; webCrawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; webCrawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; DBConfiguration dbConfig = DBConfiguration.GetDBConfiguration(); dbConfig = DBConfigurationSectionHandler.LoadFromXml().Convert(); IDatabase database = DBFactory.GetDatabase(dbConfig.DatabaseType); Func<IDatabase> databaseGenerator = database.GetDatabaseGenerator(); _databaseConnectionPool = new DBConnectionPool(databaseGenerator); string userGUID = Guid.NewGuid().ToString(); CreateProject(rootURI, userGUID); if (uriContains != null) { foreach (string uriContent in uriContains) { //webCrawler.ShouldCrawlPage((pageToCrawl, crawlContext) => //{ // CrawlDecision decision = new CrawlDecision(); // Match match = Regex.Match(pageToCrawl.Uri.ToString(), uriContent, RegexOptions.IgnoreCase); // if (!match.Success) // return new CrawlDecision { Allow = false, Reason = "Include all uri parts" }; // // return decision; //}); } } CrawlResult result = webCrawler.Crawl(uri); if (result.ErrorOccurred) Console.WriteLine("Crawl of {0} completed with ERROR!!!", result.RootUri.AbsoluteUri); else Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); _databaseConnectionPool.CloseAllConnections(); _databaseConnectionPool = null; }
public void RunWebCrawler(String rootURI, String[] uriContains) { if (rootURI == null || rootURI == "") throw new ArgumentNullException(); Uri uri = new Uri(rootURI); CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 10; crawlConfig.MaxPagesToCrawl = 1000; XmlConfigurator.Configure(); PoliteWebCrawler webCrawler = new PoliteWebCrawler(); webCrawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; webCrawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; webCrawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; webCrawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; DBConfiguration dbConfig = DBConfiguration.GetDBConfiguration(); dbConfig = DBConfigurationSectionHandler.LoadFromXml().Convert(); IDatabase database = DBFactory.GetDatabase(dbConfig.DatabaseType); Func<IDatabase> databaseGenerator = database.GetDatabaseGenerator(); _databaseConnectionPool = new DBConnectionPool(databaseGenerator); if (uriContains != null) { foreach (String uriContent in uriContains) { } } CrawlResult result = webCrawler.Crawl(uri); if (result.ErrorOccurred) Console.WriteLine("Crawl of {0} completed with ERROR!!!", result.RootUri.AbsoluteUri); else Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); _databaseConnectionPool.CloseAllConnections(); _databaseConnectionPool = null; }