static void DoCrawl() { CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 10; crawlConfig.MaxPagesToCrawl = 5000; crawlConfig.UserAgentString = "abot v1.0 http://code.google.com/p/abot"; //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111"); //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222"); //Will use app.config for confguration PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri("http://sunnah.com/")); Console.WriteLine("jumlah crawled content :" + result.CrawlContext.CrawledCount); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }
/// <summary> /// Initializes the crawler from configuration and stores a definition of the instance /// </summary> /// <param name="seedUrl"></param> /// <param name="sessionId"></param> /// <param name="crawlerId"></param> public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId) { var config = new CrawlConfiguration(); var abotSection = AbotConfigurationSectionHandler.LoadFromXml(); if (abotSection != null) { config = abotSection.Convert(); _logger.InfoFormat("CrawlConfiguration loaded from app.config"); } else { config.CrawlTimeoutSeconds = 100; config.MaxConcurrentThreads = 1; config.MaxPagesToCrawl = long.MaxValue; config.IsExternalPageCrawlingEnabled = false; config.IsExternalPageLinksCrawlingEnabled = false; config.MinCrawlDelayPerDomainMilliSeconds = 10000; config.DownloadableContentTypes = "text/html, text/plain"; config.IsHttpRequestAutoRedirectsEnabled = true; config.IsUriRecrawlingEnabled = false; config.UserAgentString = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0"; _logger.InfoFormat("CrawlConfiguration default loaded"); } return(InitializeCrawler(seedUrl, sessionId, crawlerId, config)); }
public AbotConfigurationSectionHandlerTest() { var builder = new ConfigurationBuilder(); builder.AddJsonFile("appsettings.json"); var cr = builder.Build(); _uut = new AbotConfigurationSectionHandler(cr); }
private CrawlConfiguration GetCrawlConfigurationFromConfigFile() { AbotConfigurationSectionHandler configFromFile = AbotConfigurationSectionHandler.LoadFromXml(); if (configFromFile == null) { throw new ApplicationException("Config section \"abot\" was NOT found"); } return(configFromFile.Convert()); }
private CrawlConfiguration GetCrawlConfigurationFromConfigFile() { AbotConfigurationSectionHandler configFromFile = AbotConfigurationSectionHandler.LoadFromXml(); if (configFromFile == null) { throw new InvalidOperationException("abot config section was NOT found"); } _logger.DebugFormat("abot config section was found"); return(configFromFile.Convert()); }
public void Crawl_RetryEnabled_VerifyCrawlResultIsAsExpected() { new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = AbotConfigurationSectionHandler.LoadFromXml().Convert(); configuration.MaxRetryCount = 3; configuration.MinRetryDelayInMilliseconds = 2000; base.CrawlAndAssert(new PoliteWebCrawler(configuration)); }
private static PoliteWebCrawler CreateCrawler(int recursionDepth, int maxLinks) { CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.MaxCrawlDepth = recursionDepth; crawlConfig.MaxConcurrentThreads = 20; crawlConfig.MaxLinksPerPage = maxLinks; //Must pass an instance of AngleSharpHyperlinkParser to override default customized HAP parser, which is incompatible with my installed HAP dll PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, new AngleSharpHyperlinkParser(), null, null, null); crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; return(crawler); }
private CrawlConfiguration GetCrawlConfigurationFromConfigFile() { AbotConfigurationSectionHandler configFromFile = null; try{ configFromFile = AbotConfigurationSectionHandler.LoadFromXml(); } catch {} if (configFromFile == null) { _logger.DebugFormat("abot config section was NOT found"); return(null); } _logger.DebugFormat("abot config section was found"); return(configFromFile.Convert()); }
private WebCrawler CreateCrawler(IThreadManager threadManager) { CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.MaxConcurrentThreads = 10;//this overrides the config value crawlConfig.MaxCrawlDepth = 3; //Will use the manually created crawlConfig object created above PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, new AmazonPageDecisionMaker(_amazonHelper), threadManager, null, new AmazonPageRequester(crawlConfig), new AmazonHyperLinkParser(_amazonHelper), null, null, null); crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; return(crawler); }
private void ExecuteStartCrawlCommand() { try { CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.MaxCrawlDepth = Convert.ToInt32(MaxDepth); //this overrides the config value crawlConfig.MaxPagesToCrawl = Convert.ToInt32(MaxPages); //this overrides the config value WelcomeTitle = string.Empty; AbotManager m = new AbotManager(); m.MessageUpdate += M_MessageUpdate; m.RunCrawl(_crawlUrl, _localFolder, crawlConfig); } catch (Exception ex) { M_MessageUpdate(this, new MessageEventArgs(ex.Message)); } }
public async Task Crawl_RetryEnabled_VerifyCrawlResultIsAsExpected() { await new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters")); var builder = new ConfigurationBuilder(); builder.AddJsonFile("appsettings.json"); var cr = builder.Build(); CrawlConfiguration configuration = new AbotConfigurationSectionHandler(cr).Convert(); configuration.MaxRetryCount = 3; configuration.MinRetryDelayInMilliseconds = 2000; await base.CrawlAndAssertAsync(new PoliteWebCrawler(configuration)); }
private PoliteWebCrawler SetUp() { XmlConfigurator.Configure(); CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.MaxConcurrentThreads = 5;//this overrides the config value // Ostrożnie z głębokością, 0 i tak sporo zwraca rekordów crawlConfig.MaxCrawlDepth = 0; //Will use app.config for configuration PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.ShouldCrawlPage(ShouldCrawlPage); crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; return(crawler); }
public void SectionHandlerDefaults_MatchPocoDefaults() { _uut = new AbotConfigurationSectionHandler(); CrawlConfiguration pocoDefaults = new CrawlConfiguration(); Assert.AreEqual(pocoDefaults.ConfigurationExtensions.Count, _uut.ExtensionValues.Count); Assert.AreEqual(pocoDefaults.CrawlTimeoutSeconds, _uut.CrawlBehavior.CrawlTimeoutSeconds); Assert.AreEqual(pocoDefaults.DownloadableContentTypes, _uut.CrawlBehavior.DownloadableContentTypes); Assert.AreEqual(pocoDefaults.IsExternalPageCrawlingEnabled, _uut.CrawlBehavior.IsExternalPageCrawlingEnabled); Assert.AreEqual(pocoDefaults.IsExternalPageLinksCrawlingEnabled, _uut.CrawlBehavior.IsExternalPageLinksCrawlingEnabled); Assert.AreEqual(pocoDefaults.IsRespectRobotsDotTextEnabled, _uut.Politeness.IsRespectRobotsDotTextEnabled); Assert.AreEqual(pocoDefaults.IsRespectMetaRobotsNoFollowEnabled, _uut.Politeness.IsRespectMetaRobotsNoFollowEnabled); Assert.AreEqual(pocoDefaults.IsRespectHttpXRobotsTagHeaderNoFollowEnabled, _uut.Politeness.IsRespectHttpXRobotsTagHeaderNoFollowEnabled); Assert.AreEqual(pocoDefaults.IsRespectAnchorRelNoFollowEnabled, _uut.Politeness.IsRespectAnchorRelNoFollowEnabled); Assert.AreEqual(pocoDefaults.IsIgnoreRobotsDotTextIfRootDisallowedEnabled, _uut.Politeness.IsIgnoreRobotsDotTextIfRootDisallowedEnabled); Assert.AreEqual(pocoDefaults.IsUriRecrawlingEnabled, _uut.CrawlBehavior.IsUriRecrawlingEnabled); Assert.AreEqual(pocoDefaults.MaxConcurrentThreads, _uut.CrawlBehavior.MaxConcurrentThreads); Assert.AreEqual(pocoDefaults.MaxRobotsDotTextCrawlDelayInSeconds, _uut.Politeness.MaxRobotsDotTextCrawlDelayInSeconds); Assert.AreEqual(pocoDefaults.MaxPagesToCrawl, _uut.CrawlBehavior.MaxPagesToCrawl); Assert.AreEqual(pocoDefaults.MaxPagesToCrawlPerDomain, _uut.CrawlBehavior.MaxPagesToCrawlPerDomain); Assert.AreEqual(pocoDefaults.MinCrawlDelayPerDomainMilliSeconds, _uut.Politeness.MinCrawlDelayPerDomainMilliSeconds); Assert.AreEqual(pocoDefaults.UserAgentString, _uut.CrawlBehavior.UserAgentString); Assert.AreEqual(pocoDefaults.RobotsDotTextUserAgentString, _uut.Politeness.RobotsDotTextUserAgentString); Assert.AreEqual(pocoDefaults.MaxPageSizeInBytes, _uut.CrawlBehavior.MaxPageSizeInBytes); Assert.AreEqual(pocoDefaults.HttpServicePointConnectionLimit, _uut.CrawlBehavior.HttpServicePointConnectionLimit); Assert.AreEqual(pocoDefaults.IsSslCertificateValidationEnabled, _uut.CrawlBehavior.IsSslCertificateValidationEnabled); Assert.AreEqual(pocoDefaults.HttpRequestTimeoutInSeconds, _uut.CrawlBehavior.HttpRequestTimeoutInSeconds); Assert.AreEqual(pocoDefaults.HttpRequestMaxAutoRedirects, _uut.CrawlBehavior.HttpRequestMaxAutoRedirects); Assert.AreEqual(pocoDefaults.IsHttpRequestAutoRedirectsEnabled, _uut.CrawlBehavior.IsHttpRequestAutoRedirectsEnabled); Assert.AreEqual(pocoDefaults.IsHttpRequestAutomaticDecompressionEnabled, _uut.CrawlBehavior.IsHttpRequestAutomaticDecompressionEnabled); Assert.AreEqual(pocoDefaults.IsSendingCookiesEnabled, _uut.CrawlBehavior.IsSendingCookiesEnabled); Assert.AreEqual(pocoDefaults.MaxMemoryUsageCacheTimeInSeconds, _uut.CrawlBehavior.MaxMemoryUsageCacheTimeInSeconds); Assert.AreEqual(pocoDefaults.MaxMemoryUsageInMb, _uut.CrawlBehavior.MaxMemoryUsageInMb); Assert.AreEqual(pocoDefaults.MinAvailableMemoryRequiredInMb, _uut.CrawlBehavior.MinAvailableMemoryRequiredInMb); Assert.AreEqual(pocoDefaults.MaxCrawlDepth, _uut.CrawlBehavior.MaxCrawlDepth); Assert.AreEqual(pocoDefaults.MaxLinksPerPage, _uut.CrawlBehavior.MaxLinksPerPage); Assert.AreEqual(pocoDefaults.IsForcedLinkParsingEnabled, _uut.CrawlBehavior.IsForcedLinkParsingEnabled); Assert.AreEqual(pocoDefaults.MaxRetryCount, _uut.CrawlBehavior.MaxRetryCount); Assert.AreEqual(pocoDefaults.MinRetryDelayInMilliseconds, _uut.CrawlBehavior.MinRetryDelayInMilliseconds); }
public PoliteWebCrawler CreateCrawler() { _dataFinder = new DataFinder(new KomputronikDataExtractor()); XmlConfigurator.Configure(); CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.MaxConcurrentThreads = 15;//this overrides the config value crawlConfig.MaxCrawlDepth = 15; //Will use app.config for configuration PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.ShouldCrawlPage(ShouldCrawlPage); crawler.ShouldDownloadPageContent(ShouldCrawlPageContent); crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks); crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; return(crawler); }
public int DoCrawl() { CrawlConfiguration CConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); CConfig.MaxConcurrentThreads = maxConcurrentThreads; CConfig.MaxPagesToCrawl = maxPagesToCrawl; CConfig.CrawlTimeoutSeconds = crawlTimeoutSeconds; CConfig.HttpRequestTimeoutInSeconds = httpRequestTimeoutInSeconds; CConfig.LoginUser = loginUser; CConfig.LoginPassword = loginPassword; Console.WriteLine("Doing Crawl With Slack " + (slackBotEnabled ? "Enabled" : "Disabled")); PoliteWebCrawler crawler = new PoliteWebCrawler(CConfig, null, null, null, null, null, null, null, null); //PoliteWebCrawler crawler = new PoliteWebCrawler(); errors = new List <Errors>(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri(URL)); //This is synchronous, it will not go to the next line until the crawl has completed if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } IEnumerable <Errors> EnumList = errors.AsEnumerable(); for (int i = 0; i < 525; i++) { if (EnumList.Where(x => x.ErrorCode == i).Count() != 0) { returnInt = 1; Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine(i + " (" + getErrorName(i) + ") Errors:"); slackMessage += i + " (" + getErrorName(i) + ") Errors:\n"; Console.ForegroundColor = ConsoleColor.Red; foreach (Errors err in EnumList.Where(x => x.ErrorCode == i)) { Console.WriteLine(" " + err.ErrorURL); slackMessage += " " + err.ErrorURL + "\n"; } } } Console.ResetColor(); if (slackMessage == "") { slackMessage = "No Errors In WebPage!"; } Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("Done"); Console.ResetColor(); return(returnInt); }
static void Main(string[] args) { CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.MaxConcurrentThreads = 5;//this overrides the config value crawlConfig.MaxCrawlDepth = 0; crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; //var doc = new HtmlDocument(); //doc.Load(@"C:\Users\lucao\Downloads\keketest.html"); //var embedNodes = doc.DocumentNode.SelectSingleNode("//script[contains(text(), 'thunder_url')]"); //var domain = Regex.Match(embedNodes.InnerText, @".*domain.*'(.*)'").Groups[1].ToString(); //var thunder_url = Regex.Match(embedNodes.InnerText, ".*thunder_url.*\"(.*)\"").Groups[1].ToString(); //var downloadMp3Link = domain + thunder_url; CrawlResult result; for (int i = 58; i > 30; i--) { DownloadLinkList.Clear(); Thread.Sleep(60000); result = crawler.Crawl(new Uri($"http://www.kekenet.com/Article/15410/List_{i}.shtml")); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } if (DownloadLinkList.Count > 0) { DownloadMP3LinkList.Clear(); foreach (var link in DownloadLinkList) { var sub_crawler = new PoliteWebCrawler(); sub_crawler.PageCrawlStartingAsync += sub_crawler_ProcessPageCrawlStarting; sub_crawler.PageCrawlCompletedAsync += sub_crawler_ProcessPageCrawlCompleted; sub_crawler.PageCrawlDisallowedAsync += sub_crawler_PageCrawlDisallowed; sub_crawler.PageLinksCrawlDisallowedAsync += sub_crawler_PageLinksCrawlDisallowed; sub_crawler.Crawl(new Uri(link)); Thread.Sleep(20000); sub_crawler?.Dispose(); } } //"http://k6.kekenet.com/Sound/2018/01/scad180110.mp3" if (DownloadMP3LinkList.Count > 0) { foreach (var mp3Link in DownloadMP3LinkList) { WebClient client = new WebClient(); Uri ur = new Uri(mp3Link); client.DownloadProgressChanged += WebClientDownloadProgressChanged; client.DownloadDataCompleted += WebClientDownloadCompleted; var file = @"C:\Users\lucao\Downloads\keke\" + mp3Link.Split('/').Last().ToString(); client.DownloadFile(ur, file); Thread.Sleep(60000); } } } }