/// <summary> /// Parse robots.txt sitemaps from _robotsDotText to _rootSitemap /// </summary> /// <returns></returns> protected virtual bool TryParseRobotsSitemaps() { IList <string> sitemaps = RobotsDotText?.Robots.GetSitemapUrls(); // Robots.txt can collect more then 1 sitemap? if (sitemaps?.Count > 0) { Logger.DebugFormat("Start parse site using sitemap.xml..."); // Collect info from robots.txt if (!Uri.TryCreate(sitemaps[0], UriKind.Absolute, out Uri result)) { Logger.WarnFormat("Can't parse {0} to Uri object", sitemaps[0]); return(false); } // Get root sitemap RootSitemap = new RobotsSitemap( sitemaps: sitemaps .Select(x => Uri.TryCreate(x, UriKind.Absolute, out Uri sitemapUri) ? new Sitemap(sitemapUri) : null) .Where(x => x != null), sitemapLocation: new Uri(RobotsDotText.Robots.BaseUri, Core.Robots.RobotsDotTextFinder.RobotsTxt)); } return(RootSitemap?.Sitemaps != null && RootSitemap.Sitemaps.Any()); }
public IRobotsSitemap Load(IRobotsSitemap sitemap) => new RobotsSitemap(_adapteeObject.LoadAsync(RobotsSitemap.MapIRobotsSitemapToSitemap(sitemap)).Result);
//protected void IsBrowserLoaded(object sender, System.EventArgs e) //{ // // No need more calls // Browser.BrowserInitialized -= IsBrowserLoaded; // // Continue main thread // SemaphoreObj.Release(); //} protected virtual /*async Task<*/ IEnumerable <CrawlResult> /*>*/ GetSitemapResults(IRobotsSitemap sitemap, CancellationTokenSource cancellationTokenSource) { List <CrawlResult> results = new List <CrawlResult>(); if (!sitemap.IsLoaded) { sitemap = SitemapLoader.Load(sitemap); } if (sitemap.Sitemaps != null && sitemap.Sitemaps.Any()) { Logger.InfoFormat("Sitemap: {0} | Inner sitemaps' count: {1}", sitemap.Location, sitemap.Sitemaps.Count()); foreach (IRobotsSitemap derivedSitemap in sitemap.Sitemaps) { results.AddRange(/*await*/ GetSitemapResults(derivedSitemap, cancellationTokenSource) /*.Result*/); } } if (sitemap.Items != null && sitemap.Items.Any()) { Logger.InfoFormat("Sitemap: {0} | Uris' count: {1}", sitemap.Location, sitemap.Items.Count()); CrawlContext.Scheduler.Add(sitemap.Items.Select(x => new PageToCrawl(x.Location))); CrawlResult crawlResult = new CrawlResult(); CrawlComplete = false; //await Task.Run(() => ParallelCrawlSite(crawlResult)); ParallelCrawlSite(crawlResult); results.Add(crawlResult); } return(results); }
public static Louw.SitemapParser.Sitemap MapIRobotsSitemapToSitemap(IRobotsSitemap robotsSitemap) { return((robotsSitemap as RobotsSitemap)?._adapteeObject); }