Example #1
0
        /// <summary>
        /// Parse robots.txt sitemaps from _robotsDotText to _rootSitemap
        /// </summary>
        /// <returns></returns>
        protected virtual bool TryParseRobotsSitemaps()
        {
            IList <string> sitemaps = RobotsDotText?.Robots.GetSitemapUrls();

            // Robots.txt can collect more then 1 sitemap?
            if (sitemaps?.Count > 0)
            {
                Logger.DebugFormat("Start parse site using sitemap.xml...");

                // Collect info from robots.txt

                if (!Uri.TryCreate(sitemaps[0], UriKind.Absolute, out Uri result))
                {
                    Logger.WarnFormat("Can't parse {0} to Uri object", sitemaps[0]);
                    return(false);
                }

                // Get root sitemap
                RootSitemap = new RobotsSitemap(
                    sitemaps: sitemaps
                    .Select(x => Uri.TryCreate(x, UriKind.Absolute, out Uri sitemapUri) ?
                            new Sitemap(sitemapUri) : null)
                    .Where(x => x != null),
                    sitemapLocation: new Uri(RobotsDotText.Robots.BaseUri, Core.Robots.RobotsDotTextFinder.RobotsTxt));
            }

            return(RootSitemap?.Sitemaps != null &&
                   RootSitemap.Sitemaps.Any());
        }
Example #2
0
 public IRobotsSitemap Load(IRobotsSitemap sitemap)
 => new RobotsSitemap(_adapteeObject.LoadAsync(RobotsSitemap.MapIRobotsSitemapToSitemap(sitemap)).Result);
Example #3
0
        //protected void IsBrowserLoaded(object sender, System.EventArgs e)
        //{
        //	// No need more calls
        //	Browser.BrowserInitialized -= IsBrowserLoaded;

        //	// Continue main thread
        //	SemaphoreObj.Release();
        //}

        protected virtual /*async Task<*/ IEnumerable <CrawlResult> /*>*/ GetSitemapResults(IRobotsSitemap sitemap, CancellationTokenSource cancellationTokenSource)
        {
            List <CrawlResult> results = new List <CrawlResult>();

            if (!sitemap.IsLoaded)
            {
                sitemap = SitemapLoader.Load(sitemap);
            }

            if (sitemap.Sitemaps != null && sitemap.Sitemaps.Any())
            {
                Logger.InfoFormat("Sitemap: {0} | Inner sitemaps' count: {1}", sitemap.Location, sitemap.Sitemaps.Count());

                foreach (IRobotsSitemap derivedSitemap in sitemap.Sitemaps)
                {
                    results.AddRange(/*await*/ GetSitemapResults(derivedSitemap, cancellationTokenSource) /*.Result*/);
                }
            }

            if (sitemap.Items != null && sitemap.Items.Any())
            {
                Logger.InfoFormat("Sitemap: {0} | Uris' count: {1}", sitemap.Location, sitemap.Items.Count());

                CrawlContext.Scheduler.Add(sitemap.Items.Select(x => new PageToCrawl(x.Location)));

                CrawlResult crawlResult = new CrawlResult();
                CrawlComplete = false;
                //await Task.Run(() => ParallelCrawlSite(crawlResult));
                ParallelCrawlSite(crawlResult);
                results.Add(crawlResult);
            }

            return(results);
        }
Example #4
0
 public static Louw.SitemapParser.Sitemap MapIRobotsSitemapToSitemap(IRobotsSitemap robotsSitemap)
 {
     return((robotsSitemap as RobotsSitemap)?._adapteeObject);
 }