Exemple #1
0
 private void Ready()
 {
     ColorConsole.WriteLine($"{PageActorName} has become Ready", ConsoleColor.Magenta);
     ReceiveAsync <UrlForUrlAndObjectParsingMessage>(async url =>
     {
         ColorConsole.WriteLine($"{PageActorName} started downloading: {url.Url}", ConsoleColor.Magenta);
         HtmlContentMessage htmlContent = await GetHtmlContent(url.Url);
         ColorConsole.WriteLine($"{PageActorName} finished downloading: {url.Url}", ConsoleColor.Magenta);
         Context.ActorSelection(ActorPaths.UrlParser).Tell(htmlContent);
         //Context.ActorSelection(ActorPaths.ObjectParser).Tell(htmlContent);
         //Context.ActorSelection(ActorPaths.UrlTracker).Tell(new ProcessedUrlMessage(htmlContent.SourceUrl));
     });
     ReceiveAsync <UrlForObjectParsingMessage>(async url =>
     {
         ColorConsole.WriteLine($"{PageActorName} started downloading: {url.Url}", ConsoleColor.Magenta);
         HtmlContentMessage htmlContent = await GetHtmlContent(url.Url);
         ColorConsole.WriteLine($"{PageActorName} finished downloading: {url.Url}", ConsoleColor.Magenta);
         Context.ActorSelection(ActorPaths.ObjectParser).Tell(htmlContent);
         Context.ActorSelection(ActorPaths.UrlTracker).Tell(new ProcessedUrlMessage(htmlContent.SourceUrl));
     });
 }
Exemple #2
0
        private List <Uri> ParseUrlsFromHtmlContent(HtmlContentMessage htmlContent)
        {
            HtmlDocument htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(htmlContent.Html);
            List <Uri> urlsFound = new List <Uri>();

            if (htmlDoc.DocumentNode.SelectSingleNode("//urlset[starts-with(@xmlns, 'http://www.sitemaps.org')]") != null) // if sitemap
            {
                var locs = htmlDoc.DocumentNode.SelectNodes("//loc");
                if (locs != null)
                {
                    foreach (var loc in locs)
                    {
                        string value = loc.InnerText;
                        Uri    url   = new Uri(value, UriKind.RelativeOrAbsolute);
                        urlsFound.Add(url);
                    }
                }
            }
            else
            {
                var aTags = htmlDoc.DocumentNode.SelectNodes("//a[@href]");
                if (aTags != null)
                {
                    foreach (var aTag in aTags)
                    {
                        string hrefValue = aTag.Attributes["href"].Value;
                        hrefValue = WebUtility.HtmlDecode(hrefValue);
                        Uri url = new Uri(hrefValue, UriKind.RelativeOrAbsolute);
                        url = new Uri(htmlContent.SourceUrl, url);
                        urlsFound.Add(url);
                    }
                }
            }
            return(urlsFound);
        }