private void Ready() { ColorConsole.WriteLine($"{PageActorName} has become Ready", ConsoleColor.Magenta); ReceiveAsync <UrlForUrlAndObjectParsingMessage>(async url => { ColorConsole.WriteLine($"{PageActorName} started downloading: {url.Url}", ConsoleColor.Magenta); HtmlContentMessage htmlContent = await GetHtmlContent(url.Url); ColorConsole.WriteLine($"{PageActorName} finished downloading: {url.Url}", ConsoleColor.Magenta); Context.ActorSelection(ActorPaths.UrlParser).Tell(htmlContent); //Context.ActorSelection(ActorPaths.ObjectParser).Tell(htmlContent); //Context.ActorSelection(ActorPaths.UrlTracker).Tell(new ProcessedUrlMessage(htmlContent.SourceUrl)); }); ReceiveAsync <UrlForObjectParsingMessage>(async url => { ColorConsole.WriteLine($"{PageActorName} started downloading: {url.Url}", ConsoleColor.Magenta); HtmlContentMessage htmlContent = await GetHtmlContent(url.Url); ColorConsole.WriteLine($"{PageActorName} finished downloading: {url.Url}", ConsoleColor.Magenta); Context.ActorSelection(ActorPaths.ObjectParser).Tell(htmlContent); Context.ActorSelection(ActorPaths.UrlTracker).Tell(new ProcessedUrlMessage(htmlContent.SourceUrl)); }); }
private List <Uri> ParseUrlsFromHtmlContent(HtmlContentMessage htmlContent) { HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlContent.Html); List <Uri> urlsFound = new List <Uri>(); if (htmlDoc.DocumentNode.SelectSingleNode("//urlset[starts-with(@xmlns, 'http://www.sitemaps.org')]") != null) // if sitemap { var locs = htmlDoc.DocumentNode.SelectNodes("//loc"); if (locs != null) { foreach (var loc in locs) { string value = loc.InnerText; Uri url = new Uri(value, UriKind.RelativeOrAbsolute); urlsFound.Add(url); } } } else { var aTags = htmlDoc.DocumentNode.SelectNodes("//a[@href]"); if (aTags != null) { foreach (var aTag in aTags) { string hrefValue = aTag.Attributes["href"].Value; hrefValue = WebUtility.HtmlDecode(hrefValue); Uri url = new Uri(hrefValue, UriKind.RelativeOrAbsolute); url = new Uri(htmlContent.SourceUrl, url); urlsFound.Add(url); } } } return(urlsFound); }