public async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsXmlContent(propertyBag.ContentType)) { return; } using (var reader = propertyBag.GetResponse()) { using (var sr = new StreamReader(reader)) { var mydoc = XDocument.Load(sr); if (mydoc.Root == null) { return; } var qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9"); var urlNodes = from e in mydoc.Descendants(qualifiedName) where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase) select e.Value; foreach (var url in urlNodes) { // add new crawler steps var baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); var decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url); var normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } await crawler.AddStepAsync(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary <string, object> { { Resources.PropertyBagKeyOriginalUrl, url }, { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri } }); } } } }
public virtual async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag) { // Get text from previous pipeline step var text = propertyBag.Text; if (this.HasTextStripRules) { text = this.StripText(text); } if (text.IsNullOrEmpty()) { return; } if (this.HasLinkStripRules) { text = this.StripLinks(text); } // Find links var matches = s_LinkRegex.Value.Matches(text); foreach (var match in matches.Cast <Match>().Where(m => m.Success)) { var link = match.Value; if (link.IsNullOrEmpty()) { continue; } var baseUrl = propertyBag.ResponseUri.GetLeftPath(); var normalizedLink = link.NormalizeUrl(baseUrl); if (normalizedLink.IsNullOrEmpty()) { continue; } // Add new step to crawler await crawler.AddStepAsync(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary <string, object> { { Resources.PropertyBagKeyOriginalUrl, new Uri(link) }, { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri } }).ConfigureAwait(false); } }
public virtual async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsHtmlContent(propertyBag.ContentType)) { return; } var htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (var reader = propertyBag.GetResponse()) { var documentEncoding = htmlDoc.DetectEncoding(reader); reader.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(reader, documentEncoding, true); } else { htmlDoc.Load(reader, true); } } var originalContent = htmlDoc.DocumentNode.OuterHtml; if (this.HasTextStripRules || this.HasSubstitutionRules) { var content = this.StripText(originalContent); content = this.Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } propertyBag["HtmlDoc"].Value = htmlDoc; var nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select name.Value + ": " + content.Value).ToArray(); } // Extract text propertyBag.Text = htmlDoc.ExtractText().Trim(); if (this.HasLinkStripRules || this.HasTextStripRules) { var content = this.StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } var baseUrl = propertyBag.ResponseUri.GetLeftPath(); // Extract Head Base nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]"); if (!nodes.IsNull()) { baseUrl = nodes. Select(entry => new { entry, href = entry.Attributes["href"] }). Where(@t => [email protected]() && [email protected]() && Uri.IsWellFormedUriString(@t.href.Value, UriKind.RelativeOrAbsolute)). Select(@t => @t.href.Value). AddToEnd(baseUrl). FirstOrDefault(); } // Extract Links var links = htmlDoc.GetLinks(); foreach (var link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } var decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); var normalizedLink = this.NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } await crawler.AddStepAsync(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary <string, object> { { Resources.PropertyBagKeyOriginalUrl, link }, { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri } }).ConfigureAwait(false); } }
/// <summary> /// Add crawler step process synchronously. /// </summary> /// <param name="crawler">Crawler for which start crawling.</param> /// <param name="uri">Url to crawl</param> /// <param name="depth">Depth of the url</param> public static void AddStep(this ICrawler crawler, Uri uri, int depth) { crawler.AddStepAsync(uri, depth).Wait(); }
/// <summary> /// Queue a new step on the crawler queue /// </summary> /// <param name="uri">url to crawl</param> /// <param name="depth">depth of the url</param> public static async Task AddStepAsync(this ICrawler crawler, Uri uri, int depth) { await crawler.AddStepAsync(uri, depth, null, null).ConfigureAwait(false); }
/// <summary> /// Queue a new step on the crawler queue /// </summary> /// <param name="uri">url to crawl</param> /// <param name="depth">depth of the url</param> public static async Task AddStepAsync(this ICrawler crawler, Uri uri, int depth) { await crawler.AddStepAsync(uri, depth, null, null); }