private static string ExtractCss(HtmlDocument doc) { var styleTags = doc.FindTagsByName("style", "link").ToArray(); var css = new StringBuilder(); foreach (var tag in styleTags) { if (tag.IsType("style")) css.AppendLine(tag.ChildElements.ToSeparatedString(" ")); else if (tag.IsType("link") && (string.Compare(tag.Attributes["type"], "text/css", true) == 0 || string.Compare(tag.Attributes["rel"], "stylesheet") == 0)) { var src = tag.Attributes["href"]; if (string.IsNullOrEmpty(src)) continue; var uri = new Uri(src, UriKind.RelativeOrAbsolute); if (!uri.IsAbsoluteUri && HttpContext.Current != null && HttpContext.Current.Request != null) { uri = new Uri(HttpContext.Current.Request.Url, uri); } if (!uri.IsAbsoluteUri) continue; var client = new WebClient(); css.Append(client.DownloadString(uri)); } } foreach (var tag in styleTags) { tag.Remove(); } return css.ToString(); }
private void DoExtractContent(CrawlData crawlData) { var text = _indexStrips.Aggregate(crawlData.OriginalContent, (current, strip) => StripText(current, strip.Key, strip.Value)); var html = new HtmlDocument(text); crawlData.Title = html.FindTagsByName("title").Select(t => t.InnerText).FirstOrDefault(); var metaData = new CaseInvariantNameValueCollection(); foreach (var keyValue in html.FindTagsByName("meta").Select(GetKeyValue).Where(keyValue => keyValue.HasValue)) { if (metaData.HasKey(keyValue.Value.Key)) throw new ApplicationException("Duplicate meta tags:" + keyValue.Value.Key); metaData.Append(keyValue.Value.Key, keyValue.Value.Value); } crawlData.MetaData = metaData.ToDictionary(); crawlData.FilteredContent = html.ExtractContent(); }
private void DoExtractLinks(CrawlData data) { var text = _followStrips.Aggregate(data.OriginalContent, (current, strip) => StripText(current, strip.Key, strip.Value)); var html = new HtmlDocument(text); data.Links = html.FindTagsByName("a").Select(a => a.Attributes["href"]).ToArray(); }
private static HtmlTag ExtractBodyTag(HtmlDocument htmlDocument) { return htmlDocument.FindTagsByName("body").FirstOrDefault(); }