Ejemplo n.º 1
0
 private static string ExtractCss(HtmlDocument doc)
 {
     var styleTags = doc.FindTagsByName("style", "link").ToArray();
     var css = new StringBuilder();
     foreach (var tag in styleTags)
     {
         if (tag.IsType("style"))
             css.AppendLine(tag.ChildElements.ToSeparatedString(" "));
         else if (tag.IsType("link") && (string.Compare(tag.Attributes["type"], "text/css", true) == 0 || string.Compare(tag.Attributes["rel"], "stylesheet") == 0))
         {
             var src = tag.Attributes["href"];
             if (string.IsNullOrEmpty(src))
                 continue;
             var uri = new Uri(src, UriKind.RelativeOrAbsolute);
             if (!uri.IsAbsoluteUri && HttpContext.Current != null && HttpContext.Current.Request != null)
             {
                 uri = new Uri(HttpContext.Current.Request.Url, uri);
             }
             if (!uri.IsAbsoluteUri)
                 continue;
             var client = new WebClient();
             css.Append(client.DownloadString(uri));
         }
     }
     foreach (var tag in styleTags)
     {
         tag.Remove();
     }
     return css.ToString();
 }
Ejemplo n.º 2
0
        private void DoExtractContent(CrawlData crawlData)
        {
            var text = _indexStrips.Aggregate(crawlData.OriginalContent, (current, strip) => StripText(current, strip.Key, strip.Value));
            var html = new HtmlDocument(text);

            crawlData.Title = html.FindTagsByName("title").Select(t => t.InnerText).FirstOrDefault();

            var metaData = new CaseInvariantNameValueCollection();
            foreach (var keyValue in html.FindTagsByName("meta").Select(GetKeyValue).Where(keyValue => keyValue.HasValue))
            {
                if (metaData.HasKey(keyValue.Value.Key))
                    throw new ApplicationException("Duplicate meta tags:" + keyValue.Value.Key);
                metaData.Append(keyValue.Value.Key, keyValue.Value.Value);
            }
            crawlData.MetaData = metaData.ToDictionary();
            crawlData.FilteredContent = html.ExtractContent();
        }
Ejemplo n.º 3
0
        private void DoExtractLinks(CrawlData data)
        {
            var text = _followStrips.Aggregate(data.OriginalContent, (current, strip) => StripText(current, strip.Key, strip.Value));

            var html = new HtmlDocument(text);
            data.Links = html.FindTagsByName("a").Select(a => a.Attributes["href"]).ToArray();
        }
Ejemplo n.º 4
0
 private static HtmlTag ExtractBodyTag(HtmlDocument htmlDocument)
 {
     return htmlDocument.FindTagsByName("body").FirstOrDefault();
 }