Ejemplo n.º 1
0
        public IList<ParsedContent> ParseRaw(string rawContent, FollowLinksOptions linkOpts)
        {
            ParsedContent parsed = new ParsedContent();

            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(rawContent);
            ReadMeta(doc, ref parsed);

            foreach (string invalidNode in new string[] { "script", "style", "link", "object", "embed", "title" }) {
                foreach (HtmlNode script in new List<HtmlNode>(doc.DocumentNode.Descendants(invalidNode)))
                    script.Remove();
            }

            HtmlNode body = doc.DocumentNode.SelectSingleNode("/html/body");
            if (body == null)
                return new ParsedContent[] { parsed };

            // extract links to foolow
            if (true) {
                HtmlNodeCollection links = body.SelectNodes("//a");
                if (links != null) {
                    foreach (HtmlNode a in links) {
                        parsed.Links.Add(new ParsedLink(a));
                    }
                }
            }
            
            // this is plain page, extract and index as HTML
            parsed.PlainContent = body.InnerText.Trim();
            parsed.LinkOpts = linkOpts;
            parsed.LinkOpts.CurrentDepth++;

            return new ParsedContent[] { parsed };
        }
        public IList<ParsedContent> ParseRaw(string rawContent, FollowLinksOptions linkOpts)
        {
            // This is RSS or Sitemap
            XmlDocument xmlDoc = new XmlDocument();
            try {
                xmlDoc.LoadXml(rawContent);
            } catch (Exception ex) {
                Logger.Error("Invalid XML!", ex);
                Logger.Debug(rawContent);
                return new List<ParsedContent>();
            }

            // check type
            var parsed = new ParsedContent();

            if (xmlDoc.DocumentElement.Name == "urlset") {
                // this is a sitemap
                XmlNamespaceManager mgr = new XmlNamespaceManager(xmlDoc.NameTable);
                mgr.AddNamespace("ns", "http://www.sitemaps.org/schemas/sitemap/0.9");

                foreach (XmlElement xmlUrl in xmlDoc.DocumentElement.SelectNodes("//ns:url", mgr)) {
                    parsed.Links.Add(new ParsedLink() {
                        Url = xmlUrl["loc"].InnerText.Trim()
                    });
                }

            } else if (xmlDoc.DocumentElement.Name == "rss") {

                try { parsed.Title = xmlDoc.DocumentElement["channel"]["title"].InnerText; } catch { }
                try { parsed.Description = xmlDoc.DocumentElement["channel"]["description"].InnerText; } catch { }
                try { parsed.Author = xmlDoc.DocumentElement["channel"]["managingEditor"].InnerText; } catch { }
                try { parsed.Metadata["link"] = xmlDoc.DocumentElement["channel"]["link"].InnerText; } catch { }

                foreach (XmlElement xmlUrl in xmlDoc.DocumentElement["channel"].SelectNodes("item")) {
                    var link = new ParsedLink();
                    link.Url = xmlUrl["link"].InnerText.Trim();
                    try { link.Title = xmlUrl["title"].InnerText.Trim(); } catch { }
                    try { link.Description = xmlUrl["description"].InnerText.Trim(); } catch { }
                    parsed.Links.Add(link);
                }
            }

            parsed.LinkOpts = new FollowLinksOptions();
            parsed.LinkOpts.Follow = true;
            parsed.LinkOpts.CurrentDepth = 1;

            return new ParsedContent[] { parsed };
        }
Ejemplo n.º 3
0
        void ReadMeta(HtmlDocument doc, ref ParsedContent parsed)
        {
            // first, normalize values for meta name attribute
            if (doc.DocumentNode.SelectNodes("/html/head/meta") != null) {
                foreach (HtmlNode xmlMeta in doc.DocumentNode.SelectNodes("/html/head/meta")) {
                    xmlMeta.Attributes["name"].Value = xmlMeta.Attributes["name"].Value.ToLower();
                }
            }

            if (string.IsNullOrEmpty(parsed.Title)) {
                try { parsed.Title = doc.DocumentNode.SelectSingleNode("/html/head/title").InnerText.Trim(); } catch { }
            }

            if (string.IsNullOrEmpty(parsed.Description)) {
                try { parsed.Description = doc.DocumentNode.SelectSingleNode("/html/head/meta[@name='description']").Attributes["content"].Value.Trim(); } catch { }
            }
        }