public IList<ParsedContent> ParseRaw(string rawContent, FollowLinksOptions linkOpts) { ParsedContent parsed = new ParsedContent(); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(rawContent); ReadMeta(doc, ref parsed); foreach (string invalidNode in new string[] { "script", "style", "link", "object", "embed", "title" }) { foreach (HtmlNode script in new List<HtmlNode>(doc.DocumentNode.Descendants(invalidNode))) script.Remove(); } HtmlNode body = doc.DocumentNode.SelectSingleNode("/html/body"); if (body == null) return new ParsedContent[] { parsed }; // extract links to foolow if (true) { HtmlNodeCollection links = body.SelectNodes("//a"); if (links != null) { foreach (HtmlNode a in links) { parsed.Links.Add(new ParsedLink(a)); } } } // this is plain page, extract and index as HTML parsed.PlainContent = body.InnerText.Trim(); parsed.LinkOpts = linkOpts; parsed.LinkOpts.CurrentDepth++; return new ParsedContent[] { parsed }; }
public IList<ParsedContent> ParseRaw(string rawContent, FollowLinksOptions linkOpts) { // This is RSS or Sitemap XmlDocument xmlDoc = new XmlDocument(); try { xmlDoc.LoadXml(rawContent); } catch (Exception ex) { Logger.Error("Invalid XML!", ex); Logger.Debug(rawContent); return new List<ParsedContent>(); } // check type var parsed = new ParsedContent(); if (xmlDoc.DocumentElement.Name == "urlset") { // this is a sitemap XmlNamespaceManager mgr = new XmlNamespaceManager(xmlDoc.NameTable); mgr.AddNamespace("ns", "http://www.sitemaps.org/schemas/sitemap/0.9"); foreach (XmlElement xmlUrl in xmlDoc.DocumentElement.SelectNodes("//ns:url", mgr)) { parsed.Links.Add(new ParsedLink() { Url = xmlUrl["loc"].InnerText.Trim() }); } } else if (xmlDoc.DocumentElement.Name == "rss") { try { parsed.Title = xmlDoc.DocumentElement["channel"]["title"].InnerText; } catch { } try { parsed.Description = xmlDoc.DocumentElement["channel"]["description"].InnerText; } catch { } try { parsed.Author = xmlDoc.DocumentElement["channel"]["managingEditor"].InnerText; } catch { } try { parsed.Metadata["link"] = xmlDoc.DocumentElement["channel"]["link"].InnerText; } catch { } foreach (XmlElement xmlUrl in xmlDoc.DocumentElement["channel"].SelectNodes("item")) { var link = new ParsedLink(); link.Url = xmlUrl["link"].InnerText.Trim(); try { link.Title = xmlUrl["title"].InnerText.Trim(); } catch { } try { link.Description = xmlUrl["description"].InnerText.Trim(); } catch { } parsed.Links.Add(link); } } parsed.LinkOpts = new FollowLinksOptions(); parsed.LinkOpts.Follow = true; parsed.LinkOpts.CurrentDepth = 1; return new ParsedContent[] { parsed }; }
void ReadMeta(HtmlDocument doc, ref ParsedContent parsed) { // first, normalize values for meta name attribute if (doc.DocumentNode.SelectNodes("/html/head/meta") != null) { foreach (HtmlNode xmlMeta in doc.DocumentNode.SelectNodes("/html/head/meta")) { xmlMeta.Attributes["name"].Value = xmlMeta.Attributes["name"].Value.ToLower(); } } if (string.IsNullOrEmpty(parsed.Title)) { try { parsed.Title = doc.DocumentNode.SelectSingleNode("/html/head/title").InnerText.Trim(); } catch { } } if (string.IsNullOrEmpty(parsed.Description)) { try { parsed.Description = doc.DocumentNode.SelectSingleNode("/html/head/meta[@name='description']").Attributes["content"].Value.Trim(); } catch { } } }