public IList<ParsedContent> ParseRaw(string rawContent, FollowLinksOptions linkOpts) { ParsedContent parsed = new ParsedContent(); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(rawContent); ReadMeta(doc, ref parsed); foreach (string invalidNode in new string[] { "script", "style", "link", "object", "embed", "title" }) { foreach (HtmlNode script in new List<HtmlNode>(doc.DocumentNode.Descendants(invalidNode))) script.Remove(); } HtmlNode body = doc.DocumentNode.SelectSingleNode("/html/body"); if (body == null) return new ParsedContent[] { parsed }; // extract links to foolow if (true) { HtmlNodeCollection links = body.SelectNodes("//a"); if (links != null) { foreach (HtmlNode a in links) { parsed.Links.Add(new ParsedLink(a)); } } } // this is plain page, extract and index as HTML parsed.PlainContent = body.InnerText.Trim(); parsed.LinkOpts = linkOpts; parsed.LinkOpts.CurrentDepth++; return new ParsedContent[] { parsed }; }
public CrawlJob(Uri url) { Url = url; TimeoutSec = 30; LinkOpts = new FollowLinksOptions(); }
public IList<ParsedContent> ParseRaw(string rawContent, FollowLinksOptions linkOpts) { return new ParsedContent[] { new ParsedContent() { ContentTypes = new List<string>() { "text/plain", "txt" }, PlainContent = rawContent.Trim() } }; }
public IList<ParsedContent> ParseRaw(string rawContent, FollowLinksOptions linkOpts) { // This is RSS or Sitemap XmlDocument xmlDoc = new XmlDocument(); try { xmlDoc.LoadXml(rawContent); } catch (Exception ex) { Logger.Error("Invalid XML!", ex); Logger.Debug(rawContent); return new List<ParsedContent>(); } // check type var parsed = new ParsedContent(); if (xmlDoc.DocumentElement.Name == "urlset") { // this is a sitemap XmlNamespaceManager mgr = new XmlNamespaceManager(xmlDoc.NameTable); mgr.AddNamespace("ns", "http://www.sitemaps.org/schemas/sitemap/0.9"); foreach (XmlElement xmlUrl in xmlDoc.DocumentElement.SelectNodes("//ns:url", mgr)) { parsed.Links.Add(new ParsedLink() { Url = xmlUrl["loc"].InnerText.Trim() }); } } else if (xmlDoc.DocumentElement.Name == "rss") { try { parsed.Title = xmlDoc.DocumentElement["channel"]["title"].InnerText; } catch { } try { parsed.Description = xmlDoc.DocumentElement["channel"]["description"].InnerText; } catch { } try { parsed.Author = xmlDoc.DocumentElement["channel"]["managingEditor"].InnerText; } catch { } try { parsed.Metadata["link"] = xmlDoc.DocumentElement["channel"]["link"].InnerText; } catch { } foreach (XmlElement xmlUrl in xmlDoc.DocumentElement["channel"].SelectNodes("item")) { var link = new ParsedLink(); link.Url = xmlUrl["link"].InnerText.Trim(); try { link.Title = xmlUrl["title"].InnerText.Trim(); } catch { } try { link.Description = xmlUrl["description"].InnerText.Trim(); } catch { } parsed.Links.Add(link); } } parsed.LinkOpts = new FollowLinksOptions(); parsed.LinkOpts.Follow = true; parsed.LinkOpts.CurrentDepth = 1; return new ParsedContent[] { parsed }; }
public IList<ParsedContent> ParseUrl(Uri url, FollowLinksOptions linkOpts) { throw new NotImplementedException(); }
public IList<ParsedContent> ParseFile(string filePath, FollowLinksOptions linkOpts) { if (!File.Exists(filePath)) return new ParsedContent[0]; return ParseRaw(File.ReadAllText(filePath), linkOpts); }
public IList<ParsedContent> ParseStream(Stream s, FollowLinksOptions linkOpts) { using (StreamReader sr = new StreamReader(s)) { return ParseRaw(sr.ReadToEnd(), linkOpts); } }
public ParsedContent() { Metadata = new Dictionary<string, string>(); Links = new List<ParsedLink>(); LinkOpts = new FollowLinksOptions(); }