public IList<ParsedContent> ParseRaw(string rawContent, FollowLinksOptions linkOpts) { // This is RSS or Sitemap XmlDocument xmlDoc = new XmlDocument(); try { xmlDoc.LoadXml(rawContent); } catch (Exception ex) { Logger.Error("Invalid XML!", ex); Logger.Debug(rawContent); return new List<ParsedContent>(); } // check type var parsed = new ParsedContent(); if (xmlDoc.DocumentElement.Name == "urlset") { // this is a sitemap XmlNamespaceManager mgr = new XmlNamespaceManager(xmlDoc.NameTable); mgr.AddNamespace("ns", "http://www.sitemaps.org/schemas/sitemap/0.9"); foreach (XmlElement xmlUrl in xmlDoc.DocumentElement.SelectNodes("//ns:url", mgr)) { parsed.Links.Add(new ParsedLink() { Url = xmlUrl["loc"].InnerText.Trim() }); } } else if (xmlDoc.DocumentElement.Name == "rss") { try { parsed.Title = xmlDoc.DocumentElement["channel"]["title"].InnerText; } catch { } try { parsed.Description = xmlDoc.DocumentElement["channel"]["description"].InnerText; } catch { } try { parsed.Author = xmlDoc.DocumentElement["channel"]["managingEditor"].InnerText; } catch { } try { parsed.Metadata["link"] = xmlDoc.DocumentElement["channel"]["link"].InnerText; } catch { } foreach (XmlElement xmlUrl in xmlDoc.DocumentElement["channel"].SelectNodes("item")) { var link = new ParsedLink(); link.Url = xmlUrl["link"].InnerText.Trim(); try { link.Title = xmlUrl["title"].InnerText.Trim(); } catch { } try { link.Description = xmlUrl["description"].InnerText.Trim(); } catch { } parsed.Links.Add(link); } } parsed.LinkOpts = new FollowLinksOptions(); parsed.LinkOpts.Follow = true; parsed.LinkOpts.CurrentDepth = 1; return new ParsedContent[] { parsed }; }
public CrawlJob CreateJob(ParsedLink forLink) { Uri url; try { url = new Uri(forLink.Url); } catch (UriFormatException ex) { // relative URL, build it from current URL url = new Uri(Url.ToString().Substring(0, Url.ToString().LastIndexOf('/')) + "/" + forLink.Url); } CrawlJob job = new CrawlJob(url); job.LinkOpts = LinkOpts; // TODO: rest of the params plus maybe some checkings return job; }