예제 #1
0
        public IList<ParsedContent> ParseRaw(string rawContent, FollowLinksOptions linkOpts)
        {
            ParsedContent parsed = new ParsedContent();

            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(rawContent);
            ReadMeta(doc, ref parsed);

            foreach (string invalidNode in new string[] { "script", "style", "link", "object", "embed", "title" }) {
                foreach (HtmlNode script in new List<HtmlNode>(doc.DocumentNode.Descendants(invalidNode)))
                    script.Remove();
            }

            HtmlNode body = doc.DocumentNode.SelectSingleNode("/html/body");
            if (body == null)
                return new ParsedContent[] { parsed };

            // extract links to foolow
            if (true) {
                HtmlNodeCollection links = body.SelectNodes("//a");
                if (links != null) {
                    foreach (HtmlNode a in links) {
                        parsed.Links.Add(new ParsedLink(a));
                    }
                }
            }
            
            // this is plain page, extract and index as HTML
            parsed.PlainContent = body.InnerText.Trim();
            parsed.LinkOpts = linkOpts;
            parsed.LinkOpts.CurrentDepth++;

            return new ParsedContent[] { parsed };
        }
예제 #2
0
        public CrawlJob(Uri url)
        {
            Url = url;
            TimeoutSec = 30;

            LinkOpts = new FollowLinksOptions();
        }
예제 #3
0
 public IList<ParsedContent> ParseRaw(string rawContent, FollowLinksOptions linkOpts)
 {
     return new ParsedContent[] {
         new ParsedContent() {
             ContentTypes = new List<string>() { "text/plain", "txt" },
             PlainContent = rawContent.Trim()
         }
     };
 }
        public IList<ParsedContent> ParseRaw(string rawContent, FollowLinksOptions linkOpts)
        {
            // This is RSS or Sitemap
            XmlDocument xmlDoc = new XmlDocument();
            try {
                xmlDoc.LoadXml(rawContent);
            } catch (Exception ex) {
                Logger.Error("Invalid XML!", ex);
                Logger.Debug(rawContent);
                return new List<ParsedContent>();
            }

            // check type
            var parsed = new ParsedContent();

            if (xmlDoc.DocumentElement.Name == "urlset") {
                // this is a sitemap
                XmlNamespaceManager mgr = new XmlNamespaceManager(xmlDoc.NameTable);
                mgr.AddNamespace("ns", "http://www.sitemaps.org/schemas/sitemap/0.9");

                foreach (XmlElement xmlUrl in xmlDoc.DocumentElement.SelectNodes("//ns:url", mgr)) {
                    parsed.Links.Add(new ParsedLink() {
                        Url = xmlUrl["loc"].InnerText.Trim()
                    });
                }

            } else if (xmlDoc.DocumentElement.Name == "rss") {

                try { parsed.Title = xmlDoc.DocumentElement["channel"]["title"].InnerText; } catch { }
                try { parsed.Description = xmlDoc.DocumentElement["channel"]["description"].InnerText; } catch { }
                try { parsed.Author = xmlDoc.DocumentElement["channel"]["managingEditor"].InnerText; } catch { }
                try { parsed.Metadata["link"] = xmlDoc.DocumentElement["channel"]["link"].InnerText; } catch { }

                foreach (XmlElement xmlUrl in xmlDoc.DocumentElement["channel"].SelectNodes("item")) {
                    var link = new ParsedLink();
                    link.Url = xmlUrl["link"].InnerText.Trim();
                    try { link.Title = xmlUrl["title"].InnerText.Trim(); } catch { }
                    try { link.Description = xmlUrl["description"].InnerText.Trim(); } catch { }
                    parsed.Links.Add(link);
                }
            }

            parsed.LinkOpts = new FollowLinksOptions();
            parsed.LinkOpts.Follow = true;
            parsed.LinkOpts.CurrentDepth = 1;

            return new ParsedContent[] { parsed };
        }
예제 #5
0
 public IList<ParsedContent> ParseUrl(Uri url, FollowLinksOptions linkOpts)
 {
     throw new NotImplementedException();
 }
예제 #6
0
 public IList<ParsedContent> ParseFile(string filePath, FollowLinksOptions linkOpts)
 {
     if (!File.Exists(filePath))
         return new ParsedContent[0];
     return ParseRaw(File.ReadAllText(filePath), linkOpts);
 }
예제 #7
0
 public IList<ParsedContent> ParseStream(Stream s, FollowLinksOptions linkOpts)
 {
     using (StreamReader sr = new StreamReader(s)) {
         return ParseRaw(sr.ReadToEnd(), linkOpts);
     }
 }
 public ParsedContent()
 {
     Metadata = new Dictionary<string, string>();
     Links = new List<ParsedLink>();
     LinkOpts = new FollowLinksOptions();
 }