/// <summary> /// Document parser. /// </summary> /// <param name="content">The document to parse.</param> /// <returns>The parsed document.</returns> public static XHtmlDocument Parse(string content) { var strContent = content.Trim(new[] { ' ', '\uFEFF', '\r', '\n' }); if (content.Length == 0) { return(new XHtmlDocument(new XDocument())); } XDocument doc = XDocument.Parse(strContent); XHtmlDocument htmDoc = new XHtmlDocument(doc); if (doc.Root != null) { XNamespace ns = "http://www.w3.org/1999/xhtml"; htmDoc.Images = (from img in doc.Root.Descendants(ns + "img") let xAttribute = img.Attribute("src") where xAttribute != null select xAttribute.Value).Distinct().ToArray(); htmDoc.Links = (from linc in doc.Root.Descendants(ns + "link") let xAttribute = linc.Attribute("href") where xAttribute != null select xAttribute.Value).Distinct().ToArray(); htmDoc.Scripts = (from script in doc.Root.Descendants(ns + "script") let xAttribute = script.Attribute("src") where xAttribute != null select xAttribute.Value).Distinct().ToArray(); } return(htmDoc); }
/// <summary> /// Create download list for contents of HTML document /// </summary> /// <param name="origdir">Original path</param> /// <param name="document">The document</param> private static void MakeDownloadList(string origdir, XHtmlDocument document) { foreach (var script in document.Scripts) { PriorityRecord pr = new PriorityRecord(PriorityLevel.HighPriority, string.Format("{0}/{1}", origdir.ToLower(), script.ToLower())); currentStreams.Add(pr); } foreach (var link in document.Links) { PriorityRecord pr = new PriorityRecord(PriorityLevel.MediumPriority, string.Format("{0}/{1}", origdir.ToLower(), link.ToLower())); currentStreams.Add(pr); } foreach (var image in document.Images) { PriorityRecord pr = new PriorityRecord(PriorityLevel.LowPriority, string.Format("{0}/{1}", origdir.ToLower(), image.ToLower())); currentStreams.Add(pr); } // sort by priority, name currentStreams.SortByPriority(); }
/// <summary> /// Document parser. /// </summary> /// <param name="content">The document to parse.</param> /// <returns>The parsed document.</returns> public static XHtmlDocument Parse(string content) { XDocument doc = XDocument.Parse(content.Trim(new[] { ' ', '\uFEFF', '\r', '\n' })); XHtmlDocument htmDoc = new XHtmlDocument(doc); if (doc.Root != null) { XNamespace ns = "http://www.w3.org/1999/xhtml"; htmDoc.Images = (from img in doc.Root.Descendants(ns + "img") let xAttribute = img.Attribute("src") where xAttribute != null select xAttribute.Value).Distinct().ToArray(); htmDoc.Links = (from linc in doc.Root.Descendants(ns + "link") let xAttribute = linc.Attribute("href") where xAttribute != null select xAttribute.Value).Distinct().ToArray(); htmDoc.Scripts = (from script in doc.Root.Descendants(ns + "script") let xAttribute = script.Attribute("src") where xAttribute != null select xAttribute.Value).Distinct().ToArray(); } return htmDoc; }