コード例 #1
0
        /// <summary>
        /// Document parser.
        /// </summary>
        /// <param name="content">The document to parse.</param>
        /// <returns>The parsed document.</returns>
        public static XHtmlDocument Parse(string content)
        {
            var strContent = content.Trim(new[] { ' ', '\uFEFF', '\r', '\n' });

            if (content.Length == 0)
            {
                return(new XHtmlDocument(new XDocument()));
            }

            XDocument     doc    = XDocument.Parse(strContent);
            XHtmlDocument htmDoc = new XHtmlDocument(doc);

            if (doc.Root != null)
            {
                XNamespace ns = "http://www.w3.org/1999/xhtml";
                htmDoc.Images = (from img in doc.Root.Descendants(ns + "img")
                                 let xAttribute = img.Attribute("src")
                                                  where xAttribute != null
                                                  select xAttribute.Value).Distinct().ToArray();
                htmDoc.Links = (from linc in doc.Root.Descendants(ns + "link")
                                let xAttribute = linc.Attribute("href")
                                                 where xAttribute != null
                                                 select xAttribute.Value).Distinct().ToArray();
                htmDoc.Scripts = (from script in doc.Root.Descendants(ns + "script")
                                  let xAttribute = script.Attribute("src")
                                                   where xAttribute != null
                                                   select xAttribute.Value).Distinct().ToArray();
            }

            return(htmDoc);
        }
コード例 #2
0
        /// <summary>
        /// Create download list for contents of HTML document
        /// </summary>
        /// <param name="origdir">Original path</param>
        /// <param name="document">The document</param>
        private static void MakeDownloadList(string origdir, XHtmlDocument document)
        {
            foreach (var script in document.Scripts)
            {
                PriorityRecord pr = new PriorityRecord(PriorityLevel.HighPriority, string.Format("{0}/{1}", origdir.ToLower(), script.ToLower()));
                currentStreams.Add(pr);
            }

            foreach (var link in document.Links)
            {
                PriorityRecord pr = new PriorityRecord(PriorityLevel.MediumPriority, string.Format("{0}/{1}", origdir.ToLower(), link.ToLower()));
                currentStreams.Add(pr);
            }

            foreach (var image in document.Images)
            {
                PriorityRecord pr = new PriorityRecord(PriorityLevel.LowPriority, string.Format("{0}/{1}", origdir.ToLower(), image.ToLower()));
                currentStreams.Add(pr);
            }

            // sort by priority, name
            currentStreams.SortByPriority();
        }
コード例 #3
0
        /// <summary>
        /// Document parser.
        /// </summary>
        /// <param name="content">The document to parse.</param>
        /// <returns>The parsed document.</returns>
        public static XHtmlDocument Parse(string content)
        {
            XDocument doc = XDocument.Parse(content.Trim(new[] { ' ', '\uFEFF', '\r', '\n' }));
            XHtmlDocument htmDoc = new XHtmlDocument(doc);
            if (doc.Root != null)
            {
                XNamespace ns = "http://www.w3.org/1999/xhtml";
                htmDoc.Images = (from img in doc.Root.Descendants(ns + "img")
                                    let xAttribute = img.Attribute("src")
                                    where xAttribute != null
                                    select xAttribute.Value).Distinct().ToArray();
                htmDoc.Links = (from linc in doc.Root.Descendants(ns + "link")
                                let xAttribute = linc.Attribute("href")
                                where xAttribute != null
                                select xAttribute.Value).Distinct().ToArray();
                htmDoc.Scripts = (from script in doc.Root.Descendants(ns + "script")
                                let xAttribute = script.Attribute("src")
                                where xAttribute != null
                                select xAttribute.Value).Distinct().ToArray();
            }

            return htmDoc;
        }