Esempio n. 1
0
        /// <summary>
        /// Determine whether element has any children block level elements.
        /// </summary>
        /// <param name="element">Element to operate on</param>
        /// <returns>bool</returns>
        internal static bool HasChildBlockElement(IElement element)
        {
            var b = NodeUtility.SomeNode(element?.ChildNodes, (node) =>
            {
                return(divToPElems.ToList().IndexOf((node as IElement)?.TagName) != -1 ||
                       HasChildBlockElement(node as IElement));
            });
            var d = element?.TextContent;

            return(b);
        }
Esempio n. 2
0
        /// <summary>
        /// Check if this node has only whitespace and a single element with given tag
        /// Returns false if the DIV node contains non-empty text nodes
        /// or if it contains no element with given tag or more than 1 element.
        /// </summary>
        /// <param name="element">Element to operate on</param>
        /// <param name="tag">Tag of the child element</param>
        /// <returns>bool</returns>
        internal static bool HasSingleTagInsideElement(IElement element, string tag)
        {
            // There should be exactly 1 element child with given tag:
            if (element.Children.Length != 1 || element.Children[0].TagName != tag)
            {
                return(false);
            }

            // And there should be no text nodes with real content
            return(!NodeUtility.SomeNode(element.ChildNodes, (node) =>
            {
                return node.NodeType == NodeType.Text &&
                regExps["hasContent"].IsMatch(node.TextContent);
            }));
        }
Esempio n. 3
0
        /// <summary>
        /// Get the article title
        /// </summary>
        /// <param name="doc">The document</param>
        /// <returns>
        /// The clean title
        /// </returns>
        public static string GetArticleTitle(IHtmlDocument doc)
        {
            var curTitle  = "";
            var origTitle = "";

            try
            {
                curTitle = origTitle = doc.Title.Trim();

                // If they had an element with id "title" in their HTML
                if (typeof(string) != curTitle.GetType())
                {
                    curTitle = origTitle = NodeUtility.GetInnerText(doc.GetElementsByTagName("title")[0]);
                }
            }
            catch (Exception e) { /* ignore exceptions setting the title. */ }

            var titleHadHierarchicalSeparators = false;

            int wordCount(String str)
            {
                return(Regex.Split(str, @"\s+").Length);
            }

            // If there's a separator in the title, first remove the final part
            if (curTitle.IndexOfAny(new char[] { '|', '-', '»', '/', '>' }) != -1)
            {
                titleHadHierarchicalSeparators = curTitle.IndexOfAny(new char[] { '\\', '»', '/', '>' }) != -1;
                curTitle = Regex.Replace(origTitle, @"(.*) [\|\-\\\/>»] .*", "$1", RegexOptions.IgnoreCase);

                // If the resulting title is too short (3 words or fewer), remove
                // the first part instead:
                if (wordCount(curTitle) < 3)
                {
                    curTitle = Regex.Replace(origTitle, @"[^\|\-\\\/>»]* [\|\-\\\/>»](.*)", "$1", RegexOptions.IgnoreCase);
                }
            }
            else if (curTitle.IndexOf(": ") != -1)
            {
                // Check if we have an heading containing this exact string, so we
                // could assume it's the full title.
                var headings = NodeUtility.ConcatNodeLists(
                    doc.GetElementsByTagName("h1"),
                    doc.GetElementsByTagName("h2")
                    );
                var trimmedTitle = curTitle.Trim();
                var match        = NodeUtility.SomeNode(headings, (heading) =>
                {
                    return(heading.TextContent.Trim() == trimmedTitle);
                });

                // If we don't, let's extract the title out of the original title string.
                if (!match)
                {
                    curTitle = origTitle.Substring(origTitle.LastIndexOf(':') + 1);

                    // If the title is now too short, try the first colon instead:
                    if (wordCount(curTitle) < 3)
                    {
                        curTitle = origTitle.Substring(origTitle.IndexOf(':') + 1);
                    }
                }
            }
            else if (curTitle.Length > 150 || curTitle.Length < 15)
            {
                var hOnes = doc.GetElementsByTagName("h1");

                if (hOnes.Length == 1)
                {
                    curTitle = NodeUtility.GetInnerText(hOnes[0]);
                }
            }

            curTitle = curTitle.Trim();

            // If we now have 4 words or fewer as our title, and either no
            // 'hierarchical' separators (\, /, > or ») were found in the original
            // title or we decreased the number of words by more than 1 word, use
            // the original title.
            var curTitleWordCount = wordCount(curTitle);

            if (curTitleWordCount <= 4 && (
                    !titleHadHierarchicalSeparators ||
                    curTitleWordCount != wordCount(Regex.Replace(origTitle, @"[\|\-\\\/>»: ]+", " ", RegexOptions.IgnoreCase)) - 1))
            {
                curTitle = origTitle;
            }

            return(curTitle);
        }