/// <summary> /// Determine whether element has any children block level elements. /// </summary> /// <param name="element">Element to operate on</param> /// <returns>bool</returns> internal static bool HasChildBlockElement(IElement element) { var b = NodeUtility.SomeNode(element?.ChildNodes, (node) => { return(divToPElems.ToList().IndexOf((node as IElement)?.TagName) != -1 || HasChildBlockElement(node as IElement)); }); var d = element?.TextContent; return(b); }
/// <summary> /// Check if this node has only whitespace and a single element with given tag /// Returns false if the DIV node contains non-empty text nodes /// or if it contains no element with given tag or more than 1 element. /// </summary> /// <param name="element">Element to operate on</param> /// <param name="tag">Tag of the child element</param> /// <returns>bool</returns> internal static bool HasSingleTagInsideElement(IElement element, string tag) { // There should be exactly 1 element child with given tag: if (element.Children.Length != 1 || element.Children[0].TagName != tag) { return(false); } // And there should be no text nodes with real content return(!NodeUtility.SomeNode(element.ChildNodes, (node) => { return node.NodeType == NodeType.Text && regExps["hasContent"].IsMatch(node.TextContent); })); }
/// <summary> /// Get the article title /// </summary> /// <param name="doc">The document</param> /// <returns> /// The clean title /// </returns> public static string GetArticleTitle(IHtmlDocument doc) { var curTitle = ""; var origTitle = ""; try { curTitle = origTitle = doc.Title.Trim(); // If they had an element with id "title" in their HTML if (typeof(string) != curTitle.GetType()) { curTitle = origTitle = NodeUtility.GetInnerText(doc.GetElementsByTagName("title")[0]); } } catch (Exception e) { /* ignore exceptions setting the title. */ } var titleHadHierarchicalSeparators = false; int wordCount(String str) { return(Regex.Split(str, @"\s+").Length); } // If there's a separator in the title, first remove the final part if (curTitle.IndexOfAny(new char[] { '|', '-', '»', '/', '>' }) != -1) { titleHadHierarchicalSeparators = curTitle.IndexOfAny(new char[] { '\\', '»', '/', '>' }) != -1; curTitle = Regex.Replace(origTitle, @"(.*) [\|\-\\\/>»] .*", "$1", RegexOptions.IgnoreCase); // If the resulting title is too short (3 words or fewer), remove // the first part instead: if (wordCount(curTitle) < 3) { curTitle = Regex.Replace(origTitle, @"[^\|\-\\\/>»]* [\|\-\\\/>»](.*)", "$1", RegexOptions.IgnoreCase); } } else if (curTitle.IndexOf(": ") != -1) { // Check if we have an heading containing this exact string, so we // could assume it's the full title. var headings = NodeUtility.ConcatNodeLists( doc.GetElementsByTagName("h1"), doc.GetElementsByTagName("h2") ); var trimmedTitle = curTitle.Trim(); var match = NodeUtility.SomeNode(headings, (heading) => { return(heading.TextContent.Trim() == trimmedTitle); }); // If we don't, let's extract the title out of the original title string. if (!match) { curTitle = origTitle.Substring(origTitle.LastIndexOf(':') + 1); // If the title is now too short, try the first colon instead: if (wordCount(curTitle) < 3) { curTitle = origTitle.Substring(origTitle.IndexOf(':') + 1); } } } else if (curTitle.Length > 150 || curTitle.Length < 15) { var hOnes = doc.GetElementsByTagName("h1"); if (hOnes.Length == 1) { curTitle = NodeUtility.GetInnerText(hOnes[0]); } } curTitle = curTitle.Trim(); // If we now have 4 words or fewer as our title, and either no // 'hierarchical' separators (\, /, > or ») were found in the original // title or we decreased the number of words by more than 1 word, use // the original title. var curTitleWordCount = wordCount(curTitle); if (curTitleWordCount <= 4 && ( !titleHadHierarchicalSeparators || curTitleWordCount != wordCount(Regex.Replace(origTitle, @"[\|\-\\\/>»: ]+", " ", RegexOptions.IgnoreCase)) - 1)) { curTitle = origTitle; } return(curTitle); }