/// <summary> /// <para>Get the density of links as a percentage of the content.</para> /// <para>This is the amount of text that is inside a link divided by the totaltextinthenode.</para> /// </summary> /// <param name="element">Element to operate on</param> internal static float GetLinkDensity(IElement element) { var textLength = NodeUtility.GetInnerText(element).Length; if (textLength == 0) { return(0); } float linkLength = 0; // XXX implement _reduceNodeList? NodeUtility.ForEachNode(element.GetElementsByTagName("a"), (linkNode) => { linkLength += NodeUtility.GetInnerText(linkNode as IElement).Length; }); return(linkLength / textLength); }
/// <summary> /// Get the article title /// </summary> /// <param name="doc">The document</param> /// <returns> /// The clean title /// </returns> public static string GetArticleTitle(IHtmlDocument doc) { var curTitle = ""; var origTitle = ""; try { curTitle = origTitle = doc.Title.Trim(); // If they had an element with id "title" in their HTML if (typeof(string) != curTitle.GetType()) { curTitle = origTitle = NodeUtility.GetInnerText(doc.GetElementsByTagName("title")[0]); } } catch (Exception e) { /* ignore exceptions setting the title. */ } var titleHadHierarchicalSeparators = false; int wordCount(String str) { return(Regex.Split(str, @"\s+").Length); } // If there's a separator in the title, first remove the final part if (curTitle.IndexOfAny(new char[] { '|', '-', '»', '/', '>' }) != -1) { titleHadHierarchicalSeparators = curTitle.IndexOfAny(new char[] { '\\', '»', '/', '>' }) != -1; curTitle = Regex.Replace(origTitle, @"(.*) [\|\-\\\/>»] .*", "$1", RegexOptions.IgnoreCase); // If the resulting title is too short (3 words or fewer), remove // the first part instead: if (wordCount(curTitle) < 3) { curTitle = Regex.Replace(origTitle, @"[^\|\-\\\/>»]* [\|\-\\\/>»](.*)", "$1", RegexOptions.IgnoreCase); } } else if (curTitle.IndexOf(": ") != -1) { // Check if we have an heading containing this exact string, so we // could assume it's the full title. var headings = NodeUtility.ConcatNodeLists( doc.GetElementsByTagName("h1"), doc.GetElementsByTagName("h2") ); var trimmedTitle = curTitle.Trim(); var match = NodeUtility.SomeNode(headings, (heading) => { return(heading.TextContent.Trim() == trimmedTitle); }); // If we don't, let's extract the title out of the original title string. if (!match) { curTitle = origTitle.Substring(origTitle.LastIndexOf(':') + 1); // If the title is now too short, try the first colon instead: if (wordCount(curTitle) < 3) { curTitle = origTitle.Substring(origTitle.IndexOf(':') + 1); } } } else if (curTitle.Length > 150 || curTitle.Length < 15) { var hOnes = doc.GetElementsByTagName("h1"); if (hOnes.Length == 1) { curTitle = NodeUtility.GetInnerText(hOnes[0]); } } curTitle = curTitle.Trim(); // If we now have 4 words or fewer as our title, and either no // 'hierarchical' separators (\, /, > or ») were found in the original // title or we decreased the number of words by more than 1 word, use // the original title. var curTitleWordCount = wordCount(curTitle); if (curTitleWordCount <= 4 && ( !titleHadHierarchicalSeparators || curTitleWordCount != wordCount(Regex.Replace(origTitle, @"[\|\-\\\/>»: ]+", " ", RegexOptions.IgnoreCase)) - 1)) { curTitle = origTitle; } return(curTitle); }