示例#1
0
        /// <summary>
        /// <para>Get the density of links as a percentage of the content.</para>
        /// <para>This is the amount of text that is inside a link divided by the totaltextinthenode.</para>
        /// </summary>
        /// <param name="element">Element to operate on</param>
        internal static float GetLinkDensity(IElement element)
        {
            var textLength = NodeUtility.GetInnerText(element).Length;

            if (textLength == 0)
            {
                return(0);
            }

            float linkLength = 0;

            // XXX implement _reduceNodeList?
            NodeUtility.ForEachNode(element.GetElementsByTagName("a"), (linkNode) =>
            {
                linkLength += NodeUtility.GetInnerText(linkNode as IElement).Length;
            });

            return(linkLength / textLength);
        }
示例#2
0
        /// <summary>
        /// Get the article title
        /// </summary>
        /// <param name="doc">The document</param>
        /// <returns>
        /// The clean title
        /// </returns>
        public static string GetArticleTitle(IHtmlDocument doc)
        {
            var curTitle  = "";
            var origTitle = "";

            try
            {
                curTitle = origTitle = doc.Title.Trim();

                // If they had an element with id "title" in their HTML
                if (typeof(string) != curTitle.GetType())
                {
                    curTitle = origTitle = NodeUtility.GetInnerText(doc.GetElementsByTagName("title")[0]);
                }
            }
            catch (Exception e) { /* ignore exceptions setting the title. */ }

            var titleHadHierarchicalSeparators = false;

            int wordCount(String str)
            {
                return(Regex.Split(str, @"\s+").Length);
            }

            // If there's a separator in the title, first remove the final part
            if (curTitle.IndexOfAny(new char[] { '|', '-', '»', '/', '>' }) != -1)
            {
                titleHadHierarchicalSeparators = curTitle.IndexOfAny(new char[] { '\\', '»', '/', '>' }) != -1;
                curTitle = Regex.Replace(origTitle, @"(.*) [\|\-\\\/>»] .*", "$1", RegexOptions.IgnoreCase);

                // If the resulting title is too short (3 words or fewer), remove
                // the first part instead:
                if (wordCount(curTitle) < 3)
                {
                    curTitle = Regex.Replace(origTitle, @"[^\|\-\\\/>»]* [\|\-\\\/>»](.*)", "$1", RegexOptions.IgnoreCase);
                }
            }
            else if (curTitle.IndexOf(": ") != -1)
            {
                // Check if we have an heading containing this exact string, so we
                // could assume it's the full title.
                var headings = NodeUtility.ConcatNodeLists(
                    doc.GetElementsByTagName("h1"),
                    doc.GetElementsByTagName("h2")
                    );
                var trimmedTitle = curTitle.Trim();
                var match        = NodeUtility.SomeNode(headings, (heading) =>
                {
                    return(heading.TextContent.Trim() == trimmedTitle);
                });

                // If we don't, let's extract the title out of the original title string.
                if (!match)
                {
                    curTitle = origTitle.Substring(origTitle.LastIndexOf(':') + 1);

                    // If the title is now too short, try the first colon instead:
                    if (wordCount(curTitle) < 3)
                    {
                        curTitle = origTitle.Substring(origTitle.IndexOf(':') + 1);
                    }
                }
            }
            else if (curTitle.Length > 150 || curTitle.Length < 15)
            {
                var hOnes = doc.GetElementsByTagName("h1");

                if (hOnes.Length == 1)
                {
                    curTitle = NodeUtility.GetInnerText(hOnes[0]);
                }
            }

            curTitle = curTitle.Trim();

            // If we now have 4 words or fewer as our title, and either no
            // 'hierarchical' separators (\, /, > or ») were found in the original
            // title or we decreased the number of words by more than 1 word, use
            // the original title.
            var curTitleWordCount = wordCount(curTitle);

            if (curTitleWordCount <= 4 && (
                    !titleHadHierarchicalSeparators ||
                    curTitleWordCount != wordCount(Regex.Replace(origTitle, @"[\|\-\\\/>»: ]+", " ", RegexOptions.IgnoreCase)) - 1))
            {
                curTitle = origTitle;
            }

            return(curTitle);
        }