Ejemplo n.º 1
0
 private static void DeleteImageNodes(HtmlNode articleNode)
 {
     var imageNotes = articleNode.GetElementsByTagName("img");
     foreach (var image in imageNotes)
     {
         image.Remove();
     }
 }
Ejemplo n.º 2
0
        private static double GetLinkDensity(HtmlNode node)
        {
            var links = node.GetElementsByTagName("a");

            var textLength = GetInnerText(node).Length;
            var linkLength = links.Sum(l => GetInnerText(l).Length);

            return linkLength * 1.0 / textLength;
        }
Ejemplo n.º 3
0
 private static void RemoveScripts(HtmlNode node)
 {
     foreach (var script in node.GetElementsByTagName("script"))
     {
         script.Remove();
     }
 }
Ejemplo n.º 4
0
        private static string GetArticleTitle(HtmlNode htmlNode)
        {
            var titleNode = htmlNode.GetElementsByTagName("title").FirstOrDefault();
            if (titleNode == null) return null;

            string currTitle, origTitle;
            currTitle = origTitle = GetInnerText(titleNode);

            if (Regex.IsMatch(currTitle, @" [\|\-] "))
            {
                currTitle = Regex.Replace(origTitle,  @"(.*)[\|\-] .*", "$1");

                if (currTitle.Split(' ').Length < 3)
                {
                    currTitle = origTitle.Replace(@"[^\|\-]*[\|\-](.*)", "$1");
                }
            }
            else if (currTitle.IndexOf(": ") != -1)
            {
                currTitle = Regex.Replace(origTitle, @".*:(.*)", "$1");

                if(currTitle.Split(' ').Length < 3)
                {
                    currTitle = Regex.Replace(origTitle, @"[^:]*[:](.*)", "$1");
                }
            }
            else if (currTitle.Length > 150 || currTitle.Length < 15)
            {
                var hOnes = htmlNode.GetElementsByTagName("h1");
                if (hOnes.Count == 1)
                {
                    currTitle = GetInnerText(hOnes[0]);
                }
            }

            if (currTitle.Split(' ').Length <= 4)
            {
                currTitle = origTitle;
            }

            return currTitle.Trim();
        }