private static void DeleteImageNodes(HtmlNode articleNode) { var imageNotes = articleNode.GetElementsByTagName("img"); foreach (var image in imageNotes) { image.Remove(); } }
private static double GetLinkDensity(HtmlNode node) { var links = node.GetElementsByTagName("a"); var textLength = GetInnerText(node).Length; var linkLength = links.Sum(l => GetInnerText(l).Length); return linkLength * 1.0 / textLength; }
private static void RemoveScripts(HtmlNode node) { foreach (var script in node.GetElementsByTagName("script")) { script.Remove(); } }
private static string GetArticleTitle(HtmlNode htmlNode) { var titleNode = htmlNode.GetElementsByTagName("title").FirstOrDefault(); if (titleNode == null) return null; string currTitle, origTitle; currTitle = origTitle = GetInnerText(titleNode); if (Regex.IsMatch(currTitle, @" [\|\-] ")) { currTitle = Regex.Replace(origTitle, @"(.*)[\|\-] .*", "$1"); if (currTitle.Split(' ').Length < 3) { currTitle = origTitle.Replace(@"[^\|\-]*[\|\-](.*)", "$1"); } } else if (currTitle.IndexOf(": ") != -1) { currTitle = Regex.Replace(origTitle, @".*:(.*)", "$1"); if(currTitle.Split(' ').Length < 3) { currTitle = Regex.Replace(origTitle, @"[^:]*[:](.*)", "$1"); } } else if (currTitle.Length > 150 || currTitle.Length < 15) { var hOnes = htmlNode.GetElementsByTagName("h1"); if (hOnes.Count == 1) { currTitle = GetInnerText(hOnes[0]); } } if (currTitle.Split(' ').Length <= 4) { currTitle = origTitle; } return currTitle.Trim(); }