private string IndentationFor(HtmlNode node) { int length = node.Ancestors("ol").Count() + node.Ancestors("ul").Count(); return node.ParentNode.Name.ToLowerInvariant() == "li" && node.ParentNode.FirstChild != node ? new string(' ', length * 4) : Environment.NewLine + Environment.NewLine; }
private string AnalyzeNode(HtmlNode node) { string curText = node.InnerText; int wordLength = Extractor.WordLength(curText); int wordCount = Extractor.WordCount(curText); int upperCaseCount = Extractor.UpperCaseCount(curText); bool finishedSentence = Extractor.ContainsPunctuation(curText); if (wordCount == 0) { return ""; } bool isLink = node.Ancestors("a").ToList().Count > 0; double linkRatio = 0; int averageSentenceLength = Extractor.SentenceLength(curText); int nrOfSentences = Extractor.NrOfSentences(curText); if (isLink) { HtmlNode parentNode = node.ParentNode; string clazz = parentNode.Attributes["class"] == null ? null : parentNode.Attributes["class"].Value; while (parentNode.Name == "a" || (clazz != null && clazz.IndexOf("x-nc-sel") != -1)) { parentNode = parentNode.ParentNode; clazz = parentNode.Attributes["class"] == null ? null : parentNode.Attributes["class"].Value; } string parentText = parentNode.InnerText; int totalWords = Extractor.WordCount(parentText); linkRatio = ((double)wordCount) / ((double)totalWords); } return String.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}", wordLength, wordCount, nrOfSentences, averageSentenceLength, isLink ? "Yes" : "No", linkRatio.ToString("##0.00###", CultureInfo.CreateSpecificCulture("en-GB")), upperCaseCount, finishedSentence ? "Yes" : "No" ); }
// try to find the city of hall by xPath private static string TryToGetCity(HtmlNode htmlNode) { var td = htmlNode.Ancestors("td").Where(t => t.GetAttributeValue("hallslistwrap", "") != null).ToList(); if (td.First() == null) return null; const string xPath1 = @"../../../../preceding::tr[1]//h5"; // xpath to City Name var h5 = td.First().SelectSingleNode(xPath1); return h5?.InnerHtml; }
private string IndentationFor(HtmlNode node) { int length = node.Ancestors("ol").Count() + node.Ancestors("ul").Count(); return new string(' ', Math.Max(length-1,0)); }
private bool AlreadyItalic(HtmlNode node) { return node.Ancestors("i").Count() > 0 || node.Ancestors("em").Count() > 0; }
private void ProcessNode(HtmlNode node) { Node n = new Node(); n.text = node.InnerText; n.wordCount = WordCount(node.InnerText); HtmlNodeCollection linkNodes = node.SelectNodes("../a"); List<HtmlNode> ancestors = node.Ancestors("a").ToList(); n.isLink = ancestors.Count > 0; // represents the element itself being inside some hierarchy that contains some number of a tags double linkCount = n.isLink ? 1 : 0; if (linkNodes != null) { linkCount += linkNodes.Count; } n.linkRatio = linkCount / n.wordCount; nodes.Add(n); }
private bool AlreadyBold(HtmlNode node) { return node.Ancestors("strong").Count() > 0 || node.Ancestors("b").Count() > 0; }