예제 #1
0
 private string IndentationFor(HtmlNode node)
 {
     int length = node.Ancestors("ol").Count() + node.Ancestors("ul").Count();
     return node.ParentNode.Name.ToLowerInvariant() == "li" && node.ParentNode.FirstChild != node
         ? new string(' ', length * 4)
         : Environment.NewLine + Environment.NewLine;
 }
예제 #2
0
 private string AnalyzeNode(HtmlNode node)
 {
     string curText = node.InnerText;
     int wordLength = Extractor.WordLength(curText);
     int wordCount = Extractor.WordCount(curText);
     int upperCaseCount = Extractor.UpperCaseCount(curText);
     bool finishedSentence = Extractor.ContainsPunctuation(curText);
     if (wordCount == 0)
     {
         return "";
     }
     bool isLink = node.Ancestors("a").ToList().Count > 0;
     double linkRatio = 0;
     int averageSentenceLength = Extractor.SentenceLength(curText);
     int nrOfSentences = Extractor.NrOfSentences(curText);
     if (isLink)
     {
         HtmlNode parentNode = node.ParentNode;
         string clazz = parentNode.Attributes["class"] == null ? null : parentNode.Attributes["class"].Value;
         while (parentNode.Name == "a" || (clazz != null && clazz.IndexOf("x-nc-sel") != -1))
         {
             parentNode = parentNode.ParentNode;
             clazz = parentNode.Attributes["class"] == null ? null : parentNode.Attributes["class"].Value;
         }
         string parentText = parentNode.InnerText;
         int totalWords = Extractor.WordCount(parentText);
         linkRatio = ((double)wordCount) / ((double)totalWords);
     }
     return String.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}", wordLength, wordCount,
         nrOfSentences, averageSentenceLength, isLink ? "Yes" : "No", linkRatio.ToString("##0.00###", CultureInfo.CreateSpecificCulture("en-GB")), upperCaseCount, finishedSentence ? "Yes" : "No" );
 }
예제 #3
0
        // try to find the city of hall by xPath
        private static string TryToGetCity(HtmlNode htmlNode)
        {
            var td = htmlNode.Ancestors("td").Where(t => t.GetAttributeValue("hallslistwrap", "") != null).ToList();

            if (td.First() == null) return null;

            const string xPath1 = @"../../../../preceding::tr[1]//h5"; // xpath to City Name
            var h5 = td.First().SelectSingleNode(xPath1);

            return h5?.InnerHtml;
        }
예제 #4
0
 private string IndentationFor(HtmlNode node)
 {
     int length = node.Ancestors("ol").Count() + node.Ancestors("ul").Count();
     return new string(' ', Math.Max(length-1,0));
 }
예제 #5
0
		private bool AlreadyItalic(HtmlNode node)
		{
			return node.Ancestors("i").Count() > 0 || node.Ancestors("em").Count() > 0;
		}
예제 #6
0
 private void ProcessNode(HtmlNode node)
 {
     Node n = new Node();
     n.text = node.InnerText;
     n.wordCount = WordCount(node.InnerText);
     HtmlNodeCollection linkNodes = node.SelectNodes("../a");
     List<HtmlNode> ancestors = node.Ancestors("a").ToList();
     n.isLink = ancestors.Count > 0; // represents the element itself being inside some hierarchy that contains some number of a tags
     double linkCount = n.isLink ? 1 : 0;
     if (linkNodes != null)
     {
         linkCount += linkNodes.Count;
     }
     n.linkRatio = linkCount / n.wordCount;
     nodes.Add(n);
 }
예제 #7
0
		private bool AlreadyBold(HtmlNode node)
		{
			return node.Ancestors("strong").Count() > 0 || node.Ancestors("b").Count() > 0;
		}