private static IDictionary <string, XPathInfo> GenerateXPaths(HtmlNode htmlNode, string xpath, IDictionary <string, XPathInfo> xpathInfos) { if (htmlNode.NodeType == HtmlNodeType.Element) { xpath += "/" + htmlNode.Name; if (_allowedTagNames.Contains(htmlNode.Name) && !xpathInfos.ContainsKey(xpath)) { string innerText = UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value; if (!string.IsNullOrEmpty(innerText.Trim())) { XPathInfo xPathInfo = new XPathInfo(); xPathInfo.InnerText = innerText; xPathInfo.Tag = htmlNode.Name; xPathInfo.XPath = xpath; xpathInfos.Add(xpath, xPathInfo); } } if (xpathInfos.ContainsKey(xpath)) { xpathInfos[xpath].Count++; } Debug.Print(xpath); } foreach (HtmlNode childNode in htmlNode.ChildNodes) { if (childNode.NodeType == HtmlNodeType.Element) { GenerateXPaths(childNode, xpath, xpathInfos); } } return(xpathInfos); }
/// <summary> /// Performs the action. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { //use this instead: http://code.google.com/p/boilerpipe/ /*if (!crawlRequest.ProcessData) * { * return; * }*/ if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage) { if (crawlRequest.Data != null) { ManagedWebPage managedWebPage = ((ManagedWebPage)crawlRequest.ManagedDiscovery); if (managedWebPage.HtmlDocument == null) { managedWebPage.HtmlDocument = crawlRequest.Crawl.Crawler.HtmlManager.CreateHtmlDocument(crawlRequest.Html, Encoding.Unicode); } IDictionary <string, XPathInfo> xPathInfos = new Dictionary <string, XPathInfo>(); xPathInfos = GenerateXPaths(managedWebPage.HtmlDocument.DocumentNode, string.Empty, xPathInfos); //string dateXPath = ExtractDateXPath(htmlDocument1, xpathInfos); //List<string> dates = htmlDocument1.DocumentNode.SelectNodes(dateXPath).OfType<HtmlNode>().Select(h => h.InnerText).ToList(); ProcessXPaths(xPathInfos); List <XPathInfo> xPathInfos2 = xPathInfos.Values.OrderByDescending(x => x.LevenstheinDistance).ToList(); int numberOfSlashes = 0; IDictionary <string, XPathInfo> xPathInfos3 = new Dictionary <string, XPathInfo>(); int xPaths = 0; int minimumNumberOfXPaths = 5; foreach (XPathInfo xPathInfo in xPathInfos2) { int numberOfSlashes2 = xPathInfo.XPath.Length - xPathInfo.XPath.Replace("/", string.Empty).Length; if (numberOfSlashes2 > numberOfSlashes) { numberOfSlashes = numberOfSlashes2; xPathInfos3.Add(xPathInfo.XPath, xPathInfo); } else { if (xPaths++ > minimumNumberOfXPaths) { break; } } } StringBuilder stringBuilder = new StringBuilder(); Dictionary <string, XPathInfo> dictionary = new Dictionary <string, XPathInfo>(); foreach (XPathInfo xPathInfo in xPathInfos3.Values) { //stringBuilder.Remove(0, stringBuilder.Length); foreach (HtmlNode htmlNode in managedWebPage.HtmlDocument.DocumentNode.SelectNodes(xPathInfo.XPath)) { string text = UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value; if (!dictionary.ContainsKey(text)) { XPathInfo xPathInfo2 = new XPathInfo(); xPathInfo2.XPath = xPathInfo.XPath; dictionary.Add(text, xPathInfo2); } dictionary[text].Count++; } } Dictionary <string, XPathInfo> dictionary2 = new Dictionary <string, XPathInfo>(); foreach (KeyValuePair <string, XPathInfo> keyValuePair in dictionary) { if (!string.IsNullOrEmpty(keyValuePair.Key.Trim())) { dictionary2.Add(keyValuePair.Key, keyValuePair.Value); } } foreach (string key in dictionary.Keys) { foreach (string key2 in dictionary.Keys) { if (!string.IsNullOrEmpty(key.Trim()) && !string.IsNullOrEmpty(key2.Trim())) { if (key.Contains(key2) || key2.Contains(key)) { dictionary2[key].Count++; dictionary2[key2].Count++; } } } } int dictionary2Max = dictionary2.Max(d => d.Value.Count); foreach (KeyValuePair <string, XPathInfo> keyValuePair in dictionary2) { if (keyValuePair.Value.Count == dictionary2Max) { foreach (HtmlNode htmlNode in managedWebPage.HtmlDocument.DocumentNode.SelectNodes(keyValuePair.Value.XPath)) { stringBuilder.Append(UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value); } } } MessageBox.Show(stringBuilder.ToString()); //return stringBuilder.ToString(); } } }