Example #1
0
        private static IDictionary <string, XPathInfo> GenerateXPaths(HtmlNode htmlNode, string xpath, IDictionary <string, XPathInfo> xpathInfos)
        {
            if (htmlNode.NodeType == HtmlNodeType.Element)
            {
                xpath += "/" + htmlNode.Name;

                if (_allowedTagNames.Contains(htmlNode.Name) && !xpathInfos.ContainsKey(xpath))
                {
                    string innerText = UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value;

                    if (!string.IsNullOrEmpty(innerText.Trim()))
                    {
                        XPathInfo xPathInfo = new XPathInfo();

                        xPathInfo.InnerText = innerText;
                        xPathInfo.Tag       = htmlNode.Name;
                        xPathInfo.XPath     = xpath;

                        xpathInfos.Add(xpath, xPathInfo);
                    }
                }

                if (xpathInfos.ContainsKey(xpath))
                {
                    xpathInfos[xpath].Count++;
                }

                Debug.Print(xpath);
            }

            foreach (HtmlNode childNode in htmlNode.ChildNodes)
            {
                if (childNode.NodeType == HtmlNodeType.Element)
                {
                    GenerateXPaths(childNode, xpath, xpathInfos);
                }
            }

            return(xpathInfos);
        }
Example #2
0
        /// <summary>
        ///     Performs the action.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            //use this instead: http://code.google.com/p/boilerpipe/

            /*if (!crawlRequest.ProcessData)
             * {
             *  return;
             * }*/

            if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage)
            {
                if (crawlRequest.Data != null)
                {
                    ManagedWebPage managedWebPage = ((ManagedWebPage)crawlRequest.ManagedDiscovery);

                    if (managedWebPage.HtmlDocument == null)
                    {
                        managedWebPage.HtmlDocument = crawlRequest.Crawl.Crawler.HtmlManager.CreateHtmlDocument(crawlRequest.Html, Encoding.Unicode);
                    }

                    IDictionary <string, XPathInfo> xPathInfos = new Dictionary <string, XPathInfo>();

                    xPathInfos = GenerateXPaths(managedWebPage.HtmlDocument.DocumentNode, string.Empty, xPathInfos);

                    //string dateXPath = ExtractDateXPath(htmlDocument1, xpathInfos);

                    //List<string> dates = htmlDocument1.DocumentNode.SelectNodes(dateXPath).OfType<HtmlNode>().Select(h => h.InnerText).ToList();

                    ProcessXPaths(xPathInfos);

                    List <XPathInfo> xPathInfos2 = xPathInfos.Values.OrderByDescending(x => x.LevenstheinDistance).ToList();

                    int numberOfSlashes = 0;

                    IDictionary <string, XPathInfo> xPathInfos3 = new Dictionary <string, XPathInfo>();

                    int xPaths = 0;
                    int minimumNumberOfXPaths = 5;

                    foreach (XPathInfo xPathInfo in xPathInfos2)
                    {
                        int numberOfSlashes2 = xPathInfo.XPath.Length - xPathInfo.XPath.Replace("/", string.Empty).Length;

                        if (numberOfSlashes2 > numberOfSlashes)
                        {
                            numberOfSlashes = numberOfSlashes2;

                            xPathInfos3.Add(xPathInfo.XPath, xPathInfo);
                        }
                        else
                        {
                            if (xPaths++ > minimumNumberOfXPaths)
                            {
                                break;
                            }
                        }
                    }

                    StringBuilder stringBuilder = new StringBuilder();

                    Dictionary <string, XPathInfo> dictionary = new Dictionary <string, XPathInfo>();

                    foreach (XPathInfo xPathInfo in xPathInfos3.Values)
                    {
                        //stringBuilder.Remove(0, stringBuilder.Length);

                        foreach (HtmlNode htmlNode in managedWebPage.HtmlDocument.DocumentNode.SelectNodes(xPathInfo.XPath))
                        {
                            string text = UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value;

                            if (!dictionary.ContainsKey(text))
                            {
                                XPathInfo xPathInfo2 = new XPathInfo();

                                xPathInfo2.XPath = xPathInfo.XPath;

                                dictionary.Add(text, xPathInfo2);
                            }

                            dictionary[text].Count++;
                        }
                    }

                    Dictionary <string, XPathInfo> dictionary2 = new Dictionary <string, XPathInfo>();

                    foreach (KeyValuePair <string, XPathInfo> keyValuePair in dictionary)
                    {
                        if (!string.IsNullOrEmpty(keyValuePair.Key.Trim()))
                        {
                            dictionary2.Add(keyValuePair.Key, keyValuePair.Value);
                        }
                    }

                    foreach (string key in dictionary.Keys)
                    {
                        foreach (string key2 in dictionary.Keys)
                        {
                            if (!string.IsNullOrEmpty(key.Trim()) && !string.IsNullOrEmpty(key2.Trim()))
                            {
                                if (key.Contains(key2) || key2.Contains(key))
                                {
                                    dictionary2[key].Count++;
                                    dictionary2[key2].Count++;
                                }
                            }
                        }
                    }

                    int dictionary2Max = dictionary2.Max(d => d.Value.Count);

                    foreach (KeyValuePair <string, XPathInfo> keyValuePair in dictionary2)
                    {
                        if (keyValuePair.Value.Count == dictionary2Max)
                        {
                            foreach (HtmlNode htmlNode in managedWebPage.HtmlDocument.DocumentNode.SelectNodes(keyValuePair.Value.XPath))
                            {
                                stringBuilder.Append(UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value);
                            }
                        }
                    }

                    MessageBox.Show(stringBuilder.ToString());

                    //return stringBuilder.ToString();
                }
            }
        }