Example #1
0
        /// <summary>
        /// InitFirstUrlPart
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private string InitFirstUrlPart(string url)
        {
            if (!url.ToLower().Contains("http"))
            {
                url = $"https://{url}";
            }

            var mainHtml = GetMainWebContent(_shopUrl = url, null, ref _cookies, null);

            _shopName = Regex.Match(mainHtml, @"(?<=<title>)[\s\S]*?(?=</title>)").Value.Trim();
            if (_shopName.Contains("阿里旅行·去啊Alitrip.com"))
            {
                throw new Exception("阿里旅行不支持");
            }
            if (_shopName.Equals("店铺浏览-淘宝网"))
            {
                //throw new Exception("店铺不存在!");
                SendLog("店铺不存在!");
                return(string.Empty);
            }
            _shopName = Regex.Match(_shopName, "(?<=-).*(?=-)").Value.Trim();
            //var categoryUrl = $"{url}/category.htm";
            var categoryUrl  = $"{url}/search.htm";
            var html         = GetMainWebContent(categoryUrl, null, ref _cookies, null);
            var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html);
            var listUrl      = documentNode.SelectSingleNode("//input[@id='J_ShopAsynSearchURL']").Attributes["value"].Value;

            return($"{url}{listUrl}");
        }
Example #2
0
        /// <summary>
        /// 采取分类
        /// </summary>
        /// <param name="url"></param>
        public static void AmazonSpider(string url)
        {
            url = "http://www.amazon.cn/gp/site-directory/ref=sa_menu_fullstore";
            //http://www.amazon.cn/gp/site-directory/ref=sa_menu_fullstore
            //var hcFirst = new HttpClient(url);
            //hcFirst.Timeout = 30000;
            //var htmlFirst = hcFirst.Request();
            //var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst);

            WebBrowerManager.Instance.Setup(new cEXWB());
            var html = WebBrowerManager.Instance.Run(url);

            // 注意htmldecode
            html = HttpUtility.HtmlDecode(html);
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            var firstCategoryContainer = htmlDocument.DocumentNode.SelectNodes("//div[@class='popover-grouping']");

            if (firstCategoryContainer == null || !firstCategoryContainer.Any())
            {
                return;
            }

            foreach (HtmlNode htmlNode in firstCategoryContainer)
            {
                var firstCategoryNode = htmlNode.SelectSingleNode("//div[@class='popover-category-name']/h2");

                if (firstCategoryNode == null)
                {
                    continue;
                }

                // 一级分类
                var firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                Insert(firstKey.ToString(), firstCategoryNode.InnerText, string.Empty, "0", "8");

                // 二级分类
                var secondCategoryNodes = htmlNode.SelectNodes("div");
                if (secondCategoryNodes == null || !secondCategoryNodes.Any())
                {
                    continue;
                }

                foreach (HtmlNode node in secondCategoryNodes)
                {
                    if (node.Attributes["class"] == null)
                    {
                        var secondCategoryNode = node.SelectSingleNode("a");
                        if (secondCategoryNode != null && secondCategoryNode.Attributes["href"] != null &&
                            !string.IsNullOrEmpty(secondCategoryNode.Attributes["href"].Value))
                        {
                            var categoryUrl  = "http://www.amazon.cn" + HttpUtility.UrlDecode(secondCategoryNode.Attributes["href"].Value);
                            var categoryName = secondCategoryNode.InnerText;
                            var secondKey    = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                            Insert(secondKey.ToString(), categoryName, categoryUrl, "0", "8");
                        }
                    }
                }
            }
        }
        public DefaultWebCrawlerService(ICmsConfiguration cmsConfiguration)
        {
            this.cmsConfiguration = cmsConfiguration;

            webServer = cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneWebSiteUrl) ?? string.Empty;

            bool.TryParse(cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneIndexPrivatePages), out indexPrivatePages);

            if (indexPrivatePages)
            {
                var authModeString = cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneAuthorizationMode);
                if (!string.IsNullOrWhiteSpace(authModeString))
                {
                    switch (authModeString.ToLower().Trim())
                    {
                        case "windows":
                            authMode = AuthMode.Windows;
                            break;
                        default:
                            authMode = AuthMode.Forms;
                            break;
                    }
                }
            }

            HtmlAgilityPackHelper.FixMissingTagClosings();

            TimeSpan timeout;
            if (TimeSpan.TryParse(cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneIndexerPageFetchTimeout), out timeout)
                && timeout > TimeSpan.FromSeconds(0))
            {
                fetchTimeout = timeout;
            }
        }
Example #4
0
        public void GetMetaNodesCollection_ValidHtml_ReturnHtmlNodeCollection()
        {
            var htmlDocument   = HtmlAgilityPackHelper.RetrieveHtml(ValidUrl);
            var nodeCollection = HtmlAgilityPackHelper.GetMetaNodesCollection(htmlDocument);

            Assert.IsNotNull(nodeCollection);
        }
Example #5
0
        public void GetMetaNodesCollection_InvalidHtml_ReturnNull()
        {
            var htmlDocument   = new HtmlDocument();
            var nodeCollection = HtmlAgilityPackHelper.GetMetaNodesCollection(htmlDocument);

            Assert.IsNull(nodeCollection);
        }
Example #6
0
        /// <summary>
        /// 亚马逊产品列表
        /// </summary>
        public static void AmazonProductList(string url)
        {
            //http://www.amazon.cn/gp/site-directory/ref=sa_menu_fullstore
            if (string.IsNullOrEmpty(url))
            {
                return;
            }
            Uri    uri                    = new Uri(url);
            string queryString            = uri.Query;
            NameValueCollection nameValue = UrlHelper.GetQueryString(queryString);
            // 根据url中抽取分类
            string node = nameValue["node"];

            if (string.IsNullOrEmpty(node))
            {
                return;
            }

            string urlTemplate  = "http://www.amazon.cn/s/ref=?rh=n:{0}&page={1}";
            var    firstPageUrl = string.Format(urlTemplate, node, 1);
            var    hcFirst      = new HttpClient(firstPageUrl);

            hcFirst.Timeout = 30000;
            var html         = HttpUtility.HtmlDecode(hcFirst.Request());
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            var maxPageNode   = htmlDocument.DocumentNode.SelectSingleNode("//span[@class='pagnDisabled']");
            var maxPageNumber = 0;

            if (maxPageNode != null && int.TryParse(maxPageNode.InnerText, out maxPageNumber))
            {
                for (int i = 1; i <= maxPageNumber; i++)
                {
                    if (i != 1)
                    {
                        var pageUrl = string.Format(urlTemplate, node, i);
                        var hc      = new HttpClient(pageUrl);
                        hc.Timeout   = 30000;
                        html         = HttpUtility.HtmlDecode(hc.Request());
                        htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);
                    }

                    var productNodes =
                        htmlDocument.DocumentNode.SelectNodes(
                            "//div[@class='result product'] | //div[@class='result lastRow product']");
                    if (productNodes == null)
                    {
                        return;
                    }

                    foreach (HtmlNode productNode in productNodes)
                    {
                        var imageNode = productNode.SelectSingleNode("div[@class='image']/a");
                        var titleNode = productNode.SelectSingleNode("div[@class='data']/h3[@class='title']/a");
                    }
                }
            }
        }
Example #7
0
        public DefaultWebCrawlerService(ICmsConfiguration cmsConfiguration)
        {
            this.cmsConfiguration = cmsConfiguration;

            webServer = cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneWebSiteUrl) ?? string.Empty;

            bool.TryParse(cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneIndexPrivatePages), out indexPrivatePages);

            HtmlAgilityPackHelper.FixMissingTagClosings();
        }
Example #8
0
        public List <ProductInfo> SpiderProductList(SpiderCategoryInfo spiderCategory)
        {
            WebBrowerManager.Instance.Setup(new cEXWB());
            WebBrowerManager.Instance.TimeOut = 15;
            var html         = WebBrowerManager.Instance.Run(spiderCategory.CategoryUrl);
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            htmlDocument.GetElementbyId("search_result");
            return(new List <ProductInfo>());
        }
Example #9
0
        private int GetKeywordOccurrencesFromHtmlDocument(string keyword, string url)
        {
            if (!IsUrlValid(url))
            {
                return(0);
            }

            var htmlDocument = HtmlAgilityPackHelper.RetrieveHtml(url);

            return(htmlDocument.Text.OccurrencesOf(keyword));
        }
Example #10
0
        public static List <string> GetUrls()
        {
            var indexUrls = new List <string>();
            var indexUrl  = "http://www.sge.sh/publish/sge/xqzx/jyxq/index.htm";

            var firstPage = WebBrowerManager.Instance.Brower(indexUrl);

            if (string.IsNullOrEmpty(firstPage.HtmlSource))
            {
                return(indexUrls);
            }

            var firstDocument = HtmlAgilityPackHelper.GetHtmlDocument(firstPage.HtmlSource);
            var firstPageNode = firstDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[1]/div[4]/a[1]");
            var lastPageNode  = firstDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[1]/div[4]/a[4]");

            var maxPageNumber = 0;

            if (lastPageNode.HasAttributes)
            {
                var attr = lastPageNode.Attributes["href"];
                if (attr != null && !string.IsNullOrEmpty(attr.Value))
                {
                    var index      = attr.Value.IndexOf("index");
                    var pointIndex = attr.Value.IndexOf(".");
                    if (index >= 0 && pointIndex > 0)
                    {
                        var length        = pointIndex - index - "index".Length;
                        var maxPageString = attr.Value.Substring(index + "index".Length, length);

                        int.TryParse(maxPageString, out maxPageNumber);
                    }
                }
            }

            if (maxPageNumber > 0)
            {
                string url = string.Empty;
                for (int i = 0; i <= maxPageNumber; i++)
                {
                    if (i == 0)
                    {
                        url = "http://www.sge.sh/publish/sge/xqzx/jyxq/index.htm";
                    }
                    else
                    {
                        url = string.Format("http://www.sge.sh/publish/sge/xqzx/jyxq/index{0}.htm", i);
                    }
                    indexUrls.Add(url);
                }
            }

            return(indexUrls);
        }
Example #11
0
 public void RetrieveHtml_InvalidUrl_ThrowsException()
 {
     try
     {
         var htmlDocument = HtmlAgilityPackHelper.RetrieveHtml(InvalidUrl);
         Assert.Fail("An exception should have been thrown");
     }
     catch (Exception ex)
     {
         Assert.IsNotNull(ex);
     }
 }
Example #12
0
        private List <string> GetKeywordsFromUrl(string url)
        {
            if (!IsUrlValid(url))
            {
                return(null);
            }

            var htmlDocument = HtmlAgilityPackHelper.RetrieveHtml(url);
            var metaNodes    = HtmlAgilityPackHelper.GetMetaNodesCollection(htmlDocument);

            return(GetKeywordsFromMetaNodes(metaNodes));
        }
Example #13
0
        public static void DangDangSpider(string url)
        {
            //http://category.dangdang.com/
            WebBrowerManager.Instance.Setup(new cEXWB());
            var html = WebBrowerManager.Instance.Run(url);

            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            var container =
                htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[2]/div[1]/div[2]/div[1]/div[4]/div");

            if (container == null || !container.Any())
            {
                return;
            }

            foreach (HtmlNode htmlNode in container)
            {
                if (htmlNode.HasAttributes && htmlNode.Attributes["Id"] != null)
                {
                    var liNodes = htmlNode.SelectNodes("div[2]/ul[1]/li");

                    foreach (HtmlNode liNode in liNodes)
                    {
                        var aNode = liNode.SelectNodes("a");
                        var chars = new char[6] {
                            '&', 'n', 'b', 's', 'p', ';'
                        };

                        foreach (HtmlNode node in aNode)
                        {
                            var firstKey = 0;
                            if (node.HasAttributes && node.Attributes["class"] != null && node.Attributes["class"].Value == "title")
                            {
                                // 一级分类
                                firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");

                                Insert(firstKey.ToString(), node.InnerText.Trim(chars), node.Attributes["href"].Value, "0", "5");
                            }
                            else
                            {
                                //二级分类

                                // 一级分类
                                var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                                Insert(secondKey.ToString(), node.InnerText, node.Attributes["href"].Value, firstKey.ToString(), "5");
                            }
                        }
                    }
                }
            }
        }
        private List <Dictionary <string, string> > GetInfo(string url, string postDataString)
        {
            Func <string, string> removeSpace = s => s.Replace("&nbsp;", "");

            List <Dictionary <string, string> > listDic = new List <Dictionary <string, string> >();
            var httpHelper = new HttpHelper
            {
                Timeout      = 5 * 60 * 1000,
                HttpEncoding = _httpEncoding
            };
            var html          = httpHelper.GetHtmlByPost(url, postDataString);
            var urlCollection = Regex.Matches(Regex.Match(Regex.Match(html, @"dataStore[\s]*=[\s]*\[.*?\]").Value, "(?<=\")[^,]+?(?=\")").Value, @"(?<=\$)[^\$]*$");

            foreach (Match caseUrl in urlCollection)
            {
                html = httpHelper.GetHtmlByGet($"http://www.zjcredit.gov.cn{caseUrl.Value}");
                var htmlNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html);
                var administrativePenaltyInfo = new AdministrativePenaltyInfo();

                administrativePenaltyInfo.CaseName            = htmlNode.SelectSingleNode("//td[@class='listf2']").InnerText;
                administrativePenaltyInfo.CaseId              = htmlNode.SelectSingleNode("//table[2]//tr[1]/td[@class='xzcf_xx']").InnerText;
                administrativePenaltyInfo.PenaltyObject       = removeSpace(htmlNode.SelectSingleNode("//table[2]//tr[2]/td[@class='xzcf_xx']/text()").InnerText);
                administrativePenaltyInfo.LegalRepresentative = Regex.Match(htmlNode.SelectSingleNode("//table[2]//span[@class='xzcf_mc']").InnerText, "(?<=:).*").Value.Trim();
                administrativePenaltyInfo.Department          = htmlNode.SelectSingleNode("//table[2]//tr[3]/td[@class='xzcf_xx']").InnerText;
                administrativePenaltyInfo.PenaltyDate         = htmlNode.SelectSingleNode("//table[2]//tr[4]/td[@class='xzcf_xx']").InnerText;
                administrativePenaltyInfo.PenalyText          = htmlNode.SelectSingleNode("//table[4]//td[@class='xzcf_jds']").InnerText;

                var dic = new Dictionary <string, string>
                {
                    ["CaseName"]            = administrativePenaltyInfo.CaseName,
                    ["CaseId"]              = administrativePenaltyInfo.CaseId,
                    ["PenaltyObject"]       = administrativePenaltyInfo.PenaltyObject,
                    ["LegalRepresentative"] = administrativePenaltyInfo.LegalRepresentative,
                    ["Department"]          = administrativePenaltyInfo.Department,
                    ["PenaltyDate"]         = administrativePenaltyInfo.PenaltyDate,
                    ["PenalyText"]          = administrativePenaltyInfo.PenalyText,
                    ["PostUrl"]             = url,
                    ["Url"]      = caseUrl.Value,
                    ["ThreadId"] = Thread.CurrentThread.ManagedThreadId.ToString()
                };

                foreach (var info in dic)
                {
                    Console.WriteLine($"{info.Key}:{info.Value}");
                }

                listDic.Add(dic);
            }

            return(listDic);
        }
Example #15
0
        /// <summary>
        /// 一号店商品采集方法
        /// </summary>
        /// <param name="url">全部分类url</param>
        public static void YiHaoDianSpider(string url)
        {
            //http://www.yihaodian.com/product/listAll.do
            HttpClient hc = new HttpClient(url);

            hc.Timeout = 30000;
            var allSortHtml            = hc.Request();
            var htmlDocument           = HtmlAgilityPackHelper.GetHtmlDocument(allSortHtml);
            var firstCategoryContainer = htmlDocument.DocumentNode.SelectNodes("//div[@class='alonesort']");

            //var texts = new List<string>();
            foreach (HtmlNode firstCategoryNode in firstCategoryContainer)
            {
                var node = firstCategoryNode.CssSelect(".mt>h3>a");

                if (node != null && node.Any())
                {
                    //一级分类
                    var firstCategoryText = node.FirstOrDefault().InnerText;
                    var firstKey          = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                    Insert(firstKey.ToString(), firstCategoryText, node.FirstOrDefault().Attributes["href"].Value, "0", "2");


                    var secondCategoryContainer = firstCategoryNode.CssSelect(".mc>.fore");

                    foreach (HtmlNode htmlNode in secondCategoryContainer)
                    {
                        //二级分类
                        var secondCategoryNode = htmlNode.CssSelect("dt>a").FirstOrDefault();
                        var secondKey          = KeyGenerator.Instance.GetNextValue("ProductCategory2");

                        if (secondCategoryNode.Attributes["href"] != null)
                        {
                            Insert(secondKey.ToString(), secondCategoryNode.InnerText, secondCategoryNode.Attributes["href"].Value, firstKey.ToString(), "2");
                        }

                        // 三级分类集合
                        var threeCategoryNodes = htmlNode.CssSelect("dd>em>span>a");
                        foreach (HtmlNode threeCategoryNode in threeCategoryNodes)
                        {
                            // 插入三级分类
                            var thirdKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                            if (threeCategoryNode.Attributes["href"] != null)
                            {
                                Insert(thirdKey.ToString(), threeCategoryNode.InnerText, threeCategoryNode.Attributes["href"].Value, secondKey.ToString(), "2");
                            }
                        }
                    }
                }
            }
        }
Example #16
0
        public static void SuNingSpider(string url)
        {
            //www.suning.com/emall/SNProductCatgroupView?storeId=10052&catalogId=10051&flag=1
            WebBrowerManager.Instance.Setup(new cEXWB());
            var html = WebBrowerManager.Instance.Run(url);

            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            var container = htmlDocument.DocumentNode.CssSelect("div.sFloor.clearfix");

            foreach (HtmlNode htmlNode in container)
            {
                var firstNode = htmlNode.CssSelect("h3>a");
                var firstKey  = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                Insert(firstKey.ToString(), firstNode.FirstOrDefault().InnerText, firstNode.FirstOrDefault().Attributes["href"].Value, "0", "4");

                var temp = new List <HtmlNode>();
                if (htmlNode.CssSelect(".listLeft>dl") != null && htmlNode.CssSelect(".listLeft>dl").Any())
                {
                    temp.AddRange(htmlNode.CssSelect(".listLeft>dl"));
                }
                if (htmlNode.CssSelect(".listRight>dl") != null && htmlNode.CssSelect(".listRight>dl").Any())
                {
                    temp.AddRange(htmlNode.CssSelect(".listRight>dl"));
                }

                foreach (HtmlNode node in temp)
                {
                    var secondNode = node.CssSelect("dt>a").FirstOrDefault();

                    var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");

                    if (secondNode != null)
                    {
                        Insert(secondKey.ToString(), secondNode.InnerText, "", firstKey.ToString(), "4");
                    }

                    var thridNodes = node.CssSelect("dd>span>a");
                    if (thridNodes != null && thridNodes.Any())
                    {
                        foreach (HtmlNode thridNode in thridNodes)
                        {
                            // 插入三级分类
                            var thirdKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                            Insert(thirdKey.ToString(), thridNode.InnerText, thridNode.Attributes["href"].Value, secondKey.ToString(), "4");
                        }
                    }
                }
            }
        }
 /// <summary>
 /// ParseNextUrl
 /// </summary>
 /// <returns></returns>
 protected override string ParseNextUrl()
 {
     if (_isInHtml)
     {
         var docmentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(HtmlSource);
         var nextUrl =
             docmentNode.SelectSingleNode("//a[@class='J_SearchAsync next']")?.Attributes["href"]?.Value;
         if (nextUrl != null)
         {
             if (!nextUrl.Contains("http"))
                 _urlQueue.Enqueue($"https:{nextUrl}");
         }
     }
     return _urlQueue.Count == 0 ? null : _urlQueue.Dequeue();
 }
Example #18
0
        /// <summary>
        /// InitTotalPage
        /// </summary>
        /// <param name="listHtmlFirst"></param>
        /// <returns></returns>
        private int InitTotalPage(string listHtmlFirst)
        {
            var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(listHtmlFirst);
            //不转义,双引号里面要两个双引号才表示双引号
            var htmlNode = documentNode.SelectSingleNode(@"//span[@class='\""page-info\""']") ?? documentNode.SelectSingleNode(@"//b[@class='\""ui-page-s-len\""']");

            if (htmlNode == null)
            {
                return(0);
            }
            var text    = htmlNode.InnerText;
            var pageNum = Regex.Match(text, @"(?<=/)\d+").Value;
            int pageNumInt;

            return(_totalPage = int.TryParse(pageNum, out pageNumInt) ? pageNumInt : 0);
        }
Example #19
0
        void WebBrower_WBLButtonUp(object sender, csExWB.HTMLMouseEventArgs e)
        {
            if (e.SrcElement != null)
            {
                //user is scrolling using scrollbars
                //if (e.SrcElement.tagName == "HTML")
                //    return;
                //If DIV then we can look for an HTML child element
                //AllForms.m_frmLog.AppendToLog("cEXWB1_WBLButtonUp==>" + e.SrcElement.tagName);
                TreeNodeEx tnRet = null;



                foreach (var tn in HtmlTree.Nodes)
                {
                    var treeNodeEx      = tn as TreeNodeEx;
                    var selectedElement = new SelectedElement();
                    selectedElement.tagName   = e.SrcElement.tagName.ToLower();
                    selectedElement.innerText = e.SrcElement.innerText;
                    tnRet = this.FindNodeExt(treeNodeEx, selectedElement);
                    if (tnRet != null)
                    {
                        break;
                    }
                }
                if (tnRet != null)
                {
                    tnRet.ForeColor = Color.Red;
                    tnRet.Expand();
                    HtmlTree.SelectedNode = tnRet;
                    var sb = new StringBuilder();
                    sb.AppendLine("xpath:" + tnRet.HtmlNode.XPath);
                    sb.AppendLine(HtmlAgilityPackHelper.GetStringByXPath(Html, tnRet.HtmlNode.XPath, "|"));
                    richTextBox.Text += sb.ToString();
                }
            }
            else
            {
                //AllForms.m_frmLog.AppendToLog("cEXWB1_WBLButtonUp");
            }

            //Rectangle rt = new Rectangle(m_mposX - 1, m_mposY - 1, 2, 2);
            //if (rt.Contains(e.ClientX, e.ClientY))
            //{
            //    //AllForms.m_frmLog.AppendToLog("MOUSE CLICKED");
            //}
        }
        /// <summary>
        /// InitTotalPage
        /// </summary>
        /// <param name="listHtmlFirst"></param>
        /// <returns></returns>
        private int InitTotalPage(string listHtmlFirst)
        {
            
            //if (!_curUrl.Contains("/i/asynSearch.htm"))
            //    return 1;
            var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(listHtmlFirst);
            //不转义,双引号里面要两个双引号才表示双引号
            var htmlNode = documentNode.SelectSingleNode(@"//span[@class='\""page-info\""']") ?? documentNode.SelectSingleNode(@"//b[@class='\""ui-page-s-len\""']");
                           //?? documentNode.SelectSingleNode("//b[@class=\"ui-page-s-len\"]");
            if (htmlNode == null)
                return 0;
            var text = htmlNode.InnerText;
            var pageNum = Regex.Match(text, @"(?<=/)\d+").Value;
            int pageNumInt;
            return _totalPage = int.TryParse(pageNum, out pageNumInt) ? pageNumInt : 0;

        }
Example #21
0
        private List <KeywordDto> GetKeywordsOccurrencesFromHtmlDocument(IEnumerable <string> keywords, string url)
        {
            if (!IsUrlValid(url))
            {
                return(null);
            }

            var htmlDocument = HtmlAgilityPackHelper.RetrieveHtml(url);

            return((from keyword in keywords
                    let occurrenceCount = htmlDocument.Text.OccurrencesOf(keyword)
                                          select new KeywordDto()
            {
                Keyword = keyword,
                OccurenceCount = occurrenceCount
            }).ToList());
        }
        /// <summary>
        /// InitFirstUrlPart
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private string InitFirstUrlPart(string url)
        {
            if (!url.ToLower().Contains("http"))
            {
                url = $"https://{url}";
            }

            var mainHtml = GetMainWebContent(_shopUrl = url, null, ref _cookies, null);

            _shopName = Regex.Match(mainHtml, @"(?<=<title>)[\s\S]*?(?=</title>)").Value.Trim();
            //if (_shopName.Contains("阿里旅行·去啊Alitrip.com"))
            //{
            //    return $"{url}/search.htm";
            //}
            if (_shopName.Contains("阿里旅行·去啊Alitrip.com"))
            {
                throw new Exception("阿里旅行不支持");
            }
            if (_shopName.Equals("店铺浏览-淘宝网"))
            {
                //throw new Exception("店铺不存在!");
                SendLog("店铺不存在!");
                return(string.Empty);
            }
            _shopName = Regex.Match(_shopName, "(?<=-).*(?=-)").Value.Trim();
            //var categoryUrl = $"{url}/category.htm";
            var categoryUrl  = $"{url}/search.htm";
            var html         = GetMainWebContent(categoryUrl, null, ref _cookies, null);
            var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html);
            var listUrl      = documentNode.SelectSingleNode("//input[@id='J_ShopAsynSearchURL']").Attributes["value"].Value;


            //      /i/asynSearch.htm?mid=w-1901851942-0&wid=1901851942&path=/search.htm&amp;search=y
            //if (string.IsNullOrEmpty(listUrl))
            //{
            //    var dataWidgetid =
            //        documentNode.SelectSingleNode("//div[@id=\"bd\"]//div[@class=\"J_TModule\"]").Attributes[
            //            "data-widgetid"].Value;

            //    listUrl = $"/i/asynSearch.htm?mid=w-{dataWidgetid}-0&wid={dataWidgetid}&path=/search.htm&amp;search=y";
            //}

            //return string.IsNullOrEmpty(listUrl) ? url : $"{url}{listUrl}";

            return($"{url}{listUrl}");
        }
Example #23
0
        /// <summary>
        /// ParseCurrentItems
        /// </summary>
        /// <returns></returns>
        protected override IResut[] ParseCurrentItems()
        {
            List <IResut>      resultList         = new List <IResut>();
            HtmlNode           htmlNode           = HtmlAgilityPackHelper.GetDocumentNodeByHtml(HtmlSource);
            HtmlNodeCollection htmlNodeCollection = htmlNode.SelectNodes("//td[@class='Font9']");

            foreach (HtmlNode node in htmlNodeCollection)
            {
                string url            = node.SelectSingleNode("./a[@class='five']")?.Attributes["href"]?.Value;
                string dateTimeString = Regex.Match(node.InnerText, @"\d+-\d+-\d+").Value;
                if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(dateTimeString))
                {
                    break;
                }
                url = $"http://www.ccgp-shandong.gov.cn{url}";
                DateTime dateTime = Convert.ToDateTime(dateTimeString);
                int      days     = (DateTime.Now - dateTime).Days;
                if (days > _gatherDays)
                {
                    _urlQueue.Clear();
                    break;
                }
                string   html        = _httpHelper.GetHtmlByGet(url);
                HtmlNode htmlNode2   = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html);
                string   title       = htmlNode2.SelectSingleNode("//div[@align='center']")?.InnerText;
                string   publisher   = Regex.Match(html, "(?<=发布人[::]).*(?=</td>)").Value;
                string   publishTime = Regex.Match(html, "(?<=发布时间[::]).*(?=</td>)").Value;
                publishTime = Convert.ToDateTime(publishTime).ToString(CultureInfo.CurrentCulture);
                //string content = htmlNode2.SelectSingleNode("//td[@bgcolor='#FFFFFF' and @align='center' and not(@valign)]").InnerText.Trim();
                //content = HttpUtility.HtmlDecode(Regex.Match(content, @".*(?=\r\n)").Value);
                string content = htmlNode2.SelectSingleNode("//table//tr[2]/td[2]/table").OuterHtml;

                Resut resut = new Resut()
                {
                    ["url"]         = url,
                    ["title"]       = title,
                    ["content"]     = content,
                    ["publisher"]   = publisher,
                    ["publishTime"] = publishTime
                };

                resultList.Add(resut);
            }
            return(resultList.ToArray());
        }
Example #24
0
        static TestBase()
        {
            KnownAssemblies = new List <Assembly>(new[]
            {
                typeof(RootModuleDescriptor).Assembly,
                typeof(PagesModuleDescriptor).Assembly,
                typeof(BlogModuleDescriptor).Assembly,
                typeof(NewsletterModuleDescriptor).Assembly,
                typeof(MediaManagerModuleDescriptor).Assembly,
                typeof(UsersModuleDescriptor).Assembly,
                typeof(ApiModuleDescriptor).Assembly,
                typeof(UsersApiModuleDescriptor).Assembly,
                typeof(ImagesGalleryModuleDescriptor).Assembly
            });
            CreateContainer();

            HtmlAgilityPackHelper.FixMissingTagClosings();
        }
Example #25
0
        public DefaultWebCrawlerService(ICmsConfiguration cmsConfiguration)
        {
            this.cmsConfiguration = cmsConfiguration;

            webServer = cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneWebSiteUrl) ?? string.Empty;

            bool.TryParse(cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneIndexPrivatePages), out indexPrivatePages);

            HtmlAgilityPackHelper.FixMissingTagClosings();

            TimeSpan timeout;

            if (TimeSpan.TryParse(cmsConfiguration.Search.GetValue(LuceneSearchConstants.ConfigurationKeys.LuceneIndexerPageFetchTimeout), out timeout) &&
                timeout > TimeSpan.FromSeconds(0))
            {
                fetchTimeout = timeout;
            }
        }
Example #26
0
        public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct)
        {
            //var html=WebBrowerManager.Instance.Run(spiderProduct.Url);
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(spiderProduct.HtmlSource);

            //标题
            var title = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[1]/h1[1]");
            var price = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='p-price']/img");

            // 文字价格
            var priceText = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[2]/ul[1]/li[2]/script[1]");

            // 产品图片
            //var defaultImage = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[5]/div[1]/div[2]/div[1]");
            decimal realPrice = 0;

            if (price.Attributes["src"] != null && !string.IsNullOrEmpty(price.Attributes["src"].Value))
            {
                decimal.TryParse(ImageProcess.Recognize(price.Attributes["src"].Value), out realPrice);
            }

            // 促销信息是ajax
            if (title != null && price != null && priceText != null)
            {
                var beginIndex = priceText.InnerText.IndexOf("京东价:¥");

                var     endIndex         = priceText.InnerText.IndexOf("。", beginIndex);
                var     readPrice        = priceText.InnerText.Substring(beginIndex + "京东价:¥".Length, endIndex - beginIndex - "京东价:¥".Length);
                decimal decimalRealPrice = 0;
                if (decimal.TryParse(readPrice, out decimalRealPrice))
                {
                    //UpdateProduct(productId, title.InnerText, decimal.Parse(readPrice));
                }
            }

            return(new ProductInfo()
            {
                Source = spiderProduct.HtmlSource, ProductId = spiderProduct.ProductId, Url = spiderProduct.Url, Price = realPrice
            });
        }
Example #27
0
        /// <summary>
        /// 下载列表
        /// </summary>
        /// <param name="search"></param>
        public static List <DouYinModel> DownLoad(int index, string directoryPath)
        {
            List <DouYinModel> result = new List <DouYinModel>();

            try
            {
                string url = string.Format(SearchUrl, index);
                //请求搜索列表
                var list_response = HttpHelper.HttpGetRequest(url, null, null, null, keepAlive: true);
                var list_respStr  = HttpHelper.GetResponseStreamToStr(list_response);

                list_response.Close();
                result = HtmlAgilityPackHelper.GetTable(list_respStr, directoryPath);


                return(result);
            }
            catch (Exception ex)
            {
                return(result);
            }
        }
Example #28
0
        private void WebBrower_DocumentComplete(object sender, DocumentCompleteEventArgs e)
        {
            if (e.url.ToLower() == "about:blank")
            {
                return;
            }

            cEXWB pWB = sender as cEXWB;

            if (e.istoplevel)
            {
                IsDocumentFinish = true;
                Html             = pWB.DocumentSource;
                htmlDocument     = HtmlAgilityPackHelper.GetHtmlDocument(pWB.DocumentSource);
                //Stopwatch sw=new Stopwatch();
                //sw.Start();
                //LoadHtmlTree();
                //sw.Stop();
                Stopwatch sw1 = new Stopwatch();
                sw1.Start();
                BuildTree3();
                sw1.Stop();
                richTextBox.Text += string.Format("DOM树构建时间{0}毫秒", sw1.ElapsedMilliseconds) + "\n";
                //MessageBox.Show(sw1.ElapsedMilliseconds.ToString());
                //Logger.Log(LogLevel.Info, string.Format("DocumentComplete,Url:{0},时间:{1}", e.url,DateTime.Now));
            }
            else if (pWB != null && pWB.MainDocumentFullyLoaded) // a frame naviagtion within a frameset
            {
                IsDocumentFinish = true;
                //Logger.Log(LogLevel.Info, string.Format("MainDocumentFullyLoaded,Url:{0},时间:{1}", e.url, DateTime.Now));
            }
            else
            {
                //log.Debug("DocumentComplete::TopLevel is FALSE===>" + e.url);
            }
        }
Example #29
0
        public static List <string> Spider(List <string> urls)
        {
            var pageUrls = new List <string>();

            foreach (string indexUrl in urls)
            {
                var html     = WebBrowerManager.Instance.Brower(indexUrl).HtmlSource;
                var document = HtmlAgilityPackHelper.GetHtmlDocument(html);
                var liNodes  = document.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[3]/div[1]/div[3]/ul[1]/li");
                if (liNodes != null && liNodes.Any())
                {
                    foreach (HtmlNode liNode in liNodes)
                    {
                        var aNode = liNode.SelectSingleNode("a");
                        if (aNode != null && aNode.HasAttributes && aNode.Attributes["href"] != null && !string.IsNullOrEmpty(aNode.Attributes["href"].Value))
                        {
                            string pageUrl = string.Format("http://www.sge.sh/publish/sge/xqzx/jyxq/{0}", aNode.Attributes["href"].Value);
                            pageUrls.Add(pageUrl);
                        }
                    }
                }
            }
            return(pageUrls);
        }
Example #30
0
        public void RetrieveHtml_ValidUrl_ReturnHtmlDocument()
        {
            var htmlDocument = HtmlAgilityPackHelper.RetrieveHtml(ValidUrl);

            Assert.IsNotNull(htmlDocument);
        }