예제 #1
0
        /// <summary>
        /// InitFirstUrlPart
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private string InitFirstUrlPart(string url)
        {
            if (!url.ToLower().Contains("http"))
            {
                url = $"https://{url}";
            }

            var mainHtml = GetMainWebContent(_shopUrl = url, null, ref _cookies, null);

            _shopName = Regex.Match(mainHtml, @"(?<=<title>)[\s\S]*?(?=</title>)").Value.Trim();
            if (_shopName.Contains("阿里旅行·去啊Alitrip.com"))
            {
                throw new Exception("阿里旅行不支持");
            }
            if (_shopName.Equals("店铺浏览-淘宝网"))
            {
                //throw new Exception("店铺不存在!");
                SendLog("店铺不存在!");
                return(string.Empty);
            }
            _shopName = Regex.Match(_shopName, "(?<=-).*(?=-)").Value.Trim();
            //var categoryUrl = $"{url}/category.htm";
            var categoryUrl  = $"{url}/search.htm";
            var html         = GetMainWebContent(categoryUrl, null, ref _cookies, null);
            var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html);
            var listUrl      = documentNode.SelectSingleNode("//input[@id='J_ShopAsynSearchURL']").Attributes["value"].Value;

            return($"{url}{listUrl}");
        }
        private List <Dictionary <string, string> > GetInfo(string url, string postDataString)
        {
            Func <string, string> removeSpace = s => s.Replace("&nbsp;", "");

            List <Dictionary <string, string> > listDic = new List <Dictionary <string, string> >();
            var httpHelper = new HttpHelper
            {
                Timeout      = 5 * 60 * 1000,
                HttpEncoding = _httpEncoding
            };
            var html          = httpHelper.GetHtmlByPost(url, postDataString);
            var urlCollection = Regex.Matches(Regex.Match(Regex.Match(html, @"dataStore[\s]*=[\s]*\[.*?\]").Value, "(?<=\")[^,]+?(?=\")").Value, @"(?<=\$)[^\$]*$");

            foreach (Match caseUrl in urlCollection)
            {
                html = httpHelper.GetHtmlByGet($"http://www.zjcredit.gov.cn{caseUrl.Value}");
                var htmlNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html);
                var administrativePenaltyInfo = new AdministrativePenaltyInfo();

                administrativePenaltyInfo.CaseName            = htmlNode.SelectSingleNode("//td[@class='listf2']").InnerText;
                administrativePenaltyInfo.CaseId              = htmlNode.SelectSingleNode("//table[2]//tr[1]/td[@class='xzcf_xx']").InnerText;
                administrativePenaltyInfo.PenaltyObject       = removeSpace(htmlNode.SelectSingleNode("//table[2]//tr[2]/td[@class='xzcf_xx']/text()").InnerText);
                administrativePenaltyInfo.LegalRepresentative = Regex.Match(htmlNode.SelectSingleNode("//table[2]//span[@class='xzcf_mc']").InnerText, "(?<=:).*").Value.Trim();
                administrativePenaltyInfo.Department          = htmlNode.SelectSingleNode("//table[2]//tr[3]/td[@class='xzcf_xx']").InnerText;
                administrativePenaltyInfo.PenaltyDate         = htmlNode.SelectSingleNode("//table[2]//tr[4]/td[@class='xzcf_xx']").InnerText;
                administrativePenaltyInfo.PenalyText          = htmlNode.SelectSingleNode("//table[4]//td[@class='xzcf_jds']").InnerText;

                var dic = new Dictionary <string, string>
                {
                    ["CaseName"]            = administrativePenaltyInfo.CaseName,
                    ["CaseId"]              = administrativePenaltyInfo.CaseId,
                    ["PenaltyObject"]       = administrativePenaltyInfo.PenaltyObject,
                    ["LegalRepresentative"] = administrativePenaltyInfo.LegalRepresentative,
                    ["Department"]          = administrativePenaltyInfo.Department,
                    ["PenaltyDate"]         = administrativePenaltyInfo.PenaltyDate,
                    ["PenalyText"]          = administrativePenaltyInfo.PenalyText,
                    ["PostUrl"]             = url,
                    ["Url"]      = caseUrl.Value,
                    ["ThreadId"] = Thread.CurrentThread.ManagedThreadId.ToString()
                };

                foreach (var info in dic)
                {
                    Console.WriteLine($"{info.Key}:{info.Value}");
                }

                listDic.Add(dic);
            }

            return(listDic);
        }
예제 #3
0
 /// <summary>
 /// ParseNextUrl
 /// </summary>
 /// <returns></returns>
 protected override string ParseNextUrl()
 {
     if (_isInHtml)
     {
         var docmentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(HtmlSource);
         var nextUrl =
             docmentNode.SelectSingleNode("//a[@class='J_SearchAsync next']")?.Attributes["href"]?.Value;
         if (nextUrl != null)
         {
             if (!nextUrl.Contains("http"))
                 _urlQueue.Enqueue($"https:{nextUrl}");
         }
     }
     return _urlQueue.Count == 0 ? null : _urlQueue.Dequeue();
 }
예제 #4
0
        /// <summary>
        /// InitTotalPage
        /// </summary>
        /// <param name="listHtmlFirst"></param>
        /// <returns></returns>
        private int InitTotalPage(string listHtmlFirst)
        {
            var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(listHtmlFirst);
            //不转义,双引号里面要两个双引号才表示双引号
            var htmlNode = documentNode.SelectSingleNode(@"//span[@class='\""page-info\""']") ?? documentNode.SelectSingleNode(@"//b[@class='\""ui-page-s-len\""']");

            if (htmlNode == null)
            {
                return(0);
            }
            var text    = htmlNode.InnerText;
            var pageNum = Regex.Match(text, @"(?<=/)\d+").Value;
            int pageNumInt;

            return(_totalPage = int.TryParse(pageNum, out pageNumInt) ? pageNumInt : 0);
        }
예제 #5
0
        /// <summary>
        /// InitTotalPage
        /// </summary>
        /// <param name="listHtmlFirst"></param>
        /// <returns></returns>
        private int InitTotalPage(string listHtmlFirst)
        {
            
            //if (!_curUrl.Contains("/i/asynSearch.htm"))
            //    return 1;
            var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(listHtmlFirst);
            //不转义,双引号里面要两个双引号才表示双引号
            var htmlNode = documentNode.SelectSingleNode(@"//span[@class='\""page-info\""']") ?? documentNode.SelectSingleNode(@"//b[@class='\""ui-page-s-len\""']");
                           //?? documentNode.SelectSingleNode("//b[@class=\"ui-page-s-len\"]");
            if (htmlNode == null)
                return 0;
            var text = htmlNode.InnerText;
            var pageNum = Regex.Match(text, @"(?<=/)\d+").Value;
            int pageNumInt;
            return _totalPage = int.TryParse(pageNum, out pageNumInt) ? pageNumInt : 0;

        }
        /// <summary>
        /// InitFirstUrlPart
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private string InitFirstUrlPart(string url)
        {
            if (!url.ToLower().Contains("http"))
            {
                url = $"https://{url}";
            }

            var mainHtml = GetMainWebContent(_shopUrl = url, null, ref _cookies, null);

            _shopName = Regex.Match(mainHtml, @"(?<=<title>)[\s\S]*?(?=</title>)").Value.Trim();
            //if (_shopName.Contains("阿里旅行·去啊Alitrip.com"))
            //{
            //    return $"{url}/search.htm";
            //}
            if (_shopName.Contains("阿里旅行·去啊Alitrip.com"))
            {
                throw new Exception("阿里旅行不支持");
            }
            if (_shopName.Equals("店铺浏览-淘宝网"))
            {
                //throw new Exception("店铺不存在!");
                SendLog("店铺不存在!");
                return(string.Empty);
            }
            _shopName = Regex.Match(_shopName, "(?<=-).*(?=-)").Value.Trim();
            //var categoryUrl = $"{url}/category.htm";
            var categoryUrl  = $"{url}/search.htm";
            var html         = GetMainWebContent(categoryUrl, null, ref _cookies, null);
            var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html);
            var listUrl      = documentNode.SelectSingleNode("//input[@id='J_ShopAsynSearchURL']").Attributes["value"].Value;


            //      /i/asynSearch.htm?mid=w-1901851942-0&wid=1901851942&path=/search.htm&amp;search=y
            //if (string.IsNullOrEmpty(listUrl))
            //{
            //    var dataWidgetid =
            //        documentNode.SelectSingleNode("//div[@id=\"bd\"]//div[@class=\"J_TModule\"]").Attributes[
            //            "data-widgetid"].Value;

            //    listUrl = $"/i/asynSearch.htm?mid=w-{dataWidgetid}-0&wid={dataWidgetid}&path=/search.htm&amp;search=y";
            //}

            //return string.IsNullOrEmpty(listUrl) ? url : $"{url}{listUrl}";

            return($"{url}{listUrl}");
        }
예제 #7
0
        /// <summary>
        /// ParseCurrentItems
        /// </summary>
        /// <returns></returns>
        protected override IResut[] ParseCurrentItems()
        {
            List <IResut>      resultList         = new List <IResut>();
            HtmlNode           htmlNode           = HtmlAgilityPackHelper.GetDocumentNodeByHtml(HtmlSource);
            HtmlNodeCollection htmlNodeCollection = htmlNode.SelectNodes("//td[@class='Font9']");

            foreach (HtmlNode node in htmlNodeCollection)
            {
                string url            = node.SelectSingleNode("./a[@class='five']")?.Attributes["href"]?.Value;
                string dateTimeString = Regex.Match(node.InnerText, @"\d+-\d+-\d+").Value;
                if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(dateTimeString))
                {
                    break;
                }
                url = $"http://www.ccgp-shandong.gov.cn{url}";
                DateTime dateTime = Convert.ToDateTime(dateTimeString);
                int      days     = (DateTime.Now - dateTime).Days;
                if (days > _gatherDays)
                {
                    _urlQueue.Clear();
                    break;
                }
                string   html        = _httpHelper.GetHtmlByGet(url);
                HtmlNode htmlNode2   = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html);
                string   title       = htmlNode2.SelectSingleNode("//div[@align='center']")?.InnerText;
                string   publisher   = Regex.Match(html, "(?<=发布人[::]).*(?=</td>)").Value;
                string   publishTime = Regex.Match(html, "(?<=发布时间[::]).*(?=</td>)").Value;
                publishTime = Convert.ToDateTime(publishTime).ToString(CultureInfo.CurrentCulture);
                //string content = htmlNode2.SelectSingleNode("//td[@bgcolor='#FFFFFF' and @align='center' and not(@valign)]").InnerText.Trim();
                //content = HttpUtility.HtmlDecode(Regex.Match(content, @".*(?=\r\n)").Value);
                string content = htmlNode2.SelectSingleNode("//table//tr[2]/td[2]/table").OuterHtml;

                Resut resut = new Resut()
                {
                    ["url"]         = url,
                    ["title"]       = title,
                    ["content"]     = content,
                    ["publisher"]   = publisher,
                    ["publishTime"] = publishTime
                };

                resultList.Add(resut);
            }
            return(resultList.ToArray());
        }
예제 #8
0
        /// <summary>
        /// ParseCurrentItems
        /// </summary>
        /// <returns></returns>
        protected override IResut[] ParseCurrentItems()
        {

            

            var resultList = new List<IResut>();

            Newtonsoft.Json.Serialization.Func<string, string> getFormatProductId = productId => Regex.Match(productId, @"(?<=\\"").*(?=\\"")").Value;
            Newtonsoft.Json.Serialization.Func<string, string> getFormatProductName = productName => productName.Trim();
            Newtonsoft.Json.Serialization.Func<string, string> getFormatProductUrl = productUrl => $"https:{Regex.Match(productUrl, @"(?<=\\"").*(?=\\"")").Value}";




            //var html = Regex.Match(HtmlSource, @"<div class=\\""J_TItems\\"">[\s\S]*?<div class=\\""pagination\\"">").Value;

            //var docmentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html);

            //var htmlNodeCollection = docmentNode.SelectNodes(@"//div[@class='\""item4line1\""']//dl") ??
            //             docmentNode.SelectNodes(@"//div[@class='\""item5line1\""']//dl")?? 
            //             docmentNode.SelectNodes(@"//div[@class='\""item30line1\""']//dl");




            //var divNodes = docmentNode.SelectNodes(@"//div");
            //Console.WriteLine(new string('=', 64));
            //foreach (var divNode in divNodes)
            //{
            //    var classValue = divNode.GetAttributeValue(@"class", string.Empty);
            //    Console.WriteLine($"classvalue: {classValue}");
            //}
            //Console.WriteLine(new string('-', 64));

            //用matches和ends-with都提示需要命名空间管理器或 XsltContext。此查询具有前缀、变量或用户定义的函数。还没解决这个问题
            //var htmlNodeCollection = docmentNode.SelectNodes(@"//div[matches(@class,'\""item\d+line1\""')]//dl");
            //var htmlNodeCollection = docmentNode.SelectNodes("//div[starts-with(@class,'\\\"item')]//dl");
            //var htmlNodeCollection = docmentNode.SelectNodes("//div[ends-with(@class,'line1\\\"')]//dl");

            
            
            var docmentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(HtmlSource);


            if (_isInHtml)
            {

                var htmlNodeCollection = docmentNode.SelectNodes(@"//div[@class='pagination']/parent::div/child::div")
                                         ??docmentNode.SelectNodes(@"//div[@class='comboHd']/parent::div/child::div")?? docmentNode.SelectNodes(@"//div[contains(@class,'item') and contains(@class,'line1')]//dl");

                foreach (var htmlNode in htmlNodeCollection)
                {
                    var attributes = htmlNode.Attributes["class"].Value;
                    //退出 后面的推荐产品不要了
                    if (attributes == @"pagination")
                        break;
                    if (attributes == @"comboHd")
                    {
                        //清空队列
                        _urlQueue.Clear();
                        break;
                    }
                    if (attributes.Contains(@"item") && attributes.Contains(@"line1"))
                    {
                        var htmlNodeDls = htmlNode.SelectNodes(".//dl");
                        foreach (var htmlNodeDl in htmlNodeDls)
                        {
                            var detailNode =
                                    htmlNodeDl.SelectSingleNode(
                                        @".//dd[@class='detail']//a[@class='item-name J_TGoldData']");
                            var productName = getFormatProductName(detailNode.InnerText);
                            var productUrl = detailNode.Attributes["href"].Value;

                            var productId = Regex.Match(productUrl, @"(?<=id=)\d+").Value;
                            //如果hash表不包含的productId
                            if (!_hashTable.ContainsKey(productId))
                            {
                                //productId加入到hash表中
                                _hashTable.Add(productId, null);
                                
                                //Console.WriteLine($"shopId:{productId},shopName:{productName},productUrl:{productUrl}。");
                                var price =
                                    htmlNodeDl.SelectSingleNode(@".//span[@class='c-price']")?.InnerText.Trim();
                                string maxPrice = null;
                                var saleNum =
                                    htmlNodeDl.SelectSingleNode(@".//span[@class='sale-num']")?.InnerText.Trim();
                                var comment = htmlNodeDl.SelectSingleNode(@".//h4/a/span")?.InnerText;
                                comment = comment == null ? null : Regex.Match(comment, @"\d+").Value;
                                var resut = new Resut
                                {
                                    ["productId"] = productId,
                                    ["productName"] = productName,
                                    ["productUrl"] = productUrl,
                                    ["shopId"] = _shopUrl,
                                    ["shopName"] = _shopName,
                                    ["price"] = price,
                                    ["maxPrice"] = maxPrice,
                                    ["saleNum"] = saleNum,
                                    ["comment"] = comment
                                };

                                resultList.Add(resut);
                            }
                        }

                    }
                    //ProductId
                    //PrdouctName
                    //ProductUrl
                    //ShopId
                    //ShopName
                }

                
            }

            else
            {


                var htmlNodeCollection = docmentNode.SelectNodes(
                    @"//div[@class='\""pagination\""']/parent::div/child::div")
                                         ??
                                         docmentNode.SelectNodes(@"//div[@class='\""comboHd\""']/parent::div/child::div");

                //var htmlNodeCollection = docmentNode.SelectNodes(@"//div[contains(@class,'\""item') and contains(@class,'line1\""')]//dl");

                foreach (var htmlNode in htmlNodeCollection)
                {
                    var attributes = htmlNode.Attributes["class"].Value;
                    //退出 后面的推荐产品不要了
                    if (attributes == @"\""pagination\""")
                        break;
                    if (attributes == @"\""comboHd\""")
                    {
                        //清空队列
                        _urlQueue.Clear();
                        break;
                    }
                    if (attributes.Contains(@"\""item") && attributes.Contains(@"line1\"""))
                    {
                        var htmlNodeDls = htmlNode.SelectNodes(".//dl");
                        foreach (var htmlNodeDl in htmlNodeDls)
                        {
                            var productId = getFormatProductId(htmlNodeDl.Attributes["data-id"].Value);
                            //如果hash表不包含的productId
                            if (!_hashTable.ContainsKey(productId))
                            {
                                //productId加入到hash表中
                                _hashTable.Add(productId, null);
                                var detailNode =
                                    htmlNodeDl.SelectSingleNode(
                                        @".//dd[@class='\""detail\""']//a[@class='\""item-name']");
                                var productName = getFormatProductName(detailNode.InnerText);
                                var productUrl = getFormatProductUrl(detailNode.Attributes["href"].Value);
                                //Console.WriteLine($"shopId:{productId},shopName:{productName},productUrl:{productUrl}。");
                                var price =
                                    htmlNodeDl.SelectSingleNode(@".//span[@class='\""c-price\""']")?.InnerText.Trim();
                                var maxPrice =
                                    htmlNodeDl.SelectSingleNode(@".//span[@class='\""s-price\""']")?.InnerText.Trim();
                                var saleNum =
                                    htmlNodeDl.SelectSingleNode(@".//span[@class='\""sale-num\""']")?.InnerText.Trim();
                                var comment = htmlNodeDl.SelectSingleNode(@".//div[@class='\""title\""']")?.InnerText;
                                comment = comment == null ? null : Regex.Match(comment, @"\d+").Value;
                                var resut = new Resut
                                {
                                    ["productId"] = productId,
                                    ["productName"] = productName,
                                    ["productUrl"] = productUrl,
                                    ["shopId"] = _shopUrl,
                                    ["shopName"] = _shopName,
                                    ["price"] = price,
                                    ["maxPrice"] = maxPrice,
                                    ["saleNum"] = saleNum,
                                    ["comment"] = comment
                                };

                                resultList.Add(resut);
                            }
                        }

                    }
                    //ProductId
                    //PrdouctName
                    //ProductUrl
                    //ShopId
                    //ShopName
                }
            }



            return resultList.ToArray();

        }
예제 #9
0
        private IEnumerable <Dictionary <string, string> > GetInfo(string url)
        {
            Func <string, string> removeSpace = s => s.Replace("&nbsp;", "");

            var listDic = new List <Dictionary <string, string> >();

            var httpHelper = new HttpHelper
            {
                HttpEncoding = _httpEncoding,
                Timeout      = 5 * 60 * 1000
            };

            var html = httpHelper.GetHtmlByPost(url, $"id={_tableId}&inTime={DateTime.Now}");

            var matchCollection = Regex.Matches(Regex.Match(html, @"dataStore[\s]*=[\s]*\[.*?\]").Value, "(?<=\")[^,]+?(?=\")");

            foreach (Match match in matchCollection)
            {
                var cId           = Regex.Match(Regex.Match(match.Value, @"[E][\$][^\$]*[\$]").Value, @"(?<=E[\$])[\S]*(?=[\$])").Value;
                var content       = httpHelper.GetHtmlByPost("http://www.zjcredit.gov.cn/info/promptsDetail.do", $"tableId={_tableId}&cId={cId}&inTime={DateTime.Now}");
                var htmlNode      = HtmlAgilityPackHelper.GetDocumentNodeByHtml(content);
                var sincerityInfo = new SincerityInfo();
                sincerityInfo.InfoName    = Regex.Match(content, "(?<=信息名称:).*?(?=')").Value;
                sincerityInfo.DataSources = Regex.Match(content, "(?<=数据来源:).*?(?=')").Value;
                sincerityInfo.UpdateTime  = Regex.Match(content, "(?<=更新时间:).*?(?=')").Value;
                var htmlNodeList = htmlNode.SelectNodes("//td[@class='xyml_t2']").ToList();
                sincerityInfo.OrganizationCode    = removeSpace(htmlNodeList[0].InnerText);
                sincerityInfo.LegalRepresentative = removeSpace(htmlNodeList[1].InnerText);
                sincerityInfo.WorkAddress         = removeSpace(htmlNodeList[2].InnerText);
                sincerityInfo.Court             = removeSpace(htmlNodeList[3].InnerText);
                sincerityInfo.CaseNo            = removeSpace(htmlNodeList[4].InnerText);
                sincerityInfo.ExecutiveBasis    = removeSpace(htmlNodeList[5].InnerText);
                sincerityInfo.ExecutiveReason   = removeSpace(htmlNodeList[6].InnerText);
                sincerityInfo.ExecutiveTime     = removeSpace(htmlNodeList[7].InnerText);
                sincerityInfo.ExecutiveMoney    = removeSpace(htmlNodeList[8].InnerText);
                sincerityInfo.NotExecutiveMoney = removeSpace(htmlNodeList[9].InnerText);
                sincerityInfo.ExposureTime      = removeSpace(htmlNodeList[10].InnerText);

                var dic = new Dictionary <string, string>
                {
                    ["InfoName"]            = sincerityInfo.InfoName,
                    ["DataSources"]         = sincerityInfo.DataSources,
                    ["UpdateTime"]          = sincerityInfo.UpdateTime,
                    ["OrganizationCode"]    = sincerityInfo.OrganizationCode,
                    ["LegalRepresentative"] = sincerityInfo.LegalRepresentative,
                    ["WorkAddress"]         = sincerityInfo.WorkAddress,
                    ["Court"]             = sincerityInfo.Court,
                    ["CaseNo"]            = sincerityInfo.CaseNo,
                    ["ExecutiveBasis"]    = sincerityInfo.ExecutiveBasis,
                    ["ExecutiveReason"]   = sincerityInfo.ExecutiveReason,
                    ["ExecutiveTime"]     = sincerityInfo.ExecutiveTime,
                    ["ExecutiveMoney"]    = sincerityInfo.ExecutiveMoney,
                    ["NotExecutiveMoney"] = sincerityInfo.NotExecutiveMoney,
                    ["ExposureTime"]      = sincerityInfo.ExposureTime,
                    ["PageUrl"]           = url,
                    ["CId"]      = cId,
                    ["ThreadId"] = Thread.CurrentThread.ManagedThreadId.ToString()
                };

                listDic.Add(dic);
                foreach (var keyValue in dic)
                {
                    Console.WriteLine($"{keyValue.Key}:{keyValue.Value}");
                }
            }



            return(listDic);
        }