/// <summary> /// InitFirstUrlPart /// </summary> /// <param name="url"></param> /// <returns></returns> private string InitFirstUrlPart(string url) { if (!url.ToLower().Contains("http")) { url = $"https://{url}"; } var mainHtml = GetMainWebContent(_shopUrl = url, null, ref _cookies, null); _shopName = Regex.Match(mainHtml, @"(?<=<title>)[\s\S]*?(?=</title>)").Value.Trim(); if (_shopName.Contains("阿里旅行·去啊Alitrip.com")) { throw new Exception("阿里旅行不支持"); } if (_shopName.Equals("店铺浏览-淘宝网")) { //throw new Exception("店铺不存在!"); SendLog("店铺不存在!"); return(string.Empty); } _shopName = Regex.Match(_shopName, "(?<=-).*(?=-)").Value.Trim(); //var categoryUrl = $"{url}/category.htm"; var categoryUrl = $"{url}/search.htm"; var html = GetMainWebContent(categoryUrl, null, ref _cookies, null); var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html); var listUrl = documentNode.SelectSingleNode("//input[@id='J_ShopAsynSearchURL']").Attributes["value"].Value; return($"{url}{listUrl}"); }
private List <Dictionary <string, string> > GetInfo(string url, string postDataString) { Func <string, string> removeSpace = s => s.Replace(" ", ""); List <Dictionary <string, string> > listDic = new List <Dictionary <string, string> >(); var httpHelper = new HttpHelper { Timeout = 5 * 60 * 1000, HttpEncoding = _httpEncoding }; var html = httpHelper.GetHtmlByPost(url, postDataString); var urlCollection = Regex.Matches(Regex.Match(Regex.Match(html, @"dataStore[\s]*=[\s]*\[.*?\]").Value, "(?<=\")[^,]+?(?=\")").Value, @"(?<=\$)[^\$]*$"); foreach (Match caseUrl in urlCollection) { html = httpHelper.GetHtmlByGet($"http://www.zjcredit.gov.cn{caseUrl.Value}"); var htmlNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html); var administrativePenaltyInfo = new AdministrativePenaltyInfo(); administrativePenaltyInfo.CaseName = htmlNode.SelectSingleNode("//td[@class='listf2']").InnerText; administrativePenaltyInfo.CaseId = htmlNode.SelectSingleNode("//table[2]//tr[1]/td[@class='xzcf_xx']").InnerText; administrativePenaltyInfo.PenaltyObject = removeSpace(htmlNode.SelectSingleNode("//table[2]//tr[2]/td[@class='xzcf_xx']/text()").InnerText); administrativePenaltyInfo.LegalRepresentative = Regex.Match(htmlNode.SelectSingleNode("//table[2]//span[@class='xzcf_mc']").InnerText, "(?<=:).*").Value.Trim(); administrativePenaltyInfo.Department = htmlNode.SelectSingleNode("//table[2]//tr[3]/td[@class='xzcf_xx']").InnerText; administrativePenaltyInfo.PenaltyDate = htmlNode.SelectSingleNode("//table[2]//tr[4]/td[@class='xzcf_xx']").InnerText; administrativePenaltyInfo.PenalyText = htmlNode.SelectSingleNode("//table[4]//td[@class='xzcf_jds']").InnerText; var dic = new Dictionary <string, string> { ["CaseName"] = administrativePenaltyInfo.CaseName, ["CaseId"] = administrativePenaltyInfo.CaseId, ["PenaltyObject"] = administrativePenaltyInfo.PenaltyObject, ["LegalRepresentative"] = administrativePenaltyInfo.LegalRepresentative, ["Department"] = administrativePenaltyInfo.Department, ["PenaltyDate"] = administrativePenaltyInfo.PenaltyDate, ["PenalyText"] = administrativePenaltyInfo.PenalyText, ["PostUrl"] = url, ["Url"] = caseUrl.Value, ["ThreadId"] = Thread.CurrentThread.ManagedThreadId.ToString() }; foreach (var info in dic) { Console.WriteLine($"{info.Key}:{info.Value}"); } listDic.Add(dic); } return(listDic); }
/// <summary> /// ParseNextUrl /// </summary> /// <returns></returns> protected override string ParseNextUrl() { if (_isInHtml) { var docmentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(HtmlSource); var nextUrl = docmentNode.SelectSingleNode("//a[@class='J_SearchAsync next']")?.Attributes["href"]?.Value; if (nextUrl != null) { if (!nextUrl.Contains("http")) _urlQueue.Enqueue($"https:{nextUrl}"); } } return _urlQueue.Count == 0 ? null : _urlQueue.Dequeue(); }
/// <summary> /// InitTotalPage /// </summary> /// <param name="listHtmlFirst"></param> /// <returns></returns> private int InitTotalPage(string listHtmlFirst) { var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(listHtmlFirst); //不转义,双引号里面要两个双引号才表示双引号 var htmlNode = documentNode.SelectSingleNode(@"//span[@class='\""page-info\""']") ?? documentNode.SelectSingleNode(@"//b[@class='\""ui-page-s-len\""']"); if (htmlNode == null) { return(0); } var text = htmlNode.InnerText; var pageNum = Regex.Match(text, @"(?<=/)\d+").Value; int pageNumInt; return(_totalPage = int.TryParse(pageNum, out pageNumInt) ? pageNumInt : 0); }
/// <summary> /// InitTotalPage /// </summary> /// <param name="listHtmlFirst"></param> /// <returns></returns> private int InitTotalPage(string listHtmlFirst) { //if (!_curUrl.Contains("/i/asynSearch.htm")) // return 1; var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(listHtmlFirst); //不转义,双引号里面要两个双引号才表示双引号 var htmlNode = documentNode.SelectSingleNode(@"//span[@class='\""page-info\""']") ?? documentNode.SelectSingleNode(@"//b[@class='\""ui-page-s-len\""']"); //?? documentNode.SelectSingleNode("//b[@class=\"ui-page-s-len\"]"); if (htmlNode == null) return 0; var text = htmlNode.InnerText; var pageNum = Regex.Match(text, @"(?<=/)\d+").Value; int pageNumInt; return _totalPage = int.TryParse(pageNum, out pageNumInt) ? pageNumInt : 0; }
/// <summary> /// InitFirstUrlPart /// </summary> /// <param name="url"></param> /// <returns></returns> private string InitFirstUrlPart(string url) { if (!url.ToLower().Contains("http")) { url = $"https://{url}"; } var mainHtml = GetMainWebContent(_shopUrl = url, null, ref _cookies, null); _shopName = Regex.Match(mainHtml, @"(?<=<title>)[\s\S]*?(?=</title>)").Value.Trim(); //if (_shopName.Contains("阿里旅行·去啊Alitrip.com")) //{ // return $"{url}/search.htm"; //} if (_shopName.Contains("阿里旅行·去啊Alitrip.com")) { throw new Exception("阿里旅行不支持"); } if (_shopName.Equals("店铺浏览-淘宝网")) { //throw new Exception("店铺不存在!"); SendLog("店铺不存在!"); return(string.Empty); } _shopName = Regex.Match(_shopName, "(?<=-).*(?=-)").Value.Trim(); //var categoryUrl = $"{url}/category.htm"; var categoryUrl = $"{url}/search.htm"; var html = GetMainWebContent(categoryUrl, null, ref _cookies, null); var documentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html); var listUrl = documentNode.SelectSingleNode("//input[@id='J_ShopAsynSearchURL']").Attributes["value"].Value; // /i/asynSearch.htm?mid=w-1901851942-0&wid=1901851942&path=/search.htm&search=y //if (string.IsNullOrEmpty(listUrl)) //{ // var dataWidgetid = // documentNode.SelectSingleNode("//div[@id=\"bd\"]//div[@class=\"J_TModule\"]").Attributes[ // "data-widgetid"].Value; // listUrl = $"/i/asynSearch.htm?mid=w-{dataWidgetid}-0&wid={dataWidgetid}&path=/search.htm&search=y"; //} //return string.IsNullOrEmpty(listUrl) ? url : $"{url}{listUrl}"; return($"{url}{listUrl}"); }
/// <summary> /// ParseCurrentItems /// </summary> /// <returns></returns> protected override IResut[] ParseCurrentItems() { List <IResut> resultList = new List <IResut>(); HtmlNode htmlNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(HtmlSource); HtmlNodeCollection htmlNodeCollection = htmlNode.SelectNodes("//td[@class='Font9']"); foreach (HtmlNode node in htmlNodeCollection) { string url = node.SelectSingleNode("./a[@class='five']")?.Attributes["href"]?.Value; string dateTimeString = Regex.Match(node.InnerText, @"\d+-\d+-\d+").Value; if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(dateTimeString)) { break; } url = $"http://www.ccgp-shandong.gov.cn{url}"; DateTime dateTime = Convert.ToDateTime(dateTimeString); int days = (DateTime.Now - dateTime).Days; if (days > _gatherDays) { _urlQueue.Clear(); break; } string html = _httpHelper.GetHtmlByGet(url); HtmlNode htmlNode2 = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html); string title = htmlNode2.SelectSingleNode("//div[@align='center']")?.InnerText; string publisher = Regex.Match(html, "(?<=发布人[::]).*(?=</td>)").Value; string publishTime = Regex.Match(html, "(?<=发布时间[::]).*(?=</td>)").Value; publishTime = Convert.ToDateTime(publishTime).ToString(CultureInfo.CurrentCulture); //string content = htmlNode2.SelectSingleNode("//td[@bgcolor='#FFFFFF' and @align='center' and not(@valign)]").InnerText.Trim(); //content = HttpUtility.HtmlDecode(Regex.Match(content, @".*(?=\r\n)").Value); string content = htmlNode2.SelectSingleNode("//table//tr[2]/td[2]/table").OuterHtml; Resut resut = new Resut() { ["url"] = url, ["title"] = title, ["content"] = content, ["publisher"] = publisher, ["publishTime"] = publishTime }; resultList.Add(resut); } return(resultList.ToArray()); }
/// <summary> /// ParseCurrentItems /// </summary> /// <returns></returns> protected override IResut[] ParseCurrentItems() { var resultList = new List<IResut>(); Newtonsoft.Json.Serialization.Func<string, string> getFormatProductId = productId => Regex.Match(productId, @"(?<=\\"").*(?=\\"")").Value; Newtonsoft.Json.Serialization.Func<string, string> getFormatProductName = productName => productName.Trim(); Newtonsoft.Json.Serialization.Func<string, string> getFormatProductUrl = productUrl => $"https:{Regex.Match(productUrl, @"(?<=\\"").*(?=\\"")").Value}"; //var html = Regex.Match(HtmlSource, @"<div class=\\""J_TItems\\"">[\s\S]*?<div class=\\""pagination\\"">").Value; //var docmentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(html); //var htmlNodeCollection = docmentNode.SelectNodes(@"//div[@class='\""item4line1\""']//dl") ?? // docmentNode.SelectNodes(@"//div[@class='\""item5line1\""']//dl")?? // docmentNode.SelectNodes(@"//div[@class='\""item30line1\""']//dl"); //var divNodes = docmentNode.SelectNodes(@"//div"); //Console.WriteLine(new string('=', 64)); //foreach (var divNode in divNodes) //{ // var classValue = divNode.GetAttributeValue(@"class", string.Empty); // Console.WriteLine($"classvalue: {classValue}"); //} //Console.WriteLine(new string('-', 64)); //用matches和ends-with都提示需要命名空间管理器或 XsltContext。此查询具有前缀、变量或用户定义的函数。还没解决这个问题 //var htmlNodeCollection = docmentNode.SelectNodes(@"//div[matches(@class,'\""item\d+line1\""')]//dl"); //var htmlNodeCollection = docmentNode.SelectNodes("//div[starts-with(@class,'\\\"item')]//dl"); //var htmlNodeCollection = docmentNode.SelectNodes("//div[ends-with(@class,'line1\\\"')]//dl"); var docmentNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(HtmlSource); if (_isInHtml) { var htmlNodeCollection = docmentNode.SelectNodes(@"//div[@class='pagination']/parent::div/child::div") ??docmentNode.SelectNodes(@"//div[@class='comboHd']/parent::div/child::div")?? docmentNode.SelectNodes(@"//div[contains(@class,'item') and contains(@class,'line1')]//dl"); foreach (var htmlNode in htmlNodeCollection) { var attributes = htmlNode.Attributes["class"].Value; //退出 后面的推荐产品不要了 if (attributes == @"pagination") break; if (attributes == @"comboHd") { //清空队列 _urlQueue.Clear(); break; } if (attributes.Contains(@"item") && attributes.Contains(@"line1")) { var htmlNodeDls = htmlNode.SelectNodes(".//dl"); foreach (var htmlNodeDl in htmlNodeDls) { var detailNode = htmlNodeDl.SelectSingleNode( @".//dd[@class='detail']//a[@class='item-name J_TGoldData']"); var productName = getFormatProductName(detailNode.InnerText); var productUrl = detailNode.Attributes["href"].Value; var productId = Regex.Match(productUrl, @"(?<=id=)\d+").Value; //如果hash表不包含的productId if (!_hashTable.ContainsKey(productId)) { //productId加入到hash表中 _hashTable.Add(productId, null); //Console.WriteLine($"shopId:{productId},shopName:{productName},productUrl:{productUrl}。"); var price = htmlNodeDl.SelectSingleNode(@".//span[@class='c-price']")?.InnerText.Trim(); string maxPrice = null; var saleNum = htmlNodeDl.SelectSingleNode(@".//span[@class='sale-num']")?.InnerText.Trim(); var comment = htmlNodeDl.SelectSingleNode(@".//h4/a/span")?.InnerText; comment = comment == null ? null : Regex.Match(comment, @"\d+").Value; var resut = new Resut { ["productId"] = productId, ["productName"] = productName, ["productUrl"] = productUrl, ["shopId"] = _shopUrl, ["shopName"] = _shopName, ["price"] = price, ["maxPrice"] = maxPrice, ["saleNum"] = saleNum, ["comment"] = comment }; resultList.Add(resut); } } } //ProductId //PrdouctName //ProductUrl //ShopId //ShopName } } else { var htmlNodeCollection = docmentNode.SelectNodes( @"//div[@class='\""pagination\""']/parent::div/child::div") ?? docmentNode.SelectNodes(@"//div[@class='\""comboHd\""']/parent::div/child::div"); //var htmlNodeCollection = docmentNode.SelectNodes(@"//div[contains(@class,'\""item') and contains(@class,'line1\""')]//dl"); foreach (var htmlNode in htmlNodeCollection) { var attributes = htmlNode.Attributes["class"].Value; //退出 后面的推荐产品不要了 if (attributes == @"\""pagination\""") break; if (attributes == @"\""comboHd\""") { //清空队列 _urlQueue.Clear(); break; } if (attributes.Contains(@"\""item") && attributes.Contains(@"line1\""")) { var htmlNodeDls = htmlNode.SelectNodes(".//dl"); foreach (var htmlNodeDl in htmlNodeDls) { var productId = getFormatProductId(htmlNodeDl.Attributes["data-id"].Value); //如果hash表不包含的productId if (!_hashTable.ContainsKey(productId)) { //productId加入到hash表中 _hashTable.Add(productId, null); var detailNode = htmlNodeDl.SelectSingleNode( @".//dd[@class='\""detail\""']//a[@class='\""item-name']"); var productName = getFormatProductName(detailNode.InnerText); var productUrl = getFormatProductUrl(detailNode.Attributes["href"].Value); //Console.WriteLine($"shopId:{productId},shopName:{productName},productUrl:{productUrl}。"); var price = htmlNodeDl.SelectSingleNode(@".//span[@class='\""c-price\""']")?.InnerText.Trim(); var maxPrice = htmlNodeDl.SelectSingleNode(@".//span[@class='\""s-price\""']")?.InnerText.Trim(); var saleNum = htmlNodeDl.SelectSingleNode(@".//span[@class='\""sale-num\""']")?.InnerText.Trim(); var comment = htmlNodeDl.SelectSingleNode(@".//div[@class='\""title\""']")?.InnerText; comment = comment == null ? null : Regex.Match(comment, @"\d+").Value; var resut = new Resut { ["productId"] = productId, ["productName"] = productName, ["productUrl"] = productUrl, ["shopId"] = _shopUrl, ["shopName"] = _shopName, ["price"] = price, ["maxPrice"] = maxPrice, ["saleNum"] = saleNum, ["comment"] = comment }; resultList.Add(resut); } } } //ProductId //PrdouctName //ProductUrl //ShopId //ShopName } } return resultList.ToArray(); }
private IEnumerable <Dictionary <string, string> > GetInfo(string url) { Func <string, string> removeSpace = s => s.Replace(" ", ""); var listDic = new List <Dictionary <string, string> >(); var httpHelper = new HttpHelper { HttpEncoding = _httpEncoding, Timeout = 5 * 60 * 1000 }; var html = httpHelper.GetHtmlByPost(url, $"id={_tableId}&inTime={DateTime.Now}"); var matchCollection = Regex.Matches(Regex.Match(html, @"dataStore[\s]*=[\s]*\[.*?\]").Value, "(?<=\")[^,]+?(?=\")"); foreach (Match match in matchCollection) { var cId = Regex.Match(Regex.Match(match.Value, @"[E][\$][^\$]*[\$]").Value, @"(?<=E[\$])[\S]*(?=[\$])").Value; var content = httpHelper.GetHtmlByPost("http://www.zjcredit.gov.cn/info/promptsDetail.do", $"tableId={_tableId}&cId={cId}&inTime={DateTime.Now}"); var htmlNode = HtmlAgilityPackHelper.GetDocumentNodeByHtml(content); var sincerityInfo = new SincerityInfo(); sincerityInfo.InfoName = Regex.Match(content, "(?<=信息名称:).*?(?=')").Value; sincerityInfo.DataSources = Regex.Match(content, "(?<=数据来源:).*?(?=')").Value; sincerityInfo.UpdateTime = Regex.Match(content, "(?<=更新时间:).*?(?=')").Value; var htmlNodeList = htmlNode.SelectNodes("//td[@class='xyml_t2']").ToList(); sincerityInfo.OrganizationCode = removeSpace(htmlNodeList[0].InnerText); sincerityInfo.LegalRepresentative = removeSpace(htmlNodeList[1].InnerText); sincerityInfo.WorkAddress = removeSpace(htmlNodeList[2].InnerText); sincerityInfo.Court = removeSpace(htmlNodeList[3].InnerText); sincerityInfo.CaseNo = removeSpace(htmlNodeList[4].InnerText); sincerityInfo.ExecutiveBasis = removeSpace(htmlNodeList[5].InnerText); sincerityInfo.ExecutiveReason = removeSpace(htmlNodeList[6].InnerText); sincerityInfo.ExecutiveTime = removeSpace(htmlNodeList[7].InnerText); sincerityInfo.ExecutiveMoney = removeSpace(htmlNodeList[8].InnerText); sincerityInfo.NotExecutiveMoney = removeSpace(htmlNodeList[9].InnerText); sincerityInfo.ExposureTime = removeSpace(htmlNodeList[10].InnerText); var dic = new Dictionary <string, string> { ["InfoName"] = sincerityInfo.InfoName, ["DataSources"] = sincerityInfo.DataSources, ["UpdateTime"] = sincerityInfo.UpdateTime, ["OrganizationCode"] = sincerityInfo.OrganizationCode, ["LegalRepresentative"] = sincerityInfo.LegalRepresentative, ["WorkAddress"] = sincerityInfo.WorkAddress, ["Court"] = sincerityInfo.Court, ["CaseNo"] = sincerityInfo.CaseNo, ["ExecutiveBasis"] = sincerityInfo.ExecutiveBasis, ["ExecutiveReason"] = sincerityInfo.ExecutiveReason, ["ExecutiveTime"] = sincerityInfo.ExecutiveTime, ["ExecutiveMoney"] = sincerityInfo.ExecutiveMoney, ["NotExecutiveMoney"] = sincerityInfo.NotExecutiveMoney, ["ExposureTime"] = sincerityInfo.ExposureTime, ["PageUrl"] = url, ["CId"] = cId, ["ThreadId"] = Thread.CurrentThread.ManagedThreadId.ToString() }; listDic.Add(dic); foreach (var keyValue in dic) { Console.WriteLine($"{keyValue.Key}:{keyValue.Value}"); } } return(listDic); }