protected List <ProvInfo> GetCity() { List <ProvInfo> citys = ToolFile.Deserialize <ProvInfo>(ToolFile.WebCityPath); if (citys == null || citys.Count < 1) { citys = new List <ProvInfo>(); string url = "http://jzsc.mohurd.gov.cn/asite/region/index"; string html = string.Empty; try { html = ToolWeb.GetHtmlByUrl(url); } catch { } JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); object[] objs = (object[])(((Dictionary <string, object>)((Dictionary <string, object>)smsTypeJson["json"])["category"])["provinces"]); foreach (object obj in objs) { Dictionary <string, object> dic = (Dictionary <string, object>)obj; ProvInfo info = new ProvInfo(); info.RegionId = Convert.ToString(dic["region_id"]); info.RegionName = Convert.ToString(dic["region_name"]); info.RegionFullName = Convert.ToString(dic["region_fullname"]); citys.Add(info); } citys = citys.OrderBy(x => x.RegionName).ToList(); ToolFile.Serialize <ProvInfo>(citys, ToolFile.WebCityPath); } return(citys); }
private void SaveAttach(BidProject info, string htmltxt, string result, bool isUpdate) { List <BaseAttach> list = new List <BaseAttach>(); if (htmltxt.Contains("http")) { Parser parser = new Parser(new Lexer(htmltxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int j = 0; j < aNode.Count; j++) { ATag aTag = aNode[j].GetATag(); string attachName = aTag.LinkText; string aurl = string.Empty; aurl = aTag.Link.GetReplace("\\\"", ""); if (string.IsNullOrWhiteSpace(attachName)) { attachName = info.PrjName; } try { string url = System.Web.HttpUtility.UrlDecode(aurl); string[] urls = url.Split('&'); BaseAttach entity = null; if (isUpdate) { entity = ToolHtml.GetBaseAttach(url, attachName, result, "SiteManage\\Files\\Attach\\"); } else { entity = ToolHtml.GetBaseAttach(url, attachName, info.Id, "SiteManage\\Files\\Attach\\"); } if (entity != null) { list.Add(entity); } } catch { } } } } if (list.Count > 0) { if (isUpdate) { string delSql = string.Format("delete from BaseAttach where SourceID='{0}'", result); ToolFile.Delete(result); int count = ToolDb.ExecuteSql(delSql); } foreach (BaseAttach attach in list) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } }
protected void SaveAttach(string url, string sourceId) { List <BaseAttach> attach = new List <BaseAttach>(); string htmlAnnex = string.Empty; try { htmlAnnex = this.ToolWebSite.GetHtmlByUrl(url, Encoding.UTF8); } catch { } Parser dtparser = new Parser(new Lexer(htmlAnnex)); NodeList dtList = dtparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_GridView1"), new TagNameFilter("table"))); if (dtList != null && dtList.Count > 0) { TableTag dttable = dtList[0] as TableTag; for (int t = 1; t < dttable.RowCount; t++) { ATag file = dttable.SearchFor(typeof(ATag), true)[t - 1] as ATag; if (file.IsAtagAttach()) { string aurl = "http://www.szjsjy.com.cn/" + file.Link.Replace("../", "").Replace("./", ""); try { BaseAttach entity = ToolHtml.GetBaseAttach(aurl, file.LinkText, sourceId, "SiteManage\\Files\\Attach\\"); if (entity != null) { attach.Add(entity); } } catch { } } } } if (attach.Count > 0) { string delSql = string.Format("delete from BaseAttach where SourceID='{0}'", sourceId); ToolFile.Delete(sourceId); int count = ToolDb.ExecuteSql(delSql); ToolDb.SaveDatas(attach, string.Empty); } }
protected List <QualInfo> GetQual() { List <QualInfo> quals = ToolFile.Deserialize <QualInfo>(ToolFile.WebQualPath); if (quals == null || quals.Count < 1) { quals = new List <QualInfo>(); int pageInt = 1; int totalPage = 0; string url = "http://jzsc.mohurd.gov.cn/asite/qualapt/aptData?apt_type="; string html = string.Empty; try { html = ToolWeb.GetHtmlByUrl(url); } catch { } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "clearfix"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().Replace(",", ","); string page = temp.GetRegexBegEnd("total", ",").GetReplace("\":"); totalPage = int.Parse(page); pageInt = totalPage / 10 + 1; } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = ToolWeb.GetNameValueCollection(new string[] { "$total", "$reload", "$pg", "$pgsz" }, new string[] { totalPage.ToString(), "0", i.ToString(), "10" }); try { html = ToolWeb.GetHtmlByUrl(url, nvc, Encoding.UTF8); } catch { } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table_box"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount - 1; j++) { TableRow tr = table.Rows[j]; parser = new Parser(new Lexer(tr.ToHtml())); try { NodeList input = parser.ExtractAllNodesThatMatch(new TagNameFilter("input")); InputTag tag = input[0] as InputTag; string json = tag.GetAttribute("value"); JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(json); QualInfo info = new QualInfo(); info.QualCode = Convert.ToString(smsTypeJson["apt_code"]); info.QualName = Convert.ToString(smsTypeJson["apt_scope"]); quals.Add(info); } catch (Exception ex) { Logger.Error(i); Logger.Error(tr.ToHtml()); } } } Thread.Sleep(1000 * 1); } quals = quals.OrderBy(x => x.QualCode).ToList(); ToolFile.Serialize <QualInfo>(quals, ToolFile.WebQualPath); } return(quals); }
protected override IList ExecuteCrawl(bool crawlAll) { List <ProvInfo> citys = GetCity(); foreach (ProvInfo info in citys) { if (info.RegionName == "广东") { citys.Remove(info); citys.Insert(0, info); break; } } List <string> SqlQuals = this.SaveQuals(); List <QualInfo> quals = GetQual().OrderByDescending(x => x.QualName).ToList(); string path = Path.Combine(System.Environment.CurrentDirectory, "ProvQual.xml"); List <ProvQual> provQual = ToolFile.Deserialize <ProvQual>(path); ProvQual tempQual = null; if (provQual != null && provQual.Count > 1) { tempQual = provQual[0]; } else { provQual = new List <ProvQual>(); } bool provFlat = true, qualFlot = true; int count = 1, totalCount = 1; string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; string pageHtl = string.Empty; foreach (ProvInfo city in citys) { if (city.RegionName != "广东") { break; } if (tempQual != null && provFlat) { if (tempQual.RegionId != city.RegionId && tempQual.RegionName != city.RegionName) { continue; } else { provFlat = false; } } int qualIndex = 0; foreach (QualInfo qual in quals) { //if (tempQual != null && qualFlot) //{ // if (tempQual.QualName != qual.QualName && // tempQual.QualCode != qual.QualCode) // continue; // else // qualFlot = false; //} string name = qual.QualName; if (name.Contains("不分")) { name = name.Remove(name.IndexOf("不分")); } else if (name.Contains("暂定级")) { name = name.Remove(name.IndexOf("暂定级")); } else if (name.Length > 2) { name = name.Remove(name.Length - 2, 2); } if (SqlQuals.Contains(name)) { continue; } NameValueCollection nvc = ToolWeb.GetNameValueCollection(new string[] { "qy_type", "apt_scope", "apt_code", "qy_name", "qy_fr_name", "apt_certno", "qy_reg_addr", "qy_region" }, new string[] { "", qual.QualName, qual.QualCode, "", "", "", city.RegionName, city.RegionId }); try { html = ToolWeb.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch { return(null); } int totalPage = 0; Parser parser = new Parser(new Lexer(html)); NodeList tempNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class", "nodata"))); if (tempNode != null && tempNode.Count > 0) { if (tempNode[0].ToNodePlainString().Contains("暂未查询到已登记入库信息")) { continue; qualIndex++; if (qualIndex > 5) { Thread.Sleep(30 * 1000); qualIndex = 0; } } } parser.Reset(); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "clearfix"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().Replace(",", ","); string page = temp.GetRegexBegEnd("total", ",").GetReplace("\":"); totalPage = int.Parse(page); if (totalPage % 15 != 0 && totalPage > 15) { pageInt = totalPage / 15 + 1; } else if (totalPage % 15 == 0 && totalPage > 15) { pageInt = totalPage / 15; } else { pageInt = 1; } } catch { } } for (int p = 1; p <= pageInt; p++) { if (p > 1) { Logger.Error(p); Logger.Error(city.RegionName); Logger.Error(qual.QualName); nvc = ToolWeb.GetNameValueCollection(new string[] { "apt_code", "qy_region", "qy_fr_name", "$total", "qy_reg_addr", "$reload", "qy_type", "qy_name", "$pg", "$pgsz", "apt_scope", "apt_certno" }, new string[] { qual.QualCode, city.RegionId, "", totalPage.ToString(), city.RegionName, "0", "", "", p.ToString(), "15", qual.QualName, "" }); try { html = ToolWeb.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch { try { Thread.Sleep(60 * 1000 * 1); html = ToolWeb.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch { try { Thread.Sleep(60 * 1000 * 1); html = ToolWeb.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } } } if (IsVailCode(html)) { ProvQual pro = new ProvQual(); pro.RegionFullName = city.RegionFullName; pro.QualName = qual.QualName; pro.QualCode = qual.QualCode; pro.RegionId = city.RegionId; pro.RegionName = city.RegionName; pro.PageIndex = p; provQual.Add(pro); ToolFile.Serialize <ProvQual>(provQual, path); break; } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table_box responsive personal"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int i = 1; i < table.RowCount - 1; i++) { TableRow tr = table.Rows[i]; if (table.Rows[i].ColumnCount <= 1) { break; } string CorpName = string.Empty, CorpCode = string.Empty, CorpAddress = string.Empty, RegDate = string.Empty, RegFund = string.Empty, BusinessCode = string.Empty, BusinessType = string.Empty, LinkMan = string.Empty, LinkPhone = string.Empty, Fax = string.Empty, Email = string.Empty, CorpSite = string.Empty, cUrl = string.Empty, ISOQualNum = string.Empty, ISOEnvironNum = string.Empty, OffAdr = string.Empty, Cert = string.Empty, ctxKc = string.Empty, corpProv = string.Empty, corpRz = string.Empty; CorpCode = tr.Columns[1].ToNodePlainString(); CorpName = tr.Columns[2].ToNodePlainString(); BusinessCode = CorpCode; LinkMan = tr.Columns[3].ToNodePlainString(); corpProv = tr.Columns[4].ToNodePlainString(); ATag aTag = tr.Columns[2].GetATag(); if (aTag == null) { continue; } cUrl = "http://jzsc.mohurd.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = ToolWeb.GetHtmlByUrl(cUrl).GetJsString(); } catch { try { Thread.Sleep(1000 * 60 * 1); htmldtl = ToolWeb.GetHtmlByUrl(cUrl).GetJsString(); } catch { try { Thread.Sleep(1000 * 60 * 1); htmldtl = ToolWeb.GetHtmlByUrl(cUrl).GetJsString(); } catch { continue; } } } if (IsVailCode(htmldtl)) { count++; totalCount++; Logger.Error(p); continue; } parser = new Parser(new Lexer(htmldtl.ToLower().GetReplace("th", "td"))); NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "pro_table_box datas_table"))); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag tableInfo = tableNode[0] as TableTag; for (int j = 0; j < tableInfo.RowCount; j++) { for (int c = 0; c < tableInfo.Rows[j].ColumnCount; c++) { string temp = tableInfo.Rows[j].Columns[c].ToNodePlainString(); if (c % 2 == 0) { ctx += temp + ":"; } else { ctx += temp + "\r\n"; } } } BusinessType = ctx.GetRegex("企业登记注册类型"); CorpAddress = ctx.GetRegex("企业经营地址"); } CorpInfo info = ToolDb.GenCorpInfo(CorpName, CorpCode, CorpAddress, RegDate, RegFund, BusinessCode, BusinessType, LinkMan, LinkPhone, Fax, Email, CorpSite, "", corpProv, corpProv, "中华人民共和国住房和城乡建设部建筑市场监管司", cUrl, ISOQualNum, ISOEnvironNum, OffAdr); string sql = string.Format("select Id from CorpInfo where CorpName='{0}' and Province='{1}' and City='{2}' and InfoSource='{3}'", info.CorpName, info.Province, info.City, info.InfoSource); string resultId = Convert.ToString(ToolDb.ExecuteScalar(sql)); int delResult = 0; if (!string.IsNullOrEmpty(resultId)) { string delCorpQual = string.Format("delete from CorpQual where CorpId='{0}'", resultId); string delCorpResult = string.Format("delete from CorpResults where CorpId='{0}'", resultId); string delCorpTecStaff = string.Format("delete from CorpTecStaff where CorpId='{0}'", resultId); string delCorpPrompt = string.Format("delete from CorpPrompt where CorpId='{0}'", resultId); string delCorpInfo = string.Format("delete from CorpInfo where Id='{0}'", resultId); ToolDb.ExecuteSql(delCorpQual); ToolDb.ExecuteSql(delCorpResult); ToolDb.ExecuteSql(delCorpTecStaff); ToolDb.ExecuteSql(delCorpPrompt); delResult = ToolDb.ExecuteSql(delCorpInfo); } bool isSave = false; if (delResult >= 1) { isSave = ToolDb.SaveEntity(info, ""); } else { isSave = ToolDb.SaveEntity(info, this.ExistCompareFields); } if (isSave) { parser = new Parser(new Lexer(htmldtl)); NodeList aNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "query_info_tab")), true), new TagNameFilter("a"))); if (aNodes != null && aNodes.Count > 0) { for (int a = 0; a < aNodes.Count; a++) { ATag aInfo = aNodes[a] as ATag; string url = "http://jzsc.mohurd.gov.cn" + aInfo.GetAttribute("data-url"); if (aInfo.LinkText.Contains("资质")) { AddCorpQual(info, url); } else if (aInfo.LinkText.Contains("注册人员")) { this.AddCorpTecStaff(info, url); } else if (aInfo.LinkText.Contains("工程项目")) { this.AddCorpResults(info, url); } else if (aInfo.LinkText.Contains("不良行为")) { this.AddCorpPromptGood(info, url); } else if (aInfo.LinkText.Contains("良好行为")) { this.AddCorpPrompt(info, url); } Thread.Sleep(1000 * 1); } } } Thread.Sleep(1000 * 2); count++; totalCount++; if (count >= 20) { count = 0; Thread.Sleep(1000 * 60 * 2); } if (totalCount >= 100) { totalCount = 0; Thread.Sleep(1000 * 60 * 10); } } } Thread.Sleep(1000 * 60 * 3); } Thread.Sleep(1000 * 60 * 5); } } return(null); }
private void SaveAttach(BidProject info, string htmltxt, string result, bool isUpdate) { List <BaseAttach> list = new List <BaseAttach>(); if (htmltxt.Contains("http")) { Parser parser = new Parser(new Lexer(htmltxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int j = 0; j < aNode.Count; j++) { ATag aTag = aNode[j].GetATag(); string attachName = aTag.LinkText; string aurl = string.Empty; if (!aTag.Link.ToLower().Contains("http")) { aurl = "https://www.szjsjy.com.cn:8001/" + aTag.Link.GetReplace("\\"); } else { aurl = aTag.Link.GetReplace("\\"); } if (string.IsNullOrWhiteSpace(attachName)) { attachName = info.PrjName; } try { string url = System.Web.HttpUtility.UrlDecode(aurl); string[] urls = url.Split('&'); url = urls[0] + "&" + urls[2] + "&" + urls[1]; BaseAttach entity = null; if (isUpdate) { entity = ToolHtml.GetBaseAttach(url.Replace("\"", ""), attachName, result, "SiteManage\\Files\\Attach\\"); } else { entity = ToolHtml.GetBaseAttach(url.Replace("\"", ""), attachName, info.Id, "SiteManage\\Files\\Attach\\"); } if (entity != null) { list.Add(entity); } } catch { } } } } else { System.Data.DataTable dtlDtl = ToolHtml.JsonToDataTable(htmltxt); if (dtlDtl != null && dtlDtl.Rows.Count > 0) { for (int i = 0; i < dtlDtl.Rows.Count; i++) { System.Data.DataRow row = dtlDtl.Rows[i]; string attachName = Convert.ToString(row["attachName"]); if (string.IsNullOrWhiteSpace(attachName)) { attachName = info.PrjName; } string attachGuid = Convert.ToString(row["attachGuid"]); string url = "https://www.szjsjy.com.cn:8001/file/downloadFile?fileId=" + attachGuid; try { BaseAttach entity = null; if (isUpdate) { entity = ToolHtml.GetBaseAttachByUrl(url, attachName, result, "SiteManage\\Files\\Attach\\"); } else { entity = ToolHtml.GetBaseAttachByUrl(url, attachName, info.Id, "SiteManage\\Files\\Attach\\"); } if (entity != null) { list.Add(entity); } } catch { } } } } if (list.Count > 0) { if (isUpdate) { string delSql = string.Format("delete from BaseAttach where SourceID='{0}'", result); ToolFile.Delete(result); int count = ToolDb.ExecuteSql(delSql); } foreach (BaseAttach attach in list) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } }