protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int sqlCount = 0; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + this.MaxCount); } catch { return(null); } int startIndex = html.IndexOf("{"); int endIndex = html.LastIndexOf("}"); html = html.Substring(startIndex, (endIndex + 1) - startIndex); JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); foreach (KeyValuePair <string, object> obj in smsTypeJson) { if (obj.Key == "total") { continue; } object[] array = (object[])obj.Value; foreach (object arrValue in array) { string InfoTitle = string.Empty, InfoType = string.Empty, bgType = string.Empty, prjType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty, infoSource = string.Empty; Dictionary <string, object> dic = (Dictionary <string, object>)arrValue; InfoTitle = Convert.ToString(dic["ggName"]); prjCode = Convert.ToString(dic["bdBH"]); string kzJguid = Convert.ToString(dic["kzJGuid"]); InfoType = "控制价公示"; PublistTime = Convert.ToString(dic["fbStartTime2"]); prjType = Convert.ToString(dic["gcLeiXing2"]); InfoUrl = Convert.ToString(dic["detailUrl"]); try { Uri uri = new Uri(InfoUrl); string url = "https://www.szjsjy.com.cn:8001/jyw-lg/jyxx/queryOldOTDataDetail.do" + uri.Query; htmlTxt = this.ToolWebSite.GetHtmlByUrl(url); htmlTxt = htmlTxt.GetReplace("\""); } catch { continue; } InfoCtx = htmlTxt.GetReplace("<br />", "\r\n").GetReplace("</tr>", "\r\n").ToCtxString(); buildUnit = InfoCtx.GetBuildRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = InfoCtx.GetRegex("标底审核单位"); } infoSource = "深圳市建设工程交易服务中心龙岗分中心"; NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "深圳龙岗区工程", "龙岗区", string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, infoSource, InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, prjType, bgType, htmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } Parser parser = new Parser(new Lexer(htmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int f = 0; f < fileNode.Count; f++) { ATag tag = fileNode[f] as ATag; if (tag.IsAtagAttach() || tag.Link.ToLower().Contains("downloadfile")) { try { string link = string.Empty; if (tag.Link.ToLower().Contains("http")) { link = tag.Link; if (link.StartsWith("\\")) { link = link.Substring(link.IndexOf("\\"), link.Length - link.IndexOf("\\")); } if (link.EndsWith("//")) { link = link.Remove(link.LastIndexOf("//")); } link = link.GetReplace("\\", ""); link = link.GetReplace("\"", ""); } else { link = "https://www.szjsjy.com.cn:8001/" + tag.Link; } BaseAttach attach = ToolDb.GenBaseAttach(tag.LinkText, info.Id, link); base.AttachList.Add(attach); } catch { continue; } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 27; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.hbfgw.gov.cn/hqfw/xmgg/xmkzgg/index_" + (i - 1).ToString() + ".shtml"); } catch { continue; } } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "mytable"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; TableRow tr = table.Rows[j]; ItemCode = tr.Columns[0].ToNodePlainString().GetReplace("('无')").GetReplace("('", "kdxx").GetReplace("')", "xxdk").GetRegexBegEnd("kdxx", "xxdk"); ATag aTag = tr.Columns[1].GetATag(); ItemName = aTag.LinkText; ApprovalUnit = tr.Columns[2].ToNodePlainString(); PlanDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.hbfgw.gov.cn/hqfw/xmgg/xmkzgg/" + aTag.Link.GetReplace("../,./"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "appendixDiv"))); if (dtlNode != null && dtlNode.Count > 0) { parser = new Parser(new Lexer(htmldtl)); NodeList hNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("h1")); if (hNode != null && hNode.Count > 0) { string temp = hNode[0].ToNodePlainString(); ItemName = string.IsNullOrEmpty(temp) ? ItemName : temp; } ItemName = ItemName.GetReplace("省发改委批复,省发改委核准"); CtxHtml = dtlNode.AsHtml().Replace("none", "block"); ItemCtx = CtxHtml.ToCtxString(); string imgUrl = InfoUrl.Substring(0, InfoUrl.LastIndexOf("/")); List <string> attach = new List <string>(); parser = new Parser(new Lexer(CtxHtml)); NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgNode != null && imgNode.Count > 0) { for (int p = 0; p < imgNode.Count; p++) { ImageTag img = imgNode[p] as ImageTag; string link = imgUrl + "/" + img.ImageURL.GetReplace("../,./"); CtxHtml = CtxHtml.GetReplace(img.ImageURL, link); attach.Add(link); } } PlanType = "项目核准信息"; MsgType = "湖北省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("湖北省", "湖北省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (attach.Count > 0) { for (int a = 0; a < attach.Count; a++) { BaseAttach entity = ToolDb.GenBaseAttach(ItemName, info.Id, attach[a]); base.AttachList.Add(entity); } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8).Replace(" ", ""); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("noWrap", "true"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().GetRegexBegEnd("总页数:", "当"); pageInt = int.Parse(temp); } catch { } } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "?Paging=" + i); } catch { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("valign", "top"))); if (sNode != null && sNode.Count > 0) { TableTag table = sNode[0] as TableTag; for (int t = 0; t < table.RowCount - 1; t++) { TableRow tr = table.Rows[t]; ATag aTag = tr.GetATag(); if (aTag == null) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://jyzx.maoming.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.GetReplace("</p>", "\r\n").ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex(); msgType = "茂名市公共资源交易网"; specType = "建设工程"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "茂名市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.Contains("http")) { link = a.Link; } else { link = "http://jyzx.maoming.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, a.Link); base.AttachList.Add(attach); } } } list.Add(info); } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default).GetJsString(); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "28"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString().GetRegexBegEnd(",共", "页"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&pageNum=" + i.ToString(), Encoding.Default).GetJsString(); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "95%"))); if (nodeList != null && nodeList.Count > 1) { TableTag table = nodeList[1] as TableTag; for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "政策法规"; headName = tr.Columns[0].ToNodePlainString(); releaseTime = tr.Columns[1].ToPlainTextString().GetDateRegex(); infoUrl = "http://market.meizhou.gov.cn" + tr.Columns[0].GetATagValue("onclick").GetRegexBegEnd(",'", "',"); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center"))); if (dtlList != null && dtlList.Count > 0) { if (dtlList.Count > 1) { ctxHtml = dtlList[1].ToHtml(); } else { ctxHtml = dtlList.ToHtml(); } infoCtx = ctxHtml.ToCtxString().Replace(">", ""); msgType = MsgTypeCosnt.MeiZhouMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "梅州市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(htldtl)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach baseInfo = ToolHtml.GetBaseAttach("http://market.meizhou.gov.cn" + aTag.Link, aTag.LinkText, info.Id); if (baseInfo != null) { ToolDb.SaveEntity(baseInfo, string.Empty); } } catch { } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "0h120")), true), new TagNameFilter("a"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[sNode.Count - 2].GetATagValue("title"); pageInt = Convert.ToInt32(temp.GetReplace("第,页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "0h120"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { TableTag table = viewList[j] as TableTag; string prjName = string.Empty, InfoUrl = string.Empty, beginDate = string.Empty, HtmlTxt = string.Empty; ATag aTag = viewList[j].GetATag(); if (aTag == null) { continue; } prjName = aTag.GetAttribute("title"); beginDate = table.ToNodePlainString().GetDateRegex(); InfoUrl = "http://renshan.huidong.gov.cn/" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "fontzoom"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); if (prjName.Contains("中标") || prjName.Contains("成交") || prjName.Contains("结果")) { string buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty; bidCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("中标候选公司,中标候选人"); } bidMoney = bidCtx.GetMoneyRegex(); try { if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } Parser imgParser = new Parser(new Lexer(HtmlTxt.ToLower())); NodeList imgNode = imgParser.ExtractAllNodesThatMatch(new TagNameFilter("img")); string src = string.Empty; if (imgNode != null && imgNode.Count > 0) { string imgUrl = (imgNode[0] as ImageTag).GetAttribute("src"); src = "http://renshan.huidong.gov.cn/" + imgUrl; HtmlTxt = HtmlTxt.ToLower().GetReplace(imgUrl, src); } msgType = "惠东县稔山镇人民政府"; specType = "政府采购"; bidType = prjName.GetInviteBidType(); BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "惠东县", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!string.IsNullOrEmpty(src)) { string sql = string.Format("select Id from BidInfo where InfoUrl='{0}'", info.InfoUrl); object obj = ToolDb.ExecuteScalar(sql); if (obj == null || obj.ToString() == "") { try { BaseAttach attach = ToolHtml.GetBaseAttach(src, prjName, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attach != null) { ToolDb.SaveEntity(attach, ""); } } catch { } } } parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://renshan.huidong.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { string code = string.Empty, buildUnit = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty; inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); inviteType = prjName.GetInviteBidType(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } Parser imgParser = new Parser(new Lexer(HtmlTxt.ToLower())); NodeList imgNode = imgParser.ExtractAllNodesThatMatch(new TagNameFilter("img")); string src = string.Empty; if (imgNode != null && imgNode.Count > 0) { string imgUrl = (imgNode[0] as ImageTag).GetAttribute("src"); src = "http://renshan.huidong.gov.cn/" + imgUrl; HtmlTxt = HtmlTxt.ToLower().GetReplace(imgUrl, src); } msgType = "惠东县稔山镇人民政府"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "惠东县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!string.IsNullOrEmpty(src)) { string sql = string.Format("select Id from InviteInfo where InfoUrl='{0}'", info.InfoUrl); object obj = ToolDb.ExecuteScalar(sql); if (obj == null || obj.ToString() == "") { try { BaseAttach attach = ToolHtml.GetBaseAttach(src, prjName, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attach != null) { ToolDb.SaveEntity(attach, ""); } } catch { } } } parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://renshan.huidong.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidSituation>(); int sqlCount = 0; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + this.MaxCount); } catch { return(null); } int startIndex = html.IndexOf("{"); int endIndex = html.LastIndexOf("}"); html = html.Substring(startIndex, (endIndex + 1) - startIndex); JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); object[] objvalues = smsTypeJson["rows"] as object[]; foreach (object objValue in objvalues) { Dictionary <string, object> dic = (Dictionary <string, object>)objValue; string code = string.Empty, prjName = string.Empty, PublicityEndDate = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, ctx = string.Empty, HtmlTxt = string.Empty, beginDate = string.Empty; code = Convert.ToString(dic["bdBH"]); prjName = Convert.ToString(dic["bdName"]); beginDate = Convert.ToString(dic["faBuTime2"]); string idt = Convert.ToString(dic["bdGuid"]); InfoUrl = Convert.ToString(dic["detailUrl"]); string attachJson = string.Empty; try { string urll = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/queryOldOTDataDetail.do?type=5&id=" + idt; HtmlTxt = this.ToolWebSite.GetHtmlByUrl(urll).GetJsString().GetReplace("\\t,\\r,\\n,\""); if (string.IsNullOrWhiteSpace(HtmlTxt)) { string kdGuid = Convert.ToString(dic["kbJiLuGuid"]); InfoUrl = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/kbJiLu_View.do?kbJiLuGuid=" + kdGuid; HtmlTxt = this.ToolWebSite.GetHtmlByUrl(InfoUrl); string url = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/querykbJiLuDetail.do?ggGuid=&bdGuid=&kbJiLuGuid=" + kdGuid; attachJson = this.ToolWebSite.GetHtmlByUrl(url); } } catch (Exception ex) { continue; } string gcBh = string.Empty, gcName = string.Empty, gcLeixing = string.Empty, jywTime = string.Empty, kbjiGuid = string.Empty, surl = string.Empty, attachId = string.Empty, attachFileGroupGuid = string.Empty; if (!string.IsNullOrWhiteSpace(attachJson)) { JavaScriptSerializer newSerializer = new JavaScriptSerializer(); Dictionary <string, object> newTypeJson = (Dictionary <string, object>)newSerializer.DeserializeObject(attachJson); Dictionary <string, object> kdInfo = (Dictionary <string, object>)newTypeJson["kbJiLu"]; try { attachId = Convert.ToString(kdInfo["kbJiLuGuid"]); attachFileGroupGuid = Convert.ToString(kdInfo["attachFileGroupGuid"]); } catch { } gcLeixing = Convert.ToString(kdInfo["gcLeiXing"]); jywTime = Convert.ToString(kdInfo["jywFaBuEndTime"]); //https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/kbJiLu_View.do?kbJiLuGuid=9cb75eb8-66b6-441c-9686-471dfa357ff5 surl = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/kbJiLu_View.do?kbJiLuGuid=" + attachFileGroupGuid; attachJson = this.ToolWebSite.GetHtmlByUrl(surl); HtmlTxt = attachJson; Parser parserNew = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parserNew.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "de_tab1"))); if (tableNode != null && tableNode.Count > 0) { HtmlTxt = tableNode.AsHtml(); HtmlTxt = HtmlTxt.GetReplace("<td id=\"bdBH\"> </td>", "<td id=\"bdBH\"> " + code + "</td>"); HtmlTxt = HtmlTxt.GetReplace("<td id=\"bdName\"> </td>", "<td id=\"bdName\"> " + prjName + "</td>"); HtmlTxt = HtmlTxt.GetReplace("<td id=\"gcLeiXing\"> </td>", "<td id=\"gcLeiXing\"> " + gcLeixing + "</td>"); HtmlTxt = HtmlTxt.GetReplace("<td id=\"jieZhiTime\"> </td>", "<td id=\"jieZhiTime\"> " + jywTime + "</td>"); ctx = HtmlTxt.Replace("</tr>", "\r\n").ToCtxString(); } } ctx = HtmlTxt.ToCtxString(); string saveUrl = Convert.ToString(dic["detailUrl"]); msgType = "深圳市建设工程交易中心宝安分中心"; BidSituation info = ToolDb.GetBidSituation("广东省", "深圳宝安区工程", "宝安区", code, prjName, PublicityEndDate, msgType, InfoUrl, ctx, HtmlTxt, beginDate); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(list); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { if (!string.IsNullOrWhiteSpace(attachFileGroupGuid)) { string moJson = string.Empty; string sUrl = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/filegroup/queryByGroupGuidZS.do?groupGuid=" + attachFileGroupGuid; try { moJson = this.ToolWebSite.GetHtmlByUrl(sUrl); } catch { } if (!string.IsNullOrWhiteSpace(moJson)) { JavaScriptSerializer newSerializers = new JavaScriptSerializer(); Dictionary <string, object> newTypeJsons = (Dictionary <string, object>)newSerializers.DeserializeObject(moJson); Dictionary <string, object> mofo = (Dictionary <string, object>)newTypeJsons; object[] objs = (object[])mofo["rows"]; foreach (object objAttach in objs) { Dictionary <string, object> attachs = (Dictionary <string, object>)objAttach; string attachguid = Convert.ToString(attachs["attachGuid"]); string attachName = Convert.ToString(attachs["attachName"]); string link = "https://www.szjsjy.com.cn:8001/file/downloadFile?fileId=" + attachguid; BaseAttach attach = ToolHtml.GetBaseAttach(link, attachName, info.Id, "SiteManage\\Files\\Attach\\"); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } } } else { Parser parser = new Parser(new Lexer(HtmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int f = 0; f < fileNode.Count; f++) { ATag tag = fileNode[f] as ATag; try { BaseAttach attach = null; string link = string.Empty; if (tag.Link.ToLower().Contains("http")) { link = tag.Link; if (link.Contains("\\")) { link = link.Replace("\\", ""); } } else { link = "https://www.szjsjy.com.cn:8001/" + tag.Link; } attach = ToolHtml.GetBaseAttach(link, tag.LinkText, info.Id, "SiteManage\\Files\\Attach\\"); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { continue; } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "lblPageCount"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString(); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { //string str = System.Web.HttpUtility.("%A1%AA%A1%AA%C6%F3%D2%B5%A1%AA%A1%AA"); viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__EVENTVALIDATION", "textfield", "textfield", "select", "SearchName", "SearchNo", "txtSqlText", "checkPage" }, new string[] { "Linkbutton3", "", viewState, eventValidation, "", "", "%A1%AA%A1%AA%C6%F3%D2%B5%A1%AA%A1%AA", "", "", " FProjectName like ''%%'' and FTNO like ''%%''", (i - 1).ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "dgData"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, city = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToNodePlainString(); city = tr.Columns[1].ToNodePlainString(); endDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www1.cqjsxx.com/webcqjg/GcxxFolder/" + tr.Columns[0].GetATagHref(); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "DetailTable"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); TableTag tag = dtlNode[0] as TableTag; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { if ((c + 1) % 2 == 0) { inviteCtx += tag.Rows[r].Columns[c].ToPlainTextString().ToNodeString() + "\r\n"; } else { inviteCtx += tag.Rows[r].Columns[c].ToPlainTextString().ToNodeString() + ":"; } } } beginDate = inviteCtx.GetRegex("备案日期").GetDateRegex(); if (string.IsNullOrEmpty(beginDate)) { beginDate = inviteCtx.GetRegex("备案日期"); } if (string.IsNullOrEmpty(beginDate)) { beginDate = inviteCtx.GetRegex("开始日期").GetDateRegex(); } if (string.IsNullOrEmpty(beginDate)) { beginDate = inviteCtx.GetRegex("开始日期"); } if (string.IsNullOrEmpty(beginDate)) { beginDate = DateTime.Now.ToString("yyyy-MM-dd"); } buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex(); specType = "建设工程"; inviteType = prjName.GetInviteBidType(); msgType = "重庆市工程建设招标投标交易中心"; InviteInfo info = ToolDb.GenInviteInfo("重庆市", "重庆市及区县", city, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www1.cqjsxx.com/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("id", "PageDataList__ctl7_LinkButton1"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString(); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("共", "页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "head1:username", "head1:Password", "head1:rbLoginType", "Tb_keyword", "ddlNewsType", "ddlistaddnewsdate" }, new string[] { "PageDataList$_ctl" + (i + 1).ToString() + "$LinkButton1", "", viewState, "", "", "unit", "", "20", "" } ); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", " tb_list"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "通知公告"; releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); headName = tr.Columns[1].ToNodePlainString(); infoUrl = "http://www.szpark.com.cn" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "newsinfo"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = ctxHtml.ToCtxString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.ShenZhenFJYLMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int m = 0; m < imgList.Count; m++) { try { ImageTag img = imgList[m] as ImageTag; string src = img.GetAttribute("src"); if (src.ToLower().Contains(".gif")) { continue; } BaseAttach obj = null; if (src.Contains("http")) { obj = ToolHtml.GetBaseAttach(src, headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.szpark.com.cn" + src.Replace("../", "/").Replace("./", "/"), headName, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = null; string href = aTag.GetATagHref(); if (href.Contains("http")) { obj = ToolHtml.GetBaseAttach(href, aTag.LinkText, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.szpark.com.cn" + href.Replace("../", "/").Replace("./", "/"), aTag.LinkText, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int sqlCount = 0; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + this.MaxCount); } catch { return(null); } int startIndex = html.IndexOf("{"); int endIndex = html.LastIndexOf("}"); html = html.Substring(startIndex, (endIndex + 1) - startIndex); JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); object[] objvalues = smsTypeJson["rows"] as object[]; foreach (object objValue in objvalues) { Dictionary <string, object> dic = (Dictionary <string, object>)objValue; string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; code = Convert.ToString(dic["gcBH"]); prjName = Convert.ToString(dic["gcName"]); //if (!prjName.Contains("新安翻身小学教学楼防水工程(小型工程)")) // continue; beginDate = Convert.ToString(dic["ggStartTime2"]).GetDateRegex(); string end = Convert.ToString(dic["ggEndTime"]); try { endDate = ToolHtml.GetDateTimeByLong(Convert.ToInt64(end)).ToString(); } catch { } inviteType = Convert.ToString(dic["gcLeiXing2"]); InfoUrl = Convert.ToString(dic["detailUrl"]); try { string urll = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/queryOldOTDataDetail.do?type=1&id=" + dic["gcGuid"]; try { HtmlTxt = this.ToolWebSite.GetHtmlByUrl(urll).GetJsString().GetReplace("\\t,\\r,\\n,\""); } catch { } if (string.IsNullOrWhiteSpace(HtmlTxt)) { urll = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/showGongGao.do?ggGuid=" + dic["ggGuid"]; } HtmlTxt = this.ToolWebSite.GetHtmlByUrl(urll).GetJsString().GetReplace("\\t,\\r,\\n,\""); HtmlTxt = HtmlTxt.GetReplace("},{,maoDian:,html:"); if (string.IsNullOrWhiteSpace(HtmlTxt)) { urll = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/queryOldOTDataDetail.do?type=1&id=" + dic["gcGuid"]; HtmlTxt = this.ToolWebSite.GetHtmlByUrl(urll).GetJsString().GetReplace("\\t,\\r,\\n,\""); } } catch { //Logger.Error(prjName); continue; } inviteCtx = HtmlTxt.Replace("</span>", "\r\n").Replace("<br />", "\r\n").Replace("<BR>", "\r\n").Replace("<br/>", "\r\n").ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (string.IsNullOrEmpty(code)) { code = inviteCtx.GetCodeRegex(); } msgType = "深圳市建设工程交易中心宝安分中心"; specType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳宝安区工程", "宝安区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { Parser parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link.Replace("\\", ""); BaseAttach attach = null; try { attach = ToolHtml.GetBaseAttach(link, a.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\"); } catch { } if (attach != null) { ToolDb.SaveEntity(attach, ""); } } } } } } if (!crawlAll && sqlCount >= this.MaxCount) { return(list); } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "BcwjInfoList1_Pager"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString().GetRegexBegEnd("1/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "BcwjInfoList1:KeyWord", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { viewState, "", "BcwjInfoList1:Pager", i.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "BcwjInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; InfoType = "补充通知"; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); InfoTitle = aTag.GetAttribute("title").GetReplace(";"); prjCode = tr.Columns[1].ToNodePlainString().GetReplace("[", "【").GetReplace("]", "】").GetRegexBegEnd("【", "】"); PublistTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link; if (!InfoUrl.Contains("http")) { continue; } string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "spnShow"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml(); InfoCtx = htmlTxt.GetReplace("</p>,<br />,<br/>", "\r\n").ToCtxString(); NoticeInfo info = ToolDb.GenNoticeInfo("浙江省", "浙江省及地市", "", string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "浙江省公共资源交易中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, "政府采购", "建设工程", htmlTxt); list.Add(info); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://downc.zmctc.com/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pageBtn"))); if (pageNode != null && pageNode.Count > 0) { string temp = pageNode[0].ToPlainTextString().GetRegexBegEnd("共", "页"); try { pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&PageNo=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "news")), true), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string prjName = string.Empty, InfoUrl = string.Empty, beginDate = string.Empty, HtmlTxt = string.Empty; ATag aTag = viewList[j].GetATag(); if (aTag == null) { continue; } prjName = aTag.GetAttribute("title"); beginDate = viewList[j].ToNodePlainString().GetDateRegex(); InfoUrl = "http://www.zcjsglj.gov.cn" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "newscontent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); if (prjName.Contains("中标") || prjName.Contains("成交") || prjName.Contains("结果") || prjName.Contains("候选人公示")) { string buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty; bidCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); if (prjName.Length == 4) { string tempName = bidCtx.GetRegex("工程名称,项目名称"); if (!string.IsNullOrEmpty(tempName)) { prjName = tempName; } } code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("中标候选人为,中标候选公司,中标候选人"); } bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); try { if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } msgType = "广州市增城区住房和建设局"; specType = "政府采购"; bidType = prjName.GetInviteBidType(); BidInfo info = ToolDb.GenBidInfo("广东省", "广州政府采购", "增城区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.zcjsglj.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { string code = string.Empty, buildUnit = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty; inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); if (prjName.Length == 4) { string tempName = inviteCtx.GetRegex("工程名称,项目名称"); if (!string.IsNullOrEmpty(tempName)) { prjName = tempName; } } inviteType = prjName.GetInviteBidType(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } msgType = "广州市增城区住房和建设局"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "广州政府采购", "增城区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.zcjsglj.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "1", Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "paging"), new TagNameFilter("div"))); if (sNode != null && sNode.Count > 0) { string temp = sNode[0].ToNodePlainString(); try { temp = temp.GetRegexBegEnd("/", "转到"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasAttributeFilter("class", "column-info-list"), new TagNameFilter("div")), true), new TagNameFilter("li"))); if (sNode != null && sNode.Count > 0) { for (int t = 0; t < sNode.Count; t++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = sNode[t].GetATag(); prjName = aTag.LinkText.ToNodeString(); InfoUrl = "http://ggzy.zhaoqing.gov.cn" + aTag.Link; beginDate = sNode[t].ToPlainTextString().GetDateRegex(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("body")); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { dtlparser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("border", "1"))); if (tableNode == null || tableNode.Count < 1) { dtlparser.Reset(); tableNode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("table")); } if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag table = tableNode[0] as TableTag; if (table.Rows[0].ColumnCount >= 2) { for (int j = 1; j < table.RowCount; j++) { ctx += table.Rows[j].Columns[0].ToNodePlainString() + ":"; ctx += table.Rows[j].Columns[1].ToNodePlainString() + "\r\n"; } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = ctx.GetRegex("单位名称,第一中标候选人"); } bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(); } } } buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex(); msgType = "肇庆市公共资源交易中心"; specType = "建设工程"; prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "肇庆市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); dtlparser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag fileTag = aNode[a] as ATag; if (fileTag.IsAtagAttach()) { string url = string.Empty; if (fileTag.Link.Contains("http")) { url = fileTag.Link; } else { url = this.SiteUrl + beginDate.GetReplace("-").Substring(0, 6) + fileTag.Link.GetReplace("./", "/"); } BaseAttach item = ToolDb.GenBaseAttach(fileTag.LinkText, info.Id, url); base.AttachList.Add(item); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 295; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; for (int i = 1; i < pageInt; i++) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "classId", "key", "page" }, new string[] { "151", "-1", i.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("li")); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = listNode[j].GetATag(); prjName = aTag.GetAttribute("title").GetReplace("\\\""); beginDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://bid.aited.cn/" + aTag.Link.GetReplace("../,\\\""); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "news_article"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex().GetCodeDel(); msgType = "中航技国际经贸发展有限公司"; specType = "建设工程"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("北京市", "北京市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach() || a.Link.Contains("DownloadServlet")) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://bid.aited.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "AspNetPager1")), true), new TagNameFilter("a"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[pageNode.Count - 1].GetATagHref().GetRegexBegEnd(",'", "'"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTTARGET", "__EVENTARGUMENT", "__EVENTVALIDATION", "TBKey", "AspNetPager1_input" }, new string[] { viewState, "E997B95C", "AspNetPager1", i.ToString(), eventValidation, "", (i - 1).ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); headName = aTag.LinkText; infoType = "通知公告"; releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = "http://www.sdzb.gov.cn/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "96%"))); if (dtlNode != null && dtlNode.Count > 0) { ctxHtml = dtlNode.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = "山东省建设工程招标投标管理办公室"; List <string> attach = new List <string>(); parser = new Parser(new Lexer(ctxHtml)); NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgNode != null && imgNode.Count > 0) { for (int p = 0; p < imgNode.Count; p++) { ImageTag img = imgNode[p] as ImageTag; string link = "http://www.sdzb.gov.cn" + img.ImageURL.GetReplace("../,./"); ctxHtml = ctxHtml.GetReplace(img.ImageURL, link); attach.Add(link); } } NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "山东省", "山东省及地市", "", infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { if (attach.Count > 0) { for (int a = 0; a < attach.Count; a++) { try { BaseAttach entity = ToolHtml.GetBaseAttachByUrl(attach[a], headName, info.Id); if (entity != null) { ToolDb.SaveEntity(entity, "SourceID,AttachServerPath"); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.Link.ToLower().Contains("download") || a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.sdzb.gov.cn" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } try { BaseAttach entity = ToolHtml.GetBaseAttachByUrl(link, a.LinkText, info.Id); if (entity != null) { ToolDb.SaveEntity(entity, "SourceID,AttachServerPath"); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToPlainTextString().GetRegexBegEnd("共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://ggzy.zhuhai.gov.cn//zbgg/index_" + i + ".htm", Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "news")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { INode node = listNode[j]; ATag aTag = node.GetATag(); if (aTag == null) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "m_r m_r_g"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetReplace(" ").GetBuildRegex(); code = inviteCtx.GetReplace(" ").GetCodeRegex().GetCodeDel(); prjAddress = inviteCtx.GetReplace(" ").GetAddressRegex(); if (buildUnit.Contains("管理局")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("管理局")) + "管理局"; } if (buildUnit.Contains("联系")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系")); } if (buildUnit.Contains("价格")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("价格")); } msgType = "珠海市公共资源交易中心"; specType = "建设工程"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "珠海市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://ggzy.zhuhai.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "?Paging=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "99%"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount - 1; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); InfoTitle = aTag.GetAttribute("title"); PublistTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gxzbtb.cn" + aTag.Link; InfoType = "澄清变更"; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml().GetJsString(); InfoCtx = htmlTxt.ToCtxString(); buildUnit = InfoCtx.GetBuildRegex(); NoticeInfo info = ToolDb.GenNoticeInfo("广西壮族自治区", "广西壮族自治区及地市", area, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "广西壮族自治区公共资源交易中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, "交通工程", string.Empty, htmlTxt); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.gxzbtb.cn" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { string urlList = "http://www.ezztb.gov.cn/jiaoyixinxi/queryJiaoYiXinXiPagination.do?bianHao=&gongChengLeiBie=&gongChengType=&gongShiType=10&page=1&title=&type=10&rows="; IList list = new List <InviteInfo>(); int sqlCount = 0; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(urlList + this.MaxCount); } catch { return(null); } int startIndex = html.IndexOf("{"); int endIndex = html.LastIndexOf("}"); html = html.Substring(startIndex, (endIndex + 1) - startIndex); JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); object[] objvalues = smsTypeJson["rows"] as object[]; foreach (object objValue in objvalues) { Dictionary <string, object> dic = (Dictionary <string, object>)objValue; string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; code = Convert.ToString(dic["bianHao"]); prjName = Convert.ToString(dic["title"]); beginDate = Convert.ToString(dic["faBuStartTimeText"]).GetDateRegex(); inviteType = Convert.ToString(dic["gongChengTypeText"]); if (prjName.Contains("测试")) { continue; } InfoUrl = "http://www.ezztb.gov.cn/jyw/jyw/showGongGao.do?ggGuid=" + dic["yuanXiTongId"]; try { HtmlTxt = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); JavaScriptSerializer Newserializer = new JavaScriptSerializer(); Dictionary <string, object> newTypeJson = (Dictionary <string, object>)Newserializer.DeserializeObject(HtmlTxt); HtmlTxt = Convert.ToString(newTypeJson["html"]); if (string.IsNullOrWhiteSpace(HtmlTxt)) { string url = "http://www.ezztb.gov.cn/jiaoyixingxi/zbgg_view.html?guid=" + dic["yuanXiTongId"]; string htmldtl = this.ToolWebSite.GetHtmlByUrl(url); } } catch (Exception ex) { continue; } inviteCtx = HtmlTxt.Replace("</span>", "\r\n").ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); //if (string.IsNullOrWhiteSpace(buildUnit)) // buildUnit = inviteCtx.GetRegex("招标人与招标代理建设单位"); if (string.IsNullOrEmpty(code)) { code = inviteCtx.GetCodeRegex(); } msgType = "鄂州市公共资源交易中心"; specType = "建设工程"; if (string.IsNullOrWhiteSpace(inviteType)) { inviteType = prjName.GetInviteBidType(); } buildUnit = buildUnit.Replace(" ", ""); InviteInfo info = ToolDb.GenInviteInfo("湖北省", "湖北省及地市", "鄂州市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } Parser parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.ezztb.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htl = regexHtml.Replace(htl, ""); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("colSpan", "6"))); if (nodeList != null && nodeList.Count > 0) { Regex regexPage = new Regex(@"共\d+页"); page = int.Parse(regexPage.Match(nodeList.AsString()).Value.Trim(new char[] { '共', '页' })); } for (int i = 1; i < page; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(htl); eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "key", "AxGridView1$ctl23$ctl07", "AxGridView1$ctl23$pageList", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION" }, new string[] { "AxGridView1$ctl23$ctl03", string.Empty, viewState, string.Empty, "20", (i - 1).ToString(), string.Empty, eventValidation }); try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "AxGridView1"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 1; j < table.RowCount - 1; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[2].ToPlainTextString().Trim(); prjName = tr.Columns[3].ToPlainTextString().Trim(); //endDate = tr.Columns[4].ToPlainTextString().Replace(" ", "").Trim().Substring(0, 10); ATag aTag = tr.Columns[5].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.yjgcjy.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace(" ", ""); } catch (Exception) { Logger.Error("InviteYJYXJS"); continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellSpacing", "1"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); TableTag tableRow = (TableTag)dtnode[0]; for (int k = 1; k < tableRow.RowCount; k++) { TableRow trow = tableRow.Rows[k]; for (int c = 0; c < trow.ColumnCount; c++) { string tr1 = string.Empty; tr1 = trow.Columns[c].ToPlainTextString().Trim(); inviteCtx += tr1; } inviteCtx += "\r\n"; } Regex regPrjAddr = new Regex(@"工程建设地址:[^\r\n]+\r\n"); try { prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程建设地址", "").Replace(":", "").Replace("。", "").Replace("、", "").Replace(";", "").Replace(",", "").Trim(); if (Encoding.Default.GetByteCount(prjAddress) > 200 || prjAddress == "") { prjAddress = "见招标详细信息"; } } catch (Exception) { prjAddress = "见招标详细信息"; } Regex regBegin = new Regex(@"公告发布时间:[^\r\n]+[\r\n]{1}"); beginDate = regBegin.Match(inviteCtx).Value.Replace("公告发布时间:", "").Trim(); string date = beginDate.Replace(" ", "").Trim(); Regex regDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日"); beginDate = regDate.Match(date).Value.Trim(); if (beginDate == "") { Regex regDateT = new Regex(@"[u4e00-u9fa5]{4}年[u4e00-u9fa5]{1,2}月[u4e00-u9fa5]{1,2}日"); beginDate = regDateT.Match(inviteCtx).Value.Replace("公告发布时间:", "").Trim(); } if (beginDate == "") { beginDate = string.Empty; } Regex bildUnit = new Regex(@"建设单位:[^\r\n]+[\r\n]{1}"); buildUnit = bildUnit.Match(inviteCtx).Value.Replace("建设单位:", "").Trim(); if (buildUnit == "") { buildUnit = ""; } msgType = "阳江市建设工程交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = ns0 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("xml:namespace prefix = st1", "").Trim(); inviteCtx = inviteCtx.Replace("点击进入留言", "").Trim(); code = code.Replace(";", "").Replace(":", "").Trim(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "阳江市区", "阳西县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parserdetail.Reset(); NodeList fileNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellSpacing", "1"))); if (fileNode != null && fileNode.Count > 0 && fileNode[0] is TableTag) { TableTag fileTable = fileNode[0] as TableTag; for (int f = 10; f < fileTable.RowCount; f++) { TableRow trowFile = fileTable.Rows[f]; for (int z = 0; z < 1; z++) { string tr1 = string.Empty; tr1 = trowFile.Columns[z].ToPlainTextString().Trim(); if (tr1.Contains("下载招标文件:") || tr1.Contains("下载工程量清单:") || tr1.Contains("下载图纸:")) { if (fileTable.Rows[f].Columns[z + 1].ToPlainTextString().Trim() != "") { int tt = fileTable.Rows[f].Columns[z + 1].SearchFor(typeof(ATag), true).Count; for (int ii = 0; ii < tt; ii++) { string st3 = fileTable.Rows[f].Columns[z + 1].SearchFor(typeof(ATag), true)[ii].ToPlainTextString().Trim(); ATag aTagCh = fileTable.Rows[f].Columns[z + 1].SearchFor(typeof(ATag), true)[ii] as ATag; string urlValues = "http://www.yjgcjy.cn" + aTagCh.Link; if (aTagCh.Link.Contains("http://www.yjgcjy.cn")) { urlValues = aTagCh.Link; } if (st3 != "") { BaseAttach attach = ToolDb.GenBaseAttach(st3, info.Id, urlValues); base.AttachList.Add(attach); } } } } else { continue; } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { code = ""; Parser parserdetailtwo = new Parser(new Lexer(htmldetail)); NodeList dtnodetwo = parserdetailtwo.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "nr"))); if (dtnodetwo != null && dtnodetwo.Count > 0) { HtmlTxt = dtnodetwo.AsHtml(); inviteCtx = dtnodetwo.AsString().Replace("。", "").Trim(); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); inviteCtx = regexHtml.Replace(inviteCtx, "").Replace("O", "〇"); Regex regPrjAddr = new Regex(@"(工程建设地点|工程地点):[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程建设地点", "").Replace("工程地点", "").Replace(":", "").Trim(); if (prjAddress == "") { prjAddress = "见招标详细信息"; } Regex regDateT = new Regex(@"[^u4e00-u9fa5]{4}年[^u4e00-u9fa5]{1,3}月[^u4e00-u9fa5]{1,3}日"); beginDate = regDateT.Match(inviteCtx).Value.Trim(); beginDate = returnS(beginDate); if (beginDate == "") { beginDate = string.Empty; } Regex bildUnit = new Regex(@"发包人:[^\r\n]+[\r\n]{1}"); buildUnit = bildUnit.Match(inviteCtx).Value.Replace("发包人:", "").Trim(); if (buildUnit == "") { buildUnit = ""; } msgType = "阳江市建设工程交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = ns0 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("xml:namespace prefix = st1", "").Trim(); inviteCtx = inviteCtx.Replace("点击进入留言", "").Trim(); inviteCtx = inviteCtx.Replace("〇", "0"); InviteInfo info = ToolDb.GenInviteInfo("广东省", "阳江市区", "阳西县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "-1"); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "m_COUNT"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString().GetRegexBegEnd("/", ")"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + ((i - 1) * 24)); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "m_TAB"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); if (aTag == null) { continue; } ItemName = tr.Columns[1].ToNodePlainString(); if (ItemName.Contains("...")) { aTag.GetAttribute("title"); } PlanDate = "20" + tr.Columns[2].ToPlainTextString().GetDateRegex("yy-MM-dd"); InfoUrl = "http://www.scdrc.gov.cn" + aTag.Link;//aTag.Link.GetReplace(".htm", "_1.htm"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList IsNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("iframe"), new HasAttributeFilter("id", "m_FRAME"))); if (IsNode != null && IsNode.Count > 0) { try { InfoUrl = "http://www.scdrc.gov.cn" + aTag.Link.GetReplace(".htm", "_1.htm"); htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "m_TEXT"))); if (dtlNode == null || dtlNode.Count < 1) { parser.Reset(); dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); } if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); parser = new Parser(new Lexer(CtxHtml)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { TableTag tag = tableNode[0] as TableTag; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { ItemCtx += temp.GetReplace(":,:") + "\r\n"; } else { ItemCtx += temp.GetReplace(":,:") + ":"; } } } } else { ItemCtx = CtxHtml.ToCtxString(); } ItemContent = ItemCtx.GetRegex("内容", true, 1000); ApprovalUnit = ItemCtx.GetRegex("批复单位"); ApprovalDate = ItemCtx.GetRegex("批复日期,批复时间"); ApprovalCode = ItemCtx.GetRegex("批复文号(备案号)"); TotalInvest = ItemCtx.GetRegex("总投资").GetMoney(); PlanBeginDate = ItemCtx.GetRegex("开工时间"); ItemAddress = ItemCtx.GetRegex("所属地区"); PlanType = ItemCtx.GetRegex("项目类型"); MsgType = "四川省发展和改革委员会"; ItemName = ItemName.GetReplace("四川省发展和改革委员会"); ItemPlan info = ToolDb.GenItemPlan("四川省", "四川省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); parser = new Parser(new Lexer(CtxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.scdrc.gov.cn/dir1111/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "clearfix")), true), new TagNameFilter("a"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[sNode.Count - 1].GetATag().GetAttribute("onclick").Replace("(", "kdxx").Replace(",", "xxdk"); pageInt = int.Parse(temp.GetRegexBegEnd("kdxx", "xxdk")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://ps.szzfcg.cn/portal/topicView.do?method=view1&id=2887106&siteId=9&underwayFlag=undefined&tstmp=17%3A40%3A43%20GMT%2B0800&page=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "fixed"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; beginDate = tr.Columns[1].ToNodePlainString().GetDateRegex("yyyy/MM/dd"); ATag aTag = tr.Columns[0].GetATag(); prjName = aTag.GetAttribute("title"); Regex regexLink = new Regex(@"id=[^-]+"); string id = regexLink.Match(aTag.Link).Value; InfoUrl = "http://ps.szzfcg.cn/portal/documentView.do?method=view&" + id; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "深圳市坪山新区公共资源交易中心"; specType = "政府采购"; inviteType = "服务"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳政府采购", "坪山新区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aTagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aTagNode != null && aTagNode.Count > 0) { for (int k = 0; k < aTagNode.Count; k++) { ATag aFile = aTagNode[k].GetATag(); if (aFile.IsAtagAttach() || aFile.Link.ToLower().Contains("down")) { string link = string.Empty; if (aFile.Link.Contains("http")) { link = aFile.Link; } else { link = "http://ps.szzfcg.cn/" + aFile.Link; } BaseAttach attach = ToolDb.GenBaseAttach(aFile.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { viewState, "MoreInfoList1$Pager", i.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.hbggzy.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")) + "地址"; } code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "湖北省公共资源交易中心"; specType = "政府采购"; inviteType = "水利工程"; buildUnit = buildUnit.Replace(" ", ""); InviteInfo info = ToolDb.GenInviteInfo("湖北省", "湖北省及地市", area, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.hbggzy.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidSituation>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1, sqlCount = 0; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl("http://www.szjsjy.com.cn/HomePage.aspx", Encoding.UTF8, ref cookiestr); viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection n = this.ToolWebSite.GetNameValueCollection( new string[] { "__VIEWSTATE", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "TextBox1", "ddl", "DDL_Govt", "DDL_Trade", "txtText", "hdnSN", "ImageButton2.x", "ImageButton2.y" }, new string[] { viewState, "", eventValidation, "请输入关键字", "0", "0", "0", "CN=年度施工投标人7,OU=1007,L=深圳市,ST=广东省,C=CN", "241EDFC1BA276AA7", "19", "13" } ); string tempCookie = string.Empty; html = this.ToolWebSite.GetHtmlByUrl("http://www.szjsjy.com.cn/HomePage.aspx", n , Encoding.UTF8, ref tempCookie); cookiestr = tempCookie.Replace("path=/;", "").Replace("HttpOnly,", "").Replace("HttpOnly", "").Replace(" ", ""); //"_gscu_485601283=265607704dljg167; _gscs_485601283=32711103yul0an14|pv:5;" + tempCookie.Replace("path=/;", "").Replace("HttpOnly,", "").Replace("HttpOnly", "").Replace(" ", ""); //tempCookie = tempCookie.Replace("path=/;", "").Replace("HttpOnly,", "").Replace("HttpOnly", "").Replace(" ", ""); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_Content_GridView1"))); if (pageNode != null && pageNode.Count > 0) { TableTag table = pageNode[0] as TableTag; try { string temp = table.Rows[table.RowCount - 1].ToNodePlainString().GetRegexBegEnd(",共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "ctl00$Content$drpSearchType", "ctl00$Content$txtQymc", "ctl00$Content$hdnOperate", "ctl00$hdnPageCount" }, new string[] { "ctl00$Content$GridView1", "Page$" + i, viewState, "", eventValidation, "0", "", "", pageInt.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_Content_GridView1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string code = string.Empty, prjName = string.Empty, PublicityEndDate = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, ctx = string.Empty, HtmlTxt = string.Empty, beginDate = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[1].ToNodePlainString(); prjName = tr.Columns[2].ToNodePlainString(); PublicityEndDate = tr.Columns[3].ToPlainTextString(); beginDate = DateTime.Now.ToString(); InfoUrl = "http://www.szjsjy.com.cn/BusinessInfo/" + tr.Columns[4].GetATagHref(); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8, ref cookiestr).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ContentContainer"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); ctx = HtmlTxt.ToCtxString(); msgType = "深圳市建设工程交易中心"; BidSituation info = ToolDb.GetBidSituation("广东省", "深圳市工程", "", code, prjName, PublicityEndDate, msgType, InfoUrl, ctx, HtmlTxt, beginDate); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(list); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int d = 0; d < aNode.Count; d++) { ATag aTag = aNode[0] as ATag; if (!aTag.IsAtagAttach()) { continue; } string url = "http://www.szjsjy.com.cn/" + aTag.Link.Replace("../", ""); BaseAttach attach = null; try { attach = ToolHtml.GetBaseAttach(url, aTag.LinkText, info.Id, "SiteManage\\Files\\Attach\\"); if (attach == null) { attach = ToolHtml.GetBaseAttach(url, aTag.LinkText, info.Id, "SiteManage\\Files\\Attach\\"); } } catch { } if (attach != null) { ToolDb.SaveEntity(attach, string.Empty); } } } } } } } } return(list); }
public void DealHtml(IList list, string html, bool crawlAll) { Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (aNodes != null && aNodes.Count > 0) { Type typs = typeof(ATag); TableTag table = aNodes[0] as TableTag; for (int t = 1; t < table.RowCount - 1; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[t] as TableRow; ATag aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = aTag.Link; prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmlDtl = regexHtml.Replace(htmlDtl, ""); Parser parserCtx = new Parser(new Lexer(htmlDtl)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable"))); if (ctxNode != null && ctxNode.Count > 0) { Parser parserdiv = new Parser(new Lexer(htmlDtl)); NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button"))); HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim(); Type tp = typeof(ATag); TableTag tabTag = ctxNode[0] as TableTag; string startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regex = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}"); Match math = regex.Match(startTime); beginDate = math.Value.Replace("时间:", "").Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexcode = new Regex("(工程编号|项目编号|招标编号):[^\r\n]+[\r\n]{1}"); Match match = regexcode.Match(tabTag.ToPlainTextString()); code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexBuildUnit = new Regex("(招标人|建设单位|招标采购单位):[^\r\n]+[\r\n]{1}"); Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString()); buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexAddress = new Regex("(建设地点|项目地点|工程地点):[^\r\n]+[\r\n]{1}"); Match matchAddress = regexAddress.Match(tabTag.ToPlainTextString()); prjAddress = matchAddress.Value.Substring(matchAddress.Value.IndexOf(":") + 1).Trim(); ctx = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace(" ", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); if (ctx.Length > 0) { Regex regexCtx = new Regex("<!--[^<]+-->"); ctx = regexCtx.Replace(ctx, ""); } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } if (buildUnit == "" || buildUnit == null) { buildUnit = ""; } if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = buildUnit.Substring(0, 150); } if (Encoding.Default.GetByteCount(prjAddress) > 200) { prjAddress = "见招标公告内容"; } if (beginDate.Length > 0 && endDate.Length > 0) { DateTime begin = new DateTime(); DateTime end = new DateTime(); try { begin = DateTime.Parse(beginDate); end = DateTime.Parse(endDate); } catch (Exception) { } if (begin > end) { endDate = string.Empty; } } } parserCtx.Reset(); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai"))); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(ctxNode.AsString()).Value.Trim(); if (beginDate == "") { beginDate = string.Empty; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "惠东县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, ctx, remark, "惠州市建设工程交易中心", inviteType, "建设工程", string.Empty, InfoUrl, HtmlTxt); list.Add(info); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"))); NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true); for (int a = 0; a < aTagNodes.Count; a++) { ATag fileTage = aTagNodes[a] as ATag; if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile")) { string downloadURL = fileTage.Link; BaseAttach attach = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return; } } } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "page"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页").GetReplace("("); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "/p/" + i + ".html"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); if (aTag == null) { continue; } ItemName = aTag.GetAttribute("title").GetReplace("甘肃省发展和改革委员会"); PlanDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gsdrc.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); TotalInvest = ItemCtx.GetRegexBegEnd("总投资", "万元"); ItemCode = ItemCtx.GetRegex("项目编码"); PlanType = "项目审批与核准"; MsgType = "甘肃省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("甘肃省", "甘肃省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); parser = new Parser(new Lexer(CtxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.gsdrc.gov.cn/" + a.Link.GetReplace("../,./"); } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("script"), new HasAttributeFilter("type", "text/javascript"))); string b = pageNode.AsString().GetCtxBr(); string c = b.Replace("('", "徐鑫").Replace("')", "凯德"); if (pageNode != null && pageNode.Count > 0) { try { string temp = c.GetRegexBegEnd("徐鑫", "凯德"); page = int.Parse(temp); } catch { } } for (int i = 1; i <= page; i++) { if (i >= 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "fcInfotitle", "currentPage" }, new string[] { "", i.ToString() } ); try { htl = this.ToolWebSite.GetHtmlByUrl("https://www.dgzb.com.cn/ggzy/website/WebPagesManagement/findListByPage?fcInfotype=1&tenderkind=A&projecttendersite=SS&orderFiled=fcInfoenddate&orderValue=desc", nvc, Encoding.UTF8); } catch { continue; } } JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(htl); foreach (KeyValuePair <string, object> obj in smsTypeJson) { object[] array = (object[])obj.Value; foreach (object arrValue in array) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Dictionary <string, object> dic = (Dictionary <string, object>)arrValue; code = Convert.ToString(dic["fcTendersn"]); prjName = Convert.ToString(dic["fcInfotitle"]); beginDate = Convert.ToString(dic["fcInfostartdate"]).GetDateRegex("yyyy-MM-dd"); string xu = Convert.ToString(dic["id"]); InfoUrl = "https://www.dgzb.com.cn/ggzy/website/WebPagesManagement/jsdetail?publishId=" + xu + "&fcInfotype=1"; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8).Replace(" ", ""); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail"))); if (dtnode.Count > 0 && dtnode != null) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.Replace("</p>", "\r\n").ToCtxString(); prjAddress = inviteCtx.GetRegexBegEnd("工程地址:", "\r"); buildUnit = inviteCtx.GetRegexBegEnd("建设单位:", "\r"); msgType = "东莞市建设工程交易中心"; specType = "建设工程"; Regex regoType = new Regex(@"工程类型(:|:)[^\r\n]+\r\n"); otherType = regoType.Match(inviteCtx).Value.Replace("工程类型:", "").Trim(); inviteCtx = inviteCtx.Replace("ctl00_cph_context_span_MetContent", "").Replace("<span id=", "").Replace("</span>", "").Replace(">", "").Trim(); if (buildUnit == "") { buildUnit = "见招标信息"; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info);//附件搜索 parserdetail.Reset(); parser = new Parser(new Lexer(HtmlTxt)); NodeList aTagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aTagNode != null && aTagNode.Count > 0) { for (int k = 0; k < aTagNode.Count; k++) { ATag aTag = aTagNode[k].GetATag(); if (aTag.IsAtagAttach()) { string linkurl = aTag.Link; linkurl = linkurl.Replace("&", "&"); string cc = string.Empty; string aa = linkurl.GetRegexBegEnd("&", "id"); if (aa == "") { cc = linkurl; } else { cc = linkurl.Replace(aa, ""); } BaseAttach attach = ToolDb.GenBaseAttach(aTag.LinkText, info.Id, cc); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } //for (int i = 1; i < page; i++) //{ // if (i > 1) // { // viewState = this.ToolWebSite.GetAspNetViewState(htl); // eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl); // NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[]{ // "__EVENTTARGET", // "__EVENTARGUMENT", // "__LASTFOCUS", // "__VIEWSTATE", // "__EVENTVALIDATION", // "ctl00$cph_context$drp_selSeach", // "ctl00$cph_context$txt_strWhere", // "ctl00$cph_context$drp_Rq", // "ctl00$cph_context$GridViewPaingTwo1$txtGridViewPagingForwardTo", // "ctl00$cph_context$GridViewPaingTwo1$btnNext.x", // "ctl00$cph_context$GridViewPaingTwo1$btnNext.y" // }, new string[]{ // string.Empty, // string.Empty, // string.Empty, // viewState, // eventValidation, // "1", // string.Empty, // "3", // (i-1).ToString(), // "8", // "10" // }); // try // { // htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8, ref cookiestr); // } // catch (Exception ex) { continue; } // } // parser = new Parser(new Lexer(htl)); // NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_GridView1"))); // if (tableNodeList != null && tableNodeList.Count > 0) // { // TableTag table = (TableTag)tableNodeList[0]; // for (int j = 1; j < table.RowCount; j++) // { // string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, // prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, // specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, // remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, // CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; // TableRow tr = table.Rows[j]; // code = tr.Columns[1].ToPlainTextString().Trim(); // prjName = tr.Columns[2].ToPlainTextString().Trim(); // beginDate = tr.Columns[4].ToPlainTextString().Trim().GetReplace(" - ", "&").Split('&')[0].Trim(); // try // { // endDate = tr.Columns[4].ToPlainTextString().Trim().GetReplace(" - ", "&").Split('&')[1].Trim(); // } // catch { } // ATag aTag = tr.Columns[2].SearchFor(typeof(ATag), true)[0] as ATag; // InfoUrl = "http://www.dgzb.com.cn:8080/dgjyweb/sitemanage/" + aTag.Link.Replace("amp;", "").Trim(); // string htmldetail = string.Empty; // try // { // htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8).Replace(" ", ""); // } // catch (Exception) // { // continue; // } // Parser parserdetail = new Parser(new Lexer(htmldetail)); // NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_span_MetContent"))); // if (dtnode.Count > 0 && dtnode != null) // { // HtmlTxt = dtnode.AsHtml(); // inviteCtx = dtnode.ToHtml().Replace("<br/>", "\r\n"); // Regex regBuidUnit = new Regex(@"建设单位:[^\r\n]+\r\n"); // buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("建设单位:", "").Replace(":", "").Trim(); // Regex regPrjAddr = new Regex(@"(工程地点|工程地址)(:|:)[^\r\n]+\r\n"); // prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("工程地址", "").Replace(":", "").Trim(); // msgType = "东莞市建设工程交易中心"; // specType = "建设工程"; // Regex regoType = new Regex(@"工程类型(:|:)[^\r\n]+\r\n"); // otherType = regoType.Match(inviteCtx).Value.Replace("工程类型:", "").Trim(); // inviteCtx = inviteCtx.Replace("ctl00_cph_context_span_MetContent", "").Replace("<span id=", "").Replace("</span>", "").Replace(">", "").Trim(); // if (buildUnit == "") // { // buildUnit = "见招标信息"; // } // inviteType = ToolHtml.GetInviteTypes(prjName); // InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "", // string.Empty, code, prjName, prjAddress, buildUnit, // beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); // list.Add(info);//附件搜索 // parserdetail.Reset(); // NodeList fileNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_DownLoadFiles1_GridView2"))); // if (fileNode != null && fileNode.Count > 0) // { // string iii = fileNode.AsString().Trim(); // TableTag tablefile = (TableTag)fileNode[0]; // for (int k = 1; k < tablefile.RowCount; k++) // { // string fileName = string.Empty, fileUrl = string.Empty; // TableRow trfile = tablefile.Rows[k]; // if (trfile.Columns[1].ToPlainTextString().Trim() != "") // { // ATag aTagfile = trfile.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; // fileName = trfile.Columns[1].ToPlainTextString().Trim(); // fileUrl = "http://www.dgzb.com.cn/dgjyweb/sitemanage/" + aTagfile.Link.Replace("amp;", "").Trim(); // BaseAttach attach = ToolDb.GenBaseAttach(fileName, info.Id, fileUrl); // base.AttachList.Add(attach); // } // } // } // parserdetail.Reset();//补充文件搜索 // NodeList fileBuChongNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_BuChongFileDown1_GridView2"))); // if (fileBuChongNode != null && fileBuChongNode.Count > 0) // { // string iii = fileBuChongNode.AsString().Trim(); // TableTag tableBuChongfile = (TableTag)fileBuChongNode[0]; // for (int k = 1; k < tableBuChongfile.RowCount; k++) // { // string fileName = string.Empty, fileUrl = string.Empty; // TableRow trfileBuChong = tableBuChongfile.Rows[k]; // if (trfileBuChong.Columns[1].ToPlainTextString().Trim() != "") // { // ATag aTagfile = trfileBuChong.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; // fileName = trfileBuChong.Columns[1].ToPlainTextString().Trim(); // fileUrl = "http://www.dgzb.com.cn/dgjyweb/sitemanage/" + aTagfile.Link.Replace("amp;", "").Trim(); // BaseAttach attach = ToolDb.GenBaseAttach(fileName, info.Id, fileUrl); // base.AttachList.Add(attach); // } // } // } // if (!crawlAll && list.Count >= this.MaxCount) return list; // } // } // } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "mvcPager")), true), new TagNameFilter("a"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[pageNode.Count - 2].GetATagHref().GetReplace("/Front/Zbgg/System.Web.Mvc.UrlParameter/"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "/System.Web.Mvc.UrlParameter/" + i); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "div_Li1")), true), new TagNameFilter("table"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[0].GetATag(); prjName = aTag.GetAttribute("title"); buildUnit = tr.Columns[1].GetATagValue("title"); beginDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.fjjsjy.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Table2"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.GetReplace("<br/>,<br />,<br>", "\r\n").ToCtxString(); prjAddress = inviteCtx.GetAddressRegex().GetCodeDel(); code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "福建省建设工程交易中心"; specType = inviteType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("福建省", "福建省及地市", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.fjjsjy.com/" + a.Link.GetReplace("../,./"); } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("cellspacing", "2"), new TagNameFilter("table"))); string pageString = sNode.AsString(); Regex regexPage = new Regex(@",共[^页]+页,"); Match pageMatch = regexPage.Match(pageString); try { pageInt = int.Parse(pageMatch.Value.Replace(",共", "").Replace("页,", "").Trim()); } catch (Exception) { } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "ctl00$hdnPageCount" }, new string[] { "ctl00$Content$GridView1", "Page$" + i.ToString(), viewState, "", eventValidation, pageInt.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_Content_GridView1"), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j] as TableRow; code = tr.Columns[1].ToPlainTextString().Trim(); prjName = tr.Columns[2].ToPlainTextString().Trim(); buildUnit = tr.Columns[4].ToPlainTextString().Trim(); bidUnit = tr.Columns[5].ToPlainTextString().Trim(); bidMoney = tr.Columns[6].ToPlainTextString().Replace("万元", "").Trim(); beginDate = tr.Columns[3].ToPlainTextString().Split('至')[0].Replace("年", "-").Replace("月", "-").Replace("日", " ").Replace("时", "").Trim(); endDate = tr.Columns[3].ToPlainTextString().Split('至')[1].Replace("年", "-").Replace("月", "-").Replace("日", " ").Replace("时", "").Trim(); ATag aTag = tr.Columns[2].Children[0] as ATag; InfoUrl = "http://www.szjsjy.com.cn/BusinessInfo/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "lblXXNR"), new TagNameFilter("span"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "lblXXNR"), new TagNameFilter("span"))); bidCtx = dtnode.AsString().Replace(" ", ""); Regex regPrjAdd = new Regex(@"(工程地点|工程地址):[^\r\n]+[\r\n]{1}"); prjAddress = regPrjAdd.Match(bidCtx).Value.Replace("工程地点:", "").Replace("工程地址:", "").Trim(); msgType = "深圳市建设工程交易中心"; specType = "建设工程"; Regex regprjMgr = new Regex(@"(项目经理|项目负责人|项目总监|建造师|监理师|项目经理姓名)(:|:)[^\s]+[\s]{1}"); prjMgr = regprjMgr.Match(bidCtx).Value.Replace("项目经理姓名", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Replace("监理师", "").Trim(); string bidUnitInfo = bidCtx.GetBidRegex(); if (!string.IsNullOrEmpty(bidUnitInfo)) { bidUnit = bidUnitInfo; } Regex regInvType = new Regex(@"[^\r\n]+[\r\n]{1}"); string InvType = regInvType.Match(bidCtx).Value; prjName = ToolDb.GetPrjName(prjName); if (!string.IsNullOrEmpty(bidUnit)) { bidUnit = ToolDb.GetBidUnit(bidUnit); if (bidUnit.Contains("报价")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("报价")); } } bidType = ToolHtml.GetInviteTypes(InvType); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳市工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, string.Empty, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); dtlparser.Reset(); NodeList dlNodes = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "trFujian"), new TagNameFilter("tr"))); if (dlNodes != null && dlNodes.Count > 0) { TableRow attr = dlNodes[0] as TableRow; NodeList fileNodes = attr.SearchFor(typeof(ATag), true); if (fileNodes != null && fileNodes.Count > 0) { for (int f = 0; f < fileNodes.Count; f++) { ATag fileTag = fileNodes[f] as ATag; if (!string.IsNullOrEmpty(fileTag.Link)) { BaseAttach attach = ToolDb.GenBaseAttach(fileTag.StringText, info.Id, fileTag.Link.Replace("..", "http://www.szjsjy.com.cn")); base.AttachList.Add(attach); } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Top10 TxtCenter"))); if (noList != null && noList.Count > 0) { string temp = noList.AsString().GetRegexBegEnd("/", "页"); try { pageInt = Convert.ToInt32(temp); } catch { pageInt = 10; } } else { pageInt = 10; } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.bidding.csg.cn/zbhxrgs/index_" + i.ToString() + ".jhtml", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "W750 Right")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 1; j < nodeList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.LinkText; beginDate = nodeList[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.bidding.csg.cn" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Center W1000"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); parser = new Parser(new Lexer(HtmlTxt)); NodeList nameNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("h1"), new HasAttributeFilter("class", "TxtCenter Padding10"))); if (nameNode != null && nameNode.Count > 0) { prjName = nameNode[0].ToNodePlainString(); } bidType = prjName.GetInviteBidType(); buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex().GetCodeDel(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegexBegEnd("公开询价确定", "成交单位"); } if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("成交人,拟定采购单位,成交候选人,第一推荐成交候选人,第一"); } if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegexBegEnd("签约单位为", "。"); } if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegexBegEnd("第一入围候选人", ","); } if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { TableTag tableDtl = tableNode[0] as TableTag; string ctx = string.Empty; for (int k = 1; k < tableDtl.RowCount; k++) { try { ctx += tableDtl.Rows[k].Columns[0].ToNodePlainString().Replace("单位名称", "中标单位").Replace("中标候选人", "中标单位") + ":"; ctx += tableDtl.Rows[k].Columns[1].ToNodePlainString() + "\r\n"; } catch { } } bidUnit = ctx.GetReplace("中标单位:第一").GetBidRegex(); if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyRegex(); } prjMgr = ctx.GetRegex("项目经理姓名及资质证书编号"); if (prjMgr.IndexOf("/") > 0) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("/")); } if (string.IsNullOrEmpty(bidUnit) || string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { ctx = string.Empty; for (int k = 0; k < tableDtl.RowCount; k++) { try { for (int d = 0; d < tableDtl.Rows[k].ColumnCount; d++) { ctx += tableDtl.Rows[k].Columns[d].ToNodePlainString().Replace("单位名称", "中标单位").Replace("中标侯选人", "中标单位") + ":"; ctx += tableDtl.Rows[k + 1].Columns[d].ToNodePlainString() + "\r\n"; } } catch { } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyRegex(); } prjMgr = ctx.GetRegex("项目经理姓名及资质证书编号"); } } } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见中标信息"; } specType = "其他"; msgType = "中国南方电网有限责任公司招标服务中心"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "中国南方电网有限责任公司招标服务中心"; } bidUnit = bidUnit.GetReplace(":"); BidInfo info = ToolDb.GenBidInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeAtag = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (nodeAtag != null && nodeAtag.Count > 0) { for (int c = 0; c < nodeAtag.Count; c++) { ATag a = nodeAtag[c] as ATag; if (a.Link.IsAtagAttach()) { string alink = "http://www.bidding.csg.cn/" + a.Link; try { BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace(" ", ""), info.Id, alink); base.AttachList.Add(attach); } catch { } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 15; //取得页码 string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList aNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "scott"))), new TagNameFilter("a"))); if (aNodes != null && aNodes.Count > 0) { try { string temp = aNodes.GetATagHref(aNodes.Count - 1); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("(", ")")); } catch { pageInt = 15; } } parser.Reset(); //逐页读取数据 for (int page = 1; page <= pageInt; page++) { try { if (page > 1) { string typeId = html.GetInputValue("typeId"); string boardId = html.GetInputValue("boardId"); string totalRows = html.GetInputValue("totalRows"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "typeId", "boardId", "newstitle", "sTime", "eTime", "totalRows", "pageNO" }, new string[] { typeId, boardId, string.Empty, string.Empty, string.Empty, totalRows, page.ToString() }); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default); } } catch { continue; } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[1].ToNodePlainString(); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = tr.GetATagHref(); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); parser = new Parser(new Lexer(htlDtl)); NodeList ifrm = parser.ExtractAllNodesThatMatch(new TagNameFilter("iframe")); IFrameTag iframe = ifrm.SearchFor(typeof(IFrameTag), true)[0] as IFrameTag; htlDtl = this.ToolWebSite.GetHtmlByUrl(iframe.GetAttribute("src").Replace("/zsweb/..", ""), Encoding.Default); } catch { Logger.Error("BidZhongshan"); continue; } parser = new Parser(new Lexer(htlDtl.Replace("th", "td").Replace("TH", "td"))); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "newtalbe_c"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); TableTag tab = dtlList[0] as TableTag; string ctx = string.Empty; for (int k = 0; k < tab.RowCount; k++) { for (int d = 0; d < tab.Rows[k].ColumnCount; d++) { if ((d + 1) % 2 == 0) { ctx += tab.Rows[k].Columns[d].ToNodePlainString() + "\r\n"; } else { ctx += tab.Rows[k].Columns[d].ToNodePlainString() + ":"; } } } code = htlDtl.ToCtxString().GetCodeRegex().Replace("[", "").Replace("]", ""); buildUnit = ctx.GetBuildRegex(); prjAddress = ctx.GetAddressRegex(); bidUnit = ctx.GetBidRegex(); bidMoney = ctx.GetMoneyRegex(); bidType = prjName.GetInviteBidType(); msgType = "中山市住房和城乡建设局"; specType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "中山市区", string.Empty, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(htlDtl)); NodeList aList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aList != null && aList.Count > 0) { for (int c = 0; c < aList.Count; c++) { ATag a = aList[c] as ATag; if (a.LinkText.IsAtagAttach()) { string alink = a.Link; BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace(" ", "").Replace(";", "").Replace(";", ""), info.Id, alink); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookieStr = string.Empty; int pageInt = 1; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ecms_pagination")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { ATag atag = pageList[pageList.Count - 2] as ATag; string temp = atag.LinkText; pageInt = int.Parse(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.shcac.edu.cn:80/html/xxdt/tzgg/" + i.ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_main_content")), true), new TagNameFilter("ul")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string btName = string.Empty, btTime = string.Empty, btUrl = string.Empty; ATag aTag = nodeList[j].GetATag(); btName = nodeList[j].ToNodePlainString(); btTime = nodeList[j].ToNodePlainString().GetDateRegex(); btName = btName.Replace(btTime, ""); btUrl = aTag.Link; string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(btUrl, Encoding.UTF8); htldtl = htldtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlBt = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail_main_content")), true), new TagNameFilter("h3"))); if (dtlBt != null && dtlBt.Count > 0) { btName = dtlBt.AsString(); if (btName.Contains("招标公告") || btName.Contains("补充公告")) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; parser.Reset(); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("style", "line-height:22px;"))); if (dtlList != null && dtlList.Count > 0) { prjName = btName; beginDate = btTime; InfoUrl = btUrl; HtmlTxt = dtlList.ToHtml(); inviteCtx = dtlList.ToHtml().Replace("</p>", "\r\n").ToCtxString().Replace("\r\n\t", "\r\n").Replace("\r\n\r\n", "\r\n"); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); msgType = "上海民航职业技术学院"; specType = ""; InviteInfo info = ToolDb.GenInviteInfo("上海市", "上海市区", string.Empty, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNodes = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNodes != null && aNodes.Count > 0) { for (int a = 0; a < aNodes.Count; a++) { ATag aFile = aNodes[a] as ATag; if (aFile.IsAtagAttach()) { string link = string.Empty; if (aFile.Link.ToLower().Contains("http")) { link = aFile.Link; } else { link = aFile.Link; } BaseAttach attach = ToolDb.GenBaseAttach(aFile.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else if (btName.Contains("中标结果") || btName.Contains("结果公示") || btName.Contains("中标公示")) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; parser.Reset(); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("style", "line-height:22px;"))); if (dtlList != null && dtlList.Count > 0) { prjName = btName; beginDate = btTime; InfoUrl = btUrl; HtmlTxt = dtlList.ToHtml(); bidCtx = dtlList.ToHtml().Replace("</p>", "\r\n").ToCtxString().Replace("\r\n\t", "\r\n").Replace("\r\n\r\n", "\r\n"); buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("中标人"); } bidMoney = bidCtx.GetMoneyRegex(); buildUnit = bidCtx.GetBuildRegex(); if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = bidCtx.GetRegex("招标人"); } code = bidCtx.GetCodeRegex().GetCodeDel(); if (!string.IsNullOrWhiteSpace(code)) { if (code[code.Length - 1] != '号') { code = ""; } } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } msgType = "上海民航职业技术学院"; specType = ""; bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("上海市", "上海市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { continue; } } else { continue; } } } } return(list); }