protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "PagingControl_lblPage"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString(); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); string _VIEWSTATEGENERATOR = ToolHtml.GetHtmlInputValue(html, "__VIEWSTATEGENERATOR"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION", "PagingControl$tbxpidex", "PagingControl$btnGo" }, new string[] { "", "", viewState, _VIEWSTATEGENERATOR, eventValidation, i.ToString(), "go" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "right")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { INode node = listNode[j]; ATag aTag = node.GetATag(); if (aTag == null) { continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title").Replace(" ", ""); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.wzgcjsx2.gx.cn/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "Center_Introduction"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.GetReplace("</p>,</br>", "\r\n").ToCtxString(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(null, false, "万元"); prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag table = tableNode[0] as TableTag; for (int r = 0; r < table.RowCount; r++) { for (int c = 0; c < table.Rows[r].ColumnCount; c++) { string temp = table.Rows[r].Columns[c].ToNodePlainString(); if (c % 2 == 0) { ctx += temp + ":"; } else { ctx += temp + "\r\n"; } } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyRegex(null, false, "万元"); } if (string.IsNullOrWhiteSpace(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } code = ctx.GetCodeRegex().GetCodeDel(); buildUnit = ctx.GetBuildRegex(); } else { code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); } } else { code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); } try { if (decimal.Parse(bidMoney) > 10000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } msgType = "梧州市公共资源交易中心"; specType = "建设工程"; bidType = prjName.GetInviteBidType(); BidInfo info = ToolDb.GenBidInfo("广西壮族自治区", "广西壮族自治区及地市", "梧州市", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.wzgcjsx2.gx.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "scott")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList[pageList.Count - 1].GetATagValue().Replace("(", "kdxx").Replace(")", "xxdk").GetRegexBegEnd("kdxx", "xxdk"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string typeId = ToolHtml.GetHtmlInputValue(html, "typeId"); string boardId = ToolHtml.GetHtmlInputValue(html, "boardId"); string totalRows = ToolHtml.GetHtmlInputValue(html, "totalRows"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "typeId", "boardId", "totalRows", "pageNO" }, new string[] { typeId, boardId, totalRows, i.ToString() }); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "政策法规"; headName = tr.Columns[1].ToNodePlainString(); releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "context_div"))); if (dtlList != null && dtlList.Count > 0) { ctxHtml = dtlList.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = MsgTypeCosnt.ZhongShanMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "中山市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int img = 0; img < imgList.Count; img++) { ImageTag imgTag = imgList[img] as ImageTag; BaseAttach baseInfo = ToolHtml.GetBaseAttachByUrl(imgTag.GetAttribute("src"), headName, info.Id); if (baseInfo != null) { ToolDb.SaveEntity(baseInfo, string.Empty); } } } parser = new Parser(new Lexer(ctxHtml)); NodeList attachList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (attachList != null && attachList.Count > 0) { for (int a = 0; a < attachList.Count; a++) { ATag aTag = attachList[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = ToolHtml.GetBaseAttachByUrl(aTag.Link, aTag.LinkText, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookieStr = string.Empty; int pageInt = 1; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default).GetJsString(); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd1")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList[pageList.Count - 1].GetATagValue(); pageInt = Convert.ToInt32(temp.Replace("javascript:goPage(", "").Replace(")", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string typeId = ToolHtml.GetHtmlInputValue(html, "typeId"); string boardId = ToolHtml.GetHtmlInputValue(html, "boardId"); string totalRows = ToolHtml.GetHtmlInputValue(html, "totalRows"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "typeId", "boardId", "newstitle", "sTime", "eTime", "totalRows", "pageNO" }, new string[] { typeId, boardId, "", "", "", totalRows, i.ToString() } ); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; TableRow tr = table.Rows[j]; InfoTitle = tr.Columns[1].ToNodePlainString(); string endDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoType = "资格预审"; InfoUrl = tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "context_div"))); if (dtlList != null && dtlList.Count > 0) { htmlTxt = dtlList.ToHtml(); InfoCtx = dtlList.ToHtml().ToCtxString().Replace("<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />", ""); PublistTime = InfoCtx.GetDateRegex("yyyy年MM月dd日").Replace("年", "-").Replace("月", "-").Replace("日", ""); if (string.IsNullOrEmpty(PublistTime)) { PublistTime = InfoCtx.GetDateRegex(); } if (string.IsNullOrEmpty(PublistTime)) { PublistTime = endDate; } NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "惠州市区", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, MsgTypeCosnt.HuiZhouMsgType, InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); Dictionary <string, string> citys = this.GetCitys(); foreach (string area in citys.Keys) { int count = 0; int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(citys[area], Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); string viewSTATEGENERATOR = ToolHtml.GetHtmlInputValue(html, "__VIEWSTATEGENERATOR"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTTARGET", "__EVENTARGUMENT", "__EVENTVALIDATION", "MoreInfoList1$txtTitle" }, new string[] { viewState, viewSTATEGENERATOR, "MoreInfoList1$Pager", i.ToString(), eventValidation, "" }); try { html = this.ToolWebSite.GetHtmlByUrl(citys[area], nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title").GetReplace("【正在报名】,【报名结束】"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gxzbtb.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); prjAddress = inviteCtx.GetAddressRegex().GetReplace(" "); buildUnit = inviteCtx.GetBuildRegex().GetReplace(" "); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")) + "地址"; } code = inviteCtx.GetCodeRegex().GetCodeDel().GetReplace(" "); msgType = "广西壮族自治区公共资源交易中心"; specType = "建设工程"; inviteType = "水利水电"; buildUnit = buildUnit.Replace(" ", ""); InviteInfo info = ToolDb.GenInviteInfo("广西壮族自治区", "广西壮族自治区及地市", area, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); count++; parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.gxzbtb.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && count >= this.MaxCount) { goto Funcs; } } } } } Funcs :; } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("vAlign", "bottom"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数:", "当前"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); string __CSRFTOKEN = ToolHtml.GetHtmlInputValue(html, "__CSRFTOKEN"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__CSRFTOKEN", "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { __CSRFTOKEN, viewState, "MoreInfoList1$Pager", i.ToString() }); try { cookiestr = cookiestr.GetReplace(new string[] { "path=/;", "HttpOnly", "," }); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, htmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); InfoTitle = aTag.GetAttribute("title"); if (Encoding.Default.GetByteCount(InfoTitle) > 150) { InfoTitle = aTag.LinkText; } PublistTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.spprec.com" + aTag.Link; InfoType = "变更公告"; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode == null || dtlNode.Count < 1) { dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ivs_content"))); } if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml(); InfoCtx = htmlTxt.GetReplace("<br />,<br/>,<br>,</p>", "\r\n").ToCtxString(); NoticeInfo info = ToolDb.GenNoticeInfo("四川省", "四川省及地市", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "四川省公共资源交易中心", InfoUrl, string.Empty, string.Empty, string.Empty, string.Empty, "政府采购", string.Empty, htmlTxt); list.Add(info); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag tag = aNode[k] as ATag; if (tag.IsAtagAttach()) { string link = string.Empty; if (tag.Link.ToLower().Contains("http")) { link = tag.Link; } else { link = "http://www.spprec.com" + tag.Link; } BaseAttach attach = ToolDb.GenBaseAttach(tag.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { Logger.Error("无内容" + InfoUrl); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletininfotable_toolbarTable"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { string bulletininfotable_totalpages = ToolHtml.GetHtmlInputValue(html, "bulletininfotable_totalpages"); string bulletininfotable_totalrows = ToolHtml.GetHtmlInputValue(html, "bulletininfotable_totalrows"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ec_i", "bulletininfotable_efn", "bulletininfotable_crd", "bulletininfotable_p", "bulletininfotable_s_bulletintitle", "bulletininfotable_s_finishday", "hySort", "findAjaxZoneAtClient", "method", "bulletinclass", "bulletininfotable_totalpages", "bulletininfotable_totalrows", "bulletininfotable_pg", "bulletininfotable_rd" }, new string[] { "bulletininfotable", "", "20", i.ToString(), "", "", "2", "false", "bulletinMore", "01", bulletininfotable_totalpages, bulletininfotable_totalrows, (i - 1).ToString(), "5" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html.Replace("tbody", "table"))); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletininfotable_table_body"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToNodePlainString(); beginDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.ynggzy.net/bulletin.do?method=showbulletin&bulletin_id=" + tr.GetAttribute("id"); string htmldtl = string.Empty; try { htmldtl = ToolHtml.GetHtmlByUrl(this.SiteUrl, InfoUrl, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToLower().GetReplace("</p>,<br />,<br/>,<br>", "\r\n").ToCtxString(); buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex(); bidType = prjName.GetInviteBidType(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("成交人,成交供应商"); } bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegexBegEnd("确定中标供应商为", ","); if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = bidCtx.GetRegexBegEnd("投标报价为", "万元"); } if (string.IsNullOrWhiteSpace(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag tag = bidNode[0] as TableTag; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("入围供应商,成交人,单位名称"); } if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyString().GetMoney(); } } } } if (buildUnit.Contains("联系")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } if (bidUnit.Contains("地址")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("地址")); } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } bidUnit = bidUnit.GetReplace("第一,1"); if (bidUnit.Contains("综合") || bidUnit.Contains("报价") || bidUnit.Contains("联系") || bidUnit.Contains("投标单位") || bidUnit.Contains("得分") || bidUnit.Contains("中标价")) { bidUnit = string.Empty; } try { if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } specType = "建设工程"; msgType = "云南省公共资源交易中心"; BidInfo info = ToolDb.GenBidInfo("云南省", "云南省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string obj = a.Link.GetReplace("(", "(").GetRegexBegEnd("(", ",").GetReplace("(").GetReplace("'").Replace(",", ""); string name = a.Link.GetReplace(")", ")").GetRegexBegEnd(",", ")").GetReplace(")").GetReplace("'").Replace(",", ""); string link = "http://www.ynggzy.net/resource/bulletin.do?method=mdownloadFile&file_id=" + obj + "&file_name=" + name; BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletininfotable_toolbarTable"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { string bulletininfotable_totalpages = ToolHtml.GetHtmlInputValue(html, "bulletininfotable_totalpages"); string bulletininfotable_totalrows = ToolHtml.GetHtmlInputValue(html, "bulletininfotable_totalrows"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ec_i", "bulletininfotable_efn", "bulletininfotable_crd", "bulletininfotable_p", "bulletininfotable_s_bulletintitle", "bulletininfotable_s_finishday", "hySort", "findAjaxZoneAtClient", "method", "bulletinclass", "bulletininfotable_totalpages", "bulletininfotable_totalrows", "bulletininfotable_pg", "bulletininfotable_rd" }, new string[] { "bulletininfotable", "", "20", i.ToString(), "", "", "1", "false", "bulletinMore", "01", bulletininfotable_totalpages, bulletininfotable_totalrows, (i - 1).ToString(), "5" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html.Replace("tbody", "table"))); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletininfotable_table_body"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, city = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToNodePlainString(); beginDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.ynggzy.net/bulletin.do?method=showbulletin&bulletin_id=" + tr.GetAttribute("id"); string htmldtl = string.Empty; try { htmldtl = ToolHtml.GetHtmlByUrl(this.SiteUrl, InfoUrl, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,<br />,<br/>,<br>", "\r\n").ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); inviteType = prjName.GetInviteBidType(); specType = "政府采购"; msgType = "云南省公共资源交易中心"; InviteInfo info = ToolDb.GenInviteInfo("云南省", "云南省及地市", city, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.ynggzy.net/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { string newUrl = "http://202.104.65.182:8081/G2/gfmweb/web-enterprise!list.do?data&filter_params_=enterpriseId,rowNum,enterpriseBaseId,enterpriseName,organizationCode&defined_operations_=&nocheck_operations_=&"; string gridSearch = "true"; string nd = ToolHtml.GetDateTimeLong(DateTime.Now).ToString(); string PAGESIZE = "100"; string PAGE = "1"; string sortField = ""; string sortDirection = "asc"; string searchVal = "1"; string _enterpriseName_like = "公司"; string entTypeCodes = ""; NameValueCollection nvc = ToolWeb.GetNameValueCollection(new string[] { "gridSearch", "nd", "PAGESIZE", "PAGE", "sortField", "sortDirection", "searchVal", "_enterpriseName_like", "entTypeCodes" }, new string[] { gridSearch, nd, PAGESIZE, PAGE, sortField, sortDirection, searchVal, _enterpriseName_like, entTypeCodes }); string html = string.Empty; int pageInt = 1; try { html = ToolWeb.GetHtmlByUrl(newUrl, nvc, Encoding.UTF8); } catch { return(null); } JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); string page = smsTypeJson["total"].ToString(); pageInt = int.Parse(page); for (int i = 1; i <= pageInt; i++) { if (i > 1) { PAGE = i.ToString(); nvc = ToolWeb.GetNameValueCollection(new string[] { "gridSearch", "nd", "PAGESIZE", "PAGE", "sortField", "sortDirection", "searchVal", "_enterpriseName_like", "entTypeCodes" }, new string[] { gridSearch, nd, PAGESIZE, PAGE, sortField, sortDirection, searchVal, _enterpriseName_like, entTypeCodes }); try { html = ToolWeb.GetHtmlByUrl(newUrl, nvc, Encoding.UTF8); smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); } catch { continue; } } object[] objList = (object[])smsTypeJson["data"]; foreach (object obj in objList) { Dictionary <string, object> dic = obj as Dictionary <string, object>; string CorpName = string.Empty, CorpCode = string.Empty, CorpAddress = string.Empty, RegDate = string.Empty, RegFund = string.Empty, BusinessCode = string.Empty, BusinessType = string.Empty, LinkMan = string.Empty, LinkPhone = string.Empty, Fax = string.Empty, Email = string.Empty, CorpSite = string.Empty, cUrl = string.Empty, ISOQualNum = string.Empty, ISOEnvironNum = string.Empty, corpType = string.Empty, qualCode = string.Empty, corpMgr = string.Empty, businessMgr = string.Empty, tecMgr = string.Empty; CorpName = Convert.ToString(dic["enterpriseName"]); CorpCode = Convert.ToString(dic["organizationCode"]); string idCode = Convert.ToString(dic["enterpriseBaseId"]); string enterpriseId = Convert.ToString(dic["enterpriseId"]); cUrl = "http://202.104.65.182:8081/G2/webdrive/web-enterprise!view.do?enterpriseId=" + enterpriseId; //string infoUrl = "http://202.104.65.182:8081/G2/webdrive/web-enterprise-pub!getEnterpriseInfoById.do"; //string infoUrl2 = "http://202.104.65.182:8081/G2/webdrive/web-enterprise-pub!menuTree.do"; //Dictionary<string, object> dtlInfo = null, dtlInfo2 = null; //string infoJson = string.Empty, infoJson2 = string.Empty; string htmldtl = string.Empty; try { htmldtl = ToolWeb.GetHtmlByUrl(cUrl).GetJsString(); //NameValueCollection dtlNvc = ToolWeb.GetNameValueCollection(new string[] { //"enterpriseId","menutype" //}, new string[] { enterpriseId, "" }); //infoJson = ToolWeb.GetHtmlByUrl(infoUrl, dtlNvc, Encoding.UTF8); //dtlInfo = (Dictionary<string, object>)serializer.DeserializeObject(infoJson); //dtlNvc = ToolWeb.GetNameValueCollection(new string[] { //"enterpriseId", //"menutype", //"actionFlag" //}, new string[] { //enterpriseId,"","" //}); //infoJson2 = ToolWeb.GetHtmlByUrl(infoUrl2, dtlNvc, Encoding.UTF8); //dtlInfo2 = (Dictionary<string, object>)serializer.DeserializeObject(infoJson2); } catch { continue; } CorpAddress = ToolHtml.GetHtmlInputValue(htmldtl, "_M.registerAddress"); RegDate = ToolHtml.GetHtmlInputValue(htmldtl, "_M.registerTime"); RegFund = ToolHtml.GetHtmlInputValue(htmldtl, "_M.licenseCapital"); if (!string.IsNullOrEmpty(RegFund)) { RegFund += "万元"; } BusinessCode = ToolHtml.GetHtmlInputValue(htmldtl, "_M.licenseRegistrationCode"); CorpSite = ToolHtml.GetHtmlInputValue(htmldtl, "_M.firmWebsite"); LinkMan = ToolHtml.GetHtmlInputValue(htmldtl, "_M.name"); Email = ToolHtml.GetHtmlInputValue(htmldtl, "_M.email"); LinkPhone = ToolHtml.GetHtmlInputValue(htmldtl, "_M.tel"); Fax = ToolHtml.GetHtmlInputValue(htmldtl, "_M.fax"); corpMgr = ToolHtml.GetHtmlInputValue(htmldtl, "_M.legalPersonName"); Parser parser = new Parser(new Lexer(htmldtl)); NodeList typeNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "g2-cell col-sm-6"))); if (typeNode != null && typeNode.Count > 0) { string str = string.Empty; for (int j = 2; j < typeNode.Count; j++) { string semp = typeNode[j].ToNodePlainString(); if (!string.IsNullOrEmpty(semp)) { try { DateTime time = DateTime.Parse(semp); continue; } catch { } str += semp + ","; } } if (!string.IsNullOrEmpty(str)) { corpType = str.Remove(str.Length - 1); } } CorpInfo info = ToolDb.GenCorpInfo(CorpName, CorpCode, CorpAddress, RegDate, RegFund, BusinessCode, BusinessType, LinkMan, LinkPhone, Fax, Email, CorpSite, corpType, "广东省", "广东地区", "广东省住房和城乡建设厅", cUrl, ISOQualNum, ISOEnvironNum, string.Empty); string exisSql = string.Format("select Id from CorpInfo where CorpName='{0}' and CorpType='{1}' and InfoSource='{2}'", info.CorpName, info.CorpType, info.InfoSource); string corpId = Convert.ToString(ToolDb.ExecuteScalar(exisSql)); if (!string.IsNullOrEmpty(corpId)) { string delCorpQual = string.Format("delete from CorpQual where CorpId='{0}'", corpId); string delCorpLeader = string.Format("delete from CorpLeader where CorpId='{0}'", corpId); string delCorpSecLicStaff = string.Format("delete from CorpSecLicStaff where CorpId='{0}'", corpId); int qualCount = 0, leaderCount = 0, tecstaffCount = 0, infoCount = 0; string corpSql = string.Format("delete from CorpInfo where Id='{0}'", corpId); infoCount = ToolDb.ExecuteSql(corpSql); qualCount = ToolDb.ExecuteSql(delCorpQual); leaderCount = ToolDb.ExecuteSql(delCorpLeader); tecstaffCount = ToolDb.ExecuteSql(delCorpSecLicStaff); if (infoCount > 0) { ToolDb.SaveEntity(info, ""); } if (qualCount >= 0) { try { AddCorpQual(info, enterpriseId); } catch (Exception ex) { Logger.Error(ex); } } if (leaderCount >= 0) { try { AddCorpLeader(info, enterpriseId); } catch (Exception ex) { Logger.Error(ex); } } if (tecstaffCount >= 0) { try { AddCorpStaff(info, enterpriseId); } catch (Exception ex) { Logger.Error(ex); } } } else { if (ToolDb.SaveEntity(info, "")) { try { AddCorpLeader(info, enterpriseId); } catch (Exception ex) { Logger.Error(ex); } try { AddCorpQual(info, enterpriseId); } catch (Exception ex) { Logger.Error(ex); } try { AddCorpStaff(info, enterpriseId); } catch (Exception ex) { Logger.Error(ex); } } } } } ToolCoreDb.ExecuteProcedure(); return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("vAlign", "bottom"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数:", "当前"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); string __CSRFTOKEN = ToolHtml.GetHtmlInputValue(html, "__CSRFTOKEN"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__CSRFTOKEN", "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { __CSRFTOKEN, viewState, "MoreInfoList1$Pager", i.ToString() }); try { cookiestr = cookiestr.GetReplace(new string[] { "path=/;", "HttpOnly", "," }); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, city = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.spprec.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ivs_content"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.GetReplace("<br />,<br/>,<br>,</p>", "\r\n").ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("联系")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系")); } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); specType = inviteType = "政府采购"; msgType = "四川省公共资源交易中心"; InviteInfo info = ToolDb.GenInviteInfo("四川省", "四川省及地市", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag tag = aNode[k] as ATag; if (tag.IsAtagAttach()) { string link = string.Empty; if (tag.Link.ToLower().Contains("http")) { link = tag.Link; } else { link = "http://www.spprec.com" + tag.Link; } BaseAttach attach = ToolDb.GenBaseAttach(tag.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "MoreInfoList1_Pager"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString().GetRegexBegEnd("1/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); string __CSRFTOKEN = ToolHtml.GetHtmlInputValue(html, "__CSRFTOKEN"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__CSRFTOKEN", "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION", "MoreInfoList1$txtProjectName", "MoreInfoList1$txtBiaoDuanName", "MoreInfoList1$txtBiaoDuanNo", "MoreInfoList1$txtJSDW", "MoreInfoList1$StartDate", "MoreInfoList1$EndDate", "MoreInfoList1$jpdDi", "MoreInfoList1$jpdXian" }, new string[] { __CSRFTOKEN, "MoreInfoList1$Pager", i.ToString(), "", viewState, "76D0A3AC", eventValidation, "", "", "", "", "", "", "-1", "-1" }); try { cookiestr = cookiestr.GetReplace("path=/; HttpOnly").Replace(",", ""); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title").GetReplace(";"); area = prjName.GetReplace("[", "【").GetReplace("]", "】").GetRegexBegEnd("【", "】"); if (!string.IsNullOrEmpty(area)) { prjName = prjName.GetReplace("[" + area + "]"); } beginDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.jszb.com.cn/jszb/YW_info/" + aTag.GetAttribute("onclick").Replace("(", "(").GetRegexBegEnd("(", ",").GetReplace("\",../,./"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "zygg_Text_23"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "江苏省建设工程招标投标办公室"; specType = "建设工程"; inviteType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("江苏省", "江苏省及地市", area, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.jszb.com.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); Dictionary <string, string> citys = this.GetCitys(); foreach (string area in citys.Keys) { int count = 0; int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(citys[area], Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); string viewSTATEGENERATOR = ToolHtml.GetHtmlInputValue(html, "__VIEWSTATEGENERATOR"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTTARGET", "__EVENTARGUMENT", "__EVENTVALIDATION", "MoreInfoList1$txtTitle" }, new string[] { viewState, viewSTATEGENERATOR, "MoreInfoList1$Pager", i.ToString(), eventValidation, "" }); try { html = this.ToolWebSite.GetHtmlByUrl(citys[area], nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title").GetReplace("【正在报名】,【报名结束】"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gxzbtb.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.GetReplace(new string[] { "<br/>", "<br />", "<br>" }, "\r\n").ToCtxString(); prjAddress = bidCtx.GetAddressRegex(); buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); code = bidCtx.GetCodeRegex().GetCodeDel(); if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag bidTable = bidNode[0] as TableTag; for (int r = 0; r < bidTable.RowCount; r++) { for (int c = 0; c < bidTable.Rows[r].ColumnCount; c++) { if ((c + 1) % 2 == 0) { ctx += bidTable.Rows[r].Columns[c].ToNodePlainString() + "\r\n"; } else { ctx += bidTable.Rows[r].Columns[c].ToNodePlainString() + ":"; } } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyString().GetMoney("万元"); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrEmpty(code)) { code = ctx.GetCodeRegex().GetCodeDel(); } if (bidUnit.Contains("推荐") || bidUnit.Contains("中标") || bidUnit.Contains("地址")) { bidUnit = string.Empty; } if (string.IsNullOrEmpty(bidUnit)) { if (bidTable.RowCount > 1) { ctx = string.Empty; for (int d = 0; d < bidTable.Rows[0].ColumnCount; d++) { ctx += bidTable.Rows[0].Columns[d].ToNodePlainString() + ":"; try { ctx += bidTable.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } catch { } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyString().GetMoney(); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrEmpty(code)) { code = ctx.GetCodeRegex().GetCodeDel(); } } } } } try { if (decimal.Parse(bidMoney) > 10000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } bidUnit = bidUnit.Replace("名称", "").Replace("单位", "").Replace("№", "").Replace("1", "").Replace("2", "").Replace("联合体", "").Replace("(", ""); if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } if (bidUnit.Contains("研究院")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("研究院")) + "研究院"; } if (bidUnit.Contains("研究所")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("研究所")) + "研究所"; } bidType = "房建市政"; specType = "建设工程"; msgType = "广西壮族自治区公共资源交易中心"; BidInfo info = ToolDb.GenBidInfo("广西壮族自治区", "广西壮族自治区及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); count++; parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.gxzbtb.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && count >= this.MaxCount) { goto Funcs; } } } } } Funcs :; } return(list); }
/// <summary> /// 获取页面input值 /// </summary> /// <param name="htl"></param> /// <param name="inputId"></param> /// <returns></returns> public static string GetInputValue(this string htl, string inputId) { return(ToolHtml.GetHtmlInputValue(htl, inputId)); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1, sqlCount = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "statusBar"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString().GetRegexBegEnd("找到", "条"); pageInt = (Convert.ToInt32(temp) + 20 - 1) / 20; } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { string id = ToolHtml.GetHtmlInputValue(html, "id"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ec_i", "topicChrList_20070702_crd", "topicChrList_20070702_f_a", "topicChrList_20070702_p", "topicChrList_20070702_s_name", "topicChrList_20070702_s_topName", "id", "method", "__ec_pages", "topicChrList_20070702_rd", "topicChrList_20070702_f_name", "topicChrList_20070702_f_topName", "topicChrList_20070702_f_ldate" }, new string[] { "topicChrList_20070702", "20", "", i.ToString(), "", "", id, "view", i.ToString(), "20", "", "", "" }); try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "topicChrList_20070702_table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 3; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string itemCode = string.Empty, itemName = string.Empty, buildUnit = string.Empty, address = string.Empty, investMent = string.Empty, buildKind = string.Empty, investKink = string.Empty, linkMan = string.Empty, linkmanTel = string.Empty, itemDesc = string.Empty, apprNo = string.Empty, apprDate = string.Empty, apprUnit = string.Empty, apprResult = string.Empty, landapprNo = string.Empty, landplanNo = string.Empty, buildDate = string.Empty, infoSource = string.Empty, url = string.Empty, textCode = string.Empty, licCode = string.Empty, msgType = string.Empty, ctxHtml = string.Empty; string listName = string.Empty; listName = tr.Columns[1].ToNodePlainString(); buildDate = tr.Columns[3].ToNodePlainString().GetDateRegex(); url = "http://www.szzfcg.cn" + tr.Columns[1].GetATagHref(); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(url, Encoding.UTF8); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tab"))); if (dtlNode != null && dtlNode.Count > 0) { ctxHtml = dtlNode.AsHtml(); infoSource = ctxHtml.ToCtxString(); string ctx = string.Empty; TableTag dtlTable = dtlNode[0] as TableTag; for (int k = 0; k < dtlTable.RowCount; k++) { for (int d = 0; d < dtlTable.Rows[k].ColumnCount; d++) { if ((d + 1) % 2 == 0) { ctx += dtlTable.Rows[k].Columns[d].ToNodePlainString() + "\r\n"; } else { ctx += dtlTable.Rows[k].Columns[d].ToNodePlainString() + ":"; } } } itemName = ctx.GetRegex("项目名称,工程名称,名称"); if (string.IsNullOrEmpty(itemName)) { itemName = listName; } buildUnit = ctx.GetRegex("采购人名称"); investMent = ctx.GetRegex("财政预算限额(元)"); investMent = investMent.GetMoney(); msgType = "深圳政府采购"; ItemInfo info = ToolDb.GenItemInfo(itemCode, itemName, buildUnit, address, investMent, buildKind, investKink, linkMan, linkmanTel, itemDesc, apprNo, apprDate, apprUnit, apprResult, landapprNo, landplanNo, buildDate, "广东省", "深圳市区", infoSource, url, textCode, licCode, msgType, ctxHtml); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(list); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { BaseProject prj = new BaseProject(); prj.Id = ToolDb.NewGuid; prj.PrjCode = info.ItemCode; prj.PrjName = info.ItemName; prj.BuildUnit = info.BuildUnit; prj.BuildTime = info.BuildDate; prj.Createtime = info.CreateTime; prj.PrjAddress = info.Address; prj.InfoSource = info.InfoSource; prj.MsgType = info.MsgType; prj.Province = info.Province; prj.City = info.City; prj.Url = info.Url; ToolDb.SaveEntity(prj, "Url", this.ExistsUpdate); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "MoreInfoList1_Pager"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString().GetRegexBegEnd("1/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); string __CSRFTOKEN = ToolHtml.GetHtmlInputValue(html, "__CSRFTOKEN"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__CSRFTOKEN", "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION", "MoreInfoList1$txtProjectName", "MoreInfoList1$txtBiaoDuanName", "MoreInfoList1$txtBiaoDuanNo", "MoreInfoList1$txtJSDW", "MoreInfoList1$StartDate", "MoreInfoList1$EndDate", "MoreInfoList1$jpdDi", "MoreInfoList1$jpdXian" }, new string[] { __CSRFTOKEN, "MoreInfoList1$Pager", i.ToString(), "", viewState, "76D0A3AC", eventValidation, "", "", "", "", "", "", "-1", "-1" }); try { cookiestr = cookiestr.GetReplace("path=/; HttpOnly").Replace(",", ""); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty, area = string.Empty, bgType = string.Empty; InfoType = "最高限价公示"; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); InfoTitle = aTag.GetAttribute("title").GetReplace(";"); area = InfoTitle.GetReplace("[", "【").GetReplace("]", "】").GetRegexBegEnd("【", "】"); if (!string.IsNullOrEmpty(area)) { InfoTitle = InfoTitle.GetReplace("[" + area + "]"); } bgType = tr.Columns[2].ToNodePlainString(); PublistTime = tr.Columns[3].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.jszb.com.cn/jszb/YW_info/" + aTag.GetAttribute("onclick").Replace("(", "(").GetRegexBegEnd("(", ",").GetReplace("\",../,./"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Table1"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml(); TableTag tag = dtlNode[0] as TableTag; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { InfoCtx += temp + "\r\n"; } else { InfoCtx += temp.GetReplace(":,:") + ":"; } } } prjCode = InfoCtx.GetCodeRegex(); buildUnit = InfoCtx.GetBuildRegex(); NoticeInfo info = ToolDb.GenNoticeInfo("江苏省", "江苏省及地市", area, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "江苏省建设工程招标投标办公室", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, "建设工程", bgType, htmlTxt); list.Add(info); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach() || a.Link.ToLower().Contains("retrieveimagedata.aspx")) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.jszb.com.cn/jszb/YW_info/ZuiGaoXJ/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default).GetJsString(); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "dataPager"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString().GetRegexBegEnd("共有:", "页"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); string dataPager_input = ToolHtml.GetHtmlInputValue(html, "dataPager_input"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "searcher:txtKeyWord", "searcher:tcInputDateTime:txtDateTime1", "searcher:tcInputDateTime:txtDateTime2", "searcher:ddlProvince", "searcher:ddlCity1", "searcher:ddlCity2", "dataPager_input" }, new string[] { "dataPager", i.ToString(), viewState, "", "", "", "-1", "-1", "-1", dataPager_input }); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "p3"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = null; if (nodeList.Count > 1) { table = nodeList[1] as TableTag; } else { table = nodeList[0] as TableTag; } for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "通知公告"; headName = tr.Columns[0].ToNodePlainString(); releaseTime = tr.Columns[1].ToPlainTextString().GetDateRegex(); infoUrl = "http://www.sgjsj.gov.cn/sgwebims/" + tr.Columns[0].GetATagValue("onclick").Replace("(", "kdxx").Replace(")", "xxdk").GetRegexBegEnd("kdxx", "xxdk").Replace("\"", ""); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Table4"))); if (dtlList != null && dtlList.Count > 0) { ctxHtml = dtlList.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = MsgTypeCosnt.ShaoGuanMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "韶关市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(htldtl)); NodeList tabNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Table1"))); NodeList aNode = null; if (tabNode != null && tabNode.Count > 1) { parser = new Parser(new Lexer(tabNode[1].ToHtml())); aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); } else if (tabNode != null && tabNode.Count > 0) { parser = new Parser(new Lexer(tabNode.AsHtml())); aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); } if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = ToolHtml.GetBaseAttach("http://www.sgjsj.gov.cn/sgwebims/" + aTag.Link.Replace("../", "").Replace("./", ""), aTag.LinkText, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } } return(null); }