protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagin"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", "").Replace(" ", ""); try { pageInt = int.Parse(pageTemp.GetRegexBegEnd("/", "页")); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "wz")), true), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { TableTag table = nodeList[j] as TableTag; string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[1]; ATag aTag = tr.GetATag(); beginDate = tr.ToPlainTextString().GetDateRegex(); prjName = aTag.LinkText.Trim(); InfoUrl = "http://www.sfcx.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "wz"), new TagNameFilter("td"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); specType = "其他"; msgType = "深圳市三方诚信招标有限公司"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); dtlparser = new Parser(new Lexer(HtmlTxt)); NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (FileTag != null && FileTag.Count > 0) { for (int k = 0; k < FileTag.Count; k++) { ATag a = FileTag[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.sfcx.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zbjgmore2_Pager"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数:", "当前"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT", "__EVENTVALIDATION" }, new string[] { viewState, "zbjgmore2$Pager", i.ToString(), eventValidation }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "zbjgmore2_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); area = tr.ToNodePlainString().GetRegexBegEnd("【", "】"); InfoUrl = "http://www.lnzb.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tblInfo"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "_Sheet1"))); if (tableNode != null && tableNode.Count > 0) { TableTag tag = tableNode[0] as TableTag; string ctx = string.Empty; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { ctx += temp.GetReplace(":,:") + "\r\n"; } else { ctx += temp.GetReplace(":,:") + ":"; } } } buildUnit = ctx.GetBuildRegex(); bidUnit = ctx.GetBidRegex(); bidMoney = ctx.GetMoneyRegex(); code = ctx.GetCodeRegex(); prjMgr = ctx.GetMgrRegex(); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("项目负责人姓名"); } } beginDate = bidCtx.GetRegex("发布时间").GetDateRegex("yyyy/MM/dd"); msgType = "辽宁省建设厅招标投标管理处"; specType = "建设工程"; bidType = "勘察设计"; BidInfo info = ToolDb.GenBidInfo("辽宁省", "辽宁省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.lnzb.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "downdown"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().ToNodeString(); Regex reg = new Regex(@"共[^页]+页"); string page = reg.Match(temp).Value.Replace("共", "").Replace("页", ""); pageInt = Convert.ToInt32(page); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://lz.liannan.gov.cn/gcjs/dept.jsp?deptId=0240&pageNo=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "infoListDiv"))); if (nodeList != null && nodeList.Count > 0) { parser = new Parser(new Lexer(nodeList.ToHtml())); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "totle_info"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string prjType = viewList[j].ToPlainTextString().ToNodeString(); if (prjType.Contains("结果") || prjType.Contains("中标")) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = prjType.Replace(prjType.GetDateRegex(), ""); beginDate = prjType.GetDateRegex(); bidType = prjName.GetInviteBidType(); InfoUrl = "http://lz.liannan.gov.cn" + viewList[j].GetATagHref(); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = htlDtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "downContent"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); bidCtx = HtmlTxt.ToCtxString(); bidUnit = bidCtx.GetBidRegex(); buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex(); bidMoney = bidCtx.GetMoneyRegex(); msgType = "连南县住房和城乡规划建设局"; specType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "清远市区", "连南县", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = prjType.Replace(prjType.GetDateRegex(), ""); beginDate = prjType.GetDateRegex(); inviteType = prjName.GetInviteBidType(); InfoUrl = "http://lz.liannan.gov.cn" + viewList[j].GetATagHref(); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = htlDtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "downContent"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex(); prjAddress = inviteCtx.GetAddressRegex(); msgType = "连南县住房和城乡规划建设局"; specType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "清远市区", "连南县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "PageDataList"))); if (tdNodes != null) { string pageTemp = tdNodes.AsString().Replace(" ", "").Replace(" ", "").Trim(); Regex regpage = new Regex(@"共[0-9]+条"); try { int pageCount = int.Parse(regpage.Match(pageTemp).Value.Replace("共", "").Replace("条", "").Trim()); if (pageCount % 15 > 0) { pageInt = (pageCount / 15) + 1; } else { pageInt = pageCount / 15; } } catch (Exception ex) { } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "cataId", "find_yn", "key_word", "typeId", "__EVENTARGUMENT", "__EVENTTARGET", "__EVENTVALIDATION", "__VIEWSTATE" }, new string[] { "1,2,3,4,5,6,7,8,", string.Empty, string.Empty, "1,2,3,4,5,6,7,8,", string.Empty, "PageDataList$ctl12$LinkButton1", eventValidation, viewState }); try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "StaffList"), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[0].ToPlainTextString().Trim(); prjName = tr.Columns[1].ToPlainTextString().Trim(); beginDate = tr.Columns[2].ToPlainTextString().Trim(); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.cobo91.com/project/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "info"), new TagNameFilter("table"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "info"), new TagNameFilter("table"))); inviteCtx = System.Web.HttpUtility.HtmlDecode(dtnode.AsString().Replace("【打印本页】", "").Replace("【关闭窗口】", "").Replace("版权所有:中邦国际招标&邦迪工程顾问", "")); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = inviteCtx.GetRegex("采购单位,招标代理"); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = inviteCtx.GetRegex("开标地点"); } specType = "其他"; msgType = "中邦国际招标&邦迪工程顾问"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); dtlparser.Reset(); NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (FileTag != null && FileTag.Count > 0) { for (int f = 0; f < FileTag.Count; f++) { ATag file = FileTag[f] as ATag; if (file.Link.ToUpper().Contains(".DOC")) { BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, "http://www.cobo91.com" + file.Link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("script"), new HasAttributeFilter("type", "text/javascript"))); string b = pageNode.AsString().GetCtxBr(); string s = b.Replace("('", "心情").Replace("')", "你猜"); if (pageNode != null && pageNode.Count > 0) { try { string temp = s.GetRegexBegEnd("心情", "你猜"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i >= 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "fcInfotitle", "currentPage" }, new string[] { "", i.ToString() } ); try { html = this.ToolWebSite.GetHtmlByUrl("https://www.dgzb.com.cn/ggzy/website/WebPagesManagement/TradeInfo/GovProcurement/findListByPage?fcInfotype=7&openbidbelong=ZJ", nvc, Encoding.UTF8); } catch { continue; } } JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); foreach (KeyValuePair <string, object> obj in smsTypeJson) { object[] array = (object[])obj.Value; foreach (object arrValue in array) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Dictionary <string, object> dic = (Dictionary <string, object>)arrValue; code = Convert.ToString(dic["fcTendersn"]); prjName = Convert.ToString(dic["fcInfotitle"]); beginDate = Convert.ToString(dic["fcInfostartdate"]).GetDateRegex("yyyy-MM-dd"); string xu = Convert.ToString(dic["publishinfoid"]); InfoUrl = "https://www.dgzb.com.cn/ggzy/website/WebPagesManagement/TradeInfo/GovProcurement/govdetail?publishinfoid=" + xu + "&fcInfotype=7"; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } bool isTable = true; parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center"))); if (dtlNode == null || dtlNode.Count < 1) { isTable = false; parser.Reset(); dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content"))); } if (dtlNode == null || dtlNode.Count < 1) { isTable = false; parser = new Parser(new Lexer(htmldtl)); dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); } if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.ToHtml(); if (isTable) { TableTag dtlTable = dtlNode[0] as TableTag; for (int d = 0; d < dtlTable.RowCount; d++) { try { bidCtx += dtlTable.Rows[d].Columns[0].ToPlainTextString().Replace(":", "").Replace(":", "") + ":"; bidCtx += dtlTable.Rows[d].Columns[1].ToPlainTextString() + "\r\n"; } catch { } } } if (string.IsNullOrEmpty(bidCtx)) { bidCtx = HtmlTxt.ToCtxString(); } bidCtx = bidCtx.GetCtxBr(); buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); bidType = prjName.GetInviteBidType(); if (buildUnit.Contains("&#")) { buildUnit = string.Empty; } string ctx = string.Empty; parser = new Parser(new Lexer(HtmlTxt)); NodeList bidUnitNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "caigou_table"))); if (bidUnitNode != null && bidUnitNode.Count > 0) { TableTag unitTable = bidUnitNode[0] as TableTag; try { for (int c = 0; c < unitTable.Rows[0].ColumnCount; c++) { ctx += unitTable.Rows[0].Columns[c].ToNodePlainString() + ":"; ctx += unitTable.Rows[1].Columns[c].ToNodePlainString() + "\r\n"; } } catch { } bidUnit = ctx.GetBidRegex(); bidMoney = ctx.GetMoneyRegex(); } if (bidUnit.Contains("&#")) { bidUnit = string.Empty; } msgType = "东莞市政府采购"; specType = "政府采购"; BidInfo info = ToolDb.GenBidInfo("广东省", "东莞市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aTagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aTagNode != null && aTagNode.Count > 0) { for (int k = 0; k < aTagNode.Count; k++) { ATag aTag = aTagNode[k].GetATag(); if (aTag.IsAtagAttach()) { BaseAttach attach = ToolDb.GenBaseAttach(aTag.LinkText, info.Id, aTag.Link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } //parser = new Parser(new Lexer(html)); //NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table02"))); //if (listNode != null && listNode.Count > 0) //{ // TableTag table = listNode[0] as TableTag; // for (int j = 1; j < table.RowCount; j++) // { // TableRow tr = table.Rows[j]; // string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; // prjName = tr.Columns[2].ToNodePlainString(); // code = tr.Columns[1].ToNodePlainString(); // beginDate = tr.Columns[3].ToPlainTextString().GetDateRegex("yyyy-MM-dd"); // string v=tr.Columns[1].GetATagHref().Replace("/viewer.do?id=", ""); // InfoUrl = "http://dggp.dg.gov.cn/portal/documentView.do?method=view&id=" + tr.Columns[1].GetATagHref().Replace("/viewer.do?id=", ""); // string htmldtl = string.Empty; // try // { // htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); // } // catch // { // continue; // } // bool isTable = true; // parser = new Parser(new Lexer(htmldtl)); // NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletinContent"))); // if (dtlNode == null || dtlNode.Count < 1) // { // isTable = false; // parser.Reset(); // dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "container"))); // } // if (dtlNode == null || dtlNode.Count < 1) // { // isTable = false; // parser = new Parser(new Lexer(htmldtl)); // dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); // } // if (dtlNode != null && dtlNode.Count > 0) // { // HtmlTxt = dtlNode.ToHtml(); // if (isTable) // { // TableTag dtlTable = dtlNode[0] as TableTag; // for (int d = 0; d < dtlTable.RowCount; d++) // { // try // { // bidCtx += dtlTable.Rows[d].Columns[0].ToPlainTextString().Replace(":", "").Replace(":", "") + ":"; // bidCtx += dtlTable.Rows[d].Columns[1].ToPlainTextString() + "\r\n"; // } // catch { } // } // } // if (string.IsNullOrEmpty(bidCtx)) bidCtx = HtmlTxt.ToCtxString(); // bidCtx = bidCtx.GetCtxBr(); // buildUnit = bidCtx.GetBuildRegex(); // prjAddress = bidCtx.GetAddressRegex(); // bidType = prjName.GetInviteBidType(); // if (buildUnit.Contains("&#")) // buildUnit = string.Empty; // string ctx = string.Empty; // parser = new Parser(new Lexer(HtmlTxt)); // NodeList bidUnitNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "projectBundleList"))); // if (bidUnitNode != null && bidUnitNode.Count > 0) // { // TableTag unitTable = bidUnitNode[0] as TableTag; // try // { // for (int c = 0; c < unitTable.Rows[0].ColumnCount; c++) // { // ctx += unitTable.Rows[0].Columns[c].ToNodePlainString() + ":"; // ctx += unitTable.Rows[1].Columns[c].ToNodePlainString() + "\r\n"; // } // } // catch { } // bidUnit = ctx.GetBidRegex(); // bidMoney = ctx.GetMoneyRegex(); // } // if (bidUnit.Contains("&#")) // bidUnit = string.Empty; // msgType = "东莞市政府采购"; // specType = "政府采购"; // BidInfo info = ToolDb.GenBidInfo("广东省", "东莞市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); // list.Add(info); // parser = new Parser(new Lexer(HtmlTxt)); // NodeList aTagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); // if (aTagNode != null && aTagNode.Count > 0) // { // for (int k = 0; k < aTagNode.Count; k++) // { // ATag aTag = aTagNode[k].GetATag(); // if (aTag.IsAtagAttach()) // { // BaseAttach attach = ToolDb.GenBaseAttach(aTag.LinkText, info.Id, "http://dggp.dg.gov.cn" + aTag.Link); // base.AttachList.Add(attach); // } // } // } // if (!crawlAll && list.Count >= this.MaxCount) return list; // } // } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("colSpan", "6"))); if (nodeList != null && nodeList.Count > 0) { Regex regexPage = new Regex(@"共\d+页"); page = int.Parse(regexPage.Match(nodeList.AsString()).Value.Trim(new char[] { '共', '页' })); } for (int i = 1; i <= page; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(htl); eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "key", "AxGridView1$ctl23$ctl07", "AxGridView1$ctl23$pageList", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION" }, new string[] { "AxGridView1$ctl23$ctl03", string.Empty, viewState, string.Empty, "20", (i - 1).ToString(), string.Empty, eventValidation }); try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "AxGridView1"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 1; j < table.RowCount - 1; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[2].ToPlainTextString().Trim(); prjName = tr.Columns[3].ToPlainTextString().Trim(); bidUnit = tr.Columns[4].ToPlainTextString().Trim(); //beginDate = DateTime.Today.ToString(); ATag aTag = tr.Columns[5].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.yjgcjy.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace(" ", ""); } catch (Exception) { Logger.Error("BidYJHLJS"); continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "nr"))); string regd = dtnode.AsString().Replace(":", "").Replace("。", "").Trim(); Regex regDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日"); Regex regDate1 = new Regex(@"\d{4}年\d{1,2}月\d{1,2}、\d{1,2}日"); Regex regDate2 = new Regex(@"\d{4} 年 \d{1,2} 月 \d{1,2} 日"); Regex regDate3 = new Regex(@"\d{4}年\d{1,2}月\d{1,2}至\d{1,2}日"); beginDate = regDate.Match(regd).ToString(); if (beginDate == "") { try { beginDate = regDate1.Match(regd).ToString(); beginDate = beginDate.Remove(beginDate.IndexOf("、")).Trim(); } catch (Exception) { beginDate = ""; } } if (beginDate == "") { try { beginDate = regDate3.Match(regd).ToString(); beginDate = beginDate.Remove(beginDate.IndexOf("至")).Trim(); } catch (Exception) { beginDate = ""; } } if (beginDate == "") { try { beginDate = regDate2.Match(regd).ToString().Trim(); } catch (Exception) { beginDate = ""; } } if (beginDate == "") { continue; } if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); TableTag tabledetail = dtnode.SearchFor(typeof(TableTag), true)[0] as TableTag; if (tabledetail != null && tabledetail.RowCount > 0) { for (int r = 0; r < tabledetail.RowCount; r++) { TableRow trdetail = tabledetail.Rows[r]; for (int c = 0; c < trdetail.ColumnCount; c++) { string tr1 = string.Empty; string tr2 = string.Empty; tr1 = trdetail.Columns[c].ToPlainTextString().Trim(); if (c + 1 < trdetail.ColumnCount) { tr2 = trdetail.Columns[c + 1].ToPlainTextString().Trim(); } bidCtx += tr1 + ":" + tr2 + ":"; if (trdetail.ColumnCount > (c + 1)) { c = c + 1; } } bidCtx += "\r\n"; } bidCtx = bidCtx.Replace("(盖章)", "").Replace("(元)", "").Replace("(元)", "").Replace("(盖章)", "").Trim(); Regex bildUnit = new Regex(@"(招标人|承包人):[^\r\n]+[\r\n]{1}"); buildUnit = bildUnit.Match(bidCtx).Value.Replace("招标人:", "").Replace(":", "").Replace("承包人", "").Trim(); if (buildUnit == "") { Regex bildUnittwo = new Regex(@"招标人\r\n\r\n\r\n[^\r\n]+[\r\n]{1}"); buildUnit = bildUnittwo.Match(bidCtx).Value.Replace("招标人\r\n\r\n\r\n", "").Replace(":", "").Trim(); if (buildUnit == "") { buildUnit = ""; } } Regex regMoney = new Regex(@"(中标价|发包价|总投资):[^\r\n]+\r\n"); bidMoney = regMoney.Match(bidCtx).Value.Replace("中标价:", "").Replace("发包价:", "").Replace("总投资:", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains(":")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf(":")).Trim(); } if (bidMoney.Contains("万元")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万元")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } Regex regprjMgr = new Regex(@"(项目负责人|项目总监|项目经理):[^\r\n]+\r\n"); prjMgr = regprjMgr.Match(bidCtx).Value.Replace("项目负责人:", "").Replace("项目总监:", "").Replace("项目经理:", "").Trim(); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = string.Empty; } else { prjMgr = prjMgr.Remove(prjMgr.IndexOf(":")).Trim(); if (Encoding.Default.GetByteCount(prjMgr) > 12 || prjMgr == "") { prjMgr = "见中标详细信息"; } } msgType = "阳江市建设工程交易中心"; specType = "建设工程"; if (bidMoney == "0") { //TableTag tabledetail = dtnode.SearchFor(typeof(TableTag), true)[0] as TableTag; if (tabledetail.RowCount >= 2) { TableRow trdetail = tabledetail.Rows[1]; if (trdetail.ChildCount > 2) { Regex regBidMoneyR = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); bidMoney = trdetail.Columns[2].ToPlainTextString().Trim(); try { bidMoney = (decimal.Parse(regBidMoneyR.Match(bidMoney).Value) / 10000).ToString(); bidUnit = trdetail.Columns[1].ToPlainTextString().Trim(); } catch (Exception) { bidMoney = "0"; bidUnit = ""; } } } } bidCtx = bidCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Replace(":", "").Trim(); bidCtx = bidCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Replace(":", "").Trim(); bidCtx = bidCtx.Replace(" xml:namespace prefix = st1 ns = ", "").Trim(); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "阳江市区", "海陵区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { Parser parserdetailtwo = new Parser(new Lexer(htmldetail)); NodeList dtnodetwo = parserdetailtwo.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("P"), new HasAttributeFilter("class", "MsoNormal"))); HtmlTxt = dtnodetwo.AsHtml(); string text = string.Empty; bidCtx = dtnodetwo.AsString().Trim(); bidCtx = bidCtx.Replace("。", "\r\n").Trim(); bidCtx = System.Web.HttpUtility.HtmlDecode(bidCtx); for (int r = 0; r < dtnodetwo.Count; r++) { if (dtnodetwo[r].ToPlainTextString().Trim() == "单位名称") { bidUnit = dtnodetwo[r + 1].ToPlainTextString().Trim(); } if (dtnodetwo[r].ToPlainTextString().Trim() == "总投资额") { bidMoney = dtnodetwo[r + 1].ToPlainTextString().Trim(); } text += dtnodetwo[r].ToPlainTextString().Trim() + "\r\n"; } Regex regMoney = new Regex(@"(投标报价|发包价):[^\r\n]+\r\n"); bidMoney = regMoney.Match(text).Value.Replace("投标报价:", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains("万元")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万元")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } msgType = "阳江市建设工程交易中心"; specType = "建设工程"; bidCtx = bidCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Replace(":", "").Trim(); bidCtx = bidCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Replace(":", "").Trim(); bidCtx = bidCtx.Replace(" xml:namespace prefix = st1 ns = ", "").Trim(); prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "阳江市区", "海陵区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "pages-list"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { string pageUrl = string.Format("http://www.gzjyfw.gov.cn/gcms/queryZjt_" + i + ".jspx?title=&businessCatalog=&businessType=JYGG&inDates=0&ext=&origin=ALL"); try { html = this.ToolWebSite.GetHtmlByUrl(pageUrl); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("id", "news_list1")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string prjName = string.Empty; ATag aTag = listNode[j].GetATag(); prjName = aTag.GetAttribute("title"); string code = string.Empty, buildUnit = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; beginDate = listNode[j].ToPlainTextString().GetDateRegex(); area = listNode[j].GetSpan().ToNodePlainString(); InfoUrl = aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "contents"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = System.Web.HttpUtility.HtmlDecode(dtlNode.AsHtml()).Replace(" ", "");; inviteCtx = HtmlTxt.ToCtxString().Replace(" ", "");; code = inviteCtx.GetCodeRegex(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); specType = "建设工程"; inviteType = prjName.GetInviteBidType(); msgType = "贵州省住房和城乡建设厅"; if (buildUnit.Contains("运输局")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("运输局")) + "运输局"; } if (buildUnit.Contains("管理局")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("管理局")) + "管理局"; } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } InviteInfo info = ToolDb.GenInviteInfo("贵州省", "贵州省及地市", area, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag fileTag = aNode[a].GetATag(); if (fileTag.IsAtagAttach()) { string link = string.Empty; if (fileTag.Link.Contains("http")) { link = fileTag.Link; } else { link = "http://www.gzjyfw.gov.cn/" + fileTag.Link; } base.AttachList.Add(ToolDb.GenBaseAttach(fileTag.LinkText, info.Id, link)); } } } if (!crawlAll && list.Count > this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList ulNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "f-l"))); if (ulNode == null || ulNode.Count < 1) { return(null); } parser = new Parser(new Lexer(ulNode[0].ToHtml())); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new TagNameFilter("ul"), true), new TagNameFilter("li"))); if (tableNodeList != null && tableNodeList.Count > 0) { for (int j = 0; j < tableNodeList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = tableNodeList[j].GetATag(); prjName = aTag.LinkText.ToNodeString().Replace(" ", ""); beginDate = prjName.GetDateRegex(); prjName = prjName.Replace(beginDate, ""); InfoUrl = "http://www.sec.com.cn/" + aTag.Link.Trim(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", ""); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "g-n-con"))); HtmlTxt = dtnode.AsHtml(); inviteCtx = dtnode.AsString().Replace("\t", "").Trim(); if (inviteCtx.Contains("\r\n\r\n")) { inviteCtx = inviteCtx.Substring(inviteCtx.IndexOf("\r\n\r\n")).ToString().Replace("&", "").Trim(); } inviteCtx = inviteCtx.Replace("", "").Replace("Ø", "").Trim(); code = inviteCtx.GetCodeRegex().Replace("能源大厦施工总承包", "").Replace("?", ""); buildUnit = inviteCtx.GetBuildRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = inviteCtx.GetRegex("招 标 人"); } prjAddress = inviteCtx.GetAddressRegex(); if (string.IsNullOrEmpty(prjAddress)) { prjAddress = inviteCtx.GetRegex("详细地址"); } msgType = "深圳能源集团公司"; specType = "建设工程"; prjAddress = "见招标信息"; if (prjName == "宝安区老虎坑垃圾焚烧发电厂二期项目飞灰固化车间屋面墙面材料采购" && code == "") { code = "0708-124003ZXY205"; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("dt"), new HasAttributeFilter("class", "ny_my"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().Replace("(", "(").GetRegexBegEnd("(", ","); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "index_" + (i - 1) + ".html"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("dt"), new HasAttributeFilter("class", "ny_news")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); ItemName = aTag.LinkText; PlanDate = node.ToPlainTextString().GetDateRegex(); if (aTag.Link.ToLower().Contains("http")) { InfoUrl = aTag.Link; } else { InfoUrl = "http://plan.hainan.gov.cn/fzggzl/xmsp/" + aTag.Link.GetReplace("../,./"); } string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "1000"))); if (dtlNode != null && dtlNode.Count > 1) { CtxHtml = dtlNode[0].ToHtml() + dtlNode[1].ToHtml(); ItemCtx = CtxHtml.ToCtxString(); ApprovalUnit = ItemCtx.GetRegex("发文机构"); ItemCode = ItemCtx.GetRegex("索引号"); ApprovalCode = ItemCtx.GetRegex("文号"); ApprovalDate = ItemCtx.GetDateRegex("yyyy年MM月dd日"); PlanType = "项目审批信息"; MsgType = "海南省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("海南省", "海南省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(null); } for (int i = 1; i >= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "fullText", "pubDate", "infoClassCodes", "normIndustry", "zoneCode", "fundSourceCodes", "poClass", "rangeType", "currentPage" }, new string[] { "", "", "0105", "", "", "", "", "", i.ToString(), }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "as-pager"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { INode node = listNode[j]; ATag aTag = node.GetATag(); if (aTag == null) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = node.ToPlainTextString().GetDateRegex(); // buildUnit = tr.Columns[1].ToNodePlainString(); InfoUrl = aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "as-floor-normal"))); parser.Reset(); NodeList btNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "as-article")), true), new TagNameFilter("h3"))); //if (dtlNode != null && dtlNode.Count > 0) //{ // HtmlTxt = dtlNode.AsHtml(); // parser = new Parser(new Lexer(HtmlTxt)); // NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("div")); // if (bidNode != null && bidNode.Count > 0) // { // string ctx = string.Empty; // TableTag bidTable = bidNode[0] as TableTag; // for (int r = 0; r < bidTable.RowCount; r++) // { // for (int c = 0; c < bidTable.Rows[r].ColumnCount; c++) // { // string temp = bidTable.Rows[r].Columns[c].ToNodePlainString(); // if (c % 2 == 0) // ctx += temp + ":"; // else // ctx += temp + "\r\n"; // } // } // } //} prjName = btNode.AsString(); HtmlTxt = dtlNode.ToHtml(); inviteCtx = HtmlTxt.Replace("</td>", "\r\n").Replace("</tr>", "\r\n").ToCtxString().Replace("\r\n\t", "\r\n").Replace("\r\n\r\n", "\r\n"); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); prjAddress = inviteCtx.GetAddressRegex(); inviteCtx = HtmlTxt.ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); msgType = "中国国际招标网"; specType = "建设工程"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach() || a.Link.Contains("downloadfile")) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://183.63.34.189/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); foreach (string siteUrl in AllSiteUrl.Keys) { int result = 0; string webUrl = this.SiteUrl + AllSiteUrl[siteUrl]; string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(webUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "Paging"))); if (nodeList != null && nodeList.Count > 0) { string temp = nodeList.AsString().GetRegexBegEnd("总页数:", "当前"); try { pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(webUrl + "?Paging=" + i.ToString())); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("valign", "top"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 0; j < table.RowCount - 2; j++) { TableRow tr = table.Rows[j]; ATag aTag = tr.GetATag(); if (aTag == null) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); InfoUrl = "http://jyzx.maoming.gov.cn" + aTag.Link; beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldetail)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.GetReplace("</p>", "\r\n").ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); inviteType = siteUrl; msgType = "茂名市公共资源交易中心"; specType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "茂名市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); result++; parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag fileTag = aNode[a] as ATag; if (fileTag.IsAtagAttach()) { string fileUrl = string.Empty; if (fileTag.Link.Contains("http")) { fileUrl = fileTag.Link; } else { fileUrl = "http://jyzx.maoming.gov.cn/" + fileTag.Link; } base.AttachList.Add(ToolDb.GenBaseAttach(fileTag.LinkText, info.Id, fileUrl)); } } } if (result >= this.MaxCount && !crawlAll) { goto Finish; } } } } } Finish : continue; } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "fanye66953"))); if (nodeList != null && nodeList.Count > 0) { try { string temp = nodeList[0].ToPlainTextString().GetRegexBegEnd("/", "&").ToLower().Replace(" ", ""); page = int.Parse(temp); } catch { } } for (int i = 1; i <= page; i++) { if (i > 1) { try { string a = (page + 1 - i).ToString(); string url = "http://zbcg.sziit.edu.cn/zbxx1/" + a + ".htm"; htl = this.ToolWebSite.GetHtmlByUrl(url, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table-l"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = tableNodeList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; if (tr.ColumnCount < 2) { continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = tr.Columns[1].ToNodePlainString(); if (prjName.Contains("暂停公告")) { continue; } beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex("yyyy/MM/dd"); ATag aTag = tr.GetATag(); InfoUrl = "http://zbcg.sziit.edu.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "vsb_newscontent"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); Regex regeximg = new Regex(@"<IMG[^>]*>");//去掉图片 HtmlTxt = regeximg.Replace(HtmlTxt, ""); bidCtx = dtnode.AsString().Replace(" ", "").Replace("EndFragment", "").Trim(); if (bidCtx.Contains("招标编号")) { code = bidCtx.Substring(bidCtx.IndexOf("招标编号")).ToString(); Regex regcode = new Regex(@"\w{4}-\w{7}"); code = regcode.Match(code).Value; } bidUnit = bidCtx.Replace("\n", "\r\n").GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.Replace("\n", "\r\n").GetBidRegex(null, false); } if (bidType == "设备材料" || bidType == "小型施工" || bidType == "专业分包" || bidType == "劳务分包" || bidType == "服务" || bidType == "勘察" || bidType == "设计" || bidType == "监理" || bidType == "施工") { specType = "建设工程"; } else { specType = "其他"; } bidType = ToolHtml.GetInviteTypes(bidType); buildUnit = ""; msgType = "深圳信息职业技术学院"; BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookieStr = string.Empty; int pageInt = 1; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("colspan", "7"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString().GetRegexBegEnd(",共", "页,"); pageInt = int.Parse(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_Content_GridView1"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; int result = pageInt > 1 ? table.RowCount - 1 : table.RowCount; for (int j = 1; j < result; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; TableRow tr = table.Rows[j]; InfoType = "抽签轮空公示"; InfoTitle = tr.Columns[2].ToNodePlainString(); prjCode = tr.Columns[1].ToNodePlainString(); PublistTime = tr.Columns[3].ToPlainTextString(); InfoUrl = SiteUrl; InfoCtx = "企业编号:" + prjCode + "\r\n企业名称:" + InfoTitle + "\r\n开始暂停时间:" + PublistTime + "\r\n结束暂停时间:" + tr.Columns[4].ToPlainTextString(); htmlTxt = InfoCtx; NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "深圳市工程", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, MsgTypeCosnt.ShenZhenMsgType, InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagebox"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.ynbidding.net/classlist.aspx?no-cache=0.04312339340010729&id=685790278180&id=://www.ynbidding.net/list&page=" + i + "&_="); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; ATag aTag = tr.GetATag(); if (aTag == null) { continue; } string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ItemName = aTag.LinkText; PlanDate = tr.Columns[0].ToNodePlainString().GetDateRegex("yyyy/MM/dd"); InfoUrl = "http://www.ynbidding.net" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "Content"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); PlanType = "项目信息"; MsgType = "云南省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("云南省", "云南省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "30"))); if (tableNodeList != null && tableNodeList.Count > 0) { try { string s = tableNodeList.AsString(); Regex regexPage = new Regex(@"/\d+页"); page = Convert.ToInt32(regexPage.Match(tableNodeList.AsString()).Value.Replace("/", "").Replace("页", "").Trim()); } catch { page = 1; } } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&sub_type=&page=" + i.ToString()), Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList NodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "730"))); if (NodeList != null && NodeList.Count > 0) { for (int j = 0; j < NodeList.Count; j++) { TableTag table = new TableTag(); try { table = (TableTag)NodeList[j]; } catch (Exception) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, img = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[0]; prjName = tr.Columns[2].ToPlainTextString().Trim(); beginDate = tr.Columns[3].ToPlainTextString().Trim(); ATag aTag = tr.Columns[2].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.szbhyy.com/" + aTag.Link.Trim(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content_info"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace(" ", "").Replace("<br/>", "\r\n").Replace("<br />", "\r\n"); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content_info"))); inviteCtx = dtnode.AsString(); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); inviteCtx = regexHtml.Replace(inviteCtx, ""); Regex regCode = new Regex(@"(工 程 编 号|招标编号):[^\r\n]+\r\n"); code = regCode.Match(inviteCtx).Value.Replace("工 程 编 号:", "").Replace("招标编号:", "").Trim(); Regex regBuidUnit = new Regex(@"(招标人|招标单位|招 标 人)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("招标单位:", "").Replace("招标人:", "").Replace("招 标 人:", "").Trim(); Regex regprjAddress = new Regex(@"工 程 地 点(:|:)[^\r\n]+\r\n"); prjAddress = regprjAddress.Match(inviteCtx).Value.Replace("工 程 地 点:", "").Trim(); msgType = "香港大学深圳医院"; if (inviteType == "设备材料" || inviteType == "小型施工" || inviteType == "专业分包" || inviteType == "劳务分包" || inviteType == "服务" || inviteType == "勘察" || inviteType == "设计" || inviteType == "监理" || inviteType == "施工") { specType = "建设工程"; } else { specType = "其他"; } if (prjAddress == "") { prjAddress = "见招标信息"; } if (buildUnit == "") { buildUnit = "香港大学深圳医院"; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "dataPager"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString().GetRegexBegEnd("共有:", "页"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "searcher:txtKeyWord", "searcher:tcInputDateTime:txtDateTime1", "searcher:tcInputDateTime:txtDateTime2", "searcher:ddlProvince", "searcher:ddlCity1", "searcher:ddlCity2" }, new string[] { "dataPager", i.ToString(), viewState, "", "", "", "-1", "-1", "-1" } ); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default).GetJsString(); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "p3"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = null; if (nodeList.Count > 1) { table = nodeList[1] as TableTag; } else { table = nodeList[0] as TableTag; } for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "政策法规"; headName = tr.Columns[0].ToNodePlainString(); releaseTime = tr.Columns[1].ToPlainTextString().GetDateRegex(); infoUrl = "http://www.sgjsj.gov.cn/sgwebims/" + tr.Columns[0].GetATagValue("onclick").Replace("(", "kdxx").Replace(")", "xxdk").GetRegexBegEnd("kdxx", "xxdk").Replace("\"", ""); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Table4"))); if (dtlList != null && dtlList.Count > 0) { ctxHtml = dtlList.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = MsgTypeCosnt.ShaoGuanMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "韶关市区", string.Empty, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(htldtl)); NodeList tabNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Table1"))); NodeList aNode = null; if (tabNode != null && tabNode.Count > 1) { parser = new Parser(new Lexer(tabNode[1].ToHtml())); aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); } else if (tabNode != null && tabNode.Count > 0) { parser = new Parser(new Lexer(tabNode.AsHtml())); aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); } if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = ToolHtml.GetBaseAttach("http://www.sgjsj.gov.cn/sgwebims/" + aTag.Link.Replace("../", "").Replace("./", ""), aTag.LinkText, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } return(null); }
public void DealHtml(IList list, string html, bool crawlAll) { Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (aNodes != null && aNodes.Count > 0) { Type typs = typeof(ATag); TableTag table = aNodes[0] as TableTag; for (int t = 1; t < table.RowCount - 1; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, bidType = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, FbTime = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[t] as TableRow; ATag aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = aTag.Link; prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmlDtl = regexHtml.Replace(htmlDtl, ""); Parser parserCtx = new Parser(new Lexer(htmlDtl)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable"))); if (ctxNode != null && ctxNode.Count > 0) { Parser parserdiv = new Parser(new Lexer(htmlDtl)); NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button"))); HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim(); Type tp = typeof(ATag); TableTag tabTag = ctxNode[0] as TableTag; string startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regex = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}"); Match math = regex.Match(startTime); beginDate = math.Value.Replace("时间:", ""); Regex regexcode = new Regex("(工程编号|项目编号):[^\r\n]+[\r\n]{1}"); Match match = regexcode.Match(tabTag.ToPlainTextString()); if (match.Value.Length > 0) { code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); } Regex regexBuildUnit = new Regex("(中标人|中标单位):[^\r\n]+[\r\n]{1}"); Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString()); if (matchBuildUnit.Value.Length > 0) { buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); } Regex regexbidUnit = new Regex("(招标人|建设单位):[^\r\n]+[\r\n]{1}"); Match matchbidUnit = regexbidUnit.Match(tabTag.ToPlainTextString()); if (matchbidUnit.Value.Length > 0) { bidUnit = matchbidUnit.Value.Replace("招标人:", "").Replace("建设单位:", "").Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); } Regex regexMoney = new Regex("(中标价|其中标价为|中标价格):[^\r\n]+[\r\n]{1}"); Match matchMoney = regexMoney.Match(tabTag.ToPlainTextString()); if (matchMoney.Value.Length > 0) { bidMoney = matchMoney.Value.Replace("中标价:", "").Replace("其中标价为:", "").Replace("中标价格:", "").Replace("\r", ""); } Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains("万")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } if (buildUnit == "" || buildUnit == null) { buildUnit = ""; } if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = buildUnit.Substring(0, 150); } if (bidUnit == "" || bidUnit == null) { bidUnit = ""; } if (Encoding.Default.GetByteCount(bidUnit) > 150) { bidUnit = bidUnit.Substring(0, 150); } ctx = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace(" ", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); if (ctx.Length > 0) { Regex regexCtx = new Regex("<!--[^<]+-->"); ctx = regexCtx.Replace(ctx, ""); } } parserCtx.Reset(); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai"))); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); if (ctx.Contains("公示开始时间")) { beginDate = ctx.Substring(ctx.IndexOf("公示开始时间")).ToString(); Regex regBeDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日"); beginDate = regBeDate.Match(beginDate).Value.Trim(); } if (beginDate == "") { beginDate = regDate.Match(ctxNode.AsString()).Value.Trim(); } if (beginDate == "") { beginDate = string.Empty; } prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "龙门县", string.Empty, code, prjName, bidUnit, beginDate, buildUnit, beginDate, endDate, ctx, string.Empty, "惠州市建设工程交易中心", bidType, "建设工程", string.Empty, bidMoney, InfoUrl, string.Empty, HtmlTxt); list.Add(info); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"))); NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true); for (int a = 0; a < aTagNodes.Count; a++) { ATag fileTage = aTagNodes[a] as ATag; if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile")) { string downloadURL = fileTage.Link; BaseAttach attach = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return; } } } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NotifyInfo>(); int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string cookiestr = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "gridview_PagerRow"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", " "); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION", "keyTextBox", "PagerControl1:_ctl4", "PagerControl1:_ctl2.x", "PagerControl1:_ctl2.y" }, new string[] { "", "", "", viewState, "7CE136E4", eventValidation, "", "", "3", "5" } ); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MyGridView1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); headName = aTag.LinkText; releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { Logger.Error(headName); Logger.Error(pageInt); continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "PopupBody_context"))); if (dtlNode != null && dtlNode.Count > 0) { if (Encoding.Default.GetByteCount(headName) > 200) { headName = headName.Substring(0, 100); } ctxHtml = dtlNode.AsHtml(); infoCtx = ctxHtml.ToCtxString(); List <string> listImg = new List <string>(); parser = new Parser(new Lexer(ctxHtml)); NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgNode != null && imgNode.Count > 0) { for (int m = 0; m < imgNode.Count; m++) { string link = "http://publish.bcactc.com" + (imgNode[m] as ImageTag).ImageURL; listImg.Add(link); ctxHtml = ctxHtml.GetReplace((imgNode[m] as ImageTag).ImageURL, link); } } msgType = "北京市建设工程发包承包交易中心"; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "北京市", "北京市区", "", infoCtx, "通知公告"); sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { if (listImg.Count > 0) { for (int a = 0; a < listImg.Count; a++) { BaseAttach entity = null; try { entity = ToolHtml.GetBaseAttach(listImg[0], headName, info.Id); if (entity != null) { ToolDb.SaveEntity(entity, string.Empty); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag fileATag = aNode[k].GetATag(); if (fileATag.IsAtagAttach()) { BaseAttach obj = null; try { if (fileATag.Link.ToLower().Contains("http")) { obj = ToolHtml.GetBaseAttach(fileATag.Link, headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://publish.bcactc.com/" + fileATag.Link, headName, info.Id); } } catch { } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } } } } if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_RightList1_GridViewPaging1_lblGridViewPagingDesc"))); if (pageList != null && pageList.Count > 0) { string pageStr = pageList[0].ToPlainTextString().Trim(); try { Regex regexPage = new Regex(@"页,共[^页]+页"); Match pageMatch = regexPage.Match(pageStr); pageInt = int.Parse(pageMatch.Value.Replace("页,共", "").Replace("页", "").Trim()); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ctl00$ScriptManager1", "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "ctl00$cph_context$RightList1$search", "ctl00$cph_context$RightList1$txtTitle", "ctl00$cph_context$RightList1$GridViewPaging1$txtGridViewPagingForwardTo", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "ctl00$cph_context$RightList1$GridViewPaging1$btnForwardToPage", }, new string[] { "ctl00$cph_context$RightList1$update1|ctl00$cph_context$RightList1$GridViewPaging1$btnForwardToPage", "", "", viewState, "标题", "", i.ToString(), "", eventValidation, "GO" }); try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_RightList1_GridView1"))); if (dtList != null && dtList.Count > 0) { TableTag table = dtList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; headName = tr.Columns[1].ToPlainTextString().Trim(); releaseTime = tr.Columns[3].ToPlainTextString().Trim(); infoScorce = tr.Columns[2].ToPlainTextString().Trim(); infoType = "通知公告"; ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; infoUrl = "http://jyzx.cb.gov.cn/LGjyzxWeb/SiteManage/" + aTag.Link; if (infoUrl.Contains("%25")) { infoUrl = infoUrl.Replace("%25", "%"); } string htmldetailtxt = string.Empty; try { htmldetailtxt = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8); } catch { continue; } parser = new Parser(new Lexer(htmldetailtxt)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "text_contend"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = noList.AsString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", ""); msgType = "深圳市龙岗建设工程交易中心"; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳龙岗区工程", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList fileList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_JyxxUploadFile1_GridView1"))); if (fileList != null && fileList.Count > 0) { string fileHtl = fileList.AsHtml(); parser = new Parser(new Lexer(fileHtl)); NodeFilter aLink = new TagNameFilter("a"); NodeList aList = parser.ExtractAllNodesThatMatch(aLink); if (aList != null && aList.Count > 0) { for (int k = 0; k < aList.Count; k++) { ATag a = aList.SearchFor(typeof(ATag), true)[k] as ATag; if (a != null) { AddBaseFile("http://jyzx.cb.gov.cn/LGjyzxWeb/" + a.Link.Replace("../", ""), a.LinkText, info); } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "fenye123"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", "").Trim(); try { pageInt = int.Parse(ToolHtml.GetRegexString(pageTemp, "共", "页")); } catch (Exception ex) { } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("http://www.sz-otc.com/zhaobiao/index_" + i.ToString()) + ".html", Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "zhaobiao_list"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < nodeList.Count; j++) { string htl = string.Empty; htl = nodeList[j].ToHtml(); Parser ul = new Parser(new Lexer(htl)); NodeFilter filter = new TagNameFilter("li"); NodeList liList = ul.ExtractAllNodesThatMatch(filter); if (liList != null && liList.Count > 0) { for (int k = 0; k < liList.Count; k++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = liList.SearchFor(typeof(ATag), true)[k] as ATag; InfoUrl = "http://www.sz-otc.com" + aTag.Link; prjName = aTag.LinkText.Replace("[新]", "").Replace(" ", ""); if (prjName.Contains("]")) { try { int beg = prjName.IndexOf("]"); prjName = prjName.Substring(beg + 1, prjName.Length - beg - 1); } catch { } } string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch { return(null); } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "right_content"), new TagNameFilter("div"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.ToHtml(); inviteCtx = dtnode.AsString().Replace(" ", "").Replace(" ", ""); string invite = inviteCtx.Replace("点击", "\r\n").Replace("发布人", "\r\n"); specType = "其他"; msgType = "深圳市东方招标有限公司"; if (string.IsNullOrEmpty(prjName)) { Regex regexName = new Regex(@"(工程名称|项目名称)(:|:)[^\r\n]+\r\n"); prjName = regexName.Match(inviteCtx).Value.Replace("工程名称", "").Replace("项目名称", "").Replace(":", "").Replace(":", "").Trim(); } Regex regex = new Regex(@"(工程编号|招标编号)(:|:)[^\r\n]+\r\n"); code = regex.Match(invite).Value.Replace("工程编号", "").Replace("招标编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regexAddress = new Regex(@"(地址|项目地址)(:|:)[^\r\n]+\r\n"); prjAddress = regexAddress.Match(inviteCtx).Value.Replace("地址", "").Replace("项目地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regexUnit = new Regex(@"(招标单位|招标机构)(:|:)[^\r\n]+\r\n"); buildUnit = regexUnit.Match(inviteCtx).Value.Replace("招标单位", "").Replace("招标机构", "").Replace(":", "").Replace(":", "").Trim(); Regex regexCar = new Regex(@"(开始日期|发布日期)(:|:)[^\r\n]+\r\n"); beginDate = regexCar.Match(invite).Value.Replace("开始日期", "").Replace("发布日期", "").Replace(":", "").Replace(":", "").Trim(); if (!string.IsNullOrEmpty(beginDate)) { string time = string.Empty; for (int leng = 0; leng < beginDate.Length; leng++) { if (leng < 10) { time += beginDate.Substring(leng, 1); } } beginDate = time; } specType = "其他"; msgType = "深圳市东方招标有限公司"; if (buildUnit == "") { buildUnit = ""; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= 20) { return(list); } } } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "jwpage"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString(); Regex reg = new Regex(@"/共[^页]+页"); pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("/共", "").Replace("页", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { } parser = new Parser(new Lexer(html)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "jwRercon"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (dtlList != null && dtlList.Count > 0) { for (int j = 0; j < dtlList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = dtlList[j].ToPlainTextString().Trim().Remove(dtlList[j].ToPlainTextString().Trim().IndexOf("[")); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(dtlList[j].ToPlainTextString().Trim()).Value; ATag aTag = dtlList.SearchFor(typeof(ATag), true)[j] as ATag; InfoUrl = "http://www.szns.gov.cn" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "hyxzf2"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = dtl.AsString().Replace(" ", "").Replace("\n", "\r\n"); string InvType = prjName; if (InvType.Contains("施工")) { inviteType = "施工"; } if (InvType.Contains("监理")) { inviteType = "监理"; } if (InvType.Contains("设计")) { inviteType = "设计"; } if (InvType.Contains("勘察")) { inviteType = "勘察"; } if (InvType.Contains("服务")) { inviteType = "服务"; } if (InvType.Contains("劳务分包")) { inviteType = "劳务分包"; } if (InvType.Contains("专业分包")) { inviteType = "专业分包"; } if (InvType.Contains("小型施工")) { inviteType = "小型工程"; } if (InvType.Contains("设备材料")) { inviteType = "设备材料"; } Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx.Replace(" ", "")).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx.Replace(" ", "")).Value.Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "深圳市南山区粤海街道办事处"; if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市南山区粤海街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "南山区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content-right fr"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().GetRegexBegEnd("page_div',", ","); pageInt = int.Parse(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.qingxi.gov.cn/qingxi/zbxx/list2" + "_" + i + ".shtml", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content-right fr")), true), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { ATag aTag = viewList[j].GetATag(); string beginDate = viewList[j].ToPlainTextString().GetDateRegex(); string tempName = aTag.LinkText; if (!tempName.Contains("中标") && !tempName.Contains("招标")) { continue; } string InfoUrl = "http://www.qingxi.gov.cn/" + aTag.Link.GetReplace("./"); string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoom"))); if (dtl != null && dtl.Count > 0) { if (tempName.Contains("中标")) { string buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, bidDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, code = string.Empty, prjName = string.Empty; HtmlTxt = dtl.AsHtml().ToLower(); bidCtx = HtmlTxt.GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); prjName = bidCtx.GetRegex("项目名称,工程名称"); if (string.IsNullOrEmpty(prjName)) { prjName = aTag.LinkText.GetRegexBegEnd("【", "】"); } if (string.IsNullOrWhiteSpace(prjName)) { parser = new Parser(new Lexer(htmDtl)); NodeList nameNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "text red font20 con-title padd-t"))); if (nameNode != null && nameNode.Count > 0) { prjName = nameNode[0].ToNodePlainString(); } } if (string.IsNullOrWhiteSpace(prjName)) { prjName = aTag.LinkText; } code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(new string[] { "中标值" }); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = bidCtx.GetMoneyRegex(); } prjMgr = bidCtx.GetMgrRegex(); bidDate = bidCtx.GetRegex("中标时间").GetDateRegex(); if (string.IsNullOrWhiteSpace(bidDate)) { bidDate = beginDate; } specType = "政府采购"; bidType = prjName.GetInviteBidType(); msgType = "东莞市清溪镇政府"; BidInfo info = ToolDb.GenBidInfo("广东省", "东莞市区", "清溪镇", string.Empty, code, prjName, buildUnit, bidDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.qingxi.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; HtmlTxt = dtl.AsHtml(); inviteCtx = HtmlTxt.GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); prjName = inviteCtx.GetRegex("项目名称,工程名称"); if (string.IsNullOrEmpty(prjName)) { prjName = aTag.LinkText.GetRegexBegEnd("【", "】"); } if (string.IsNullOrWhiteSpace(prjName)) { parser = new Parser(new Lexer(htmDtl)); NodeList nameNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "text red font20 con-title padd-t"))); if (nameNode != null && nameNode.Count > 0) { prjName = nameNode[0].ToNodePlainString(); } } if (string.IsNullOrWhiteSpace(prjName)) { prjName = aTag.LinkText; } inviteType = prjName.GetInviteBidType(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = inviteCtx.GetRegex("招标人"); } if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "东莞市清溪镇政府"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "清溪镇", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.qingxi.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString(); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("/", "页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&pageNo=" + i.ToString(), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_list")), true), new TagNameFilter("ul"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; InfoTitle = nodeList[j].GetATagValue("title"); PublistTime = nodeList[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://zyjy.huizhou.gov.cn" + nodeList[j].GetATagHref(); InfoType = "资审公示"; string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "divZoom"))); if (dtList != null && dtList.Count > 0) { htmlTxt = dtList.AsHtml(); InfoCtx = htmlTxt.ToCtxString(); NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "惠州市区", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "惠州市公共资源交易中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pageZone"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "listZone")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); InfoType = "资格预审"; InfoTitle = aTag.GetAttribute("title"); PublistTime = node.ToPlainTextString().GetDateRegex(); string area = aTag.LinkText.GetRegexBegEnd("【", "】"); InfoUrl = "http://www.hljztb.com/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "bidtable"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml(); TableTag table = dtlNode[0] as TableTag; for (int r = 0; r < table.RowCount; r++) { for (int c = 0; c < table.Rows[r].ColumnCount; c++) { string temp = table.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { InfoCtx += temp.GetReplace(":,:") + "\r\n"; } else { InfoCtx += temp.GetReplace(":,:") + ":"; } } } buildUnit = InfoCtx.GetBuildRegex(); prjCode = InfoCtx.GetRegex("编码"); NoticeInfo info = ToolDb.GenNoticeInfo("黑龙江省", "黑龙江省及地市", area, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "黑龙江住房和城乡建设厅", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, "建设工程", string.Empty, htmlTxt); list.Add(info); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.hljztb.com/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { Logger.Error("无内容"); Logger.Error(InfoUrl); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletininfotable_toolbarTable"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { string bulletininfotable_totalpages = ToolHtml.GetHtmlInputValue(html, "bulletininfotable_totalpages"); string bulletininfotable_totalrows = ToolHtml.GetHtmlInputValue(html, "bulletininfotable_totalrows"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ec_i", "bulletininfotable_efn", "bulletininfotable_crd", "bulletininfotable_p", "bulletininfotable_s_bulletintitle", "bulletininfotable_s_finishday", "hySort", "findAjaxZoneAtClient", "method", "bulletinclass", "bulletininfotable_totalpages", "bulletininfotable_totalrows", "bulletininfotable_pg", "bulletininfotable_rd" }, new string[] { "bulletininfotable", "", "20", i.ToString(), "", "", "1", "false", "bulletinMore", "01", bulletininfotable_totalpages, bulletininfotable_totalrows, (i - 1).ToString(), "5" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html.Replace("tbody", "table"))); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletininfotable_table_body"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, city = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToNodePlainString(); beginDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.ynggzy.net/bulletin.do?method=showbulletin&bulletin_id=" + tr.GetAttribute("id"); string htmldtl = string.Empty; try { htmldtl = ToolHtml.GetHtmlByUrl(this.SiteUrl, InfoUrl, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,<br />,<br/>,<br>", "\r\n").ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); inviteType = prjName.GetInviteBidType(); specType = "政府采购"; msgType = "云南省公共资源交易中心"; InviteInfo info = ToolDb.GenInviteInfo("云南省", "云南省及地市", city, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.ynggzy.net/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "wb-page-li"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "\r"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { SiteUrl = "http://www.jxsggzy.cn/web/jyxx/002005/002005004/" + i + ".html"; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { continue; } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "ewb-list-node clearfix"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; ATag aTag = listNode[j].GetATag(); prjName = aTag.GetAttribute("title"); if (string.IsNullOrWhiteSpace(prjName)) { prjName = aTag.LinkText; } beginDate = listNode[j].ToPlainTextString().GetDateRegex(); if (prjName[2].Equals('县') || prjName[2].Equals('区') || prjName[2].Equals('市')) { area = prjName.Substring(0, 3); } InfoUrl = "http://www.jxsggzy.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "article-info"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); parser = new Parser(new Lexer(HtmlTxt)); NodeList dtlBidNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellpadding", "0"))); if (dtlBidNode != null && dtlBidNode.Count > 0) { TableTag bidTable = dtlBidNode[0] as TableTag; string ctx = string.Empty; for (int r = 0; r < bidTable.RowCount; r++) { for (int c = 0; c < bidTable.Rows[r].ColumnCount; c++) { string temp = bidTable.Rows[r].Columns[c].ToNodePlainString(); if (string.IsNullOrEmpty(temp)) { continue; } if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } prjAddress = ctx.GetAddressRegex(); buildUnit = ctx.GetBuildRegex(); bidUnit = ctx.GetBidRegex(new string[] { "第一中标排序单位名称" }); bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(new string[] { "建造师姓名" }); code = ctx.GetCodeRegex(); bidCtx = ctx; } else { prjAddress = bidCtx.GetAddressRegex(); buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("第一中标排序人"); } bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = bidCtx.GetRegex("注册监理工程师"); } code = bidCtx.GetCodeRegex(); } buildUnit = buildUnit.Replace(" ", ""); bidUnit = bidUnit.Replace(" ", ""); code = code.Replace(" ", ""); prjMgr = prjMgr.Replace(" ", ""); prjAddress = prjAddress.Replace(" ", ""); bidType = "重点工程"; specType = "政府采购"; msgType = "江西省公共资源交易中心"; BidInfo info = ToolDb.GenBidInfo("江西省", "江西省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center"))); if (nodeList != null && nodeList.Count > 0) { Regex regexPage = new Regex(@"\d+页"); page = int.Parse(regexPage.Match(nodeList.AsString()).Value.Trim(new char[] { '页' })); } for (int i = 1; i < page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&otype=&page=" + i.ToString()), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("height", "23"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableRow tr = new TableRow(); for (int j = 0; j < tableNodeList.Count; j++) { string InfoUrl = string.Empty, tempName = string.Empty, tempDate = string.Empty; TableTag table = tableNodeList.SearchFor(typeof(TableTag), true)[j] as TableTag; for (int k = 0; k < 1; k++) { tr = table.Rows[k]; ATag aTag = tr.Columns[1].GetATag(); string url = "http://www.yjjs.gov.cn/news_Info.asp?rs_id=" + aTag.GetAttribute("onclick").Replace("titlelinks(", ""); int ii = url.LastIndexOf("''"); tempName = aTag.LinkText.ToNodeString(); tempDate = tr.Columns[2].ToNodePlainString().GetReplace(".", "-").GetDateRegex(); InfoUrl = url.Remove(ii).Replace(",", "").Replace("'", "").Replace("javascript:", "").Trim(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content")), true), new TagNameFilter("table"))); if (dtnode != null && dtnode.Count > 0) { TableTag dtlTable = dtnode[0] as TableTag; for (int r = 1; r < dtlTable.RowCount; r++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; for (int c = 1; c < dtlTable.Rows[r].ColumnCount; c++) { try { string temp = dtlTable.Rows[r].Columns[c].ToNodePlainString(); string title = dtlTable.Rows[0].Columns[c].ToNodePlainString(); HtmlTxt += title + ":" + temp + "</br>"; bidCtx += title + ":" + temp + "\r\n"; } catch { continue; } } prjName = bidCtx.GetRegex("工程项目名称,项目名称,工程名称", true, 200); buildUnit = bidCtx.GetRegex("建设单位"); beginDate = bidCtx.GetRegex("中标日期"); bidMoney = bidCtx.GetMoneyRegex(); bidUnit = bidCtx.GetRegex("中标单位名称"); prjMgr = bidCtx.GetMgrRegex(); prjAddress = bidCtx.GetAddressRegex(); bidType = bidCtx.GetRegex("中标单位资质类别"); msgType = "阳江市建设工程交易中心"; specType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "阳江市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { parserdetail.Reset(); NodeList dtlNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content"))); if (dtlNode != null && dtlNode.Count > 0) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); buildUnit = bidCtx.GetBuildRegex(); bidMoney = bidCtx.GetMoneyRegex(); bidUnit = bidCtx.GetBidRegex(); prjMgr = bidCtx.GetMgrRegex(); prjAddress = bidCtx.GetAddressRegex(); bidType = tempName.GetInviteBidType(); msgType = "阳江市建设工程交易中心"; specType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "阳江市区", "", string.Empty, code, tempName, buildUnit, tempDate, bidUnit, tempDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "bmdt_fy"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", ""); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://syjdb.baoan.gov.cn/xxgk_12101/ywxx/zbcg/zbxxgs_48718/index_" + (i - 1).ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "new_list01"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, ""); try { int beg = temp.IndexOf("else"), end = temp.Length; temp = temp.Substring(beg, end - beg); beg = temp.LastIndexOf("<a"); end = temp.LastIndexOf("/a>"); temp = temp.Substring(beg, (end - beg) + 3); beg = temp.IndexOf(">"); end = temp.IndexOf("</"); prjName = temp.Substring(beg + 1, end - beg - 1); Parser p = new Parser(new Lexer(temp)); NodeList l = p.ExtractAllNodesThatMatch(new TagNameFilter("a")); ATag aTag = l.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://syjdb.baoan.gov.cn/xxgk_12101/ywxx/zbcg/zbxxgs_48718/" + aTag.Link.Replace("../", "").Replace("./", ""); } catch { continue; } string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htlDtl = regexHtml.Replace(htlDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "DivContent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); parser = new Parser(new Lexer(HtmlTxt.ToLower().Replace("th", "td"))); NodeList dtlTab = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (dtlTab != null && dtlTab.Count > 0) { bidCtx = ""; TableTag table = dtlTab[0] as TableTag; for (int k = 0; k < table.RowCount; k++) { for (int c = 0; c < table.Rows[k].ColumnCount; c++) { string strCtx = table.Rows[k].Columns[c].ToPlainTextString().Replace(" ", "").Replace(" ", "").Replace("\n", ""); if (strCtx == "工程类型") { break; } if (c % 2 == 0) { bidCtx += strCtx + ":"; } else { bidCtx += strCtx + "\r\n"; } } } bidCtx = bidCtx.Replace("\n", "").Replace("\r\n\r\n", "\r\n").Replace("\r", "\r\n") + "\r\n"; } else { bidCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); bidCtx = Regex.Replace(bidCtx.Replace("<BR/>", "\r\n").Replace("<br/>", "\r\n").Replace("<BR>", "\r\n").Replace("<br>", "\r\n"), "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n") + "\r\n"; } bidType = "工程"; if (prjName.Contains("施工")) { bidType = "施工"; } if (prjName.Contains("监理")) { bidType = "监理"; } if (prjName.Contains("设计")) { bidType = "设计"; } if (prjName.Contains("勘察")) { bidType = "勘察"; } if (prjName.Contains("服务")) { bidType = "服务"; } if (prjName.Contains("劳务分包")) { bidType = "劳务分包"; } if (prjName.Contains("专业分包")) { bidType = "专业分包"; } if (prjName.Contains("小型施工")) { bidType = "小型工程"; } if (prjName.Contains("设备材料")) { bidType = "设备材料"; } bidCtx = bidCtx.Replace(" ", ""); Regex regPrjCode = new Regex(@"(工程编号|项目编号|招标编号|中标编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(bidCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("招标编号", "").Replace("中标编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regMoney = new Regex(@"(中标价|投标价|总投资|发包价|投标报价|价格|金额)(:|:|)[^\r\n]+\r\n"); bidMoney = regMoney.Match(bidCtx.Replace(" ", "")).Value.Replace("中标价", "").Replace(",", "").Replace(",", "").Replace(" ", "").Replace("总投资", "").Replace("发包价", "").Replace("投标报价", "").Replace("投标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Trim(); Regex regBidUnit = new Regex(@"(第一候选人|中标候选人|中标人名称|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("¥", "").Replace("中标人名称", "").Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regprjMgr = new Regex(@"(项目经理姓名|项目经理|项目负责人|项目总监|建造师|总工程师|监理师)(:|:)[^\r\n]+\r\n"); prjMgr = regprjMgr.Match(bidCtx.Replace(" ", "")).Value.Replace("项目经理姓名", "").Replace("总工程师", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("监理师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Trim(); if (bidMoney.Contains("人民币") || bidMoney.Contains("¥") || bidMoney.Contains("$")) { if (bidMoney.Contains("¥")) { try { int begs = bidMoney.IndexOf("¥"); bidMoney = bidMoney.Substring(begs + 1, bidMoney.Length - begs - 1); } catch { bidMoney = "0"; } } if (bidMoney.Contains("¥")) { try { int begs = bidMoney.IndexOf("¥"); bidMoney = bidMoney.Substring(begs + 1, bidMoney.Length - begs - 1); } catch { bidMoney = "0"; } } if (bidMoney.Contains("$")) { try { int begs = bidMoney.IndexOf("$"); bidMoney = bidMoney.Substring(begs + 1, bidMoney.Length - begs - 1); } catch { bidMoney = "0"; } } if (bidMoney.Contains("人民币")) { try { int begs = bidMoney.IndexOf("人民币"); bidMoney = bidMoney.Substring(begs + 1, bidMoney.Length - begs - 1); } catch { bidMoney = "0"; } } } Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains("万")) { //bidMoney = bidMoney.Remove(bidMoney.IndexOf("万元")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } if (prjMgr.Contains("资格")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格")); } bidCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); bidCtx = Regex.Replace(bidCtx.Replace("<BR/>", "\r\n").Replace("<br/>", "\r\n").Replace("<BR>", "\r\n").Replace("<br>", "\r\n"), "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n") + "\r\n"; bidCtx = bidCtx.Replace(" ", ""); buildUnit = ToolHtml.GetSubString(buildUnit, 150); bidUnit = ToolHtml.GetSubString(bidUnit, 150); code = ToolHtml.GetSubString(code, 50); prjMgr = ToolHtml.GetSubString(prjMgr, 50); msgType = "深圳市宝安区石岩街道办事处"; specType = "建设工程"; bidType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区石岩街道办事处"; } prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "digg"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", "").Trim(); Regex regpage = new Regex(@"共\d+页"); try { pageInt = int.Parse(regpage.Match(pageTemp).Value.Replace("共", "").Replace("页", "").Trim()); } catch (Exception ex) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + i.ToString(), Encoding.UTF8); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("width", "100%"), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = (TableTag)nodeList[4]; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; code = table.Rows[j].Columns[0].ToNodePlainString(); prjName = table.Rows[j].Columns[1].ToNodePlainString().Replace("(New!)", "").Replace(".", ""); ATag aTag = table.Rows[j].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.szldzb.com/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").GetJsString().Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("width", "620"), new TagNameFilter("table"))); HtmlTxt = dtnodeHTML.AsHtml(); } catch (Exception ex) { continue; } inviteCtx = HtmlTxt.ToCtxString().Trim().Replace("(new!)", "").Replace(" ", "").Replace("endfragment", ""); try { string ctx = inviteCtx.Substring(inviteCtx.Length - 80, 80); beginDate = ctx.GetChinaTime(); } catch { } if (string.IsNullOrEmpty(beginDate)) { beginDate = DateTime.Now.ToString(); } buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); inviteType = prjName.GetInviteBidType(); specType = "其他"; msgType = "深圳龙达招标有限公司"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeTag = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (nodeTag != null && nodeTag.Count > 0) { for (int k = 0; k < nodeTag.Count; k++) { ATag fileTag = nodeTag[k].GetATag(); if (fileTag.IsAtagAttach()) { string link = "http://www.szldzb.com/" + fileTag.Link; BaseAttach attach = ToolDb.GenBaseAttach(fileTag.LinkText, info.Id, link); } } } //NodeList FileTag = dtnode.ExtractAllNodesThatMatch(new TagNameFilter("a"), true); //if (FileTag != null && FileTag.Count > 0) //{ // for (int f = 0; f < FileTag.Count; f++) // { // ATag file = FileTag[f] as ATag; // if (file.Link.ToUpper().Contains(".DOC")) // { // BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, file.Link); // base.AttachList.Add(attach); // } // } //} if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "?Paging=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "99%"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount - 1; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); InfoTitle = aTag.GetAttribute("title"); PublistTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gxzbtb.cn" + aTag.Link; InfoType = "澄清公告"; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml().GetJsString(); InfoCtx = htmlTxt.ToCtxString(); buildUnit = InfoCtx.GetBuildRegex(); NoticeInfo info = ToolDb.GenNoticeInfo("广西壮族自治区", "广西壮族自治区及地市", area, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "广西壮族自治区公共资源交易中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, "建设工程", string.Empty, htmlTxt); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.gxzbtb.cn" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }