protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("align", "left"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "BigClass", "page" }, new string[] { "", i.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("dt"), new HasAttributeFilter("class", "ny_news")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; ATag aTag = listNode[j].GetATag(); prjName = aTag.GetAttribute("title"); if (string.IsNullOrWhiteSpace(prjName)) { prjName = aTag.LinkText; } beginDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link.Trim(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ny_wz"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); bidCtx = HtmlTxt.GetReplace("</p>,</br>", "\r\n").GetReplace("<br />", "\r\n").ToCtxString(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("中标人"); } bidMoney = bidCtx.GetMoneyRegex(); if (string.IsNullOrWhiteSpace(bidMoney)) { bidMoney = bidCtx.GetRegex("中标价").GetMoney("万元"); } code = bidCtx.GetCodeRegex().GetCodeDel(); if (!string.IsNullOrWhiteSpace(code)) { if (code[code.Length - 1] != '号') { code = ""; } } prjMgr = bidCtx.GetMgrRegex(); if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } msgType = "海南省发展和改革委员会"; specType = "建设工程"; bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("海南省", "海南省及地市", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://ztb.hainan.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { int sqlCount = 0; string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(null); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination")), true), new TagNameFilter("a"))); if (nodeList != null && nodeList.Count > 0) { try { string temp = nodeList[nodeList.Count - 1].GetATagHref(); string pageCount = temp.Replace(temp.Remove(temp.IndexOf("=")), "").Replace("=", ""); page = int.Parse(pageCount); } catch { } } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "?page=" + i.ToString(), Encoding.UTF8, ref cookiestr); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList liNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list")), true), new TagNameFilter("li"))); if (liNode != null && liNode.Count > 0) { for (int j = 0; j < liNode.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = liNode[j].ToPlainTextString().GetDateRegex("yyyy/MM/dd"); prjName = liNode[j].ToPlainTextString().Replace(beginDate, "").ToNodeString().Replace(" ", "").Replace("·", ""); ATag aTag = liNode[j].GetATag(); InfoUrl = "http://www.yjggzy.cn" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", ""); } catch { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtlNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("dl"), new HasAttributeFilter("class", "acticlecontent"))); if (dtlNode == null || dtlNode.Count < 1) { parserdetail.Reset(); dtlNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "nr"))); } if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.ToHtml(); bidCtx = HtmlTxt.ToCtxString().Replace("\r\n\t\r\n\t", "\r\n\t"); bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t"); bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t"); bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t"); bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t"); bidCtx = bidCtx.Replace("\r\n\r\n\t", "\r\n\t"); bidCtx = bidCtx.Replace("\r\n\r\n\t", "\r\n\t"); bidCtx = bidCtx.Replace("\r\n\r\n\t", "\r\n\t"); bidCtx = bidCtx.Replace("\r\n\r\n\t", "\r\n\t"); bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n"); bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n"); bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n"); bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n"); bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n"); bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t"); bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n"); bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t"); parser = new Parser(new Lexer(HtmlTxt)); NodeList dtlNodeList = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (dtlNodeList != null && dtlNodeList.Count > 0) { string ctx = string.Empty; TableTag tableTag = dtlNodeList[0] as TableTag; foreach (TableRow row in tableTag.Rows) { int colIndex = 0; foreach (TableColumn col in row.Columns) { if (row.Columns.Length == 3) { if (colIndex == 0 && col.GetAttribute("colspan") != "2") { colIndex++; continue; } else if (col.GetAttribute("colspan") == "2" && colIndex == 1) { ctx += col.ToNodePlainString() + ":"; } else if (!string.IsNullOrEmpty(col.GetAttribute("colspan")) && colIndex == 2) { ctx += col.ToNodePlainString() + "\r\n"; } else if (string.IsNullOrEmpty(col.GetAttribute("colspan")) && colIndex == 1) { ctx += col.ToNodePlainString() + ":"; } else if (string.IsNullOrEmpty(col.GetAttribute("colspan")) && colIndex == 2) { ctx += col.ToNodePlainString() + "\r\n"; } colIndex++; continue; } if (row.Columns.Length == 2) { if (colIndex == 0) { ctx += col.ToNodePlainString() + ":"; } else if (colIndex == 1) { ctx += col.ToNodePlainString() + "\r\n"; } colIndex++; continue; } if (colIndex == 0 && col.GetAttribute("colspan") != "2") { colIndex++; continue; } else if (colIndex == 1 && col.GetAttribute("colspan") != "2") { ctx += col.ToNodePlainString() + ":"; } else if (colIndex == 2 && col.GetAttribute("colspan") != "2") { ctx += col.ToNodePlainString() + "\r\n"; } else if (col.GetAttribute("colspan") == "2" && colIndex == 0) { ctx += col.ToNodePlainString() + ":"; } else if (!string.IsNullOrEmpty(col.GetAttribute("colspan")) && colIndex == 1) { ctx += col.ToNodePlainString() + "\r\n"; } colIndex++; } } buildUnit = ctx.GetBuildRegex(); bidUnit = ctx.GetBidRegex(); code = ctx.GetCodeRegex(); prjAddress = ctx.GetAddressRegex(); prjMgr = ctx.GetMgrRegex(); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("项目负责人姓名", true, 50); } bidMoney = ctx.GetMoneyRegex(); } else { parser = new Parser(new Lexer(HtmlTxt)); NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgNode != null && imgNode.Count > 0) { ImageTag img = imgNode[0] as ImageTag; string link = "http://www.yjggzy.cn" + img.GetAttribute("src"); HtmlTxt = HtmlTxt.GetReplace(img.GetAttribute("src"), link); } } msgType = "阳江市建设工程交易中心"; specType = "建设工程"; bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "阳江市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ImageTag img = aNode[a] as ImageTag; try { BaseAttach attach = ToolHtml.GetBaseAttach(img.GetAttribute("src"), prjName, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { } } } } if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "dataPager"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString().GetRegexBegEnd("共有:", "页"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "searcher:txtKeyWord", "searcher:tcInputDateTime:txtDateTime1", "searcher:tcInputDateTime:txtDateTime2", "searcher:ddlProvince", "searcher:ddlCity1", "searcher:ddlCity2" }, new string[] { "dataPager", i.ToString(), viewState, "", "", "", "-1", "-1", "-1" }); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; infoType = "办事指南"; headName = nodeList[j].GetATagValue("Txt"); releaseTime = nodeList[j].ToPlainTextString().GetDateRegex(); infoUrl = nodeList[j].GetATagHref(); // infoUrl = "http://www.sgjsj.gov.cn/sgwebims/" + tr.Columns[0].GetATagValue("onclick").Replace("(", "kdxx").Replace(")", "xxdk").GetRegexBegEnd("kdxx", "xxdk").Replace("\"", ""); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "crt fr"))); if (dtlList != null && dtlList.Count > 0) { ctxHtml = dtlList.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = MsgTypeCosnt.ShaoGuanMsgType; headName = infoCtx.GetRegexBegEnd("列表\r\n", "\r\n"); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "韶关市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(htldtl)); NodeList tabNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Table1"))); NodeList aNode = null; if (tabNode != null && tabNode.Count > 1) { parser = new Parser(new Lexer(tabNode[1].ToHtml())); aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); } else if (tabNode != null && tabNode.Count > 0) { parser = new Parser(new Lexer(tabNode.AsHtml())); aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); } if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = ToolHtml.GetBaseAttach("http://www.sgjsj.gov.cn/sgwebims/" + aTag.Link.Replace("../", "").Replace("./", ""), aTag.LinkText, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("valign", "top")), true), new TagNameFilter("table"))); if (pageList != null && pageList.Count > 0) { try { TableTag tab = pageList[0] as TableTag; pageInt = tab.Rows[0].ColumnCount; } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__EVENTVALIDATION", "sel", "beginDate", "endDate", "infotitle" }, new string[] { "GridView1", "Page$" + i.ToString(), viewState, eventValidation, "1", "", "", "" }); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "办事指南"; headName = tr.Columns[1].ToNodePlainString(); infoScorce = tr.Columns[2].ToNodePlainString(); releaseTime = tr.Columns[3].ToPlainTextString().GetDateRegex(); infoUrl = "http://www.szjsjy.com.cn/" + tr.Columns[1].GetATagHref().Replace("../", ""); ctxHtml = "<p>信息标题:" + headName + "<br/>信息来源:" + infoScorce + "<br/>发布时间:" + releaseTime + "</p>"; infoCtx = "信息标题:" + headName + "\r\n信息来源:" + infoScorce + "\r\n发布时间:" + releaseTime + "\r\n"; msgType = MsgTypeCosnt.ShenZhenMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { try { BaseAttach obj = ToolHtml.GetBaseAttach(infoUrl, headName, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "cNavBar_cTotalPages"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[0].ToNodePlainString(); pageInt = int.Parse(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION", "cSortField", "cSortDirection", "cID", "cParentID", "cLeft:cParentID", "cLeft:cID", "cNavBar:cPageIndex" }, new string[] { viewState, "8A9C3F4D", eventValidation, "", "", "1080200", "1080000", "1080000", "1080200", i.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list")), true), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, beginDate = string.Empty, prjName = string.Empty, InfoUrl = string.Empty; ATag aTag = viewList[j].GetATag(); beginDate = viewList[j].ToPlainTextString().GetDateRegex(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://xzedu.zhuhai.gov.cn/" + aTag.Link.GetReplace("./"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "news_view_main")), true), new TagNameFilter("li"))); if (dtl != null && dtl.Count > 1) { HtmlTxt = dtl[1].ToHtml().ToLower(); bidCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); string src = string.Empty; parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { TableTag tag = tableNode[0] as TableTag; if (tag.RowCount > 1) { string ctx = string.Empty; try { for (int r = 0; r < tag.Rows[0].ColumnCount; r++) { ctx += tag.Rows[0].Columns[r].ToNodePlainString() + ":"; ctx += tag.Rows[1].Columns[r].ToNodePlainString() + "\r\n"; } } catch { } bidUnit = ctx.GetBidRegex().GetReplace("中标(成交)"); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("投标单位"); } bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(); } } else { Parser imgParser = new Parser(new Lexer(HtmlTxt.ToLower())); NodeList imgNode = imgParser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgNode != null && imgNode.Count > 0) { string imgUrl = (imgNode[0] as ImageTag).GetAttribute("src"); src = "http://xzedu.zhuhai.gov.cn/" + imgUrl; HtmlTxt = HtmlTxt.ToLower().GetReplace(imgUrl, src); } bidUnit = bidCtx.GetBidRegex().GetReplace("中标(成交)"); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("中标(成交)供应商名称"); } bidMoney = bidCtx.GetMoneyRegex(new string[] { "中标(成交)候选人投标报价" }); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = bidCtx.GetMoneyRegex(); } prjMgr = bidCtx.GetMgrRegex(); } buildUnit = bidCtx.GetBuildRegex(); code = bidCtx.GetCodeRegex().GetCodeDel(); if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } code = bidCtx.GetCodeRegex().GetCodeDel(); try { if (decimal.Parse(bidMoney) < 1) { bidMoney = "0"; } if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } if (prjMgr.Contains("资格")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格")); } specType = "政府采购"; bidType = prjName.GetInviteBidType(); msgType = "珠海市香洲区教育局"; BidInfo info = ToolDb.GenBidInfo("广东省", "珠海市区", "香洲区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!string.IsNullOrEmpty(src)) { string sql = string.Format("select Id from BidInfo where InfoUrl='{0}'", info.InfoUrl); object obj = ToolDb.ExecuteScalar(sql); if (obj == null || obj.ToString() == "") { try { BaseAttach attach = ToolHtml.GetBaseAttach(src, prjName, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attach != null) { ToolDb.SaveEntity(attach, ""); } } catch { } } } parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://xzedu.zhuhai.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_GridViewPaingTwo1_lblGridViewPagingDesc"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString().GetRegexBegEnd("共", "页"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_GridView1"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "办事指南"; headName = tr.Columns[1].ToNodePlainString(); releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = "http://www.dgzb.com.cn/DGJYWEB/SiteManage/" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrlEncode(infoUrl, Encoding.UTF8); } catch { } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "line"))); if (dtlList != null && dtlList.Count > 0) { ctxHtml = dtlList.AsHtml(); infoCtx = dtlList.AsString(); msgType = MsgTypeCosnt.DongGuanMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "东莞市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(htldtl)); NodeList aNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_DownLoadFiles1_GridView1"))); if (aNode != null && aNode.Count > 0) { TableTag tab = aNode[0] as TableTag; for (int a = 1; a < tab.RowCount; a++) { TableRow dr = tab.Rows[a]; ATag aTag = dr.Columns[1].GetATag(); if (aTag.IsAtagAttach()) { try { BaseAttach obj = ToolHtml.GetBaseAttach("http://www.dgzb.com.cn/DGJYWEB/SiteManage/" + aTag.Link, aTag.LinkText, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace(" ", ""); Regex reg = new Regex(@"/[^页]+页"); pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("/", "").Replace("页", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://zyjy.huizhou.gov.cn/pages/cms/hzggzyjyzx/html/artList.html?cataId=54f6d9f3580843d59b9dd64918e7ae4f&pageNo=" + i.ToString(), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_list"))), new TagNameFilter("ul"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString()).Value; prjName = viewList[j].ToPlainTextString().Replace("\r", "").Replace("\n", "").Replace(beginDate, ""); ATag aTag = viewList.SearchFor(typeof(ATag), true)[j] as ATag; InfoUrl = "http://zyjy.huizhou.gov.cn" + aTag.Link; string htmDtl = string.Empty; try { System.Data.DataTable dt = new System.Data.DataTable(); htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "divZoom"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = System.Text.RegularExpressions.Regex.Replace(dtl.ToHtml(), "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); if (buildUnit.Contains("资质")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("资质")); } prjAddress = ToolHtml.GetSubString(prjAddress, 150); buildUnit = ToolHtml.GetSubString(buildUnit, 150); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "惠州市公共资源交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "dnn_ctr513_ArticleList_cboPages"))); if (nodeList != null && nodeList.Count > 0) { string oo = nodeList.AsString().Trim(); page = Convert.ToInt32(oo.Substring(oo.LastIndexOf("第")).ToString().Replace("第", "").Replace("页", "").Trim()); } for (int i = 1; i <= page; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(htl); eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "dnn$ctr513$ArticleList$cboPages", "ScrollTop", "__dnnVariable" }, new string[] { "dnn$ctr513$ArticleList$cboPages", string.Empty, string.Empty, viewState, (i - 1).ToString(), "716", eventValidation }); try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "dnn_ctr513_ArticleList_PanelA"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = tableNodeList.SearchFor(typeof(TableTag), true)[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; if (tr.ColumnCount < 2) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = tr.Columns[0].ToPlainTextString().Trim(); beginDate = tr.Columns[1].ToPlainTextString().Trim(); ATag aTag = tr.Columns[0].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://zb.zjcic.net" + aTag.Link.Replace("amp;", "").Trim(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace(" ", ""); } catch (Exception) { Logger.Error("InviteZhanJiangJSTwo"); continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "dnn_ctr377_ArticleShow_lblContent"))); if (dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = dtnode.AsString().Trim().Replace(" ", "").Trim(); Regex regBuidUnit = new Regex(@"(招标单位|招标人|招 标 单 位):[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("招标单位:", "").Replace("招 标 单 位:", "").Replace(":", "").Replace(" ", "").Trim(); if (buildUnit == "") { Regex regBuidUnitT = new Regex(@"招 标 单 位: [^\r\n]+\r\n"); buildUnit = regBuidUnitT.Match(inviteCtx).Value.Replace("招 标 单 位: ", "").Replace(" ", "").Trim(); } if (buildUnit == "") { buildUnit = ""; } Regex regPrjAddr = new Regex(@"(工程地点|工程地址|地 址|工 程 地 点)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("工程地址", "").Replace("地 址", "").Replace("工 程 地 点:", "").Replace(":", "").Trim(); if (prjAddress == "") { Regex regPrjAddrT = new Regex(@"工 程 地 点: [^\r\n]+\r\n"); prjAddress = regPrjAddrT.Match(inviteCtx).Value.Replace("工 程 地 点: ", "").Trim(); } msgType = "湛江市建设工程交易中心"; specType = "建设工程"; if (prjAddress == "") { prjAddress = "见招标信息"; } prjName = prjName.Replace("·", ""); inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "湛江市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "scott")), true), new TagNameFilter("a"))); Regex regexPage = new Regex(@"共\d+页"); try { Regex numpage = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); ATag link = (ATag)nodeList[nodeList.Count - 1]; page = Convert.ToInt32(numpage.Match(link.Link).Value.Trim()); } catch (Exception) { } for (int i = 1; i <= page; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(htl); eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "newtitle", "totalRows", "pageNO" }, new string[] { string.Empty, "0", i.ToString() }); try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr).Replace("<th", "<td").Replace("</th>", "</td>").Replace(" ", ""); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "cnewslist"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 1; j < table.RowCount - 2; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToPlainTextString().Trim(); beginDate = tr.Columns[1].ToPlainTextString().Trim(); ATag aTag = tr.Columns[0].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace("<th", "<td").Replace("</th>", "</td>").Replace("</TH>", "</td>").Replace("<TH", "<td").Replace(" ", ""); } catch (Exception) { Logger.Error("BidZhuHaiJS"); continue; } bool htmlBool = true; Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "maintable"))); if (dtnode.Count <= 0) { parserdetail = new Parser(new Lexer(htmldetail)); dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "printTb")), true), new TagNameFilter("table"))); } if (dtnode.Count <= 0) { parserdetail = new Parser(new Lexer(htmldetail)); dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "printTb")), true), new TagNameFilter("p"))); htmlBool = false; } if (dtnode.Count <= 0) { parserdetail = new Parser(new Lexer(htmldetail)); dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "fwinProjectForHand"), new TagNameFilter("div"))); } if (dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); if (htmlBool) { TableTag tabletwo = (TableTag)dtnode[0]; for (int row = 0; row < tabletwo.RowCount; row++) { TableRow r = tabletwo.Rows[row]; for (int k = 0; k < r.ColumnCount; k++) { string st = string.Empty; string st1 = string.Empty; st = r.Columns[k].ToPlainTextString().Trim(); if (k + 1 < r.ColumnCount) { st1 = r.Columns[k + 1].ToPlainTextString().Trim(); } bidCtx += st + ":" + st1 + "\r\n"; if (k + 1 <= r.ColumnCount) { k++; } } } } else { for (int k = 0; k < dtnode.Count; k++) { bidCtx += dtnode[k].ToPlainTextString() + "\r\n"; } } bidCtx = bidCtx.Replace("(单价)", "").Trim(); Regex regendDate = new Regex(@"(公告发布时间|公示日期):[^\r\n]+[\r\n]{1}"); endDate = regendDate.Match(bidCtx).Value.Replace("公告发布时间:", "").Replace("公示日期:", "").Trim(); string date = endDate.Replace(" ", "").Trim(); Regex regDate = new Regex(@"至\d{4}-\d{1,2}-\d{1,2}"); endDate = regDate.Match(date).Value.Replace("至", "").Trim(); if (endDate == "") { Regex regDateT = new Regex(@"--\d{4}-\d{1,2}-\d{1,2}"); endDate = regDateT.Match(date).Value.Replace("--", "").Trim(); } if (endDate == "") { Regex regDateT = new Regex(@"至\d{4}年\d{1,2}月\d{1,2}日"); endDate = regDateT.Match(date).Value.Replace("--", "").Trim(); } if (endDate == "") { Regex regDateT = new Regex(@"--\d{4}年\d{1,2}月\d{1,2}日"); endDate = regDateT.Match(date).Value.Replace("--", "").Trim(); } if (endDate == "") { Regex regDateT = new Regex(@"-\d{4}年\d{1,2}月\d{1,2}日"); endDate = regDateT.Match(date).Value.Replace("-", "").Trim(); } if (endDate == "") { endDate = string.Empty; } Regex regBidUnit = new Regex(@"(第一中标候选人|中标人|中标单位)(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx).Value.Replace("第一中标候选人", "").Replace("中标人:", "").Replace("中标单位:", "").Replace(":", "").Replace(":", "").Trim(); if (string.IsNullOrEmpty(bidUnit)) { Regex regBidUnit1 = new Regex(@"(第一中标候选人|中标人|中标单位)[^\r\n]+\r\n"); bidUnit = regBidUnit1.Match(bidCtx).Value.Replace("第一中标候选人", "").Replace("中标人", "").Replace("中标单位", "").Trim(); } Regex regbidMoney = new Regex(@"中标价(:|:)[^\r\n]+\r\n"); bidMoney = regbidMoney.Match(bidCtx).Value.Trim(); if (string.IsNullOrEmpty(bidMoney)) { Regex regbidMoney1 = new Regex(@"中标价[^\r\n]+\r\n"); bidMoney = regbidMoney1.Match(bidCtx).Value.Trim(); } Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains(",")) { bidMoney = bidMoney.Replace(",", "").Trim(); } if (bidMoney.Contains("万")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } Regex regprjMgr = new Regex(@"(项目负责人|项目经理|项目总监)(:|:)[^\r\n]+\r\n"); prjMgr = regprjMgr.Match(bidCtx).Value.Replace("项目负责人:", "").Replace("项目经理:", "").Replace("项目总监:", "").Trim(); Regex regcode = new Regex(@"项目编号(:|:)[^\r\n]+\r\n"); code = regcode.Match(bidCtx).Value.Replace("项目编号:", "").Replace(":", "").Trim(); msgType = "珠海市建设工程交易中心"; specType = "建设工程"; bidCtx = bidCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); bidCtx = bidCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Trim(); Regex regInvType = new Regex(@"[^\r\n]+[\r\n]{1}"); buildUnit = ""; if (bidUnit == "") { bidUnit = ""; } prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "珠海市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "dnn_ctr476_ArticleList_cboPages"))); if (nodeList != null && nodeList.Count > 0) { string oo = nodeList.AsString().Trim(); page = Convert.ToInt32(oo.Substring(oo.LastIndexOf("第")).ToString().Replace("第", "").Replace("页", "").Trim()); } for (int i = 1; i <= page; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(htl); eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "dnn$ctr476$ArticleList$cboPages", "ScrollTop", "__dnnVariable" }, new string[] { "dnn$ctr476$ArticleList$cmdNext", string.Empty, string.Empty, viewState, (i - 2).ToString(), "716", eventValidation }); try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "dnn_ctr476_ArticleList_PanelA"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = tableNodeList.SearchFor(typeof(TableTag), true)[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; if (tr.ColumnCount < 2) { continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = tr.Columns[0].ToPlainTextString().Trim(); beginDate = tr.Columns[1].ToPlainTextString().Trim(); ATag aTag = tr.Columns[0].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://zb.zjcic.net" + aTag.Link.Replace("amp;", "").Trim(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace(" ", ""); } catch (Exception) { Logger.Error("BidZhanJiangJS"); continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "dnn_ctr377_ArticleShow_lblContent"))); if (dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); bidCtx = dtnode.AsString().Trim().Replace(" ", "").Trim(); if (bidCtx.Contains("推荐")) { bidUnit = bidCtx.Substring(bidCtx.IndexOf("推荐")).Replace("推荐", "").Trim(); bidUnit = bidUnit.Remove(bidUnit.IndexOf("中标")).Trim(); if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司") + 2).Replace(":", "").Trim(); } if (bidUnit.Contains("设计院")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("设计院") + 3).Replace(":", "").Replace(":", "").Trim(); } } if (bidCtx.Contains("中标原则") && bidUnit == "") { bidUnit = bidCtx.Substring(bidCtx.IndexOf("中标原则")).Replace("中标原则", "").Replace(",", "").Trim(); bidUnit = bidUnit.Remove(bidUnit.IndexOf("中标")).Trim(); if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司") + 2).Replace(":", "").Replace(",", "").Trim(); } if (bidUnit.Contains("院")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("设计院") + 3).Replace("为", "").Replace(":", "").Replace(":", "").Trim(); } } if (bidCtx.Contains("定标原则") && bidUnit == "") { bidUnit = bidCtx.Substring(bidCtx.IndexOf("定标原则")).Replace("定标原则", "").Replace(",", "").Trim(); bidUnit = bidUnit.Remove(bidUnit.IndexOf("中标")).Trim(); if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司") + 2).Replace(":", "").Replace(",", "").Trim(); } if (bidUnit.Contains("院")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("设计院") + 3).Replace("为", "").Replace(":", "").Replace(":", "").Trim(); } } if (bidCtx.Contains("评标办法") && bidUnit == "以") { bidUnit = bidCtx.Substring(bidCtx.IndexOf("评标办法")).Replace("评标办法", "").Replace(",", "").Trim(); bidUnit = bidUnit.Remove(bidUnit.IndexOf("中标")).Trim(); if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司") + 2).Replace(":", "").Replace(",", "").Trim(); } if (bidUnit.Contains("院")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("设计院") + 3).Replace("为", "").Replace(":", "").Replace(":", "").Trim(); } } if (bidCtx.Contains("中标价:")) { bidMoney = bidCtx.Substring(bidCtx.IndexOf("中标价:")).Replace("中标价:", "").Trim(); if (bidMoney.Contains("元")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("元") + 1).Trim(); } } if (bidCtx.Contains("项目负责人:")) { prjMgr = bidCtx.Substring(bidCtx.IndexOf("项目负责人:")).Replace("项目负责人:", "").Trim(); prjMgr = prjMgr.Substring(0, 4).Replace(")", "").Replace("。", "").Replace(",", "").Replace(";", "").Trim(); } Regex regBuidUnit = new Regex(@"(招标人|招标单位)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(bidCtx).Value.Replace("招标人:", "").Replace("招标单位:", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains("万")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } msgType = "湛江市建设工程交易中心"; specType = "建设工程"; if (bidUnit == "的第一") { if (bidCtx.Contains("候选人")) { bidUnit = bidCtx.Substring(bidCtx.IndexOf("候选人")).Replace("候选人", "").Trim(); bidUnit = bidUnit.Remove(bidUnit.IndexOf("中标")).Trim(); if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司") + 2).Replace(":", "").Trim(); } if (bidUnit.Contains("设计院")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("设计院") + 3).Replace("为", "").Replace(":", "").Replace(":", "").Trim(); } } } if (bidUnit == "了") { parserdetail.Reset(); NodeList dtnodeF = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable"))); if (dtnodeF.Count <= 0) { parserdetail.Reset(); dtnodeF = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoTableGrid"))); } if (dtnodeF.Count > 0) { string bitext = string.Empty; TableTag tableone = (TableTag)dtnodeF[0]; for (int row = 0; row < tableone.RowCount; row++) { TableRow r = tableone.Rows[row]; for (int k = 0; k < r.ColumnCount; k++) { string st = string.Empty; string st1 = string.Empty; st = r.Columns[k].ToPlainTextString().Trim(); if (k + 1 < r.ColumnCount) { st1 = r.Columns[k + 1].ToPlainTextString().Trim(); } bitext += st + ":" + st1 + "\r\n"; if (k + 1 <= r.ColumnCount) { k++; } } } bitext = bitext.Replace("(", "").Replace(")", "").Trim(); Regex regBidUnit = new Regex(@"单位名称(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bitext).Value.Replace("中标单位:", "").Trim(); Regex regMoney = new Regex(@"(中标价|中标价格)(:|:)[^\r\n]+\r\n"); bidMoney = regMoney.Match(bitext).Value.Replace("中标价:", "").Replace("中标价格:", "").Replace(",", "").Trim(); if (bidMoney.Contains("万")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } } } bidUnit = bidUnit.Replace("为", "").Replace(": ", "").Trim(); if (bidUnit == "了" || bidUnit == "以" || bidUnit == "的第一") { bidUnit = ""; } if (buildUnit == "") { buildUnit = ""; } prjName = ToolDb.GetPrjName(prjName); prjName = prjName.Replace("·", ""); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "湛江市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiostr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); //NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter(""),new HasAttributeFilter("",""))); //if (pageNode != null && pageNode.Count > 0) //{ //} for (int i = 1; i <= pageInt; i++) { if (i > 1) { } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list")), true), new TagNameFilter("div"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { //continue; string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; headName = nodeList[j].GetATagValue("title"); releaseTime = nodeList[j].ToPlainTextString().GetDateRegex(); infoType = "通知公告"; infoUrl = "http://www.szjs.gov.cn/ztfw/gcjs/gzgg/" + nodeList[j].GetATagHref().Replace("../", "").Replace("./", ""); string htldtl = string.Empty; if (infoUrl.Contains("http://www.sz.gov.cn/")) { infoUrl = nodeList[j].GetATagHref(); } try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { continue; //try //{ // infoUrl = nodeList[j].GetATagHref(); // htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); //} //catch { // continue; //} } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content"))); if (noList == null || noList.Count <= 0) { parser.Reset(); noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Custom_UnionStyle"))); } if (noList == null || noList.Count <= 0) { parser.Reset(); noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "contentWrap"))); } if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = noList.AsString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.ShenZhenZJJMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(htldtl)); NodeList aList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "same")), true), new TagNameFilter("a"))); if (aList == null || aList.Count <= 0) { parser.Reset(); aList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "contentWrap")), true), new TagNameFilter("a"))); } if (aList != null && aList.Count > 0) { for (int k = 0; k < aList.Count; k++) { ATag a = aList[k].GetATag(); if (a.IsAtagAttach()) { try { string temp = nodeList[j].GetATagHref(); string link = string.Empty; if (temp.Contains("http")) { string tem = temp.GetRegexBegEnd("tzgg/", "/"); link = "http://www.sz.gov.cn/jsj/qt/tzgg/" + tem + "/" + a.Link.Replace("./", ""); } else { string tem = infoUrl.GetRegexBegEnd("gzgg/", "/"); link = "http://www.szjs.gov.cn/ztfw/gcjs/gzgg/" + tem + "/" + a.Link.Replace("./", ""); } BaseAttach obj = ToolHtml.GetBaseAttach(link, a.LinkText, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 3; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr); } catch (Exception ex) { return(list); } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page.number=" + i.ToString()), Encoding.UTF8); } catch (Exception ex) { continue; } } Parser parser = new Parser(new Lexer(htl)); parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table_title3"))); if (tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 1; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[4].ToPlainTextString().Trim(); ATag aTag = tr.Columns[4].SearchFor(typeof(ATag), true)[0] as ATag; beginDate = tr.Columns[6].ToPlainTextString().Trim(); endDate = tr.Columns[8].ToPlainTextString().Trim(); InfoUrl = "http://bidding.cnpec.com.cn/member/" + aTag.Link.Replace("amp;", "").Trim(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8).Replace(" ", ""); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); if (dtnode.Count <= 0) { Parser parserdetailDiv = new Parser(new Lexer(htmldetail)); NodeList dtnodelDiv = parserdetailDiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "font01"))); HtmlTxt = dtnodelDiv.AsHtml(); bidCtx = dtnodelDiv.AsString().Trim(); Regex regBuidUnit = new Regex(@"中标单位(:|:)[^\r\n]+\r\n"); bidUnit = regBuidUnit.Match(bidCtx).Value.Replace("中标单位:", "").Trim(); buildUnit = "中广核工程有限公司"; bidMoney = "0"; msgType = "中广核工程有限公司"; specType = "建设工程"; prjAddress = "见中标信息"; remark = "国际中标"; if (bidUnit == "") { bidUnit = ""; } bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, remark, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } if (dtnode.Count > 0) { TableTag tableNode = (TableTag)dtnode[0]; HtmlTxt = dtnode.AsHtml(); for (int k = 1; k < tableNode.RowCount; k++) { TableRow trow = tableNode.Rows[k]; for (int c = 0; c < trow.ColumnCount; c++) { string tr1 = string.Empty; tr1 = trow.Columns[c].ToPlainTextString().Trim(); if (tr1.Contains("中标候选人") && k + 1 < tableNode.RowCount) { bidUnit = tableNode.Rows[k + 1].Columns[0].ToPlainTextString().Trim(); } bidCtx += "\r\n" + tr1; } } bidCtx += "\r\n"; Regex regCode = new Regex(@"招标编号(:|:)[^\r\n]+\r\n"); code = regCode.Match(bidCtx).Value.Replace("招标编号:", "").Trim(); Regex regBuidUnit = new Regex(@"(招标人|建设单位)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(bidCtx).Value.Replace("招标人:", "").Replace("建设单位:", "").Trim(); Regex regMoney = new Regex(@"中标价(:|:)[^\r\n]+\r\n"); bidMoney = regMoney.Match(bidCtx).Value.Replace("中标价:", "").Replace(",", "").Replace("RMB", "").Trim(); if (bidMoney.Contains("EUR")) { bidMoney = "0"; } Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (!string.IsNullOrEmpty(regBidMoney.Match(bidMoney).Value)) { if (bidMoney.Contains("万元") || bidMoney.Contains("万美元") || bidMoney.Contains("万")) { bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } } msgType = "中广核工程有限公司"; specType = "建设工程"; if (buildUnit == "") { buildUnit = ""; } if (Encoding.Default.GetByteCount(bidUnit) > 50) { Regex regbidUnit = new Regex(@"[^\r\n]+\r\n"); bidUnit = regbidUnit.Match(bidUnit).Value.Trim(); } InfoUrl = InfoUrl.Replace("filter_EQ_isinternational=0", "filter_EQ_isinternational=1"); prjAddress = "见中标信息"; remark = "国际中标"; prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, remark, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "digg"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", "").Trim(); Regex regpage = new Regex(@"共\d+页"); try { pageInt = int.Parse(regpage.Match(pageTemp).Value.Replace("共", "").Replace("页", "").Trim()); } catch (Exception ex) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.UTF8); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("width", "100%"), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = (TableTag)nodeList[4]; for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; code = table.Rows[j].Columns[0].ToPlainTextString().Trim(); prjName = table.Rows[j].Columns[1].ToPlainTextString().Trim(); beginDate = table.Rows[j].Columns[2].ToPlainTextString().GetDateRegex(); ATag aTag = table.Rows[j].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.szldzb.com/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString().Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("width", "620"), new TagNameFilter("table"))); HtmlTxt = dtnodeHTML.AsHtml(); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("width", "620"), new TagNameFilter("table"))); bidCtx = dtnode.AsString().Trim().ToLower().Replace(" ", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "").Replace("<?xml:namespaceprefix=st1/>", "").Replace("startfragment", "").Replace("endfragment", ""); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetBidRegex(new string[] { "成交人" }); } bidMoney = bidCtx.GetMoneyRegex(null, false, "万元整,万元"); string monerystr = string.Empty; if (string.IsNullOrEmpty(bidUnit) && (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))) { Parser par = new Parser(new Lexer(HtmlTxt)); NodeList listCon = par.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable"))); if (listCon != null && listCon.Count > 0) { TableTag tab = listCon[0] as TableTag; string txt1 = string.Empty; string txt2 = string.Empty; try { for (int k = 0; k < 1; k++) { for (int d = 0; d < tab.Rows[k].ColumnCount; d++) { txt1 = tab.Rows[k].Columns[d].ToPlainTextString().Trim() + ":"; txt2 += txt1 + tab.Rows[k + 1].Columns[d].ToPlainTextString().Trim() + "\r\n"; } } } catch { } bidUnit = txt2.GetBidRegex(); bidMoney = txt2.GetMoneyRegex(); } } if (!string.IsNullOrEmpty(bidMoney) && bidMoney != "0") { if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } specType = "其他"; msgType = "深圳龙达招标有限公司"; prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("vAlign", "bottom"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数:", "当前"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); string __CSRFTOKEN = ToolHtml.GetHtmlInputValue(html, "__CSRFTOKEN"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__CSRFTOKEN", "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { __CSRFTOKEN, viewState, "MoreInfoList1$Pager", i.ToString() }); try { cookiestr = cookiestr.GetReplace(new string[] { "path=/;", "HttpOnly", "," }); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.spprec.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode == null || dtlNode.Count < 1) { dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ivs_content"))); } if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.GetReplace("<br />,<br/>,<br>,</p>", "\r\n").ToCtxString(); parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { bidCtx = string.Empty; TableTag htmlTable = tableNode[0] as TableTag; for (int r = 0; r < htmlTable.RowCount; r++) { for (int c = 0; c < htmlTable.Rows[r].ColumnCount; c++) { string temp = htmlTable.Rows[r].Columns[c].ToNodePlainString(); if (string.IsNullOrEmpty(temp)) { continue; } if ((c + 1) % 2 == 0) { bidCtx += temp + "\r\n"; } else { bidCtx += temp + ":"; } } } } buildUnit = bidCtx.GetBuildRegex().GetReplace("/"); prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex().GetChina().GetCodeDel(); bidUnit = bidCtx.GetRegexBegEnd("成交供应商及报价:", ","); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetBidRegex().GetReplace("/"); } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } if (!bidCtx.Contains("废标")) { bidMoney = bidCtx.GetMoneyRegex(); } bidUnit = bidUnit.GetReplace("第一包,1,、,:"); msgType = "四川省公共资源交易中心"; specType = bidType = "政府采购"; BidInfo info = ToolDb.GenBidInfo("四川省", "四川省及地市", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag tag = aNode[k] as ATag; if (tag.IsAtagAttach()) { string link = string.Empty; if (tag.Link.ToLower().Contains("http")) { link = tag.Link; } else { link = "http://www.spprec.com" + tag.Link; } BaseAttach attach = ToolDb.GenBaseAttach(tag.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { Logger.Error("无内容" + InfoUrl); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ProjectFinish>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1, count = 1; string eventValidation = string.Empty; try { htl = ToolHtml.GetHtmlByUrlEncode(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "pageLinkTd"))); if (tdNodes != null && tdNodes.Count > 0) { try { string temp = tdNodes.AsString().ToNodeString(); string s = temp.GetRegexBegEnd("总页数", "页").Replace(":", ""); pageInt = int.Parse(s); } catch (Exception ex) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "page", "qymc", "ann_serial", "pro_name" }, new string[] { i.ToString(), "", "", "" }); try { htl = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tblPrjConstBid"))); if (listNode != null && listNode.Count > 0) { TableTag table = (TableTag)listNode[0]; for (int j = 1; j < table.RowCount - 1; j++) { string pUrl = string.Empty, pInfoSource = string.Empty, pEndDate = string.Empty, pConstUnit = string.Empty, pSuperUnit = string.Empty, pDesignUnit = string.Empty, prjEndDesc = string.Empty, pPrjAddress = string.Empty, pBuildUnit = string.Empty, pPrjCode = string.Empty, PrjName = string.Empty, pRecordUnit = string.Empty, pCreatetime = string.Empty, pLicUnit = string.Empty; TableRow tr = table.Rows[j]; pPrjCode = tr.Columns[0].ToNodePlainString(); PrjName = tr.Columns[1].ToNodePlainString(); pBuildUnit = tr.Columns[2].ToNodePlainString(); pEndDate = tr.Columns[3].ToNodePlainString().GetDateRegex(); if (string.IsNullOrEmpty(pRecordUnit)) { pRecordUnit = "深圳市住房和建设局"; } ProjectFinish info = ToolDb.GenProjectFinish("广东省", pUrl, "深圳市区", pInfoSource, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, prjEndDesc, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pRecordUnit, pCreatetime, "深圳市住房和建设局", pLicUnit); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } count++; if (count >= 200) { count = 1; Thread.Sleep(600 * 1000); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htl = regexHtml.Replace(htl, ""); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "right"))); Regex regexPage = new Regex(@"共\d+页"); try { page = int.Parse(regexPage.Match(nodeList.AsString()).Value.Trim(new char[] { '共', '页' })); } catch (Exception) { } for (int i = 1; i < page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&otype=&pageNum=" + i.ToString()), Encoding.Default); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htl = regexHtml.Replace(htl, ""); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellpadding", "1"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, bidType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToPlainTextString().Trim(); beginDate = tr.Columns[1].ToPlainTextString().Replace(" ", "").Trim().Substring(0, 10); ATag aTag = tr.Columns[0].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://market.meizhou.gov.cn/website/deptwebsite/1925/Content.jsp?issueId=15488&msgType=00&filePath=" + aTag.GetAttribute("onclick").Replace("showDeptContent('1925','", ""); int ii = InfoUrl.IndexOf("'"); string oo = InfoUrl.Remove(ii).Trim(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(oo), Encoding.Default).Replace(" ", ""); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception) { Logger.Error("InviteMeiZhouCityJS"); continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("P"), new HasAttributeFilter("class", "MsoNormal"))); if (dtnode.Count > 0 && dtnode != null) { HtmlTxt = dtnode.AsHtml(); for (int k = 0; k < dtnode.Count; k++) { string tr1 = string.Empty; tr1 = dtnode[k].ToPlainTextString().Replace(" ", "").Trim(); if (k == 0) { string InvType = tr1; bidType = ToolHtml.GetInviteTypes(InvType); } inviteCtx += tr1 + ":" + "\r\n"; } Regex regPrjAddr = new Regex(@"(工程地点|建设地点):[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("建设地点", "").Replace(":", "").Replace(";", "").Trim(); Regex bildUnit = new Regex(@"(招标人|招标人(盖章)|招标人):[^\r\n]+[\r\n]{1}"); buildUnit = bildUnit.Match(inviteCtx).Value.Replace("招 标人:", "").Replace("招标人(盖章):", "").Replace("招标人:", "").Trim(); if (buildUnit != "") { int zz = buildUnit.IndexOf(":"); buildUnit = buildUnit.Remove(zz).ToString(); } Regex regcode = new Regex(@"(招标项目编号|项目编号)(:|:)[^\r\n]+[\r\n]{1}"); code = regcode.Match(inviteCtx).Value.Replace("招标项目编号", "").Replace("项目编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regoType = new Regex(@"工程类型:[^\r\n]+\r\n"); string oType = regoType.Match(inviteCtx).Value.Replace("工程类型:", "").Trim(); if (oType.Contains("房建")) { otherType = "房建及工业民用建筑"; } else if (oType.Contains("市政")) { otherType = "市政工程"; } else if (oType.Contains("园林绿化")) { otherType = "园林绿化工程"; } else if (oType.Contains("装饰") || oType.Contains("装修")) { otherType = "装饰装修工程"; } else if (oType.Contains("电力")) { otherType = "电力工程"; } else if (oType.Contains("水利")) { otherType = "水利工程"; } if (oType.Contains("环保")) { otherType = "环保工程"; } if (buildUnit == "") { buildUnit = ""; } if (Encoding.Default.GetByteCount(code) > 50) { code = string.Empty; } msgType = "梅州市建设工程交易中心"; specType = "建设工程"; Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); inviteCtx = regexHtml.Replace(inviteCtx, ""); InviteInfo info = ToolDb.GenInviteInfo("广东省", "梅州市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, bidType, specType, otherType, oo, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "bmdt_fy"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", ""); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://syjdb.baoan.gov.cn/xxgk_12101/ywxx/zbcg/zbxxgs/index_" + (i - 1).ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "new_list01"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, ""); try { int beg = temp.IndexOf("else"), end = temp.Length; temp = temp.Substring(beg, end - beg); beg = temp.LastIndexOf("<a"); end = temp.LastIndexOf("/a>"); temp = temp.Substring(beg, (end - beg) + 3); beg = temp.IndexOf(">"); end = temp.IndexOf("</"); prjName = temp.Substring(beg + 1, end - beg - 1); Parser p = new Parser(new Lexer(temp)); NodeList l = p.ExtractAllNodesThatMatch(new TagNameFilter("a")); ATag aTag = l.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://syjdb.baoan.gov.cn/xxgk_12101/ywxx/zbcg/zbxxgs/" + aTag.Link.Replace("../", "").Replace("./", ""); } catch { continue; } string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htlDtl = regexHtml.Replace(htlDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "DivContent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = dtl.AsString().Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); string InvType = prjName; if (InvType.Contains("施工")) { inviteType = "施工"; } if (InvType.Contains("监理")) { inviteType = "监理"; } if (InvType.Contains("设计")) { inviteType = "设计"; } if (InvType.Contains("勘察")) { inviteType = "勘察"; } if (InvType.Contains("服务")) { inviteType = "服务"; } if (InvType.Contains("劳务分包")) { inviteType = "劳务分包"; } if (InvType.Contains("专业分包")) { inviteType = "专业分包"; } if (InvType.Contains("小型施工")) { inviteType = "小型工程"; } if (InvType.Contains("设备材料")) { inviteType = "设备材料"; } Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "深圳市宝安区石岩街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区石岩街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookieStr = string.Empty; int pageInt = 1; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "tzgg_right_page")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { ATag aTag = pageList[pageList.Count - 2] as ATag; string tem = aTag.LinkText; pageInt = Convert.ToInt32(tem.Replace("goPage(", "").Replace(")", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string url = "http://www.gzzb.gd.cn/cms/wz/view/tzygg/enterpriseAchievementServlet?name=&number=&projectName=&projectNumber=&siteId=1&channelId=19&pager.offset=" + i.ToString() + "0"; html = this.ToolWebSite.GetHtmlByUrl(url, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "table1"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjCode = tr.Columns[1].ToNodePlainString(); InfoTitle = tr.Columns[2].ToNodePlainString(); buildUnit = tr.Columns[4].ToNodePlainString(); PublistTime = tr.Columns[5].ToPlainTextString(); InfoType = "业绩公示"; InfoUrl = "http://www.gzzb.gd.cn" + tr.Columns[2].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(InfoUrl, Encoding.Default); htldtl = htldtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "block-body"))); //if (dtlList != null && dtlList.Count > 0) //{ // InfoCtx = dtlList.AsString().ToCtxString().Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); //} InfoCtx = "项目编号:" + prjCode + "\r\n项目名称:" + InfoTitle + "\r\n单位编号:" + tr.Columns[3].ToNodePlainString() + "\r\n单位名称:" + buildUnit + "\r\n审核时间:" + PublistTime; htmlTxt = InfoCtx; NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "广州市区", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, MsgTypeCosnt.GuangZhouMsgType, InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt); list.Add(info); //parser = new Parser(new Lexer(dtlList.AsHtml())); //NodeList aList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); //if (aList != null && aList.Count > 0) //{ // for (int c = 0; c < aList.Count; c++) // { // ATag aTag = aList[c].GetATag(); // if (aTag.IsAtagAttach()) // { // string alink = "http://www.gzzb.gd.cn" + aTag.Link; // BaseAttach attach = ToolDb.GenBaseAttach(aTag.LinkText.Replace(" ", "").Replace(";", "").Replace(";", ""), info.Id, alink); // base.AttachList.Add(attach); // } // } //} if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("align", "right"))); Regex regexPage = new Regex(@"\d+页"); try { page = Convert.ToInt32(regexPage.Match(nodeList.AsString()).Value.Replace("页", "").Trim()); } catch (Exception) { } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center"))); if (tableNodeList != null && tableNodeList.Count > 1) { TableTag table = (TableTag)tableNodeList[3]; for (int j = 0; j < table.RowCount - 1; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[0].SearchFor(typeof(ATag), true)[1] as ATag; prjName = aTag.LinkText; beginDate = tr.Columns[1].ToPlainTextString().Trim(); InfoUrl = "http://zhaobiao.szpt.edu.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new TagNameFilter("p")); if (dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); Regex regeximg = new Regex(@"<IMG[^>]*>");//去掉图片 HtmlTxt = regeximg.Replace(HtmlTxt, ""); for (int z = 0; z < dtnode.Count; z++) { bidCtx += dtnode[z].ToPlainTextString().Replace(" ", "").Trim() + "\r\n"; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); bidCtx = regexHtml.Replace(bidCtx, ""); Regex regcode = new Regex(@"(项目编号|招标编号)(:|:)[^\r\n]+\r\n"); code = regcode.Match(bidCtx).Value.Replace("项目编号:", "").Replace("招标编号:", "").Replace(":", "").Trim(); Regex regBidUnit = new Regex(@"(成交单位|中标单位)(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx).Value.Replace("成交单位:", "").Replace("中标单位:", "").Replace("中标折扣率:72.5%", "").Trim(); Regex regMoney = new Regex(@"(中标价|中标价格)(:|:)[^\r\n]+\r\n"); bidMoney = regMoney.Match(bidCtx).Value.Replace("中标价:", "").Replace("中标价格:", "").Replace(",", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains("万")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } if (bidType == "设备材料" || bidType == "小型施工" || bidType == "专业分包" || bidType == "劳务分包" || bidType == "服务" || bidType == "勘察" || bidType == "设计" || bidType == "监理" || bidType == "施工") { specType = "建设工程"; } else { specType = "其他"; } if (buildUnit == "") { buildUnit = ""; } msgType = "深职院"; bidType = ToolHtml.GetInviteTypes(bidType); prjName = ToolDb.GetPrjName(prjName); if (Encoding.Default.GetByteCount(code) > 50) { code = string.Empty; } if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = string.Empty; } if (Encoding.Default.GetByteCount(bidUnit) > 150) { bidUnit = string.Empty; } if (Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = string.Empty; } BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_GridViewPaingTwo1_lblGridViewPagingDesc"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString(); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("共", "页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__VIEWSTATE", "__EVENTVALIDATION", "ctl00$cph_context$GridViewPaingTwo1$txtGridViewPagingForwardTo", "ctl00$cph_context$GridViewPaingTwo1$btnForwardToPage" }, new string[] { viewState, eventValidation, i.ToString(), "GO" } ); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_GridView1"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; headName = tr.Columns[1].ToNodePlainString(); releaseTime = tr.Columns[2].ToNodePlainString(); infoType = "政策法规"; infoUrl = "http://www.dgzb.com.cn/DGJYWEB/SiteManage/" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_span_MetContent"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = noList.AsString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.DongGuanMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "东莞市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(htldtl)); NodeList attachList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_DownLoadFiles1_GridView1"))); if (attachList != null && attachList.Count > 0) { TableTag tabTag = attachList[0] as TableTag; for (int k = 1; k < tabTag.RowCount; k++) { TableRow dr = tabTag.Rows[k]; try { string attName = string.IsNullOrEmpty(dr.Columns[1].ToNodePlainString()) ? headName : dr.Columns[1].ToNodePlainString(); BaseAttach baseInfo = ToolHtml.GetBaseAttachByUrl("http://www.dgzb.com.cn/DGJYWEB/SiteManage/" + dr.Columns[1].GetATagHref(), attName, info.Id); if (baseInfo != null) { ToolDb.SaveEntity(baseInfo, string.Empty); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "dlbsc-feiyeR"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().ToRegString().GetRegexBegEnd("/", "跳"); pageInt = int.Parse(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://dalang.szlhxq.gov.cn/dlbsc/zwgk73/cgzb10/zbgz/13891-" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "dlbsc_contUl")), true), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToNodePlainString()).Value; prjName = viewList[j].GetATag().LinkText; InfoUrl = "http://dalang.szlhxq.gov.cn" + viewList[j].GetATagHref(0).Replace("./", "/"); string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "dlbsc-content"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = System.Text.RegularExpressions.Regex.Replace(dtl.ToHtml(), "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = HtmlTxt.ToCtxString(); inviteType = prjName.GetInviteBidType(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "深圳市龙华新区大浪街道办事处"; if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区大浪街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Pages"))); if (pageNode != null && pageNode.Count > 0) { string temp = pageNode.AsString().GetRegexBegEnd("共有", "页"); try { pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "?page=" + i); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "News_list")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; INode node = listNode[j]; releaseTime = node.ToPlainTextString().GetDateRegex(); ATag aTag = node.GetATag(); headName = aTag.GetAttribute("title"); infoUrl = "http://cgzx.baoan.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "Scroller-1"))); if (dtlNode != null && dtlNode.Count > 0) { ctxHtml = dtlNode.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = "深圳市宝安区政府采购中心"; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳政府采购", "宝安区", infoCtx, "通知公告"); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag fileATag = aNode[k].GetATag(); if (fileATag.IsAtagAttach()) { BaseAttach obj = null; if (fileATag.Link.ToLower().Contains("http")) { obj = ToolHtml.GetBaseAttach(fileATag.Link, headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://ba.szzfcg.cn" + fileATag.Link, headName, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string HtmlTxt = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "749"))); if (nodeList != null && nodeList.Count > 0) { HtmlTxt = nodeList.AsHtml(); TableTag table = nodeList[0] as TableTag; //int rowIndex = 8; //for (int j = 6; j < table.RowCount - 3; j++) //{ // TableRow tr = table.Rows[j]; // if (tr.ToPlainTextString().Contains("中标通知书")) // { // rowIndex = j+2; // } //} for (int j = 13; j < table.RowCount - 3; j++) { TableRow tr = table.Rows[j]; if (tr.ToPlainTextString().Contains("注:")) { continue; } if (tr.ToPlainTextString().Contains("中标通知书")) { j++; continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty; code = tr.Columns[2].ToPlainTextString().Trim(); prjName = tr.Columns[3].ToPlainTextString().Trim(); bidUnit = tr.Columns[4].ToPlainTextString().Trim(); string bid = tr.Columns[5].ToPlainTextString().Trim(); beginDate = tr.Columns[7].ToPlainTextString().Trim(); InfoUrl = "http://www.ymcw.com/message2.htm"; HtmlTxt = string.Format("<p>招标编号:{0}<br/>项目名称:{1}<br/>中标单位:{2}<br/>中标项目:{3}<br/>中标时间:{4}<br/></p>", code, prjName, bidUnit, bid, beginDate); bidCtx = string.Format("招标编号:{0}\r\n项目名称:{1}\r\n中标单位:{2}\r\n中标时间:{3}\r\n", code, prjName, bidUnit, beginDate); specType = "其他"; msgType = "深圳市裕明财务咨询有限公司"; bidType = ToolHtml.GetInviteTypes(prjName); prjName = ToolDb.GetPrjName(prjName); if (prjName.Contains("深圳市人民检察院电子物证设备")) { continue; } BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } return(list); }
private void SaveAttach(BidProject info, string htmltxt, string result, bool isUpdate) { List <BaseAttach> list = new List <BaseAttach>(); if (htmltxt.Contains("http")) { Parser parser = new Parser(new Lexer(htmltxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int j = 0; j < aNode.Count; j++) { ATag aTag = aNode[j].GetATag(); string attachName = aTag.LinkText; string aurl = string.Empty; if (!aTag.Link.ToLower().Contains("http")) { aurl = "https://www.szjsjy.com.cn:8001/" + aTag.Link.GetReplace("\\"); } else { aurl = aTag.Link.GetReplace("\\"); } if (string.IsNullOrWhiteSpace(attachName)) { attachName = info.PrjName; } try { string url = System.Web.HttpUtility.UrlDecode(aurl); string[] urls = url.Split('&'); url = urls[0] + "&" + urls[2] + "&" + urls[1]; BaseAttach entity = null; if (isUpdate) { entity = ToolHtml.GetBaseAttach(url.Replace("\"", ""), attachName, result, "SiteManage\\Files\\Attach\\"); } else { entity = ToolHtml.GetBaseAttach(url.Replace("\"", ""), attachName, info.Id, "SiteManage\\Files\\Attach\\"); } if (entity != null) { list.Add(entity); } } catch { } } } } else { System.Data.DataTable dtlDtl = ToolHtml.JsonToDataTable(htmltxt); if (dtlDtl != null && dtlDtl.Rows.Count > 0) { for (int i = 0; i < dtlDtl.Rows.Count; i++) { System.Data.DataRow row = dtlDtl.Rows[i]; string attachName = Convert.ToString(row["attachName"]); if (string.IsNullOrWhiteSpace(attachName)) { attachName = info.PrjName; } string attachGuid = Convert.ToString(row["attachGuid"]); string url = "https://www.szjsjy.com.cn:8001/file/downloadFile?fileId=" + attachGuid; try { BaseAttach entity = null; if (isUpdate) { entity = ToolHtml.GetBaseAttachByUrl(url, attachName, result, "SiteManage\\Files\\Attach\\"); } else { entity = ToolHtml.GetBaseAttachByUrl(url, attachName, info.Id, "SiteManage\\Files\\Attach\\"); } if (entity != null) { list.Add(entity); } } catch { } } } } if (list.Count > 0) { if (isUpdate) { string delSql = string.Format("delete from BaseAttach where SourceID='{0}'", result); ToolFile.Delete(result); int count = ToolDb.ExecuteSql(delSql); } foreach (BaseAttach attach in list) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "scott")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList[pageList.Count - 1].GetATagValue().Replace("(", "kdxx").Replace(")", "xxdk").GetRegexBegEnd("kdxx", "xxdk"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "办事指南"; headName = tr.Columns[1].ToNodePlainString(); releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString().Replace("<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />", ""); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "context_div"))); if (dtlList != null && dtlList.Count > 0) { ctxHtml = dtlList.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = MsgTypeCosnt.ZhongShanMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "中山市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int img = 0; img < imgList.Count; img++) { ImageTag imgTag = imgList[img] as ImageTag; BaseAttach baseInfo = ToolHtml.GetBaseAttachByUrl(imgTag.GetAttribute("src"), headName, info.Id); if (baseInfo != null) { ToolDb.SaveEntity(baseInfo, string.Empty); } } } parser = new Parser(new Lexer(ctxHtml)); NodeList attachList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (attachList != null && attachList.Count > 0) { for (int a = 0; a < attachList.Count; a++) { ATag aTag = attachList[a] as ATag; if (aTag.IsAtagAttach()) { BaseAttach obj = ToolHtml.GetBaseAttachByUrl(aTag.Link, aTag.LinkText, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "dnn_ctr395_ProjectList_pager"))); if (nodeList != null && nodeList.Count > 0) { Regex regDate = new Regex(@"\d下一页"); page = Convert.ToInt32(regDate.Match(nodeList.AsString().Trim()).ToString().Replace("下一页", "").Trim()); } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "dnn_ctr395_ProjectList_grdData"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[0].ToPlainTextString().Trim(); prjName = tr.Columns[1].ToPlainTextString().Trim(); endDate = tr.Columns[4].ToPlainTextString().Replace(" ", "").Trim().Substring(0, 10); beginDate = tr.Columns[3].ToPlainTextString().Replace(" ", "").Trim().Substring(0, 10); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://zb.zjcic.net" + aTag.Link.Replace("amp;", "").Trim(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace(" ", ""); } catch (Exception) { Logger.Error("InviteZhJianJS"); continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "dnn_ctr408_ProjectView_INSTRUCTION"))); if (dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = dtnode.AsString().Replace(" ", "").Trim(); Regex regBuidUnit = new Regex(@"(招标单位|招标人):[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("招标单位:", "").Replace(":", "").Replace(" ", "").Trim(); if (buildUnit == "") { buildUnit = ""; } Regex regPrjAddr = new Regex(@"(工程地点|工程地址|工程建设地点)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("工程地址", "").Replace("工程建设地点", "").Replace(":", "").Trim(); msgType = "湛江市建设工程交易中心"; specType = "建设工程"; if (prjAddress == "") { prjAddress = "见招标信息"; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "湛江市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); Dictionary <string, string> citys = this.GetCitys(); foreach (string area in citys.Keys) { int count = 0; int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(citys[area], Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); string viewSTATEGENERATOR = ToolHtml.GetHtmlInputValue(html, "__VIEWSTATEGENERATOR"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTTARGET", "__EVENTARGUMENT", "__EVENTVALIDATION", "MoreInfoList1$txtTitle" }, new string[] { viewState, viewSTATEGENERATOR, "MoreInfoList1$Pager", i.ToString(), eventValidation, "" }); try { html = this.ToolWebSite.GetHtmlByUrl(citys[area], nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title").GetReplace("【正在报名】,【报名结束】"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gxzbtb.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.GetReplace(new string[] { "<br/>", "<br />", "<br>" }, "\r\n").ToCtxString(); prjAddress = bidCtx.GetAddressRegex(); buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); code = bidCtx.GetCodeRegex().GetCodeDel(); if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag bidTable = bidNode[0] as TableTag; for (int r = 0; r < bidTable.RowCount; r++) { for (int c = 0; c < bidTable.Rows[r].ColumnCount; c++) { if ((c + 1) % 2 == 0) { ctx += bidTable.Rows[r].Columns[c].ToNodePlainString() + "\r\n"; } else { ctx += bidTable.Rows[r].Columns[c].ToNodePlainString() + ":"; } } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyString().GetMoney("万元"); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrEmpty(code)) { code = ctx.GetCodeRegex().GetCodeDel(); } if (bidUnit.Contains("推荐") || bidUnit.Contains("中标") || bidUnit.Contains("地址")) { bidUnit = string.Empty; } if (string.IsNullOrEmpty(bidUnit)) { if (bidTable.RowCount > 1) { ctx = string.Empty; for (int d = 0; d < bidTable.Rows[0].ColumnCount; d++) { ctx += bidTable.Rows[0].Columns[d].ToNodePlainString() + ":"; try { ctx += bidTable.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } catch { } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyString().GetMoney(); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrEmpty(code)) { code = ctx.GetCodeRegex().GetCodeDel(); } } } } } try { if (decimal.Parse(bidMoney) > 10000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } bidUnit = bidUnit.Replace("名称", "").Replace("单位", "").Replace("№", "").Replace("1", "").Replace("2", "").Replace("联合体", "").Replace("(", ""); if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } if (bidUnit.Contains("研究院")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("研究院")) + "研究院"; } if (bidUnit.Contains("研究所")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("研究所")) + "研究所"; } bidType = "水利工程"; specType = "建设工程"; msgType = "广西壮族自治区公共资源交易中心"; BidInfo info = ToolDb.GenBidInfo("广西壮族自治区", "广西壮族自治区及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); count++; parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.gxzbtb.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && count >= this.MaxCount) { goto Funcs; } } } } } Funcs :; } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookieStr = string.Empty; int pageInt = 1; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default).GetJsString(); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd1")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList[pageList.Count - 1].GetATagValue(); pageInt = Convert.ToInt32(temp.Replace("javascript:goPage(", "").Replace(")", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string typeId = ToolHtml.GetHtmlInputValue(html, "typeId"); string boardId = ToolHtml.GetHtmlInputValue(html, "boardId"); string totalRows = ToolHtml.GetHtmlInputValue(html, "totalRows"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "typeId", "boardId", "newstitle", "sTime", "eTime", "totalRows", "pageNO" }, new string[] { typeId, boardId, "", "", "", totalRows, i.ToString() } ); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; TableRow tr = table.Rows[j]; InfoTitle = tr.Columns[1].ToNodePlainString(); string endDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoType = "资格预审"; InfoUrl = tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "context_div"))); if (dtlList != null && dtlList.Count > 0) { htmlTxt = dtlList.ToHtml(); InfoCtx = dtlList.ToHtml().ToCtxString().Replace("<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />", ""); PublistTime = InfoCtx.GetDateRegex("yyyy年MM月dd日").Replace("年", "-").Replace("月", "-").Replace("日", ""); if (string.IsNullOrEmpty(PublistTime)) { PublistTime = InfoCtx.GetDateRegex(); } if (string.IsNullOrEmpty(PublistTime)) { PublistTime = endDate; } NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "惠州市区", "龙门县", string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, MsgTypeCosnt.HuiZhouMsgType, InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "anp1"))); if (tdNodes != null) { string pageTemp = tdNodes.AsString().Replace(" ", "").Trim(); Regex regpage = new Regex(@"当前第[^页]+页"); try { pageInt = int.Parse(regpage.Match(pageTemp).Value.Split('/')[1].Replace("页", "").Trim()); } catch (Exception ex) { } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "anp1_input", "Columnlist2_DepartmentTreeView_CheckedList", "Columnlist2_DepartmentTreeView_EditEvents", "Columnlist2_DepartmentTreeView_ExpandedList", "Columnlist2_DepartmentTreeView_MoveEvents", "Columnlist2_DepartmentTreeView_MultipleSelectedList", "Columnlist2_DepartmentTreeView_ScrollData", "Columnlist2_DepartmentTreeView_SelectedNode", "Columnlist2_DepartmentTreeView_ValueChangeEvents", "Login2:txtPassword", "Login2:txtUserName", "__EVENTARGUMENT", "__EVENTTARGET", "__VIEWSTATE" }, new string[] { (i - 1).ToString(), string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, "0,0", "p_379", string.Empty, string.Empty, string.Empty, i.ToString(), "anp1", viewState }); try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch (Exception ex) { continue; } } Regex regHTML1 = new Regex(@"<td>[^<]+<td>"); Regex regHTML2 = new Regex(@"</td>[^<]+</td>"); html = regHTML2.Replace(regHTML1.Replace(html, "<td>"), "</td>"); parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "dlstNews"))); if (nodeList != null) { if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; beginDate = tr.Columns[1].ToPlainTextString().Trim(); ATag aTag = tr.Columns[0].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.szgxzb.com/zbgg/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "newsContent"), new TagNameFilter("div"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList prjNameNode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("title")); prjName = prjNameNode.AsString().Replace("国信招标--", ""); dtlparser.Reset(); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "newsContent"), new TagNameFilter("div"))); inviteCtx = dtnode.AsString(); Regex regcode = new Regex(@"(招标编号:[^)]+)"); code = regcode.Match(inviteCtx).Value.Replace("招标编号", "").Replace("(", "").Replace(")", "").Replace(":", "").Trim(); if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } specType = "其他"; msgType = "深圳市国信招标有限公司"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_page"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace(",", "kdcc"); string te = temp.GetRegexBegEnd("HTML", "kdcc").Replace("(", ""); pageInt = Convert.ToInt32(te); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://xajdb.baoan.gov.cn/xxgk_11984/ywxx/zbcg/zbxxgg/index_" + (i - 1).ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "right_list"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, ""); try { string ahtml = viewList[j].ToHtml(); string aStr = ahtml.Replace(";", "kdxx").GetRegexBegEnd("write", "kdxx").GetReplace("(,)"); ATag atag = ahtml.GetATag(); if (string.IsNullOrWhiteSpace(atag.Link)) { ahtml = ahtml.Replace(aStr, ""); ahtml = ahtml.Replace("", "kdxx").GetRegexBegEnd("write", "kdxx").GetReplace("(,)"); } int beg = temp.IndexOf("else"), end = temp.Length; temp = temp.Substring(beg, end - beg); beg = temp.IndexOf("<a"); end = temp.IndexOf("/a>"); temp = temp.Substring(beg, (end - beg) + 3); beg = temp.IndexOf(">"); end = temp.IndexOf("</"); prjName = temp.Substring(beg + 1, end - beg - 1); Parser p = new Parser(new Lexer(temp)); NodeList l = p.ExtractAllNodesThatMatch(new TagNameFilter("a")); ATag aTag = l.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://xajdb.baoan.gov.cn/xxgk_11984/ywxx/zbcg/zbxxgg/" + aTag.Link.Replace("../", "").Replace("./", ""); } catch { continue; } string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8); htmDtl = Regex.Replace(htmDtl, "(<script)[\\s\\S]*?(</script>)", ""); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "DivContent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", ""); parser = new Parser(new Lexer(HtmlTxt.ToLower().Replace("th", "td"))); NodeList dtlTab = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (dtlTab != null && dtlTab.Count > 0) { TableTag table = dtlTab[0] as TableTag; for (int k = 0; k < table.RowCount; k++) { for (int c = 0; c < table.Rows[k].ColumnCount; c++) { string strCtx = table.Rows[k].Columns[c].ToPlainTextString().Replace(" ", "").Replace(" ", ""); if (strCtx == "工程类型") { break; } if (c % 2 == 0) { bidCtx += strCtx + ":"; } else { bidCtx += strCtx + "\r\n"; } } } bidCtx = bidCtx.Replace("\n", "").Replace("\r\n\r\n", "\r\n").Replace("\r", "\r\n") + "\r\n"; } else { bidCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); bidCtx = Regex.Replace(bidCtx.Replace("<BR/>", "\r\n").Replace("<br/>", "\r\n").Replace("<BR>", "\r\n").Replace("<br>", "\r\n"), "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n") + "\r\n"; } bidType = "工程"; if (prjName.Contains("施工")) { bidType = "施工"; } if (prjName.Contains("监理")) { bidType = "监理"; } if (prjName.Contains("设计")) { bidType = "设计"; } if (prjName.Contains("勘察")) { bidType = "勘察"; } if (prjName.Contains("服务")) { bidType = "服务"; } if (prjName.Contains("劳务分包")) { bidType = "劳务分包"; } if (prjName.Contains("专业分包")) { bidType = "专业分包"; } if (prjName.Contains("小型施工")) { bidType = "小型工程"; } if (prjName.Contains("设备材料")) { bidType = "设备材料"; } Regex regPrjCode = new Regex(@"(工程编号|项目编号|招标编号|中标编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(bidCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("招标编号", "").Replace("中标编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regMoney = new Regex(@"(中标价|投标价|总投资|发包价|投标报价|价格|金额)(:|:|)[^\r\n]+\r\n"); bidMoney = regMoney.Match(bidCtx.Replace(" ", "")).Value.Replace("中标价", "").Replace("总投资", "").Replace("发包价", "").Replace("投标报价", "").Replace("投标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Trim(); Regex regBidUnit = new Regex(@"(第一候选人|中标候选人|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regprjMgr = new Regex(@"(项目经理姓名|项目经理|项目负责人|项目总监|建造师|总工程师|监理师)(:|:)[^\r\n]+\r\n"); prjMgr = regprjMgr.Match(bidCtx.Replace(" ", "")).Value.Replace("项目经理姓名", "").Replace("总工程师", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("监理师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains("万")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } if (prjMgr.Contains("资格")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格")); } if (string.IsNullOrEmpty(bidUnit) && string.IsNullOrEmpty(buildUnit)) { parser = new Parser(new Lexer(HtmlTxt.ToLower().Replace("th", "td").Replace("</p>", "\r\n").Replace("<br/>", "\r\n").Replace("<br>", "\r\n"))); dtlTab = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (dtlTab != null && dtlTab.Count > 0) { bidCtx = ""; TableTag table = dtlTab[0] as TableTag; for (int k = 0; k < table.RowCount; k++) { for (int c = 0; c < table.Rows[k].ColumnCount; c++) { string strCtx = table.Rows[k].Columns[c].ToPlainTextString().Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n"); if (strCtx == "工程类型") { break; } if (c % 2 == 0) { bidCtx += strCtx + ":"; } else { bidCtx += strCtx + "\r\n"; } } } bidCtx = bidCtx.Replace("\n", "").Replace("\r\n\r\n", "\r\n").Replace("\r", "\r\n") + "\r\n"; } regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim(); regBidUnit = new Regex(@"(第一候选人|中标候选人|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim(); } buildUnit = ToolHtml.GetSubString(buildUnit, 150); bidUnit = ToolHtml.GetSubString(bidUnit, 150); code = ToolHtml.GetSubString(code, 50); prjMgr = ToolHtml.GetSubString(prjMgr, 50); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区新安街道办事处"; } msgType = "深圳市宝安区新安街道办事处"; specType = "建设工程"; bidType = "小型工程"; prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }