private bool IsTableBid(TableTag table) { Parser tableparser = new Parser(new Lexer(table.ToHtml())); NodeList nodeList = tableparser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (nodeList != null && nodeList.Count > 1) { return(false); } for (int i = 0; i < table.RowCount; i++) { if (table.Rows[i].ToNodePlainString().Contains("中标供应商") || table.Rows[i].ToNodePlainString().Contains("成交供应商") || table.Rows[i].ToNodePlainString().Contains("中标单位")) { return(true); } } return(false); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <CorpMerit>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = ToolWeb.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } #region 优质专业工程 Parser parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "29")), true), new TagNameFilter("table")));//parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("width","98%"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; parser = new Parser(new Lexer(table.ToHtml())); NodeList aTagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aTagNode != null && aTagNode.Count > 0) { for (int j = 0; j < aTagNode.Count; j++) { ATag aTag = aTagNode[j].GetATag(); string name = "优质专业工程"; string typename = aTag.LinkText.Replace("·", ""); string url = "http://www.jianzhuxh.com/excellence/" + aTag.Link; string htlList = string.Empty; int page = 1; try { htlList = ToolWeb.GetHtmlByUrl(url, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htlList)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "center"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); page = int.Parse(temp); } catch { } } for (int d = 1; d <= page; d++) { if (d > 1) { try { htlList = ToolWeb.GetHtmlByUrl(url + "&page=" + d, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(htlList)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text"))); if (dtlNode != null && dtlNode.Count > 0) { parser = new Parser(new Lexer(dtlNode.ToHtml())); NodeList dtlNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "844"))); if (dtlNodeList != null && dtlNodeList.Count > 0) { TableTag tableTag = dtlNodeList[0] as TableTag; for (int k = 0; k < tableTag.RowCount; k++) { string CorpCode = string.Empty, CorpName = string.Empty, MeritYear = string.Empty, MeritName = string.Empty, MeritDate = string.Empty, MeritLevel = string.Empty, MeritRegion = string.Empty, MeritSector = string.Empty, MeritPrjName = string.Empty, PrjSupporter = string.Empty, Source = string.Empty, Url = string.Empty, Remark = string.Empty, Details = string.Empty, MeritType = string.Empty, PrjMgr = string.Empty, SupMgr = string.Empty, ManCost = string.Empty, ProArea = string.Empty, SupUnit = string.Empty, PileConsUnit = string.Empty, BuildingType = string.Empty; TableRow tr = tableTag.Rows[k]; MeritName = name; MeritType = typename; MeritPrjName = tr.Columns[1].ToNodePlainString(); CorpName = tr.Columns[2].ToNodePlainString(); PrjMgr = tr.Columns[3].ToNodePlainString(); SupUnit = tr.Columns[4].ToNodePlainString(); SupMgr = tr.Columns[5].ToNodePlainString(); ManCost = tr.Columns[6].ToNodePlainString(); if (ManCost.Contains("吨")) { ManCost = string.Empty; } ProArea = tr.Columns[7].ToNodePlainString(); MeritYear = tr.Columns[8].ToNodePlainString(); CorpMerit info = ToolDb.GenCorpMerit("广东省", "深圳市", "", CorpCode, CorpName, MeritYear, MeritName, MeritDate, MeritLevel, MeritRegion, MeritSector, MeritPrjName, PrjSupporter, Source, url, Remark, Details, MeritType, PrjMgr, SupMgr, ManCost, ProArea, SupUnit, PileConsUnit, BuildingType); list.Add(info); } } } } } } } #endregion #region 其它工程 parser = new Parser(new Lexer(html)); NodeList theNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "32")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center")))); if (theNode != null && theNode.Count > 2) { TableTag table = theNode[2] as TableTag; parser = new Parser(new Lexer(table.ToHtml())); NodeList atagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (atagNode != null && atagNode.Count > 0) { for (int j = 0; j < atagNode.Count; j++) { ATag aTag = atagNode[j].GetATag(); string typename = aTag.LinkText; string url = "http://www.jianzhuxh.com/excellence/" + aTag.Link; string htmlList = string.Empty; int page = 1; try { htmlList = ToolWeb.GetHtmlByUrl(url, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htmlList)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "center"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); page = int.Parse(temp); } catch { } } for (int k = 1; k <= page; k++) { if (k > 1) { try { htmlList = ToolWeb.GetHtmlByUrl(url + "&page=" + k.ToString(), Encoding.Default); } catch { } } parser = new Parser(new Lexer(htmlList)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text16"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag tableTag = dtlNode[0] as TableTag; for (int t = 0; t < tableTag.RowCount; t++) { TableRow tr = tableTag.Rows[t]; string CorpCode = string.Empty, CorpName = string.Empty, MeritYear = string.Empty, MeritName = string.Empty, MeritDate = string.Empty, MeritLevel = string.Empty, MeritRegion = string.Empty, MeritSector = string.Empty, MeritPrjName = string.Empty, PrjSupporter = string.Empty, Source = string.Empty, Url = string.Empty, Remark = string.Empty, Details = string.Empty, MeritType = string.Empty, PrjMgr = string.Empty, SupMgr = string.Empty, ManCost = string.Empty, ProArea = string.Empty, SupUnit = string.Empty, PileConsUnit = string.Empty, BuildingType = string.Empty; MeritName = MeritType = typename; if (typename.Contains("优质工程")) { MeritName = MeritType = "深圳市" + typename; MeritPrjName = tr.Columns[2].ToNodePlainString(); CorpName = tr.Columns[3].ToNodePlainString(); PrjMgr = tr.Columns[4].ToNodePlainString(); SupUnit = tr.Columns[5].ToNodePlainString(); SupMgr = tr.Columns[6].ToNodePlainString(); PrjSupporter = tr.Columns[7].ToNodePlainString(); string temp = tr.Columns[8].ToNodePlainString(); if (temp.Contains("元")) { ManCost = temp; } else { ProArea = temp; } MeritYear = tr.Columns[9].ToNodePlainString(); } else if (typename.Contains("优质结构工程")) { MeritName = MeritType = "深圳市" + typename; MeritPrjName = tr.Columns[1].ToNodePlainString(); CorpName = tr.Columns[2].ToNodePlainString(); PrjMgr = tr.Columns[3].ToNodePlainString(); PileConsUnit = tr.Columns[4].ToNodePlainString(); SupUnit = tr.Columns[5].ToNodePlainString(); SupMgr = tr.Columns[6].ToNodePlainString(); string temp = tr.Columns[8].ToNodePlainString(); if (temp.Contains("元")) { ManCost = temp; } else { ProArea = temp; } MeritYear = tr.Columns[10].ToNodePlainString(); } else if (typename.Contains("用户满意工程")) { MeritName = MeritType = "深圳市" + typename; MeritPrjName = tr.Columns[1].ToNodePlainString(); CorpName = tr.Columns[2].ToNodePlainString(); SupUnit = tr.Columns[3].ToNodePlainString(); BuildingType = tr.Columns[4].ToNodePlainString(); ProArea = tr.Columns[5].ToNodePlainString(); MeritYear = tr.Columns[6].ToNodePlainString(); } else if (typename.Contains("绿色施工示范工程")) { MeritName = MeritType = "深圳市" + typename; MeritPrjName = tr.Columns[2].ToNodePlainString(); CorpName = tr.Columns[3].ToNodePlainString(); PrjMgr = tr.Columns[4].ToNodePlainString(); SupUnit = tr.Columns[5].ToNodePlainString(); SupMgr = tr.Columns[6].ToNodePlainString(); PrjSupporter = tr.Columns[8].ToNodePlainString(); MeritYear = tr.Columns[10].ToNodePlainString(); } else if (typename.Contains("文明工地") || typename.Contains("双优工地") || typename.Contains("双优样板工地")) { MeritPrjName = tr.Columns[1].ToNodePlainString(); CorpName = tr.Columns[2].ToNodePlainString(); PrjMgr = tr.Columns[3].ToNodePlainString(); SupUnit = tr.Columns[4].ToNodePlainString(); SupMgr = tr.Columns[5].ToNodePlainString(); string temp = tr.Columns[6].ToNodePlainString(); if (temp.Contains("元")) { ManCost = temp; } else { ProArea = temp; } MeritYear = tr.Columns[7].ToNodePlainString(); } CorpMerit info = ToolDb.GenCorpMerit("广东省", "深圳市", "", CorpCode, CorpName, MeritYear, MeritName, MeritDate, MeritLevel, MeritRegion, MeritSector, MeritPrjName, PrjSupporter, Source, url, Remark, Details, MeritType, PrjMgr, SupMgr, ManCost, ProArea, SupUnit, PileConsUnit, BuildingType); list.Add(info); } } } } } } #endregion #region 深圳地区 parser = new Parser(new Lexer(html)); NodeList areaNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "29")), true), new TagNameFilter("table"))); if (areaNode != null && areaNode.Count > 0) { TableTag table = areaNode[1] as TableTag; parser = new Parser(new Lexer(table.ToHtml())); NodeList listNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { ATag aTag = listNode[j].GetATag(); string typename = aTag.LinkText.Replace("·", ""); string url = "http://www.jianzhuxh.com/excellence/" + aTag.Link; string htmlList = string.Empty; int page = 1; try { htmlList = ToolWeb.GetHtmlByUrl(url, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htmlList)); //continue; NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("form"), new HasAttributeFilter("name", "gopage"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); page = int.Parse(temp); } catch { } } for (int k = 1; k <= page; k++) { if (k > 1) { try { htmlList = ToolWeb.GetHtmlByUrl(url + "?page=" + k.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(htmlList)); NodeList tableNode = null; if (typename.Contains("鲁班奖")) { tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "py_tbl"))); } else { tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text18"))); } if (tableNode != null && tableNode.Count > 0) { TableTag tableTag = tableNode[0] as TableTag; for (int t = 1; t < tableTag.RowCount; t++) { TableRow tr = tableTag.Rows[t]; string CorpCode = string.Empty, CorpName = string.Empty, MeritYear = string.Empty, MeritName = string.Empty, MeritDate = string.Empty, MeritLevel = string.Empty, MeritRegion = string.Empty, MeritSector = string.Empty, MeritPrjName = string.Empty, PrjSupporter = string.Empty, Source = string.Empty, Url = string.Empty, Remark = string.Empty, Details = string.Empty, MeritType = string.Empty, PrjMgr = string.Empty, SupMgr = string.Empty, ManCost = string.Empty, ProArea = string.Empty, SupUnit = string.Empty, PileConsUnit = string.Empty, BuildingType = string.Empty; MeritName = MeritType = typename; MeritPrjName = tr.Columns[1].ToNodePlainString(); CorpName = tr.Columns[2].ToNodePlainString(); PrjSupporter = tr.Columns[3].ToNodePlainString().Replace("参建单位", "").Replace(":", "").Replace(":", ""); SupUnit = tr.Columns[4].ToNodePlainString(); PrjMgr = tr.Columns[5].ToNodePlainString(); MeritYear = tr.Columns[6].ToNodePlainString(); CorpMerit info = ToolDb.GenCorpMerit("广东省", "深圳市", "", CorpCode, CorpName, MeritYear, MeritName, MeritDate, MeritLevel, MeritRegion, MeritSector, MeritPrjName, PrjSupporter, Source, url, Remark, Details, MeritType, PrjMgr, SupMgr, ManCost, ProArea, SupUnit, PileConsUnit, BuildingType); list.Add(info); } } } } } } #endregion return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "statusBar"))); if (pageNode != null && pageNode.Count > 0) { string pageos = pageNode.AsString().GetRegexBegEnd("找到", "条"); try { pageInt = int.Parse(pageos.Replace(",", "")); //string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); //pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://dp.szzfcg.cn/portal/topicView.do?method=view&siteId=10&id=2014", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "topicChrList_20070702_table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 3; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.GetATag(); if (aTag == null) { continue; } beginDate = tr.ToString().GetDateRegex(); prjName = aTag.LinkText.ToNodeString(); string itemName = aTag.Link.ToString().Replace("/viewer.do?id=", ""); InfoUrl = "http://dp.szzfcg.cn/portal/documentView.do?method=view&id=" + itemName; string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmlDtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "98%"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString().Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); bidType = prjName.GetInviteBidType(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { if (bidNode.Count == 2 || bidNode.Count == 3) { string unitCtx = string.Empty; // TableTag table = null; if (bidNode.Count == 2) { table = bidNode[1] as TableTag; } if (bidNode.Count == 3) { table = bidNode[2] as TableTag; } if (table.RowCount > 1) { for (int k = 0; k < table.Rows[3].ColumnCount; k++) { unitCtx += table.Rows[3].Columns[k].ToNodePlainString() + ":"; unitCtx += table.Rows[4].Columns[k].ToNodePlainString() + "\r\n"; } } bidUnit = unitCtx.GetBidRegex(); bidMoney = unitCtx.GetMoneyRegex(); } if (bidNode.Count == 5) { string unitCtx = string.Empty; TableTag table1 = bidNode[3] as TableTag; TableTag table2 = bidNode[1] as TableTag; if (table1.RowCount > 1) { for (int k = 0; k < table1.Rows[0].ColumnCount; k++) { unitCtx += table1.Rows[0].Columns[k].ToNodePlainString() + ":"; unitCtx += table1.Rows[1].Columns[k].ToNodePlainString() + "\r\n"; } } bidUnit = unitCtx.GetRegex("中标(成交)供应商"); if (string.IsNullOrEmpty(bidUnit)) { unitCtx.GetBidRegex(); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = unitCtx.GetRegex("供应商"); } if (table2.RowCount > 1) { bool isOk = false; for (int h = 0; h < table2.RowCount; h++) { string monCtx = string.Empty; for (int k = 0; k < table2.Rows[h].ColumnCount; k++) { if (bidUnit == table2.Rows[h].Columns[k].ToNodePlainString()) { bidMoney = table2.Rows[h].Columns[table2.Rows[h].ColumnCount - 1].ToNodePlainString(); isOk = true; break; } } if (isOk) { break; } } } if (table2.ToHtml().Contains("万")) { bidMoney = (bidMoney + "万").GetMoney(); } else { bidMoney = bidMoney.GetMoney(); } } } } if (bidUnit.Contains("没有")) { bidUnit = "没有中标商"; } if (buildUnit.Contains("没有")) { buildUnit = ""; } specType = "建设工程"; msgType = "大鹏新区公共资源交易中心"; BidInfo info = ToolDb.GenBidInfo("广东省", "深圳政府采购", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "MoreInfoListGG_Pager"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("页数:", "当前"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { viewState, "MoreInfoListGG$Pager", i.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoListGG_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); if (aTag == null) { continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; string xian = aTag.LinkText.GetRegexBegEnd("【", "】"); prjName = aTag.GetAttribute("title"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.zjbid.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtnode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "infodetail"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); bidCtx = HtmlTxt.GetReplace("</p>,</br>", "\r\n").GetReplace("<br />", "\r\n").ToCtxString(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("中标人"); } if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("预中标单位(第一名)"); } if (string.IsNullOrWhiteSpace(bidUnit)) { try { parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "_Sheet1"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag dtlTable = dtlNode[0] as TableTag; HtmlTxt = dtlTable.ToHtml(); string ctx = ""; for (int r = 1; r < dtlTable.RowCount; r++) { for (int c = 0; c < dtlTable.Rows[r].ColumnCount; c++) { string temp = dtlTable.Rows[r].Columns[c].ToHtml().GetReplace("<br>,<br/>", "\r\n").ToCtxString(); if (!temp.Contains("\r\n")) { temp = dtlTable.Rows[r].Columns[c].ToNodePlainString(); } if (!IsTable(dtlTable.Rows[r].ToHtml())) { if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp.GetReplace(":,:") + ":"; } } else { ctx += GetTableBid(dtlTable.Rows[r].ToHtml()); } } } ctx = ctx.GetReplace(":\r\n", ":"); code = ctx.GetCodeRegex(); if (string.IsNullOrWhiteSpace(code)) { code = ctx.GetRegex("工程编码"); } buildUnit = ctx.GetBuildRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ctx.GetRegex("建设单位"); } if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = ctx.GetRegex("采购人名称"); } } } catch { } } if (string.IsNullOrWhiteSpace(bidUnit)) { try { parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "_Sheet1_6_1"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag dtlTable = dtlNode[0] as TableTag; string Html = dtlTable.ToHtml(); string bidCtxt = string.Empty; for (int c = 0; c < dtlTable.Rows[0].ColumnCount; c++) { bidCtxt += dtlTable.Rows[1].Columns[c].ToNodePlainString() + ":"; bidCtxt += dtlTable.Rows[2].Columns[c].ToNodePlainString() + "\r\n"; } bidCtxt = bidCtxt.GetReplace(":\r\n", ":"); bidCtxt = bidCtxt.Replace("%", ""); bidUnit = bidCtxt.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtxt.GetRegex("拟中标单位"); } bidMoney = bidCtxt.GetMoneyRegex(); if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtxt.GetRegex("中标价:").GetMoney(); } prjMgr = bidCtxt.GetMgrRegex(); if (string.IsNullOrWhiteSpace(prjMgr)) { prjMgr = bidCtxt.GetRegex("项目经理"); } } } catch { } } if (string.IsNullOrWhiteSpace(bidMoney)) { bidMoney = bidCtx.GetMoneyRegex(); } if (string.IsNullOrWhiteSpace(bidMoney)) { try { parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "_Sheet1_13_0"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag dtlTable = dtlNode[0] as TableTag; string Html = dtlTable.ToHtml(); string bidCtxt = string.Empty; for (int c = 0; c < dtlTable.Rows[0].ColumnCount; c++) { bidCtxt += dtlTable.Rows[1].Columns[c].ToNodePlainString() + ":"; bidCtxt += dtlTable.Rows[2].Columns[c].ToNodePlainString() + "\r\n"; } if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtxt.GetRegex("中标供应商"); } if (string.IsNullOrWhiteSpace(bidMoney)) { bidMoney = bidCtxt.GetRegex("价格(元)"); } } } catch { } } if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList node = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "_Sheet1_13_0"))); if (node != null && node.Count > 0) { TableTag bidTable = node[0] as TableTag; string ctx = string.Empty; if (bidTable.RowCount >= 3) { for (int r = 0; r < bidTable.Rows[1].ColumnCount; r++) { try { ctx += bidTable.Rows[1].Columns[r].ToNodePlainString() + ":"; ctx += bidTable.Rows[2].Columns[r].ToNodePlainString() + "\r\n"; } catch { } } bidUnit = ctx.GetBidRegex(); bidMoney = ctx.GetMoneyRegex(null, false, "万元"); } } } if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = bidCtx.GetBuildRegex(); } if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = bidCtx.GetRegex("招标人"); } if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = bidCtx.GetRegex("采购人名称"); } if (string.IsNullOrWhiteSpace(code)) { code = bidCtx.GetCodeRegex().GetCodeDel(); } if (!string.IsNullOrWhiteSpace(code)) { if (code[code.Length - 1] != '号') { code = ""; } } if (string.IsNullOrWhiteSpace(code)) { code = bidCtx.GetRegex("采购项目编号"); } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("开标")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("开标")); } try { if (Convert.ToDecimal(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } msgType = "浙江省招标投标办公室"; specType = "建设工程"; bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("浙江省", "浙江省及地市", xian, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } try { string temp = html.ToCtxString().GetRegexBegEnd("第1/", "页"); pageInt = int.Parse(temp); } catch { } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "pageno", "mode", "linkname" }, new string[] { i.ToString(), "query", "currinfo" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "list"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[listNode.Count - 1] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[0].GetATag(); if (aTag == null) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.nxzb.com.cn/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl.ToLower().GetReplace("th", "td"))); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tabcon"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag dtlTable = dtlNode[0] as TableTag; HtmlTxt = dtlTable.ToHtml(); for (int r = 0; r < dtlTable.RowCount; r++) { for (int c = 0; c < dtlTable.Rows[r].ColumnCount; c++) { if ((c + 1) % 2 == 0) { inviteCtx += dtlTable.Rows[r].Columns[c].ToNodePlainString() + "\r\n"; } else { inviteCtx += dtlTable.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:") + ":"; } } } beginDate = inviteCtx.GetRegex("发布日期,发布时间").GetDateRegex(); buildUnit = inviteCtx.GetBuildRegex(); parser = new Parser(new Lexer(htmldtl)); NodeList iframeNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("iframe"), new HasAttributeFilter("id", "icontent"))); if (iframeNode != null && iframeNode.Count > 0) { InfoUrl = (iframeNode[0] as IFrameTag).FrameLocation; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { } parser = new Parser(new Lexer(htmldtl)); NodeList htmlDtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); if (htmlDtlNode != null && htmlDtlNode.Count > 0) { HtmlTxt = htmlDtlNode.AsHtml(); inviteCtx = HtmlTxt.ToLower().GetReplace("<br/>,<br>", "\r\n").ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); } } msgType = "宁夏建设工程招标投标管理中心"; specType = inviteType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("宁夏回族自治区", "宁夏回族自治区及地市", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "lblPageCount"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString(); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION", "gcbh_Text_Box", "gcmc_TextBox", "num_TextBox", "ImageButton3.x", "ImageButton3.y" }, new string[] { "", "", "", viewState, "B0108473", eventValidation, "", "", "", "5", "12" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); if (aTag == null) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; code = tr.Columns[0].ToNodePlainString(); prjName = aTag.LinkText.GetReplace(" "); endDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.bcactc.com/home/gcxx/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "hei_text"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag dtlTable = dtlNode[0] as TableTag; HtmlTxt = dtlTable.ToHtml(); for (int r = 0; r < dtlTable.RowCount; r++) { for (int c = 0; c < dtlTable.Rows[r].ColumnCount; c++) { if ((c + 1) % 2 == 0) { inviteCtx += dtlTable.Rows[r].Columns[c].ToNodePlainString() + "\r\n"; } else { inviteCtx += dtlTable.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:") + ":"; } } } prjAddress = inviteCtx.GetAddressRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); beginDate = inviteCtx.GetRegex("登记日期,登记时间").GetDateRegex(); msgType = "北京市建设工程发包承包交易中心"; specType = "建设工程"; inviteType = "园林"; InviteInfo info = ToolDb.GenInviteInfo("北京市", "北京市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NotifyInfo>(); int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "style1"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "?page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "99%")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%")))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { TableRow tr = (listNode[j] as TableTag).Rows[0]; string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; ATag aTag = tr.GetATag(); headName = aTag.LinkText; if (Encoding.Default.GetByteCount(headName) > 200) { headName = headName.Substring(0, 99); } infoUrl = "http://www.hnsztb.com.cn/gsgg/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "800"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag table = dtlNode[0] as TableTag; if (table.RowCount > 1) { ctxHtml = table.Rows[1].ToHtml(); } else { ctxHtml = table.ToHtml(); } infoCtx = ctxHtml.ToCtxString(); releaseTime = infoCtx.GetDateRegex(); if (string.IsNullOrEmpty(releaseTime)) { releaseTime = infoCtx.GetDateRegex("yyyy年MM月dd日"); } if (string.IsNullOrEmpty(releaseTime)) { releaseTime = infoCtx.GetDateRegex("yyyy/MM/dd"); } if (string.IsNullOrEmpty(releaseTime)) { releaseTime = infoCtx.GetChinaTime(); } msgType = "河南省建设工程招标投标协会"; infoType = "通知公告"; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "河南省", "河南省及地市", string.Empty, infoCtx, infoType); sqlCount++; if (crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = aTag.Link; } else { link = "http://www.hnsztb.com.cn/" + a.Link; } BaseAttach entity = null; try { entity = ToolHtml.GetBaseAttach(link, a.LinkText, info.Id); if (entity == null) { entity = ToolHtml.GetBaseAttachByUrl(link, a.LinkText, info.Id); } if (entity != null) { ToolDb.SaveEntity(entity, string.Empty); } } catch { } } } } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "paging"))); if (pageNode != null && pageNode.Count > 0) { string pageos = pageNode.AsString().GetRegexBegEnd("/", "页"); try { pageInt = int.Parse(pageos); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://61.144.240.26:58080/news/publicnews/12131?pageIndex=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("id", "newsList")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.LinkText.GetReplace("· ", ""); if (prjName.Contains("FTCG")) { try { code = prjName.Remove(prjName.IndexOf("-")); } catch {} } InfoUrl = "http://61.144.240.26:58080" + aTag.Link; beginDate = nodeList[j].ToPlainTextString().GetDateRegex(); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmlDtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "98%"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString().Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); if (string.IsNullOrWhiteSpace(code)) { code = bidCtx.GetCodeRegex().GetCodeDel(); } buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); bidType = prjName.GetInviteBidType(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); string Ctx = string.Empty; if (bidNode != null && bidNode.Count > 0) { if (bidNode.Count == 2 || bidNode.Count == 3) { TableTag table = bidNode[1] as TableTag; for (int r = 0; r < table.Rows[0].ColumnCount; r++) { try { Ctx += table.Rows[0].Columns[r].ToNodePlainString() + ":"; Ctx += table.Rows[1].Columns[r].ToNodePlainString() + "\r\n"; } catch { } } bidUnit = Ctx.GetBidRegex(); bidMoney = Ctx.GetMoneyRegex(); } if (bidNode.Count == 6) { string unitCtx = string.Empty; TableTag table1 = bidNode[3] as TableTag; TableTag table2 = bidNode[1] as TableTag; if (table1.RowCount > 1) { for (int k = 0; k < table1.Rows[0].ColumnCount; k++) { unitCtx += table1.Rows[0].Columns[k].ToNodePlainString() + ":"; unitCtx += table1.Rows[1].Columns[k].ToNodePlainString() + "\r\n"; } } bidUnit = unitCtx.GetRegex("中标(成交)供应商"); if (string.IsNullOrEmpty(bidUnit)) { unitCtx.GetBidRegex(); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = unitCtx.GetRegex("供应商"); } if (table2.RowCount > 1) { bool isOk = false; for (int h = 0; h < table2.RowCount; h++) { string monCtx = string.Empty; for (int k = 0; k < table2.Rows[h].ColumnCount; k++) { if (bidUnit == table2.Rows[h].Columns[k].ToNodePlainString()) { bidMoney = table2.Rows[h].Columns[table2.Rows[h].ColumnCount - 1].ToNodePlainString(); isOk = true; break; } } if (isOk) { break; } } } if (table2.ToHtml().Contains("万")) { bidMoney = (bidMoney + "万").GetMoney(); } else { bidMoney = bidMoney.GetMoney(); } } } } if (string.IsNullOrWhiteSpace(code)) { parser = new Parser(new Lexer(htmlDtl)); NodeList codeNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "holder"))); for (int r = 0; r < codeNode.Count; r++) { string ctx = string.Empty; string clnd = codeNode[r].ToNodePlainString(); if (clnd.Contains("FTCG")) { try { ctx += clnd.ToString(); code = ctx; break; } catch { } } } } else { code = bidCtx.GetCodeRegex(); } if (code.Contains(")")) { code = code.GetReplace(")", ""); } if (bidUnit.Contains("没有")) { bidUnit = "没有中标商"; } if (buildUnit.Contains("没有")) { buildUnit = ""; } if (code.Length > 50) { code = ""; } specType = "政府采购"; msgType = "深圳政府采购"; BidInfo info = ToolDb.GenBidInfo("广东省", "深圳政府采购", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "lblPageCount"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString(); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION", "gcbh_Text_Box", "gcmc_TextBox", "num_TextBox", "ImageButton3.x", "ImageButton3.y" }, new string[] { "", "", "", viewState, "B0108473", eventValidation, "", "", "", "5", "12" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); if (aTag == null) { continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; code = tr.Columns[0].ToNodePlainString(); prjName = aTag.LinkText.GetReplace(" "); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.bcactc.com/home/gcxx/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "hei_text"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag dtlTable = dtlNode[0] as TableTag; HtmlTxt = dtlTable.ToHtml(); bidCtx = ""; for (int r = 0; r < dtlTable.RowCount; r++) { for (int c = 0; c < dtlTable.Rows[r].ColumnCount; c++) { string temp = dtlTable.Rows[r].Columns[c].ToHtml().GetReplace("<br>,<br/>", "\r\n").ToCtxString(); if (!temp.Contains("\r\n")) { temp = dtlTable.Rows[r].Columns[c].ToNodePlainString(); } if (!IsTable(dtlTable.Rows[r].ToHtml())) { if ((c + 1) % 2 == 0) { bidCtx += temp + "\r\n"; } else { bidCtx += temp.GetReplace(":,:") + ":"; } } else { bidCtx += GetTableBid(dtlTable.Rows[r].ToHtml()); } } } bidCtx = bidCtx.GetReplace(":\r\n", ":"); code = bidCtx.GetCodeRegex(); buildUnit = bidCtx.GetBuildRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = bidCtx.GetRegex("建设单位名称"); } bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("中标侯选人"); } bidMoney = bidCtx.GetMoneyRegex(); if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetRegex("中标合同额").GetMoney(); } prjMgr = bidCtx.GetMgrRegex(); if (bidUnit.Contains("中标")) { bidUnit = ""; } msgType = "北京市建设工程发包承包交易中心"; specType = "建设工程"; bidType = "园林"; BidInfo info = ToolDb.GenBidInfo("北京市", "北京市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.bcactc.com/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "DropDownList_page"))); if (pageNode != null && pageNode.Count > 0) { try { SelectTag tag = pageNode[0] as SelectTag; string temp = tag.OptionTags[tag.OptionTags.Length - 1].Value; pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__EVENTVALIDATION", "dr_gglb", "txt_beginTime", "txt_endTime", "nextPages", "DropDownList_page", "hdInputNum", "hdPageCount", "hdState" }, new string[] { "", "", viewState, eventValidation, "0", "", "", "下一页", "1", "1", pageInt.ToString(), "" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("style", "text-align: center;"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); if (aTag == null) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.LinkText.ToNodeString().GetReplace(" "); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex("yyyy/MM/dd"); InfoUrl = "https://www.ciac.sh.cn/NetInterBidweb/GKTB/DefaultV2011.aspx?gkzbxh=" + aTag.GetAttribute("onclick").GetRegexBegEnd("'", "'"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table_css"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag dtlTable = dtlNode[0] as TableTag; HtmlTxt = dtlTable.ToHtml();//dtlNode.AsHtml(); inviteCtx = ""; for (int r = 0; r < dtlTable.RowCount; r++) { for (int c = 0; c < dtlTable.Rows[r].ColumnCount; c++) { if ((c + 1) % 2 == 0) { inviteCtx += dtlTable.Rows[r].Columns[c].ToNodePlainString() + "\r\n"; } else { inviteCtx += dtlTable.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:") + ":"; } } } prjAddress = inviteCtx.GetAddressRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "上海市建筑业管理办公室"; specType = inviteType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("上海市", "上海市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }