protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "scott"))), new HasChildFilter(new TagNameFilter("a")))).SearchFor(typeof(ATag), true); for (int i = 0; i < sNodes.Count; i++) { ATag aTag = sNodes[i] as ATag; if (aTag.ToPlainTextString().Contains(">>")) { pageInt = int.Parse(aTag.Link.ToLower().Replace("gopage(", "").Replace(")", "")); } } parser.Reset(); //处理后续页 if (pageInt > 1) { string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "boardId", "eTime", "newstitle", "pageNO", "sTime", "totalRows", "typeId" }, new string[] { "000000000201", string.Empty, string.Empty, i.ToString(), string.Empty, "0", "000000000002" }); try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr); DealHtml(list, html, crawlAll); } catch (Exception ex) { continue; } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } return(list); }
public void DealHtml(IList list, string html, bool crawlAll) { Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_Content_GridView1"))); if (aNodes != null && aNodes.Count > 0) { Type typs = typeof(ATag); TableTag table = aNodes[0] as TableTag; for (int t = 1; t < table.RowCount - 1; t++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, htmlTxt = string.Empty, prjCode = string.Empty; TableRow tr = table.Rows[t] as TableRow; ATag aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.szjsjy.com.cn/BusinessInfo/" + aTag.Link; prjCode = tr.Columns[1].ToNodePlainString().Replace(" ", ""); InfoTitle = tr.Columns[2].ToPlainTextString(); PublistTime = tr.Columns[5].ToPlainTextString(); InfoType = "标底公示"; string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("<br>", "\r\n").Replace(" ", ""); } catch (Exception ex) { continue; } Parser parserCtx = new Parser(new Lexer(htmlDtl)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "lblXXNR"))); htmlTxt = ctxNode.AsHtml(); InfoCtx = ctxNode.AsString().Replace(" ", ""); parserCtx.Reset(); NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "深圳市工程", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "深圳市建设工程交易中心", InfoUrl, prjCode, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt); list.Add(info); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("id", "trFujian"))); NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true); for (int a = 0; a < aTagNodes.Count; a++) { ATag fileTage = aTagNodes[a] as ATag; string downloadURL = "http://www.szjsjy.com.cn/" + fileTage.Link.Replace("../", ""); BaseAttach attach = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL); base.AttachList.Add(attach); } if (!crawlAll && list.Count >= this.MaxCount) { return; } } } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("bgColor", "#EEF4F9"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", "").Replace(" ", "").Trim(); Regex regpage = new Regex(@"1/[0-9]+页"); try { pageInt = int.Parse(regpage.Match(pageTemp).Value.Split('/')[1].Replace("页", "").Trim()); } catch (Exception ex) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.szzdzb.cn/Product-index-id-8-p-" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "hangao27"))), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[0].ToPlainTextString().Trim(); prjName = tr.Columns[1].ToPlainTextString().Trim(); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.szzdzb.cn" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "hangao27"))), new TagNameFilter("table"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "hangao27"))), new TagNameFilter("table"))); inviteCtx = System.Web.HttpUtility.HtmlDecode(dtnode.AsString().Replace("打印本页 || 关闭窗口", "")); Regex regCtx = new Regex(@"([\r\n]+)|([\t]+)"); inviteCtx = regCtx.Replace(inviteCtx, "\r\n"); Regex regBeginDate = new Regex(@"发布时间:[^\r\n]+\r\n"); beginDate = regBeginDate.Match(inviteCtx).Value.Replace("发布时间", "").Replace(":", "").Trim(); specType = "其他"; msgType = "深圳市振东招标代理有限公司"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); dtlparser.Reset(); dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("bgColor", "#CCCCCC"))); NodeList FileTag = dtnode.SearchFor(typeof(ATag), true); if (FileTag != null && FileTag.Count > 0) { for (int f = 0; f < FileTag.Count; f++) { ATag file = FileTag[f] as ATag; if (file.Link.ToUpper().Contains(".DOC")) { BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, "http://www.szzdzb.cn" + file.Link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
public void DealHtml(IList list, string html, bool crawlAll) { Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (aNodes != null && aNodes.Count > 0) { Type typs = typeof(ATag); TableTag table = aNodes[0] as TableTag; for (int t = 1; t < table.RowCount - 1; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[t] as TableRow; ATag aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = aTag.Link; prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmlDtl = regexHtml.Replace(htmlDtl, ""); Parser parserCtx = new Parser(new Lexer(htmlDtl)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable"))); if (ctxNode != null && ctxNode.Count > 0) { Parser parserdiv = new Parser(new Lexer(htmlDtl)); NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button"))); HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim(); Type tp = typeof(ATag); TableTag tabTag = ctxNode[0] as TableTag; string startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regex = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}"); Match math = regex.Match(startTime); beginDate = math.Value.Replace("时间:", "").Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexcode = new Regex("(工程编号|项目编号|招标编号):[^\r\n]+[\r\n]{1}"); Match match = regexcode.Match(tabTag.ToPlainTextString()); code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexBuildUnit = new Regex("(招标人|建设单位|招标采购单位):[^\r\n]+[\r\n]{1}"); Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString()); buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexAddress = new Regex("(建设地点|项目地点|工程地点):[^\r\n]+[\r\n]{1}"); Match matchAddress = regexAddress.Match(tabTag.ToPlainTextString()); prjAddress = matchAddress.Value.Substring(matchAddress.Value.IndexOf(":") + 1).Trim(); ctx = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace(" ", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); if (ctx.Length > 0) { Regex regexCtx = new Regex("<!--[^<]+-->"); ctx = regexCtx.Replace(ctx, ""); } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } if (buildUnit == "" || buildUnit == null) { buildUnit = ""; } if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = buildUnit.Substring(0, 150); } if (Encoding.Default.GetByteCount(prjAddress) > 200) { prjAddress = "见招标公告内容"; } if (beginDate.Length > 0 && endDate.Length > 0) { DateTime begin = new DateTime(); DateTime end = new DateTime(); try { begin = DateTime.Parse(beginDate); end = DateTime.Parse(endDate); } catch (Exception) { } if (begin > end) { endDate = string.Empty; } } } parserCtx.Reset(); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai"))); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(ctxNode.AsString()).Value.Trim(); if (beginDate == "") { beginDate = string.Empty; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "惠东县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, ctx, remark, "惠州市建设工程交易中心", inviteType, "建设工程", string.Empty, InfoUrl, HtmlTxt); list.Add(info); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"))); NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true); for (int a = 0; a < aTagNodes.Count; a++) { ATag fileTage = aTagNodes[a] as ATag; if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile")) { string downloadURL = fileTage.Link; BaseAttach attach = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return; } } } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagePanel"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", "").Replace(" ", "").Trim(); try { pageInt = int.Parse(pageTemp.GetRegexBegEnd("总", "页")); } catch (Exception ex) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + i); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ST12")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.GetAttribute("title"); if (prjName.Contains("声明")) { continue; } beginDate = nodeList[j].ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "content"), new TagNameFilter("div"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml().GetReplace("<!--[if !supportLists]-->,<!--[endif]-->"); bidCtx = HtmlTxt.ToCtxString(); code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = bidCtx.GetRegex("采购单位,招标代理"); } bidUnit = bidCtx.GetBidRegex(); if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } bidMoney = bidCtx.GetMoneyRegex(); if (bidMoney == "0" || string.IsNullOrWhiteSpace(bidMoney)) { bidMoney = bidCtx.GetMoneyRegex(new string[] { "中标金额" }, false, "万元"); } prjMgr = bidCtx.GetMgrRegex(); try { if (decimal.Parse(bidMoney) >= 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } specType = "政府采购"; msgType = "中国远东国际招标公司"; bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); dtlparser = new Parser(new Lexer(HtmlTxt)); NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (FileTag != null && FileTag.Count > 0) { for (int f = 0; f < FileTag.Count; f++) { ATag file = FileTag[f] as ATag; if (file.IsAtagAttach()) { string link = string.Empty; if (file.Link.ToLower().Contains("http")) { link = file.Link; } else { link = "http://www.cfet.com.cn/" + file.Link; } BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; int crawlMax = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default).Replace(" ", ""); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ny_21 tc"))); if (sNode != null && sNode.Count > 0) { string pageString = sNode.AsString().Trim(); Regex regexPage = new Regex(@"createPageHTML\([^\)]+\)"); Match pageMatch = regexPage.Match(pageString); try { pageInt = int.Parse(pageMatch.Value.Replace("createPageHTML(", "").Replace(")", "").Split(',')[0].Trim()); } catch (Exception) { } } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "index_" + (i - 1).ToString() + ".html", Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "ny_22"))), new TagNameFilter("li"))); if (sNode != null && sNode.Count > 0) { for (int j = 0; j < sNode.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; INode node = sNode[j]; ATag aTag = node.Children.SearchFor(typeof(ATag), true)[0] as ATag; Div divTag = node.Children.SearchFor(typeof(Div), true)[1] as Div; prjName = aTag.ToPlainTextString().Trim(); beginDate = divTag.ToPlainTextString().Trim(new char[] { '[', ']', ' ' }); InfoUrl = aTag.Link.Replace("./", "http://ztb.gaoming.gov.cn/jsgc/zbjg/"); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new HasAttributeFilter("class", "con_10 tl"), new TagNameFilter("div")))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n"); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new HasAttributeFilter("class", "con_10 tl"), new TagNameFilter("div")))); if (dtnode != null && dtnode.Count > 0) { Regex regCtx = new Regex(@"[\n]+"); bidCtx = regCtx.Replace(dtnode.AsString().Replace(" ", "").Trim(), "\r\n"); TableTag table = dtnode.SearchFor(typeof(TableTag), true)[0] as TableTag; for (int dl = 0; dl < table.RowCount; dl++) { TableRow tr = table.Rows[dl]; if (tr.Columns[0].ToPlainTextString().Contains("编号")) { code = tr.Columns[1].ToPlainTextString().Trim(); } else if (tr.Columns[0].ToPlainTextString().Contains("招标单位")) { buildUnit = tr.Columns[1].ToPlainTextString().Trim(); } else if (tr.Columns[0].ToPlainTextString().Contains("中标单位")) { bidUnit = tr.Columns[1].ToPlainTextString().Trim(); } else if (tr.Columns[0].ToPlainTextString().Contains("建造师") || tr.Columns[0].ToPlainTextString().Contains("负责人") || tr.Columns[0].ToPlainTextString().Contains("法定代表人")) { prjMgr = tr.Columns[1].ToPlainTextString().Replace(" ", "").Trim(); } else if (tr.Columns[0].ToPlainTextString().Contains("中标价")) { Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); Regex regmoneyctx = new Regex(@"[0-9]+[\%]"); string bidMoneyctx = regmoneyctx.Replace(tr.Columns[1].ToPlainTextString(), ""); if (!string.IsNullOrEmpty(bidMoneyctx)) { if (tr.Columns[1].ToPlainTextString().Contains("万元")) { bidMoney = regBidMoney.Match(bidMoneyctx).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoneyctx).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } } } } } if (Encoding.Default.GetByteCount(bidUnit) > 150) { try { if (bidUnit.Contains("第二标段")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("\n")).Replace("第一标段", "").Replace(":", "").Replace(":", ""); } } catch { } } msgType = "佛山市高明区建设工程交易中心"; specType = "建设工程"; prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "佛山市区", "高明区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "PageDataList"))); if (tdNodes != null) { string pageTemp = tdNodes.AsString().Replace(" ", "").Replace(" ", "").Trim(); Regex regpage = new Regex(@"共[0-9]+条"); try { int pageCount = int.Parse(regpage.Match(pageTemp).Value.Replace("共", "").Replace("条", "").Trim()); if (pageCount % 15 > 0) { pageInt = (pageCount / 15) + 1; } else { pageInt = pageCount / 15; } } catch (Exception ex) { } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "cataId", "find_yn", "key_word", "typeId", "__EVENTARGUMENT", "__EVENTTARGET", "__EVENTVALIDATION", "__VIEWSTATE" }, new string[] { "1,2,3,4,5,6,7,8,", string.Empty, string.Empty, "1,2,3,4,5,6,7,8,", string.Empty, "PageDataList$ctl12$LinkButton1", eventValidation, viewState }); try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "StaffList"), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[0].ToPlainTextString().Trim(); prjName = tr.Columns[1].ToPlainTextString().Trim(); beginDate = tr.Columns[2].ToPlainTextString().Trim(); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.cobo91.com/project/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "info"), new TagNameFilter("table"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "info"), new TagNameFilter("table"))); inviteCtx = System.Web.HttpUtility.HtmlDecode(dtnode.AsString().Replace("【打印本页】", "").Replace("【关闭窗口】", "").Replace("版权所有:中邦国际招标&邦迪工程顾问", "")); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = inviteCtx.GetRegex("采购单位,招标代理"); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = inviteCtx.GetRegex("开标地点"); } specType = "其他"; msgType = "中邦国际招标&邦迪工程顾问"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); dtlparser.Reset(); NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (FileTag != null && FileTag.Count > 0) { for (int f = 0; f < FileTag.Count; f++) { ATag file = FileTag[f] as ATag; if (file.Link.ToUpper().Contains(".DOC")) { BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, "http://www.cobo91.com" + file.Link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
public void DealHtml(IList list, string html, bool crawlAll) { Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (aNodes != null && aNodes.Count > 0) { Type typs = typeof(ATag); TableTag table = aNodes[0] as TableTag; for (int t = 1; t < table.RowCount - 1; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, bidType = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, FbTime = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[t] as TableRow; ATag aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = aTag.Link; prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmlDtl = regexHtml.Replace(htmlDtl, ""); Parser parserCtx = new Parser(new Lexer(htmlDtl)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable"))); if (ctxNode != null && ctxNode.Count > 0) { Parser parserdiv = new Parser(new Lexer(htmlDtl)); NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button"))); HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim(); Type tp = typeof(ATag); TableTag tabTag = ctxNode[0] as TableTag; string startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regex = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}"); Match math = regex.Match(startTime); beginDate = math.Value.Replace("时间:", ""); Regex regexcode = new Regex("(工程编号|项目编号):[^\r\n]+[\r\n]{1}"); Match match = regexcode.Match(tabTag.ToPlainTextString()); if (match.Value.Length > 0) { code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); } Regex regexBuildUnit = new Regex("(中标人|中标单位):[^\r\n]+[\r\n]{1}"); Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString()); if (matchBuildUnit.Value.Length > 0) { buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); } Regex regexbidUnit = new Regex("(招标人|建设单位|第一中标候选人):[^\r\n]+[\r\n]{1}"); Match matchbidUnit = regexbidUnit.Match(tabTag.ToPlainTextString()); if (matchbidUnit.Value.Length > 0) { bidUnit = matchbidUnit.Value.Replace("第一中标候选人:", "").Replace("招标人:", "").Replace("建设单位:", "").Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); if (bidUnit.Contains(":")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf(":")).ToString().Trim(); } } Regex regexMoney = new Regex("(中标价|其中标价为|中标价格):[^\r\n]+[\r\n]{1}"); Match matchMoney = regexMoney.Match(tabTag.ToPlainTextString()); if (matchMoney.Value.Length > 0) { bidMoney = matchMoney.Value.Replace("中标价:", "").Replace("其中标价为:", "").Replace("中标价格:", "").Replace("\r", ""); } Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains("万")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } if (buildUnit == "" || buildUnit == null) { buildUnit = ""; } if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = buildUnit.Substring(0, 150); } if (bidUnit == "" || bidUnit == null) { bidUnit = ""; } if (Encoding.Default.GetByteCount(bidUnit) > 150) { bidUnit = bidUnit.Substring(0, 150); } ctx = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace(" ", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); if (ctx.Length > 0) { Regex regexCtx = new Regex("<!--[^<]+-->"); ctx = regexCtx.Replace(ctx, ""); } } parserCtx.Reset(); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai"))); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(ctxNode.AsString()).Value.Trim(); if (ctx.Contains("公示开始时间")) { beginDate = ctx.Substring(ctx.IndexOf("公示开始时间")).ToString(); Regex regBeDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日"); beginDate = regBeDate.Match(beginDate).Value.Trim(); } if (beginDate == "") { beginDate = regDate.Match(ctxNode.AsString()).Value.Trim(); } if (beginDate == "") { beginDate = string.Empty; } prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "惠阳区", string.Empty, code, prjName, bidUnit, beginDate, buildUnit, beginDate, endDate, ctx, string.Empty, "惠州市建设工程交易中心", bidType, "建设工程", string.Empty, bidMoney, InfoUrl, string.Empty, HtmlTxt); list.Add(info); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"))); NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true); for (int a = 0; a < aTagNodes.Count; a++) { ATag fileTage = aTagNodes[a] as ATag; if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile")) { string downloadURL = fileTage.Link; BaseAttach attach = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return; } } } }
public void GetInfoFromHtml(int currentPage) { Lexer lexer = new Lexer(currentHtml); Parser parser = new Parser(lexer); NodeList poiHeadList = parser.Parse(poiListFilter); if (poiHeadList.Count == 1) { NodeList poiNodeList = poiHeadList[0].Children.ExtractAllNodesThatMatch(poiFilter, false); int numCount = 0; for (int i = 0; i < poiNodeList.Count; i++) { POI poi = new POI(); DefinitionListBullet poiNode = (DefinitionListBullet)poiNodeList[i]; if (poiNode.TagName.Equals("DD")) { numCount++; poi.Page = currentPage; poi.Number = numCount; #region 获取口味、环境和服务评分,以及获取星级 NodeList tasteNodeList = poiNode.Children.ExtractAllNodesThatMatch(tasteFilter, true); NodeList environmentNodeList = poiNode.Children.ExtractAllNodesThatMatch(environmentFilter, true); NodeList serviceNodeList = poiNode.Children.ExtractAllNodesThatMatch(serviceFilter, true); if (tasteNodeList.Count == 1 && environmentNodeList.Count == 1 && serviceNodeList.Count == 1) { Span spanNode = (Span)tasteNodeList[0]; if (!spanNode.ToPlainTextString().Equals("-")) { poi.TasteRemark = Int32.Parse(spanNode.ToPlainTextString()); } spanNode = (Span)environmentNodeList[0]; if (!spanNode.ToPlainTextString().Equals("-")) { poi.EnvironmentRemark = Int32.Parse(spanNode.ToPlainTextString()); } spanNode = (Span)serviceNodeList[0]; if (!spanNode.ToPlainTextString().Equals("-")) { poi.ServiceRemark = Int32.Parse(spanNode.ToPlainTextString()); } #region 获取星级 INode rankNodeOfParent = spanNode.Parent.NextSibling.NextSibling; if (rankNodeOfParent.Children != null && rankNodeOfParent.Children.Count >= 1) { INode rankNodeCandidate = rankNodeOfParent.Children[0]; if (rankNodeCandidate.GetType().Equals(typeof(Span))) { Span rankNode = (Span)rankNodeCandidate; string rank = rankNode.GetAttribute("TITLE"); if (rank.Contains("五")) { poi.Rank = 5; } else { if (rank.Contains("四")) { poi.Rank = 4; } else { if (rank.Contains("三")) { poi.Rank = 3; } else { if (rank.Contains("二")) { poi.Rank = 2; } else { if (rank.Contains("一")) { poi.Rank = 1; } } } } } } } #endregion } else { Console.WriteLine("第" + i + "条POI中,判断口味、环境和服务的标准出错!"); } #endregion #region 获取平均消费 NodeList averageNodeList = poiNode.Children.ExtractAllNodesThatMatch(averageFilter, true); if (averageNodeList.Count == 1) { INode averageNode = averageNodeList[0]; if (averageNode.NextSibling.NextSibling.GetType().Equals(typeof(TextNode))) { string cost = ((TextNode)averageNode.NextSibling.NextSibling).ToPlainTextString(); poi.AverageCost = Int32.Parse(cost); } } else { Console.WriteLine("第" + i + "条POI中,判断平均消费的标准出错!"); } #endregion #region 获取点评数 NodeList commentNodeList = poiNode.Children.ExtractAllNodesThatMatch(commentFilter, true); if (commentNodeList.Count == 1) { INode commentNode = commentNodeList[0]; if (commentNode.GetType().Equals(typeof(ATag))) { string commentNum = ((ATag)commentNode).StringText; if (commentNum.Substring(commentNum.Length - 3, 3).Equals("封点评")) { commentNum = commentNum.Substring(0, commentNum.Length - 3); } poi.CommentCount = Int32.Parse(commentNum); } } else { Console.WriteLine("第" + i + "条POI中,判断点评数的标准出错!"); } #endregion #region 获取店名 NodeList nameNodeList = poiNode.Children.ExtractAllNodesThatMatch(nameFilter, true); if (nameNodeList.Count == 1) { INode nameNode = nameNodeList[0]; if (nameNode.GetType().Equals(typeof(ATag))) { poi.Name = ((ATag)nameNode).StringText; } } else { Console.WriteLine("第" + i + "条POI中,判断店名的标准出错!"); } #endregion #region 获取地址和电话 NodeList addressNodeList = poiNode.Children.ExtractAllNodesThatMatch(addressFilter, true); if (addressNodeList.Count == 1) { NodeList districtNodeList = addressNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag))); if (districtNodeList.Count == 1) { ATag districtTag = (ATag)districtNodeList[0]; string address = districtTag.ToPlainTextString(); if (districtTag.NextSibling.GetType().Equals(typeof(TextNode))) { TextNode detailAddressNode = (TextNode)districtTag.NextSibling; string detailAddress = detailAddressNode.ToPlainTextString(); detailAddress = detailAddress.Trim(); string phoneStr = detailAddress.Substring(detailAddress.Length - 8, 8); poi.Phone = phoneStr; address += detailAddress.Substring(0, detailAddress.Length - 8); } char[] removeChrVector = { ' ', '\n', '\t' }; address = address.Trim(removeChrVector); foreach (char c in removeChrVector) { address = address.Replace(c.ToString(), ""); } poi.Address = address; } else { Console.WriteLine("第" + i + "条POI中,判断含地址的<a>标记的标准出错!"); } } else { Console.WriteLine("第" + i + "条POI中,判断地址的标准出错!"); } #endregion #region 获取标签 NodeList tagsNodeList = poiNode.Children.ExtractAllNodesThatMatch(tagsFilter, true); if (tagsNodeList.Count == 1) { INode tagsNode = tagsNodeList[0]; if (tagsNode.Children != null) { for (int j = 0; j < tagsNode.Children.Count; j++) { INode node = tagsNode.Children[j]; if (node.GetType().Equals(typeof(ATag))) { poi.Tags.Add(node.ToPlainTextString()); } } } } else { Console.WriteLine("第" + i + "条POI中,判断标签的标准出错!"); } #endregion poiList.Add(poi); } } } else { Console.WriteLine("获取POI列表出错"); } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string HtmlTxt = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new TagNameFilter("form")), new TagNameFilter("a"))); if (sNode != null && sNode.Count > 0) { for (int i = 0; i < sNode.Count; i++) { ATag pageA = sNode[i] as ATag; if (pageA.ToPlainTextString().Contains("尾页")) { try { pageInt = int.Parse(pageA.Link.Remove(0, pageA.Link.LastIndexOf("=") + 1)); } catch (Exception) { } } } } parser.Reset(); for (int i = 1; i <= pageInt; i++) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&Page=" + i.ToString()), Encoding.Default); } catch (Exception ex) { continue; } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("bordercolor", "#CCCCCC"))); if (sNode != null && sNode.Count > 0) { HtmlTxt = sNode.AsHtml(); string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty; StringBuilder ctx = new StringBuilder(); decimal decMoney = 0; TableTag table = sNode[1] as TableTag; for (int j = 1; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; //招标类型 bidType = tr.Columns[0].ToPlainTextString(); string invType = "施工,设计,勘察,服务,劳务分包,专业分包,小型施工,监理,设备材料,其他"; if (invType.Contains(bidType)) { specType = "建设工程"; } else { specType = "其他"; } //项目名称 prjName = tr.Columns[1].ToPlainTextString().Replace(" ", ""); //中标单位 bidUnit = tr.Columns[2].ToPlainTextString().Replace(" ", ""); //发布时间 bidDate = tr.Columns[3].ToPlainTextString().TrimStart('[').TrimEnd(']'); NodeList cNode = new NodeList(); //进行搜索子节点A标签 tr.Columns[1].CollectInto(cNode, new TagNameFilter("a")); InfoUrl = "http://www.chjssz.gov.cn/" + (cNode[0] as ATag).Link; prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(bidType); BidInfo info = ToolDb.GenBidInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, string.Empty, bidDate, bidUnit, bidDate, string.Empty, "见附件", string.Empty, "广州建设工程交易中心", bidType, specType, string.Empty, string.Empty, InfoUrl, string.Empty, HtmlTxt); list.Add(info); //采集内容页 string dlHtml = string.Empty; try { dlHtml = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default); } catch (Exception ex) { continue; } Parser dlParser = new Parser(new Lexer(dlHtml)); NodeList dlNodes = dlParser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("background", "pic/abouts_16.jpg"))); if (dlNodes != null && dlNodes.Count > 0) { NodeList ddNode = dlNodes.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("A"), new HasAttributeFilter("target", "_blank")), true); if (ddNode != null && ddNode.Count > 0) { for (int k = 0; k < ddNode.Count; k++) { ATag ddATag = ddNode[k] as ATag; if (ddATag.Link.Contains("UploadFiles")) { BaseAttach attach = ToolDb.GenBaseAttach(ddATag.StringText, info.Id, "http://www.chjssz.gov.cn/" + ddATag.Link); base.AttachList.Add(attach); } } dlParser.Reset(); } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", ""); try { pageInt = int.Parse(pageTemp.GetRegexBegEnd("/", "页")); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://new.sztc.com/bidNotice/index_" + i + ".jhtml"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "lb-link")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.LinkText.ToNodeString().Replace(" ", ""); beginDate = prjName.GetDateRegex(); if (!string.IsNullOrEmpty(prjName)) { prjName = prjName.Replace(beginDate, ""); } InfoUrl = aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "ninfo-con"), new TagNameFilter("div"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); bidCtx = HtmlTxt.GetReplace("</p>,<br/>", "\r\n").ToCtxString().GetReplace("\t", "").GetReplace("\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n", "\r\n"); bidCtx = bidCtx.GetReplace("单位:\r\n,单位:\r\n", "单位:").GetReplace("中标人:\r\n,中标人:\r\n", "中标人:").GetReplace("编号:\r\n,编号:\r\n", "编号:"); code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.Replace("和中标金额", "").GetMoneyRegex(new string[] { "中标金额" }); if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetMoneyRegex(); } if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (dtlNode != null && dtlNode.Count > 0) { string ctx = string.Empty; TableTag table = dtlNode[0] as TableTag; for (int r = 0; r < table.Rows[0].ColumnCount; r++) { try { ctx += table.Rows[0].Columns[r].ToNodePlainString() + ":"; ctx += table.Rows[1].Columns[r].ToNodePlainString() + "\r\n"; } catch { } } bidUnit = ctx.GetBidRegex(); bidMoney = ctx.GetMoneyRegex(new string[] { "中标金额" }); if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } } } if (bidUnit.Contains("名称")) { bidUnit = bidUnit.Replace("名称", ""); } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } if (bidUnit.Contains("包号")) { bidUnit = ""; } specType = "政府采购"; msgType = "深圳市国际招标有限公司"; bidType = prjName.GetInviteBidType(); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); dtlparser = new Parser(new Lexer(HtmlTxt)); NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (FileTag != null && FileTag.Count > 0) { for (int f = 0; f < FileTag.Count; f++) { ATag file = FileTag[f] as ATag; if (file.IsAtagAttach()) { string link = string.Empty; if (file.Link.ToLower().Contains("http")) { link = file.Link; } else { link = "http://new.sztc.com/" + file.Link; } BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "box"))), new TagNameFilter("a"))); if (nodeList != null && nodeList.Count > 0) { try { ATag aTag = nodeList[nodeList.Count - 1] as ATag; if (aTag.ToPlainTextString().Contains("末页")) { page = int.Parse(aTag.GetAttribute("tagname").ToLower().Replace("/szgm/132100/xwdt17/135204/151246/8d25503a-", "").Replace(".html", "")); } } catch { } } if (page == 1) { page = 82; } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("http://www.szgm.gov.cn/szgm/132100/xwdt17/135204/151250/897d248a-" + i.ToString() + ".html"), Encoding.UTF8); } catch { return(list); } } parser = new Parser(new Lexer(htl)); NodeList tabList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page_co")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%")))); if (tabList != null && tabList.Count > 0) { for (int j = 0; j < tabList.Count; j++) { TableRow tr = (tabList[j] as TableTag).Rows[0]; ATag aTag = tr.GetATag(); if (aTag == null || tr.ColumnCount != 3) { continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.szgm.gov.cn" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htlDtl = regexHtml.Replace(htlDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "article_body"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); string tempName = bidCtx.GetRegex("工程名称"); if (!string.IsNullOrWhiteSpace(tempName)) { prjName = tempName; } code = bidCtx.GetCodeRegex(); buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("委托单位"); } if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegexBegEnd("确认", "为中标单位"); } bidMoney = bidCtx.GetMoneyRegex(); if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = bidCtx.GetRegex("合同价").GetMoney(); } if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = bidCtx.GetRegexBegEnd("人民币", "元").GetMoney(); } if (string.IsNullOrWhiteSpace(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "holder")), true), new TagNameFilter("table"))); if (tableNode == null || tableNode.Count < 1) { parser.Reset(); tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable"))); } string ctx = string.Empty; if (tableNode != null && tableNode.Count > 0) { TableTag table = tableNode[0] as TableTag; if (table.RowCount >= 2) { for (int r = 0; r < table.Rows[0].ColumnCount; r++) { string temp = table.Rows[0].Columns[r].ToNodePlainString(); if (temp.Contains("控制金额")) { continue; } ctx += temp + ":"; ctx += table.Rows[1].Columns[r].ToNodePlainString() + "\r\n"; } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrWhiteSpace(code)) { code = ctx.GetCodeRegex(); } } } try { if (decimal.Parse(bidMoney) > 50000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } code = code.GetCodeDel(); msgType = "深圳市光明新区"; if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } specType = "政府采购"; bidType = prjName.GetInviteBidType(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市光明新区公明街道办事处"; } BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "光明新区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", ""); try { pageInt = int.Parse(pageTemp.GetRegexBegEnd("/", "页")); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://new.sztc.com/bidBulletin/index_" + i + ".jhtml"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "lb-link")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.LinkText.ToNodeString().Replace(" ", ""); beginDate = prjName.GetDateRegex(); if (!string.IsNullOrEmpty(prjName)) { prjName = prjName.Replace(beginDate, ""); } InfoUrl = aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "ninfo-con"), new TagNameFilter("div"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); specType = "政府采购"; msgType = "深圳市国际招标有限公司"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); dtlparser = new Parser(new Lexer(HtmlTxt)); NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (FileTag != null && FileTag.Count > 0) { for (int f = 0; f < FileTag.Count; f++) { ATag file = FileTag[f] as ATag; if (file.IsAtagAttach()) { string link = string.Empty; if (file.Link.ToLower().Contains("http")) { link = file.Link; } else { link = "http://new.sztc.com/" + file.Link; } BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellspacing", "5"))); if (tdNodes != null && tdNodes.Count > 0) { NodeList aNodes = new NodeList(); tdNodes[0].CollectInto(aNodes, new TagNameFilter("a")); if (aNodes != null && aNodes.Count > 0) { for (int i = 0; i < aNodes.Count; i++) { ATag aTag = aNodes[i] as ATag; if (aTag.ToPlainTextString().Contains("尾页")) { Regex re = new Regex(@"[^0-9]+"); pageInt = int.Parse(re.Replace(aTag.Link, "")); break; } } } } parser.Reset(); for (int i = 1; i <= pageInt; i++) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("http://218.20.201.20/www/zbmsg/2008/xzb_list.asp?page=" + i.ToString() + "&id=13828"), Encoding.Default); } catch (Exception ex) { continue; } parser = new Parser(new Lexer(html)); tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_list")), true))); if (tdNodes != null && tdNodes.Count > 0) { for (int j = 0; j < tdNodes.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, buildScale = string.Empty, buildCycle = string.Empty, levels = string.Empty, structType = string.Empty, bidMoney = string.Empty, buildType = string.Empty, buildQual = string.Empty, InfoUrl = string.Empty, beginDate = string.Empty, bidType = string.Empty, HtmlTxt = string.Empty; decimal decMoney = 0; StringBuilder ctx = new StringBuilder(); ATag aTag = tdNodes[j] as ATag; if (aTag.Link.Contains("xzb_show.asp")) { InfoUrl = "http://218.20.201.20/www/zbmsg/2008/" + aTag.Link.Remove(aTag.Link.IndexOf("&")); Regex regexHtml = new Regex(@"<div[^>]*>[\s]*</div>"); string dlHtml = string.Empty; try { dlHtml = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).ToLower().Replace(" ", ""); } catch (Exception ex) { continue; } string filterHtml = dlHtml.Replace("\n", "").Replace("\r", "").Replace("<u>", "<a>").Replace("</u>", "</a>"); prjName = aTag.ToPlainTextString(); //内容 Parser ctxParser = new Parser(new Lexer(dlHtml)); NodeList ctxNodes = ctxParser.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), false)); ctx.Append(ctxNodes.AsString().Replace(" ", "")); HtmlTxt = ctxNodes.AsHtml(); Parser dlParser = new Parser(new Lexer(regexHtml.Replace(filterHtml, ""))); NodeList dlNodes = dlParser.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), false)); //搜索附件 NodeList findFiles = dlNodes.ExtractAllNodesThatMatch(new TagNameFilter("a"), true); NodeList fileNode = new NodeList(); if (findFiles != null && findFiles.Count > 0) { for (int f = 0; f < findFiles.Count; f++) { ATag fileA = findFiles[f] as ATag; if (fileA.Link.Contains("uploadfile")) { fileNode.Add(fileA); } } } INode nods = dlNodes[0].Parent.Parent.Parent.Parent; //发布日期 if (nods != null) { TableTag tb = nods as TableTag; for (int t = 0; t < tb.RowCount; t++) { TableRow tr = tb.Rows[t]; if (tr.ToPlainTextString().Contains("发布日期")) { beginDate = tr.ToPlainTextString().Substring(tr.ToPlainTextString().IndexOf("[") + 1, tr.ToPlainTextString().IndexOf("]") - tr.ToPlainTextString().IndexOf("[") - 1); break; } } } for (int k = 0; k < dlNodes.Count; k++) { if (dlNodes[k] is ITag) { //对a标签进行过滤 Regex strReplace = new Regex(@"<a[^>]*>|</a>"); if (dlNodes[k].ToPlainTextString().Contains("中标候选人为:") || dlNodes[k].ToPlainTextString().Contains("中标人为:")) { NodeList bidUnitNode = new NodeList(); dlNodes[k].CollectInto(bidUnitNode, new TagNameFilter("a")); if (bidUnitNode.Count > 0) { //找出匹配的项 Regex regexbidUnit = new Regex(@"<a[^>]*>[^<]*</a>"); MatchCollection matchbidUnit = null; if (dlNodes[k].ToPlainTextString().Contains("中标候选人为:")) { matchbidUnit = regexbidUnit.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("中标候选人为:"))); } else if (dlNodes[k].ToPlainTextString().Contains("中标人为:")) { matchbidUnit = regexbidUnit.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("中标人为:"))); } if (matchbidUnit != null && matchbidUnit.Count > 0) { bidUnit = strReplace.Replace(matchbidUnit[0].ToString(), ""); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = dlNodes[k + 1].ToPlainTextString().Trim(); } } else { bidUnit = dlNodes[k + 1].ToPlainTextString(); } } if (dlNodes[k].ToPlainTextString().Contains("中标价:") || dlNodes[k].ToPlainTextString().Contains("投标报价:") || dlNodes[k].ToPlainTextString().Contains("中标价:") || dlNodes[k].ToPlainTextString().Contains("中标价为")) { Regex regdecimal = new Regex(@"\d{1,}[\.]?\d{0,}"); NodeList moneyNode = new NodeList(); dlNodes[k].CollectInto(moneyNode, new TagNameFilter("a")); if (moneyNode.Count > 0) { Regex regexmoney = new Regex(@"<a[^>]*>[^<]*</a>"); MatchCollection matchmoney = null; if (dlNodes[k].ToPlainTextString().Contains("中标价:")) { matchmoney = regexmoney.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("中标价:"))); } if (dlNodes[k].ToPlainTextString().Contains("投标报价:")) { matchmoney = regexmoney.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("投标报价:"))); } if (matchmoney != null && matchmoney.Count > 0) { if (dlNodes[k].ToPlainTextString().Contains("万元")) { try { decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString())[0].ToString()); } catch (Exception ex) { } } else { try { decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString())[0].ToString()) / 10000; } catch (Exception ex) { } } } } else { if (dlNodes[k].ToPlainTextString().Contains("万元")) { decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString().ToString())[0].ToString()); } else { decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString().ToString())[0].ToString()) / 10000; } } } } } string regexstr = @"<[^>]*>"; string ctxStr = Regex.Replace(ctx.ToString(), regexstr, string.Empty, RegexOptions.IgnoreCase); bidUnit = bidUnit.Replace(" ", "").Trim(); Regex reg = new Regex(@"[\u4e00-\u9fa5]"); if (!reg.IsMatch(bidUnit)) { bidUnit = ""; } else { Regex regBidMoneys = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); string t = regBidMoneys.Match(bidUnit).Value; if (!string.IsNullOrEmpty(t)) { bidUnit = ""; } } if (string.IsNullOrEmpty(bidUnit) || decMoney <= 0) { string txt = string.Empty; parser = new Parser(new Lexer(dlHtml)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), true))); if (dtList != null && dtList.Count > 1) { for (int k = 0; k < dtList.Count; k++) { if (dtList[k].ToPlainTextString().Trim().Contains("中标候选人") || dtList[k].ToPlainTextString().Trim().Contains("中标人")) { try { if (string.IsNullOrEmpty(dtList[k + 1].ToPlainTextString().Trim())) { txt += dtList[k].ToPlainTextString().Trim(); string text = txt.Remove(txt.Length - txt.IndexOf("为:") - 2); if (string.IsNullOrEmpty(text)) { txt += dtList[k].ToPlainTextString().Trim(); txt += dtList[k + 2].ToPlainTextString().Trim() + "\r\n"; } else { txt += dtList[k].ToPlainTextString().Trim() + "\r\n"; } } else { txt += dtList[k].ToPlainTextString().Trim(); string text = txt.Remove(txt.Length - txt.IndexOf("为:") - 2); if (string.IsNullOrEmpty(text)) { txt += dtList[k].ToPlainTextString().Trim(); txt += dtList[k + 1].ToPlainTextString().Trim() + "\r\n"; } else { txt += dtList[k].ToPlainTextString().Trim() + "\r\n"; } } } catch { } } else { txt += dtList[k].ToPlainTextString().Trim() + "\r\n"; } } if (string.IsNullOrEmpty(bidUnit)) { Regex regBidUnit = new Regex(@"(中标单位|中标候选单位|中标候选人为|中标人为):[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(txt.Replace("\r\n\r\n", "")).Value.Replace("中标候选人为", "").Replace("中标人为", "").Replace("中标单位:", "").Replace("中标候选单位:", "").Replace(":", "").Trim(); } if (decMoney <= 0) { Regex regBidMoneystr = new Regex(@"(金额|价格|报价|中标价)(:|:)[^\r\n]+\r\n"); string monerystr = regBidMoneystr.Match(txt).Value.Replace("中标价", "").Replace("金额", "").Replace("价格", "").Replace("报价", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (!string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value)) { if (monerystr.Contains("万元") || monerystr.Contains("万美元")) { decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value); } else { try { decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value) / 10000; if (decMoney < decimal.Parse("0.1")) { decMoney = 0; } } catch (Exception) { decMoney = 0; } } } } } } if (string.IsNullOrEmpty(bidUnit) || decMoney <= 0) { string txt = string.Empty; parser = new Parser(new Lexer(dlHtml)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("p"), new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), true))); if (dtList != null && dtList.Count > 1) { for (int k = 0; k < dtList.Count; k++) { if (dtList[k].ToPlainTextString().Trim().Contains("中标候选人") || dtList[k].ToPlainTextString().Trim().Contains("中标人")) { if (string.IsNullOrEmpty(dtList[k + 1].ToPlainTextString().Trim())) { k++; txt += dtList[k].ToPlainTextString().Trim(); } else { txt += dtList[k].ToPlainTextString().Trim(); string text = txt.Remove(txt.Length - txt.IndexOf("为:") - 2); if (string.IsNullOrEmpty(text)) { txt = ""; txt += dtList[k].ToPlainTextString().Trim(); } else { txt = ""; txt += dtList[k].ToPlainTextString().Trim() + "\r\n"; } } } else { txt += dtList[k].ToPlainTextString().Trim() + "\r\n"; } Regex regexsHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); txt = regexsHtml.Replace(txt, ""); } if (string.IsNullOrEmpty(bidUnit)) { Regex regBidUnit = new Regex(@"(中标单位|中标候选单位|中标候选人为|中标人为):[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(txt.Replace("\r\n\r\n", "")).Value.Replace("中标候选人为", "").Replace("中标人为", "").Replace("中标单位:", "").Replace("中标候选单位:", "").Replace(":", "").Trim(); } if (string.IsNullOrEmpty(bidMoney)) { Regex regBidMoneystr = new Regex(@"(金额|价格|报价|中标价|中标价为)(:|:)[^\r\n]+\r\n"); string monerystr = regBidMoneystr.Match(txt).Value.Replace("中标价为", "").Replace("中标价", "").Replace("金额", "").Replace("价格", "").Replace("报价", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (!string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value)) { if (monerystr.Contains("万元") || monerystr.Contains("万美元")) { decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value); } else { try { decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value) / 10000; if (decMoney < decimal.Parse("0.1")) { decMoney = 0; } } catch (Exception) { decMoney = 0; } } } } } } prjName = ToolDb.GetPrjName(prjName.Replace(" ", "")); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "广州市区", "番禺区", string.Empty, string.Empty, prjName, buildUnit, beginDate, bidUnit, beginDate, string.Empty, ctxStr, string.Empty, "广州市番禺区建设局", bidType, "建设工程", string.Empty, decMoney.ToString(), InfoUrl, string.Empty, HtmlTxt); list.Add(info); if (fileNode.Count > 0) { try { for (int f = 0; f < fileNode.Count; f++) { BaseAttach attach = ToolDb.GenBaseAttach((fileNode[0] as ATag).StringText, info.Id, "http://218.20.201.20" + (fileNode[0] as ATag).Link); base.AttachList.Add(attach); } } catch { } } dlParser.Reset(); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; int crawlMax = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(this.SiteUrl + "&page=0"), Encoding.Default).Replace(" ", ""); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "page_PageList"))); if (sNode != null && sNode.Count > 0) { SelectTag select = sNode[0] as SelectTag; pageInt = select.OptionTags.Length; } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + (i - 1).ToString(), Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("onmouseover", "this.style.backgroundColor=\"#EFFCD0\";"))); if (sNode != null && sNode.Count > 0) { for (int n = 0; n < sNode.Count; n++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = sNode[n] as TableRow; prjName = tr.Columns[0].ToPlainTextString().Trim(); code = tr.Columns[1].ToPlainTextString().Trim(); beginDate = tr.Columns[2].ToPlainTextString().Trim(); ATag aTag = tr.GetATag(); if (aTag == null) { continue; } Regex regexLink = new Regex(@"id=[^-]+"); InfoUrl = "http://www.sdcin.com.cn/viewzbggnew.php?" + regexLink.Match(aTag.Link).Value; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "zbtgHTML"), new TagNameFilter("td"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = htmldetail.Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n"); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "zbtgHTML"), new TagNameFilter("td"))); if (dtnode != null && dtnode.Count > 0) { inviteCtx = HtmlTxt.ToCtxString().Replace("startprint", ""); TableTag table = dtnode[0] as TableTag; if (table != null && table.RowCount > 0) { for (int t = 0; t < table.RowCount; t++) { for (int c = 0; c < table.Rows[t].ColumnCount; c++) { if (table.Rows[t].Columns[c].ToPlainTextString().Replace(" ", "").Contains("招标人")) { if (string.IsNullOrEmpty(buildUnit)) { buildUnit = table.Rows[t].Columns[c + 1].ToPlainTextString().Trim(); } } else if (table.Rows[t].Columns[c].ToPlainTextString().Replace(" ", "").Contains("公告时间")) { if (string.IsNullOrEmpty(beginDate)) { beginDate = table.Rows[t].Columns[c + 1].ToPlainTextString().Trim().Replace("年", "-").Replace("月", "-").Replace("日", ""); } } else if (table.Rows[t].Columns[c].ToPlainTextString().Replace(" ", "").Contains("工程地点")) { if (string.IsNullOrEmpty(prjAddress)) { prjAddress = table.Rows[t].Columns[c + 1].ToPlainTextString().Trim(); } } } } } } if (string.IsNullOrEmpty(beginDate)) { Regex regDate = new Regex(@"请于\d{4}年\d{1,2}月\d{1,2}日"); beginDate = regDate.Match(inviteCtx.Replace("\r\n", "").Replace(" ", "").Replace("\t", "")).Value.Replace("请于", ""); } if (string.IsNullOrEmpty(beginDate)) { Regex regDate = new Regex(@"请于\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(inviteCtx.Replace("\r\n", "").Replace(" ", "").Replace("\t", "")).Value.Replace("请于", ""); } if (string.IsNullOrEmpty(beginDate)) { if (inviteCtx.Length > 250) { Regex regDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日"); beginDate = regDate.Match(inviteCtx.Substring(inviteCtx.Length - 250, 250).Replace("\r\n", "").Replace(" ", "").Replace("\t", "")).Value; } } if (string.IsNullOrEmpty(beginDate)) { if (inviteCtx.Length > 250) { Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(inviteCtx.Substring(inviteCtx.Length - 250, 250).Replace("\r\n", "").Replace(" ", "").Replace("\t", "")).Value; } } if (string.IsNullOrEmpty(beginDate)) { beginDate = DateTime.Now.ToString(); } msgType = "佛山市顺德区建设工程交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "佛山市区", "顺德区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); NodeList filenode = dtnode.SearchFor(typeof(ATag), true); if (filenode != null && filenode.Count > 0) { for (int f = 0; f < filenode.Count; f++) { ATag fileTag = filenode[f] as ATag; if (fileTag.IsAtagAttach()) { BaseAttach attach = ToolDb.GenBaseAttach(fileTag.ToPlainTextString().Trim(), info.Id, "http://www.sdcin.com.cn" + fileTag.Link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; int crawlMax = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default).Replace(" ", ""); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ny_21 tc"))); if (sNode != null && sNode.Count > 0) { string pageString = sNode.AsString().Trim(); Regex regexPage = new Regex(@"createPageHTML\([^\)]+\)"); Match pageMatch = regexPage.Match(pageString); try { pageInt = int.Parse(pageMatch.Value.Replace("createPageHTML(", "").Replace(")", "").Split(',')[0].Trim()); } catch (Exception) { } } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "index_" + (i - 1).ToString() + ".html", Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "ny_22"))), new TagNameFilter("li"))); if (sNode != null && sNode.Count > 0) { for (int j = 0; j < sNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, bidType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; INode node = sNode[j]; ATag aTag = node.Children.SearchFor(typeof(ATag), true)[0] as ATag; Div divTag = node.Children.SearchFor(typeof(Div), true)[1] as Div; prjName = aTag.ToPlainTextString().Trim(); beginDate = divTag.ToPlainTextString().Trim(new char[] { '[', ']', ' ' }); InfoUrl = "http://ztb.gaoming.gov.cn/jsgc/zbxx/" + aTag.Link.Replace("../", "").Replace("./", ""); if (aTag.Link.Contains("../")) { InfoUrl = "http://ztb.gaoming.gov.cn/" + aTag.Link.Replace("../", "").Replace("./", ""); } string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "con_10 tl"), new TagNameFilter("div"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n"); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "con_10 tl"), new TagNameFilter("div"))); inviteCtx = dtnode.AsString().Replace(" ", ""); Regex regCtx = new Regex(@"[\n]+"); inviteCtx = regCtx.Replace(inviteCtx, "\r\n"); Regex regPrjAdd = new Regex(@"(工程地点|工程地址|项目地址):[^\r\n]+[\r\n]{1}"); prjAddress = regPrjAdd.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("工程地址:", "").Replace("项目地址:", "").Replace(")", "").Replace("。", "").Trim(); Regex regCode = new Regex(@"GMJ[0-9]+"); code = regCode.Match(inviteCtx).Value; Regex regbuildUnit = new Regex(@"(招标单位|招标人):[^\r\n]+[\r\n]{1}"); buildUnit = regbuildUnit.Match(inviteCtx).Value.Replace("招标单位:", "").Replace("招标人:", "").Replace("。", "").Trim(); if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = buildUnit.Substring(0, 150); } if (Encoding.Default.GetByteCount(prjAddress) > 200) { prjAddress = "见招标信息"; } msgType = "佛山市高明区建设工程交易中心"; specType = "建设工程"; bidType = ToolHtml.GetInviteTypes(bidType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "佛山市区", "高明区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, bidType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }