public static ItemPlan ConvertToItemPlan(ItemPlanRequest request) { var item = new ItemPlan(); var accionPlaneada = new AccionPlaneada(); accionPlaneada.Deliver(request.AccionPlaneada_Descripcion); var accionRealizada = new AccionRealizada(); var evidencia = new Evidencia(); evidencia.Deliver(request.AccionRealizada_evidencia_Ruta); accionRealizada.Deliver(request.AccionRealizada_Descripcion, evidencia); item.Deliver(accionPlaneada, accionRealizada, request.PlanId); return(item); }
public static List <string> CanConvertToItemPlan(ItemPlanUpdateRequest request) { var errors = new List <string>(); var item = new ItemPlan(); var accionPlaneada = new AccionPlaneada(); errors.AddRange(accionPlaneada.CanDeliver(request.AccionPlaneada_Descripcion)); var accionRealizada = new AccionRealizada(); var evidencia = new Evidencia(); errors.AddRange(evidencia.CanDeliver(request.AccionRealizada_evidencia_Ruta)); errors.AddRange(accionRealizada.CanDeliver(request.AccionRealizada_Descripcion, evidencia)); errors.AddRange(item.CanDeliver(accionPlaneada, accionRealizada)); return(errors); }
public static PlanAccion CreatePlanAccion() { var actividad = ActividadMother.CreateActividad(); var accionPlaneada = new AccionPlaneada(); accionPlaneada.Deliver("Se describe lo planeado"); var accionRealizada = new AccionRealizada(); var evidencia = new Evidencia(); evidencia.Deliver("loquesea/dir"); accionRealizada.Deliver("Se describe lo realizado", evidencia); var itemPlan = new ItemPlan(); itemPlan.Deliver(accionPlaneada, accionRealizada, 0); var items = new List <ItemPlan>(); items.Add(itemPlan); var planAccion = new PlanAccion(items, actividad); return(planAccion); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "GridView1_ctl21_labCountInfo"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "每"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION" }, new string[] { "GridView1$ctl21$lbNext", "", "", viewState, "44ED84FE", eventValidation }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty, Area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[0].GetATag(); Parser divPerser = new Parser(new Lexer(tr.Columns[0].ToHtml())); NodeList divNode = divPerser.ExtractAllNodesThatMatch(new TagNameFilter("div")); if (divNode != null && divNode.Count > 0) { ItemName = (divNode[0] as Div).GetAttribute("title"); } else { ItemName = aTag.LinkText; } Area = tr.Columns[1].ToNodePlainString(); PlanDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://222.168.7.143:8888/er/AttachManage/ProjectPublic/" + aTag.Link.Replace("../", ""); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "682"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode[0].ToHtml(); ItemCtx = CtxHtml.ToCtxString(); string ctx = string.Empty; for (int q = 1; q < dtlNode.Count; q++) { TableTag tag = dtlNode[q] as TableTag; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString(); if (r == 0 && c == 0) { continue; } if (r == 0) { if ((c + 1) % 2 == 0) { ctx += temp.GetReplace(":,:") + ":"; } else { ctx += temp.GetReplace(":,:") + "\r\n"; } } else { if ((c + 1) % 2 == 0) { ctx += temp.GetReplace(":,:") + "\r\n"; } else { ctx += temp.GetReplace(":,:") + ":"; } } } } } ItemCode = ctx.GetCodeRegex(); ItemContent = ctx.GetRegex("建设内容", true, 500); ApprovalCode = ctx.GetRegex("文号"); ApprovalDate = ctx.GetRegex("批复时间"); ItemAddress = ctx.GetAddressRegex(); PlanType = "项目公开"; MsgType = "吉林省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("吉林省", "吉林省及地市", Area, ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 27; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.hbfgw.gov.cn/hqfw/xmgg/xmkzgg/index_" + (i - 1).ToString() + ".shtml"); } catch { continue; } } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "mytable"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; TableRow tr = table.Rows[j]; ItemCode = tr.Columns[0].ToNodePlainString().GetReplace("('无')").GetReplace("('", "kdxx").GetReplace("')", "xxdk").GetRegexBegEnd("kdxx", "xxdk"); ATag aTag = tr.Columns[1].GetATag(); ItemName = aTag.LinkText; ApprovalUnit = tr.Columns[2].ToNodePlainString(); PlanDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.hbfgw.gov.cn/hqfw/xmgg/xmkzgg/" + aTag.Link.GetReplace("../,./"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "appendixDiv"))); if (dtlNode != null && dtlNode.Count > 0) { parser = new Parser(new Lexer(htmldtl)); NodeList hNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("h1")); if (hNode != null && hNode.Count > 0) { string temp = hNode[0].ToNodePlainString(); ItemName = string.IsNullOrEmpty(temp) ? ItemName : temp; } ItemName = ItemName.GetReplace("省发改委批复,省发改委核准"); CtxHtml = dtlNode.AsHtml().Replace("none", "block"); ItemCtx = CtxHtml.ToCtxString(); string imgUrl = InfoUrl.Substring(0, InfoUrl.LastIndexOf("/")); List <string> attach = new List <string>(); parser = new Parser(new Lexer(CtxHtml)); NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgNode != null && imgNode.Count > 0) { for (int p = 0; p < imgNode.Count; p++) { ImageTag img = imgNode[p] as ImageTag; string link = imgUrl + "/" + img.ImageURL.GetReplace("../,./"); CtxHtml = CtxHtml.GetReplace(img.ImageURL, link); attach.Add(link); } } PlanType = "项目核准信息"; MsgType = "湖北省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("湖北省", "湖北省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (attach.Count > 0) { for (int a = 0; a < attach.Count; a++) { BaseAttach entity = ToolDb.GenBaseAttach(ItemName, info.Id, attach[a]); base.AttachList.Add(entity); } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "?page=" + i); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "list"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount - 2; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[0].GetATag(); ItemName = tr.Columns[0].GetAttribute("title"); ItemCode = tr.Columns[1].ToNodePlainString(); PlanDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.ahpc.gov.cn/zwgk/" + aTag.Link.GetReplace("../,./"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToLower().GetReplace("</p>,<br/>,<br>", "\r\n").ToCtxString(); BuildUnit = ItemCtx.GetBuildRegex(); ItemAddress = ItemCtx.GetAddressRegex(); ItemContent = ItemCtx.GetRegex("内容", true, 1000); InvestSource = ItemCtx.GetRegex("资金来源", true, 40); TotalInvest = ItemCtx.GetRegexBegEnd("投资", "万元").GetChina(); MsgUnit = "社会发展处"; ApprovalUnit = ItemCtx.GetRegex("主办处室"); PlanType = "项目公示"; MsgType = "安徽省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("安徽省", "安徽省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "-1"); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "m_COUNT"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString().GetRegexBegEnd("/", ")"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + ((i - 1) * 24)); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "m_TAB"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); if (aTag == null) { continue; } ItemName = tr.Columns[1].ToNodePlainString(); if (ItemName.Contains("...")) { aTag.GetAttribute("title"); } PlanDate = "20" + tr.Columns[2].ToPlainTextString().GetDateRegex("yy-MM-dd"); InfoUrl = "http://www.scdrc.gov.cn" + aTag.Link;//aTag.Link.GetReplace(".htm", "_1.htm"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList IsNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("iframe"), new HasAttributeFilter("id", "m_FRAME"))); if (IsNode != null && IsNode.Count > 0) { try { InfoUrl = "http://www.scdrc.gov.cn" + aTag.Link.GetReplace(".htm", "_1.htm"); htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "m_TEXT"))); if (dtlNode == null || dtlNode.Count < 1) { parser.Reset(); dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); } if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); parser = new Parser(new Lexer(CtxHtml)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { TableTag tag = tableNode[0] as TableTag; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { ItemCtx += temp.GetReplace(":,:") + "\r\n"; } else { ItemCtx += temp.GetReplace(":,:") + ":"; } } } } else { ItemCtx = CtxHtml.ToCtxString(); } ItemContent = ItemCtx.GetRegex("内容", true, 1000); ApprovalUnit = ItemCtx.GetRegex("批复单位"); ApprovalDate = ItemCtx.GetRegex("批复日期,批复时间"); ApprovalCode = ItemCtx.GetRegex("批复文号(备案号)"); TotalInvest = ItemCtx.GetRegex("总投资").GetMoney(); PlanBeginDate = ItemCtx.GetRegex("开工时间"); ItemAddress = ItemCtx.GetRegex("所属地区"); PlanType = ItemCtx.GetRegex("项目类型"); MsgType = "四川省发展和改革委员会"; ItemName = ItemName.GetReplace("四川省发展和改革委员会"); ItemPlan info = ToolDb.GenItemPlan("四川省", "四川省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); parser = new Parser(new Lexer(CtxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.scdrc.gov.cn/dir1111/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "page"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页").GetReplace("("); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "/p/" + i + ".html"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); if (aTag == null) { continue; } ItemName = aTag.GetAttribute("title").GetReplace("甘肃省发展和改革委员会"); PlanDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gsdrc.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); TotalInvest = ItemCtx.GetRegexBegEnd("总投资", "万元"); ItemCode = ItemCtx.GetRegex("项目编码"); PlanType = "项目审批与核准"; MsgType = "甘肃省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("甘肃省", "甘肃省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); parser = new Parser(new Lexer(CtxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.gsdrc.gov.cn/" + a.Link.GetReplace("../,./"); } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { string postUrl = string.Empty; if (this.MaxCount > 50) { postUrl = "http://www.lg.gov.cn/module/jslib/jquery/jpage/dataproxy.jsp?startrecord=1&endrecord=181&perpage=181"; } else { postUrl = "http://www.lg.gov.cn/module/jslib/jquery/jpage/dataproxy.jsp?startrecord=1&endrecord=" + this.MaxCount + "&perpage=" + this.MaxCount; } NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "appid", "webid", "path", "col", "columnid", "sourceContentType", "unitid", "webname", "permissiontype" }, new string[] { "1", "1", "/", "1", "6802", "1", "9393", "龙岗政府在线", "0" }); html = this.ToolWebSite.GetHtmlByUrl(postUrl, nvc); Regex reg = new Regex("(?<=(kdxx))[.\\s\\S]*?(?=(xxdk))", RegexOptions.Multiline | RegexOptions.Singleline); string c = reg.Match(html.Replace("['", "kdxx").Replace("']", "xxdk")).Value.Replace("kdxx", "").Replace("xxdk", "").Replace("','", ""); html = "<table>" + c + "</table>"; } catch { } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ATag aTag = tr.Columns[0].GetATag(); ItemName = aTag.GetAttribute("title"); PlanDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoom"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); PlanType = "项目核准信息"; MsgType = "深圳市龙岗区发改局"; ItemPlan info = ToolDb.GenItemPlan("广东省", "深圳市区", "龙岗区", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "appid", "webid", "path", "columnid", "sourceContentType", "unitid", "webname", "permissiontype" }, new string[] { "1", "1", "/", "808", "1", "620", "浙江省发展和改革委员会", "0" }); string post = "appid=1&webid=1&path=%2F&columnid=808&sourceContentType=1&unitid=620&webname=浙江省发展和改革委员会&permissiontype=0"; html = ToolHtml.GetHtmlGJByUrlPost(this.SiteUrl, post, Encoding.UTF8, "");//this.ToolWebSite.GetHtmlByUrl("http://www.zjdpc.gov.cn/col/col808/index.html", Encoding.UTF8, ref cookiestr); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc); } catch { } try { string temp = html.GetRegexBegEnd("totalPage", ";").GetReplace("="); pageInt = int.Parse(temp); } catch { } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "col", "appid", "webid", "path", "columnid", "sourceContentType", "unitid", "webname", "permissiontype" }, new string[] { "1", "1", "1", "/", "808", "1", "620", "浙江省发展和改革委员会", "0" }); try { int endrecord = i * 45; int startrecord = 45 * i - 44; html = this.ToolWebSite.GetHtmlByUrl("http://www.zjdpc.gov.cn/module/jslib/jquery/jpage/dataproxy.jsp?perpage=15&endrecord=" + endrecord + "&startrecord=" + startrecord, nvc); } catch { continue; } } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; TableRow tr = (listNode[j] as TableTag).Rows[0]; ATag aTag = tr.Columns[1].GetATag(); ItemName = aTag.GetAttribute("title").GetReplace("省发改委,\\,'"); PlanDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.zjdpc.gov.cn" + aTag.Link.GetReplace("\\,'"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoom"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString().GetReplace("begin-->,“,”,end-->"); TotalInvest = ItemCtx.GetRegexBegEnd("总投资", "万元"); MsgType = "浙江省公共资源交易中心"; PlanType = "项目审批信息"; ItemPlan info = ToolDb.GenItemPlan("浙江省", "浙江省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); parser = new Parser(new Lexer(CtxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.zjdpc.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "table25"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[listNode.Count - 1] as TableTag; for (int j = 1; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ATag aTag = tr.GetATag(); ItemName = aTag.LinkText.ToNodeString().GetReplace(" , "); ItemCode = tr.Columns[0].ToNodePlainString().GetRegexBegEnd("【", "】").GetReplace("项目编号:"); InfoUrl = "http://www.ztzl.qhfgw.gov.cn/xmjcb/xmxxgk/" + aTag.Link.GetReplace("../,./"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "table143"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.Replace("</p>", "\r\n").Replace("</tr>", "\r\n").ToCtxString(); TotalInvest = ItemCtx.GetRegexBegEnd("总投资", "万元"); PlanDate = ItemCtx.GetDateRegex(); PlanType = "项目信息"; MsgType = "青海省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("青海省", "青海省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); parser = new Parser(new Lexer(CtxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.ztzl.qhfgw.gov.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); Dictionary <string, string> dic = GetCityList(); if (dic == null || dic.Count < 1) { return(list); } foreach (string key in dic.Keys) { string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1, sqlCount = 0; string eventValidation = string.Empty; try { this.ToolWebSite.GetHtmlByUrl(dic[key], Encoding.UTF8, ref cookiestr); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "badoo")), true), new TagNameFilter("a"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[pageNode.Count - 1].GetATag().Link.Replace("javascript", "").Replace("jumpPage(", "").Replace(")", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "param.name", "param.proofCode", "page.pageNo", "page.orderBy", "page.order" }, new string[] { "", "", i.ToString(), "", "" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "hytab"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; TableRow tr = table.Rows[j]; ItemName = tr.Columns[0].ToNodePlainString(); PlanDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gdtz.gov.cn" + tr.Columns[0].GetATagHref(); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "xmgknr"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); parser = new Parser(new Lexer(CtxHtml)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { TableTag tab = tableNode[0] as TableTag; for (int k = 1; k < tab.RowCount; k++) { TableRow dr = tab.Rows[k]; if (dr.ColumnCount < 2) { break; } try { ItemCtx += dr.Columns[0].ToNodePlainString() + ":"; ItemCtx += dr.Columns[1].ToNodePlainString() + "\r\n"; } catch (Exception ex) { Logger.Error(InfoUrl + ItemName + key + i); Logger.Error(ex); } } } else { ItemCtx = CtxHtml.ToCtxString(); } ApprovalCode = ItemCtx.GetRegex("备案项目编号"); ItemAddress = ItemCtx.GetRegex("项目所在地"); TotalInvest = ItemCtx.GetRegex("项目总投资").Replace("万元", "").Replace("万", ""); ItemContent = ItemCtx.GetRegex("项目规模及内容"); ApprovalUnit = ItemCtx.GetRegex("备案机关"); ApprovalDate = ItemCtx.GetRegex("复核通过日期"); string temp = ItemCtx.GetRegex("项目起止年限"); string[] tempPlan = temp.Split('-'); if (tempPlan.Length == 2) { PlanBeginDate = tempPlan[0]; PlanEndDate = tempPlan[1]; } PlanType = "项目公开"; MsgType = "广东省发展和改革委员会"; string city = key; if (key.Contains("顺德")) { city = "佛山市区"; } ItemPlan info = ToolDb.GenItemPlan("广东省", city, "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { goto type; } } } } } type : continue; } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "arContent"))); if (pageNode != null && pageNode.Count > 0) { TableTag pageTable = pageNode[0] as TableTag; string temp = pageTable.Rows[pageTable.RowCount - 1].ToNodePlainString().Replace("createPageHTML", "").Replace("0,", "").Replace("(", "").Replace(")", "").Replace("index", "").Replace("htm", "").Replace(",", "").Replace("\"", "").Replace(";", "").Trim(); try { pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "/index_" + (i - 1).ToString() + ".htm", Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "arContent"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { TableRow tr = table.Rows[j]; string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ATag aTag = tr.Columns[1].GetATag(); ItemName = aTag.GetAttribute("title"); ItemCode = tr.Columns[2].ToNodePlainString(); PlanDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); InfoUrl = this.SiteUrl + aTag.Link.Replace("../", "").Replace("./", ""); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); string ctx = string.Empty; parser = new Parser(new Lexer(CtxHtml)); NodeList dtlTable = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); if (dtlTable != null && dtlTable.Count > 0) { TableTag tableTag = dtlTable[0] as TableTag; for (int k = 0; k < tableTag.RowCount; k++) { for (int c = 0; c < tableTag.Rows[k].ColumnCount; c++) { if (c % 2 == 0) { ctx += tableTag.Rows[k].Columns[c].ToNodePlainString().Replace(":", "").Replace(":", "") + ":"; } else { ctx += tableTag.Rows[k].Columns[c].ToNodePlainString() + "\r\n"; } } } } MsgUnit = ctx.GetRegex("发布单位"); if (string.IsNullOrEmpty(MsgUnit)) { MsgUnit = "发改委"; } PlanType = "项目审批信息"; MsgType = "深圳市发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("广东省", "深圳市区", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { List <string> urlList = new List <string>(); urlList.Add("http://www.baoan.gov.cn/ztlm/gcjszt/gcjs/xmsp/xusp/kxxyj/"); urlList.Add("http://www.baoan.gov.cn/ztlm/gcjszt/gcjs/xmsp/xusp/cbsj/"); urlList.Add("http://www.baoan.gov.cn/ztlm/gcjszt/gcjs/xmsp/xusp/hjyxpj/"); IList list = new List <ItemPlan>(); foreach (string url in urlList) { int count = 0; string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(url, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "fenye"))); if (pageNode != null && pageNode.Count > 0) { string temp = pageNode.AsString().Replace("createPageHTML", "").Replace("0,", "").Replace("(", "").Replace(")", "").Replace("index", "").Replace("html", "").Replace(",", "").Replace("\"", "").Replace(";", "").Trim(); try { pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(url + "/index_" + (i - 1).ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "97%"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ItemName = tr.Columns[1].ToNodePlainString(); ItemCode = tr.Columns[2].ToNodePlainString(); PlanDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); InfoUrl = url + tr.Columns[1].GetATagHref().Replace("../", "").Replace("./", ""); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "900"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); TableTag dtlTable = dtlNode[0] as TableTag; for (int k = 1; k < dtlTable.RowCount; k++) { ItemCtx += dtlTable.Rows[k].Columns[0].ToNodePlainString() + ":"; ItemCtx += dtlTable.Rows[k].Columns[1].ToNodePlainString() + "\r\n"; } BuildUnit = ItemCtx.GetRegex("建设单位"); ApprovalCode = ItemCtx.GetRegex("审批文号"); ApprovalUnit = ItemCtx.GetRegex("审批单位"); ApprovalDate = ItemCtx.GetRegex("审批时间").Replace(".", "-"); PlanType = "项目审批信息"; MsgType = "深圳市宝安区发改局"; ItemPlan info = ToolDb.GenItemPlan("广东省", "深圳市区", "宝安区", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); count++; list.Add(info); if (!crawlAll && count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 5; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://hebpi.net/portal/ShowMoreProjectAction.do?method=YsProListPage&page=" + i + "&rp=20"); } catch { continue; } } JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); foreach (KeyValuePair <string, object> obj in smsTypeJson) { if (obj.Key == "total" || obj.Key == "ROWNUM_") { continue; } object[] array = (object[])obj.Value; foreach (object arrValue in array) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; Dictionary <string, object> dic = (Dictionary <string, object>)arrValue; ItemName = Convert.ToString(dic["CUTNAME"]); PlanDate = Convert.ToString(dic["DD"]); InfoUrl = "http://hebpi.net:80/portal/ShowMoreProjectAction.do?method=detail&id=" + Convert.ToString(dic["ID"]); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } Parser parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "neirongleft"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); TotalInvest = ItemCtx.GetRegexBegEnd("总投资", "万元"); PlanType = "项目审批信息"; MsgType = "河北省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("河北省", "河北省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("dt"), new HasAttributeFilter("class", "ny_my"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().Replace("(", "(").GetRegexBegEnd("(", ","); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "index_" + (i - 1) + ".html"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("dt"), new HasAttributeFilter("class", "ny_news")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); ItemName = aTag.LinkText; PlanDate = node.ToPlainTextString().GetDateRegex(); if (aTag.Link.ToLower().Contains("http")) { InfoUrl = aTag.Link; } else { InfoUrl = "http://plan.hainan.gov.cn/fzggzl/xmsp/" + aTag.Link.GetReplace("../,./"); } string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "1000"))); if (dtlNode != null && dtlNode.Count > 1) { CtxHtml = dtlNode[0].ToHtml() + dtlNode[1].ToHtml(); ItemCtx = CtxHtml.ToCtxString(); ApprovalUnit = ItemCtx.GetRegex("发文机构"); ItemCode = ItemCtx.GetRegex("索引号"); ApprovalCode = ItemCtx.GetRegex("文号"); ApprovalDate = ItemCtx.GetDateRegex("yyyy年MM月dd日"); PlanType = "项目审批信息"; MsgType = "海南省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("海南省", "海南省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "cn6"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().Replace("createPageHTML", "").Replace(" 0,", "").Replace("(", "").Replace(")", "").Replace("index", "").Replace("htm", "").Replace(",", "").Replace("\"", "").Replace(";", "").Trim();; pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.gdzbtb.gov.cn/zbsxhz/index_" + (i - 1).ToString() + ".htm", Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "position2")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ATag aTag = listNode[j].GetATag(); ItemName = aTag.GetAttribute("title"); InfoUrl = "http://www.gdzbtb.gov.cn/zbsxhz/" + aTag.Link.Replace("../", "").Replace("./", ""); string tempCity = ItemName.Replace("[", "kdxx").Replace("]", "xxdk").GetRegexBegEnd("kdxx", "xxdk"); ItemName = ItemName.Replace("[" + tempCity + "]-", ""); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); if (dtlNode != null && dtlNode.Count > 3) { TableTag table = dtlNode[3] as TableTag; CtxHtml = dtlNode.AsHtml(); for (int k = 0; k < table.RowCount; k++) { ItemCtx += table.Rows[k].Columns[0].ToNodePlainString() + ":"; ItemCtx += table.Rows[k].Columns[1].ToNodePlainString() + "\r\n"; } PlanDate = ItemCtx.GetRegex("批复日期").GetDateRegex(); if (string.IsNullOrEmpty(PlanDate)) { PlanDate = ItemCtx.GetDateRegex(); } if (string.IsNullOrEmpty(PlanDate)) { PlanDate = DateTime.Now.ToString("yyyy-MM-dd"); } ItemCode = ItemCtx.GetRegex("项目编码").Replace(" ", ""); BuildUnit = ItemCtx.GetRegex("项目单位"); ApprovalUnit = ItemCtx.GetRegex("核准部门"); ApprovalDate = PlanDate; ApprovalCode = ItemCtx.GetRegex("批复文号"); ItemContent = ItemCtx.GetRegex("规模及内容", true, 1000); string city = string.Empty; if (tempCity == "广东") { city = "广州市区"; } else { city = tempCity + "市区"; } PlanType = "项目核准信息"; MsgType = "广东省招标投标监管网"; ItemPlan info = ToolDb.GenItemPlan("广东省", city, "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 24; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "z_12")), true), new TagNameFilter("a"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[pageNode.Count - 1].GetATagHref(); pageInt = int.Parse(temp.GetReplace("index_,.htm")); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "index_" + (i - 1) + ".htm", Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("valign", "top")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "730")))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { TableRow tr = (listNode[j] as TableTag).Rows[0]; ATag aTag = tr.Columns[1].GetATag(); if (aTag == null) { continue; } string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ItemName = aTag.LinkText; PlanDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.bjpc.gov.cn/gcjs/" + aTag.Link.GetReplace("../,./"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellpadding", "2"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); TableTag dtlTable = dtlNode[0] as TableTag; for (int r = 0; r < dtlTable.RowCount; r++) { for (int c = 0; c < dtlTable.Rows[r].ColumnCount; c++) { if ((c + 1) % 2 == 0) { ItemCtx += dtlTable.Rows[r].Columns[c].ToNodePlainString() + "\r\n"; } else { ItemCtx += dtlTable.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:") + ":"; } } } ItemContent = ItemCtx.GetRegex("内容摘要", true, 500); ItemCode = ApprovalCode = ItemCtx.GetRegex("审批文号"); ApprovalUnit = ItemCtx.GetRegex("批复单位"); ApprovalDate = ItemCtx.GetRegex("批复时间").GetDateRegex(); TotalInvest = ItemCtx.GetRegexBegEnd("总投资", "万元").GetChina(); if (ItemName.Contains("..")) { string temp = ItemCtx.GetRegex("项目名称"); ItemName = string.IsNullOrEmpty(temp) ? ItemName : temp; } PlanType = "项目信息"; MsgType = "北京市发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("北京市", "北京市区", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 100; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_sort"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("分", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.nxdrc.gov.cn/zfxxgk/zfxxgkml/index" + (i - 1).ToString() + ".htm", Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_v01")), true), new TagNameFilter("table"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; TableRow tr = table.Rows[j]; ItemCode = tr.Columns[0].ToNodePlainString(); ATag aTag = tr.Columns[1].GetATag(); ItemName = aTag.LinkText.GetReplace("自治区发展改革委批准,自治区发展改革委批复,自治区发改委"); PlanDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.nxdrc.gov.cn/zfxxgk/zfxxgkml/" + aTag.Link.GetReplace("../,./"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "main3"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); TotalInvest = ItemCtx.GetRegexBegEnd("总投资", "万元"); parser = new Parser(new Lexer(CtxHtml)); NodeList conNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("", "id"))); if (conNode != null && conNode.Count > 0) { ItemContent = conNode[0].ToNodePlainString(); if (Encoding.Default.GetByteCount(ItemContent) > 2000) { ItemContent = ""; } } PlanType = "项目审批信息"; MsgType = "宁夏回族自治区发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("宁夏回族自治区", "宁夏回族自治区及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("class", "form"))); if (pageNode != null && pageNode.Count > 0) { try { SelectTag tag = pageNode[0] as SelectTag; string temp = tag.OptionTags[tag.OptionTags.Length - 1].Value; pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&dqy=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "700"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount - 2; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; TableRow tr = table.Rows[j].GetTableTag().Rows[0]; ATag aTag = tr.Columns[1].GetATag(); ItemName = aTag.GetAttribute("title"); ItemCode = tr.Columns[0].ToNodePlainString(); ApprovalCode = tr.Columns[2].ToNodePlainString(); PlanDate = "20" + tr.Columns[3].ToPlainTextString().GetDateRegex("yy-MM-dd"); InfoUrl = "http://www.shdrc.gov.cn/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "maintitle2"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.GetReplace("<!--", "<span>").GetReplace("-->", "<span>").ToCtxString().GetReplace("begin,end,-->,<--"); parser = new Parser(new Lexer(CtxHtml)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text3"))); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag tag = tableNode[0] as TableTag; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { if ((c + 1) % 2 == 0) { ctx += tag.Rows[r].Columns[c].ToNodePlainString().GetReplace("begin,end").ToCtxString() + "\r\n"; } else { ctx += tag.Rows[r].Columns[c].ToNodePlainString().GetReplace("begin,end").ToCtxString() + ":"; } } } string code = ctx.GetRegex("项目编码"); ItemCode = code == "" ? ItemCode : code; ItemContent = ctx.GetRegex("内容", true, 500); ApprovalUnit = ctx.GetRegex("批复机关"); ApprovalDate = ctx.GetRegex("批复时间").GetDateRegex(); } MsgUnit = "上海市发展和改革委员会"; PlanType = "项目审批信息"; MsgType = "上海市发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("上海市", "上海市区", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "Webpager1"))); if (pageNode != null && pageNode.Count > 0) { string temp = pageNode.AsString().GetRegexBegEnd("共", "页"); try { pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT", "deptKey", "key", "Webpager1_input" }, new string[] { viewState, "Webpager1", i.ToString(), "", "", (i - 1).ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellSpacing", "1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; MsgUnit = tr.Columns[2].ToNodePlainString(); PlanDate = tr.Columns[1].ToPlainTextString().GetDateRegex("yyyy/MM/dd"); ATag aTag = tr.Columns[0].GetATag(); ItemName = aTag.LinkText.ToNodeString().GetReplace(" , "); InfoUrl = "http://xxgk.sd.gov.cn/GovInfoOpen/InfoOpenDir/InfoOpenDirTwo/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contents"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.Replace("</p>", "\r\n").Replace("</tr>", "\r\n").ToCtxString(); parser = new Parser(new Lexer(CtxHtml)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "90%"))); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag tableTag = tableNode[0] as TableTag; for (int r = 0; r < tableTag.RowCount; r++) { for (int c = 0; c < tableTag.Rows[r].ColumnCount; c++) { string temp = tableTag.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { ctx += temp.GetReplace(":,:") + "\r\n"; } else { ctx += temp.GetReplace(":,:") + ":"; } } } ItemCode = ctx.GetRegex("索引号"); } parser.Reset(); tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoom")), true), new TagNameFilter("table"))); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag tableTag = tableNode[0] as TableTag; for (int c = 0; c < tableTag.Rows[0].ColumnCount; c++) { try { ctx += tableTag.Rows[0].Columns[c].ToNodePlainString() + ":"; ctx += tableTag.Rows[1].Columns[c].ToNodePlainString() + "\r\n"; } catch { } } ApprovalCode = ctx.GetRegex("批准文号"); ApprovalUnit = ctx.GetRegex("项目申请人"); ApprovalDate = ctx.GetRegex("批准时间"); ItemContent = ctx.GetRegex("主要建设内容", true, 500); } PlanType = "项目信息"; MsgType = "山东省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("山东省", "山东省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(null); } try { string temp = html.GetRegexBegEnd("<strong>", "</strong>").GetReplace("<fontcolor=red>1</font>/");//pageNode[0].ToNodePlainString().GetReplace("1/"); pageInt = int.Parse(temp); } catch { } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellspacing", "5")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { ATag aTag = listNode[j].GetATag(); if (aTag == null) { continue; } string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ItemName = aTag.LinkText.GetReplace("省发展改革委、,省发展改革委, "); PlanDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.lndp.gov.cn/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "200"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); parser = new Parser(new Lexer(CtxHtml)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { TableTag table = tableNode[0] as TableTag; for (int r = 0; r < table.RowCount; r++) { for (int c = 0; c < table.Rows[r].ColumnCount; c++) { string temp = table.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { ItemCtx += temp.GetReplace(":,:") + "\r\n"; } else { ItemCtx += temp.GetReplace(":,:") + ":"; } } } } ItemContent = ItemCtx.GetRegex("内容概述", true, 500); TotalInvest = ItemCtx.GetRegexBegEnd("总投资", "万元").GetChina(); ItemCode = ApprovalCode = ItemCtx.GetCodeRegex(); if (string.IsNullOrEmpty(ItemCode)) { ItemCode = ApprovalCode = ItemCtx.GetRegex("编 号"); } PlanType = "项目信息"; MsgType = "辽宁省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("辽宁省", "辽宁省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "right")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "right")))); if (pageNode != null && pageNode.Count > 0) { string temp = pageNode.AsString().GetRegexBegEnd("共", "页"); try { pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "/index_" + (i - 1).ToNodeString() + ".shtml", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ATag aTag = tr.Columns[1].GetATag(); string tempName = aTag.GetAttribute("title"); PlanDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); ItemName = tempName.GetRegexBegEnd("“", "”"); if (string.IsNullOrEmpty(ItemName)) { ItemName = tempName.GetRegexBegEnd("关于下达", "政府投资项目"); } if (string.IsNullOrEmpty(ItemName)) { ItemName = tempName.Replace("关于下达", "").Replace("”", ""); } InfoUrl = "http://www.yantian.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); parser = new Parser(new Lexer(CtxHtml)); NodeList pNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("p")); if (pNode != null && pNode.Count > 0) { BuildUnit = pNode[0].ToNodePlainString().Replace(":", "").Replace(":", ""); } TotalInvest = ItemCtx.GetRegexBegEnd("项目总投资", "万元"); if (string.IsNullOrEmpty(TotalInvest)) { TotalInvest = ItemCtx.GetRegexBegEnd("项目总投资共", "万元"); } IssuedPlan = ItemCtx.GetRegexBegEnd("本次下达资金", "万元"); if (string.IsNullOrEmpty(IssuedPlan)) { IssuedPlan = ItemCtx.GetRegexBegEnd("下达资金", "万元"); } if (string.IsNullOrEmpty(IssuedPlan)) { IssuedPlan = ItemCtx.GetRegexBegEnd("本次下达前期费用", "万元"); } PlanType = "项目审批信息"; MsgType = "深圳市盐田区"; ItemPlan info = ToolDb.GenItemPlan("广东省", "深圳市区", "盐田区", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pager"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().Replace("createPageHTML", "").Replace(" 0,", "").Replace("(", "").Replace(")", "").Replace("index", "").Replace("html", "").Replace(",", "").Replace("\"", "").Replace(";", "").Trim();; pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "index_" + (i - 1).ToString() + ".html"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "listul")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ATag aTag = listNode[j].GetATag(); ItemName = aTag.GetAttribute("title"); InfoUrl = "http://www.szft.gov.cn/" + aTag.Link.Replace("../", "").Replace("./", ""); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contenter"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); PlanDate = ItemCtx.GetRegex("信息发布日期").GetDateRegex(); if (string.IsNullOrEmpty(PlanDate)) { PlanDate = ItemCtx.GetDateRegex(); } if (string.IsNullOrEmpty(PlanDate)) { PlanDate = DateTime.Now.ToString("yyyy-MM-dd"); } ItemCode = ItemCtx.GetRegex("项目编码").Replace(" ", ""); string ctx = string.Empty; parser = new Parser(new Lexer(CtxHtml)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { TableTag table = tableNode[0] as TableTag; if (table.RowCount >= 2) { TableRow tr = table.Rows[0]; TableRow trC = table.Rows[1]; for (int k = 0; k < tr.ColumnCount; k++) { ctx += tr.Columns[k].ToNodePlainString() + ":"; ctx += trC.Columns[k].ToNodePlainString() + "\r\n"; } if (string.IsNullOrEmpty(ItemCode)) { ItemCode = ctx.GetRegex("序号"); } BuildUnit = ctx.GetRegex("建设单位"); BuildNature = ctx.GetRegex("建设性质"); TotalInvest = ctx.GetRegex("总投资(万元),总投资"); PlanInvest = ctx.GetRegex("本期计划(万元),本期计划"); IssuedPlan = ctx.GetRegex("累计已下达计划(万元),累计已下达计划"); InvestSource = ctx.GetRegex("资金来源"); ItemContent = ctx.GetRegex("主要建设内容,建设内容"); if (string.IsNullOrEmpty(ItemContent)) { ItemContent = trC.Columns[trC.ColumnCount - 1].ToNodePlainString(); } } } PlanType = "项目审批信息"; MsgType = "深圳市福田区发展和改革局"; ItemPlan info = ToolDb.GenItemPlan("广东省", "深圳市区", "福田区", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "gvlist")), true), new TagNameFilter("table"))); if (pageNode != null && pageNode.Count > 0) { try { TableTag table = pageNode[0] as TableTag; string temp = table.Rows[0].Columns[table.Rows[0].ColumnCount - 1].ToNodePlainString(); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION", "ddlDepartment", "txtstartDate", "txtendDate", "keyword", }, new string[] { "gvlist", "Page$" + i, viewState, "D5D8AE3D", eventValidation, "发展财政局", string.Empty, string.Empty, string.Empty }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "gvlist"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; int num = table.RowCount; if (i > 1) { num--; } for (int j = 1; j < num; j++) { string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; TableRow tr = table.Rows[j]; ItemCode = tr.Columns[0].ToNodePlainString(); ItemName = tr.Columns[1].ToNodePlainString(); PlanDate = tr.Columns[2].ToNodePlainString(); InfoUrl = "http://other.sznews.com/pinshanproject/" + tr.Columns[1].GetATagHref(); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "stylecontent"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); TotalInvest = ItemCtx.GetRegexBegEnd("总投资", "万元").GetChina(); IssuedPlan = ItemCtx.GetRegexBegEnd("本次下达", "万元").GetChina(); PlanType = "项目核准信息"; MsgType = "深圳市坪山区发改局"; ItemPlan info = ToolDb.GenItemPlan("广东省", "深圳市区", "坪山区", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "tdfont"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().Replace("function createPageHTML", "").GetRegexBegEnd("createPageHTML", ",").Replace("(", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "index_" + (i - 1) + ".htm", Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellspacing", "3"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { if (table.Rows[j].ColumnCount < 2) { continue; } string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); ItemName = System.Web.HttpUtility.UrlDecode(aTag.GetAttribute("title")); PlanDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); if (aTag.Link.ToLower().Contains("departmentsite") || aTag.Link.ToLower().Contains("fagaiing")) { InfoUrl = "http://www.jxdpc.gov.cn/" + aTag.Link.Replace("../", ""); } else { InfoUrl = "http://www.jxdpc.gov.cn/" + "zdxm/zdxmxx/" + aTag.Link.Replace("./", ""); } string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); PlanType = "建设工程"; MsgType = "江西省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("江西省", "江西省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pageNavigate"))); if (pageNode != null && pageNode.Count > 0) { string temp = pageNode.AsString().GetRegexBegEnd("/共", "页"); try { pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&pageNo=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "listTable"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; MsgUnit = tr.Columns[3].ToNodePlainString(); ApprovalCode = tr.Columns[1].ToNodePlainString(); PlanDate = tr.Columns[4].ToPlainTextString().GetDateRegex("yyyy/MM/dd"); ATag aTag = tr.Columns[2].GetATag(); string tempName = aTag.LinkText.Replace("\n", "").Replace("\t", "").Replace("\r", "").Trim(); ItemName = tempName.GetRegexBegEnd("关于下达", "项目"); if (string.IsNullOrEmpty(ItemName)) { ItemName = tempName.GetRegexBegEnd("关于调整下达", "项目"); } if (string.IsNullOrEmpty(ItemName)) { ItemName = tempName.GetRegexBegEnd("关于预安排", "项目"); } if (string.IsNullOrEmpty(ItemName)) { ItemName = tempName.GetRegexBegEnd("关于追加", "项目"); } if (string.IsNullOrEmpty(ItemName)) { ItemName = tempName.GetRegexBegEnd("关于", "项目"); } if (string.IsNullOrEmpty(ItemName)) { ItemName = tempName; } InfoUrl = "http://www.szlh.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "main2"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.Replace("</p>", "\r\n").Replace("</tr>", "\r\n").ToCtxString(); TotalInvest = ItemCtx.GetRegexBegEnd("总投资控制在", "万元"); PlanInvest = ItemCtx.GetRegexBegEnd("计划安排建设资金", "万元"); if (string.IsNullOrEmpty(TotalInvest)) { TotalInvest = ItemCtx.GetRegexBegEnd("计划项目总投资", "万元").Replace("为", ""); } if (string.IsNullOrEmpty(TotalInvest)) { TotalInvest = ItemCtx.GetRegexBegEnd("计划共安排建设资金", "万元"); } if (string.IsNullOrEmpty(TotalInvest)) { TotalInvest = ItemCtx.GetRegexBegEnd("计划共安排投资", "万元"); } if (string.IsNullOrEmpty(TotalInvest) || string.IsNullOrEmpty(PlanInvest)) { parser = new Parser(new Lexer(CtxHtml)); NodeList inNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "justify"))); if (inNode != null && inNode.Count > 0) { string ctx = inNode.AsString(); TotalInvest = ctx.GetRegexBegEnd("总投资控制在", "万元"); PlanInvest = ctx.GetRegexBegEnd("计划安排建设资金", "万元"); if (string.IsNullOrEmpty(TotalInvest)) { TotalInvest = ItemCtx.GetRegexBegEnd("计划项目总投资", "万元").Replace("为", ""); } if (string.IsNullOrEmpty(TotalInvest)) { TotalInvest = ItemCtx.GetRegexBegEnd("计划共安排建设资金", "万元"); } if (string.IsNullOrEmpty(TotalInvest)) { TotalInvest = ItemCtx.GetRegexBegEnd("计划共安排投资", "万元"); } } } parser = new Parser(new Lexer(CtxHtml)); NodeList contentNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table_suoyin"))); if (contentNode != null && contentNode.Count > 0) { TableTag dtlTable = contentNode[0] as TableTag; ItemContent = dtlTable.Rows[dtlTable.RowCount - 1].Columns[dtlTable.Rows[dtlTable.RowCount - 1].ColumnCount - 1].ToNodePlainString(); } PlanType = "项目审批信息"; MsgType = "深圳市罗湖区发改局"; ItemPlan info = ToolDb.GenItemPlan("广东省", "深圳市区", "罗湖区", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagebar"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.hljdpc.gov.cn/xzgs/index_" + i + ".jhtml"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "right-list")), true), new TagNameFilter("dl"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { ATag aTag = listNode[j].GetATag(); if (aTag == null) { continue; } string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ItemName = aTag.GetAttribute("title"); PlanDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.hljdpc.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable"))); if (dtlNode == null || dtlNode.Count < 1) { parser.Reset(); dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "700"))); } if (dtlNode == null || dtlNode.Count < 1) { parser.Reset(); dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "590"))); } if (dtlNode == null || dtlNode.Count < 1) { parser.Reset(); dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content")), true), new TagNameFilter("table"))); } if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); TableTag table = dtlNode[0] as TableTag; for (int r = 0; r < table.RowCount; r++) { for (int c = 0; c < table.Rows[r].ColumnCount; c++) { string temp = table.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { ItemCtx += temp.GetReplace(":,:") + "\r\n"; } else { ItemCtx += temp.GetReplace(":,:") + ":"; } } } ItemCode = ApprovalCode = ItemCtx.GetRegex("文件号"); ItemContent = ItemCtx.GetRegex("主要内容", true, 500); ApprovalDate = ItemCtx.GetRegex("生成日期").GetDateRegex("yyyy年MM月dd日"); MsgUnit = ItemCtx.GetRegex("发布处室"); TotalInvest = ItemCtx.GetRegexBegEnd("总投资", "万元").GetChina(); PlanType = "行政公示 "; MsgType = "黑龙江省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("黑龙江省", "黑龙江省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pagenav")), true), new TagNameFilter("a"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[pageNode.Count - 1].GetATagHref(); pageInt = int.Parse(temp.GetReplace(new string[] { "/xxgk/spgbsp/Index_", ".html" })); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.hnfgw.gov.cn/xxgk/spgbsp/Index_" + i + ".html"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list14bline")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { ATag aTag = listNode[j].GetATag(); if (aTag == null) { continue; } string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ItemName = aTag.GetAttribute("title").Replace("我委批复", ""); PlanDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.hnfgw.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "PrintTxt"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); TotalInvest = ItemCtx.GetRegexBegEnd("总投资", "万元").GetChina(); PlanType = "项目审批信息"; MsgType = "湖南省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("湖南省", "湖南省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagebox"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.ynbidding.net/classlist.aspx?no-cache=0.04312339340010729&id=685790278180&id=://www.ynbidding.net/list&page=" + i + "&_="); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; ATag aTag = tr.GetATag(); if (aTag == null) { continue; } string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ItemName = aTag.LinkText; PlanDate = tr.Columns[0].ToNodePlainString().GetDateRegex("yyyy/MM/dd"); InfoUrl = "http://www.ynbidding.net" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "Content"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); PlanType = "项目信息"; MsgType = "云南省发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("云南省", "云南省及地市", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }