protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_Main_paging_LblPageCount"))); if (pageList != null && pageList.Count > 0) { try { page = int.Parse(pageList.AsString()); } catch { } } for (int i = 1; i <= page; i++) { if (i > 1) { if (i < 3) { viewState = this.ToolWebSite.GetAspNetViewState(htl); eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl); } NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ctl00$ScriptManager1", "__EVENTTARGET", "__EVENTARGUMENT", "ctl00$Main$ddl_type", "ctl00$Main$txt_Title", "ctl00$Main$paging$txtPageIndex", "__VIEWSTATE", "__EVENTVALIDATION", "__VIEWSTATEENCRYPTED", "__ASYNCPOST", "ctl00$Main$paging$btnForward.x", "ctl00$Main$paging$btnForward.y" }, new string[] { "ctl00$UpdatePanel1|ctl00$Main$paging$btnForward", string.Empty, string.Empty, "1", string.Empty, i.ToString(), viewState, eventValidation, string.Empty, "true", "10", "11" }); try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_Main_Gv_FinishCheck"))); if (dtList != null && dtList.Count > 0) { TableTag table = dtList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string pUrl = string.Empty, pInfoSource = string.Empty, pEndDate = string.Empty, pConstUnit = string.Empty, pSuperUnit = string.Empty, pDesignUnit = string.Empty, prjEndDesc = string.Empty, pPrjAddress = string.Empty, pBuildUnit = string.Empty, pPrjCode = string.Empty, PrjName = string.Empty, pRecordUnit = string.Empty, pCreatetime = string.Empty, pLicUnit = string.Empty; TableRow tr = table.Rows[j]; PrjName = tr.Columns[3].ToPlainTextString().Trim(); pPrjCode = tr.Columns[1].ToPlainTextString().Trim(); pEndDate = tr.Columns[4].ToPlainTextString().Trim(); pUrl = "http://www.szbajs.gov.cn/SiteManage/" + tr.GetAttribute("ondblclick").Replace("&", "&").Replace(")", "kdxx").GetRegexBegEnd("'", "kdxx").Replace("'", ""); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(pUrl), Encoding.UTF8); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "data_con"))); if (dtnode.Count > 0 && dtnode != null) { string ctx = dtnode.AsString().Replace(" ", ""); pInfoSource = ctx; Regex regPrjAddr = new Regex(@"(建设地点|工程地址)(:|:)[^\r\n]+\r\n"); pPrjAddress = regPrjAddr.Match(ctx).Value.Replace("工程地址", "").Replace("建设地点", "").Replace(":", "").Replace(":", "").Trim(); Regex regpDesignUnit = new Regex(@"设计单位(:|:)[^\r\n]+\r\n"); pDesignUnit = regpDesignUnit.Match(ctx).Value.Replace("设计单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim(); Regex regpSuperUnit = new Regex(@"监理单位(:|:)[^\r\n]+\r\n"); pSuperUnit = regpSuperUnit.Match(ctx).Value.Replace("监理单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim(); Regex regpConstUnit = new Regex(@"施工单位(:|:)[^\r\n]+\r\n"); pConstUnit = regpConstUnit.Match(ctx).Value.Replace("施工单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim(); Regex regpBuiUnit = new Regex(@"建设单位(:|:)[^\r\n]+\r\n"); pBuildUnit = regpBuiUnit.Match(ctx).Value.Replace("建设单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim(); Regex regpRecordUnit = new Regex(@"备案机关(:|:)[^\r\n]+\r\n"); pRecordUnit = regpRecordUnit.Match(ctx).Value.Replace("备案机关", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();; Regex regpLicUnit = new Regex(@"发证机关(:|:)[^\r\n]+\r\n"); pLicUnit = regpLicUnit.Match(ctx).Value.Replace("发证机关", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();; if (string.IsNullOrEmpty(pLicUnit)) { pLicUnit = "深圳市宝安区建设局"; } ProjectFinish info = ToolDb.GenProjectFinish("广东省", pUrl, "深圳市宝安区", pInfoSource, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, prjEndDesc, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pRecordUnit, pCreatetime, "深圳市宝安区建设局", pLicUnit); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ProjectFinish>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1, count = 1; string eventValidation = string.Empty; try { htl = ToolHtml.GetHtmlByUrlEncode(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "pageLinkTd"))); if (tdNodes != null && tdNodes.Count > 0) { try { string temp = tdNodes.AsString().ToNodeString(); string s = temp.GetRegexBegEnd("总页数", "页").Replace(":", ""); pageInt = int.Parse(s); } catch (Exception ex) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "page", "qymc", "ann_serial", "pro_name" }, new string[] { i.ToString(), "", "", "" }); try { htl = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tblPrjConstBid"))); if (listNode != null && listNode.Count > 0) { TableTag table = (TableTag)listNode[0]; for (int j = 1; j < table.RowCount - 1; j++) { string pUrl = string.Empty, pInfoSource = string.Empty, pEndDate = string.Empty, pConstUnit = string.Empty, pSuperUnit = string.Empty, pDesignUnit = string.Empty, prjEndDesc = string.Empty, pPrjAddress = string.Empty, pBuildUnit = string.Empty, pPrjCode = string.Empty, PrjName = string.Empty, pRecordUnit = string.Empty, pCreatetime = string.Empty, pLicUnit = string.Empty; TableRow tr = table.Rows[j]; pPrjCode = tr.Columns[0].ToNodePlainString(); PrjName = tr.Columns[1].ToNodePlainString(); pBuildUnit = tr.Columns[2].ToNodePlainString(); pEndDate = tr.Columns[3].ToNodePlainString().GetDateRegex(); if (string.IsNullOrEmpty(pRecordUnit)) { pRecordUnit = "深圳市住房和建设局"; } ProjectFinish info = ToolDb.GenProjectFinish("广东省", pUrl, "深圳市区", pInfoSource, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, prjEndDesc, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pRecordUnit, pCreatetime, "深圳市住房和建设局", pLicUnit); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } count++; if (count >= 200) { count = 1; Thread.Sleep(600 * 1000); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ProjectFinish>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagefooter"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString().GetRegexBegEnd(",共有", "页"); page = int.Parse(temp); } catch { } } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&web_cur_page=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(htl)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1"))); if (dtList != null && dtList.Count > 0) { TableTag table = dtList[dtList.Count - 1] as TableTag; for (int j = 1; j < table.RowCount; j++) { string pUrl = string.Empty, pInfoSource = string.Empty, pEndDate = string.Empty, pConstUnit = string.Empty, pSuperUnit = string.Empty, pDesignUnit = string.Empty, prjEndDesc = string.Empty, pPrjAddress = string.Empty, pBuildUnit = string.Empty, pPrjCode = string.Empty, PrjName = string.Empty, pRecordUnit = string.Empty, pCreatetime = string.Empty, pLicUnit = string.Empty; TableRow tr = table.Rows[j]; PrjName = tr.Columns[3].ToPlainTextString().Trim(); pPrjCode = tr.Columns[2].ToPlainTextString().Trim(); pEndDate = tr.Columns[1].ToPlainTextString().Trim(); pBuildUnit = tr.Columns[4].ToPlainTextString().Trim(); ATag aTag = tr.Columns[2].SearchFor(typeof(ATag), true)[0] as ATag; pUrl = "http://www.cb.gov.cn" + aTag.Link.Replace("GoDetail('", "").Replace("');", ""); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(pUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1"))); if (dtnode.Count > 0 && dtnode != null) { pInfoSource = dtnode.AsHtml().ToCtxString(); pPrjAddress = pInfoSource.GetRegex("建设地点,工程地址"); pDesignUnit = pInfoSource.GetRegex("设计单位"); pSuperUnit = pInfoSource.GetRegex("监理单位"); pConstUnit = pInfoSource.GetRegex("施工单位"); pRecordUnit = pInfoSource.GetRegex("备案机关"); pLicUnit = pInfoSource.GetRegex("发证机关"); if (string.IsNullOrEmpty(pLicUnit)) { pLicUnit = "深圳市龙岗区住房和建设局"; } ProjectFinish info = ToolDb.GenProjectFinish("广东省", pUrl, "深圳市龙岗区", pInfoSource, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, prjEndDesc, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pRecordUnit, pCreatetime, "深圳市龙岗区住房和建设局", pLicUnit); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } if (htl.Contains("RowCount")) { try { int index = htl.IndexOf("RowCount"); string pageStr = htl.Substring(index, htl.Length - index).Replace("RowCount", "").Replace("}", "").Replace(":", "").Replace("\"", ""); decimal b = decimal.Parse(pageStr) / 20; if (b.ToString().Contains(".")) { page = Convert.ToInt32(b) + 1; } else { page = Convert.ToInt32(b); } } catch { } } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl("http://www.szjs.gov.cn/build/build.ashx?_=1352585430077&menu=%E9%A1%B9%E7%9B%AE%E4%BF%A1%E6%81%AF&type=%E7%AB%A3%E5%B7%A5%E9%AA%8C%E6%94%B6%E5%A4%87%E6%A1%88&pageSize=20&pageIndex=" + i.ToString(), Encoding.UTF8); } catch (Exception ex) { continue; } } JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(htl); foreach (KeyValuePair <string, object> obj in smsTypeJson) { if (obj.Key != "DataList") { continue; } object[] array = (object[])obj.Value; foreach (object obj2 in array) { Dictionary <string, object> dicSmsType = (Dictionary <string, object>)obj2; string pUrl = string.Empty, pInfoSource = string.Empty, pEndDate = string.Empty, pConstUnit = string.Empty, pSuperUnit = string.Empty, pDesignUnit = string.Empty, prjEndDesc = string.Empty, pPrjAddress = string.Empty, pBuildUnit = string.Empty, pPrjCode = string.Empty, PrjName = string.Empty, pRecordUnit = string.Empty, pCreatetime = string.Empty, pLicUnit = string.Empty; try { pPrjCode = Convert.ToString(dicSmsType["LogSerial"]); PrjName = Convert.ToString(dicSmsType["PrjLogName"]); pBuildUnit = Convert.ToString(dicSmsType["ConstName"]); pEndDate = Convert.ToString(dicSmsType["LogDate"]); pUrl = "http://www.szjs.gov.cn/build/jgys_detail.aspx?id=" + pPrjCode; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(pUrl, Encoding.UTF8, ref cookiestr).Trim(); } catch (Exception) { continue; } Parser parser = new Parser(new Lexer(htmldetail)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "js-table mar-l-4"))); if (dtList != null && dtList.Count > 0) { TableTag table = dtList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow dr = table.Rows[j]; string ctx = string.Empty; for (int k = 0; k < dr.ColumnCount; k++) { ctx += dr.Columns[k].ToPlainTextString().Trim().Replace("\r", "").Replace("\n", ""); } pInfoSource += ctx + "\r\n"; } Regex regPrjAddr = new Regex(@"(建设地点|工程地址)(:|:)[^\r\n]+\r\n"); pPrjAddress = regPrjAddr.Match(pInfoSource).Value.Replace("工程地址", "").Replace("建设地点", "").Replace(":", "").Replace(":", "").Trim(); Regex regpDesignUnit = new Regex(@"设计单位(:|:)[^\r\n]+\r\n"); pDesignUnit = regpDesignUnit.Match(pInfoSource).Value.Replace("设计单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim(); Regex regpSuperUnit = new Regex(@"监理单位(:|:)[^\r\n]+\r\n"); pSuperUnit = regpSuperUnit.Match(pInfoSource).Value.Replace("监理单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim(); Regex regpConstUnit = new Regex(@"施工单位(:|:)[^\r\n]+\r\n"); pConstUnit = regpConstUnit.Match(pInfoSource).Value.Replace("施工单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim(); Regex regpRecordUnit = new Regex(@"备案机关(:|:)[^\r\n]+\r\n"); pRecordUnit = regpRecordUnit.Match(pInfoSource).Value.Replace("备案机关", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim(); Regex regpLicUnit = new Regex(@"发证机关(:|:)[^\r\n]+\r\n"); pLicUnit = regpLicUnit.Match(pInfoSource).Value.Replace("发证机关", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();; if (string.IsNullOrEmpty(pLicUnit)) { pLicUnit = "深圳市住房和建设局"; } ProjectFinish info = ToolDb.GenProjectFinish("广东省", pUrl, "深圳市区", pInfoSource, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, prjEndDesc, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pRecordUnit, pCreatetime, "深圳市住房和建设局", pLicUnit); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } catch { continue; } } } } return(list); }