//根据各种筛选条件,获取到需要的元素,后其看是否改为全Filter public string GetByFilter(string html, FilterModel model)//OR与AND都只能同时接受两个 { string result = ""; if (model.EType.ToLower().Equals("title")) { return(GetTitle(html)); } NodeList nodes = GetTagList(html, model.EType); if (!string.IsNullOrEmpty(model.ID)) { HasAttributeFilter filter = new HasAttributeFilter("id", model.ID); nodes = nodes.ExtractAllNodesThatMatch(filter); } if (!string.IsNullOrEmpty(model.CSS)) { HasAttributeFilter filter = new HasAttributeFilter("class", model.CSS); nodes = nodes.ExtractAllNodesThatMatch(filter); } if (!model.AllowScript) { TagNameFilter filter = new TagNameFilter("script"); nodes.ExtractAllNodesThatMatch(filter, true); } //将图片文件本地化 { TagNameFilter filter = new TagNameFilter("img"); NodeList imgs = nodes.ExtractAllNodesThatMatch(filter, true); for (int i = 0; i < imgs.Count; i++) { ImageTag img = imgs[i] as ImageTag; string savepath = function.VToP(vdir + Path.GetFileName(img.ImageURL)); if (File.Exists(savepath)) { continue; } //避免图片重复下载 img.ImageURL = httpHelp.DownFile(baseurl, img.ImageURL, savepath); } } result = nodes.AsHtml(); if (!string.IsNullOrWhiteSpace(model.Start) && !string.IsNullOrWhiteSpace(model.End)) { result = regHelper.GetValueBySE(result, model.Start, model.End); } return(result); }
private void GetCorpStaffSzjsjMethod(string url, IList list, string html, bool crawlAll) { Parser parser = new Parser(new Lexer(html)); NodeList aNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "dgConstBid"))); if (aNodes != null && aNodes.Count == 1 && aNodes[0] is TableTag) { TableTag table = (TableTag)aNodes[0]; for (int i = 1; i < table.Rows.Length; i++) { if (table.Rows[i].Columns.Length == 6) { Type typs = typeof(ATag); string Name = string.Empty, Sex = string.Empty, CredType = string.Empty, IdNum = string.Empty, CorpName = string.Empty, CorpCode = string.Empty, CertCode = string.Empty, CertGrade = string.Empty, RegLevel = string.Empty, RegCode = string.Empty, AuthorUnit = string.Empty, PersonType = string.Empty, Province = string.Empty, City = string.Empty, CreateTime = string.Empty, InfoSource = string.Empty, Url = string.Empty, Profession = string.Empty; Name = table.Rows[i].Columns[1].ToPlainTextString().Trim().Replace(" ", ""); //Sex = table.Rows[i].Columns[1].ToPlainTextString().Trim().Replace(" ", ""); string urlSpilt = (table.Rows[i].Columns[1].Children.SearchFor(typs, true)[0] as ATag).Link; string idnum = urlSpilt.Replace("GoDetail('", "").Replace("');", ""); //urlSpilt.Substring(urlSpilt.IndexOf("('"), (urlSpilt.Length - 2)); IdNum = idnum.Replace("&am", "").Replace("&a", "").Replace("p;c", "").Replace("cate", "").Replace("cat", "").Replace("ate", ""); // CorpName = table.Rows[i].Columns[2].ToPlainTextString().Trim().Replace(" ", ""); CorpCode = CorpName; CertCode = table.Rows[i].Columns[4].ToPlainTextString().Trim().Replace(" ", ""); Profession = table.Rows[i].Columns[5].ToPlainTextString().Trim().Replace(" ", ""); PersonType = table.Rows[i].Columns[3].ToPlainTextString().Trim().Replace(" ", ""); Url = "http://61.144.226.2/ryxx/Detail_LWDZ.aspx?ID_NUMBER=" + idnum; string ctxhtml = string.Empty; try { ctxhtml = ToolWeb.GetHtmlByUrl(Url, Encoding.Default); } catch (Exception ex) { Logger.Error("人员姓名:" + CorpName + ",证件号:" + IdNum + "所在单位:" + CorpName + "," + Url + ";" + ex); continue; } Parser parserCtx = new Parser(new Lexer(ctxhtml)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("borderColor", "#cccccc"))); TableTag tabTag = ctxNode[0] as TableTag; string text = ctxNode.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("TD"), new HasAttributeFilter("width", "76%")), true).AsString().Replace(" ", ""); string strSpilt = "任职企业编号:.*?\r\n"; MatchCollection mc = Regex.Matches(text, strSpilt); foreach (Match m in mc) { CorpCode = m.ToString().Replace("任职企业编号:", "").Replace("\r\n", ""); } CorpStaff corpStaff = ToolDb.GenCorpStaff(Name, Sex, CredType, string.Empty, CorpName, CorpCode, CertCode, RegLevel, RegCode, AuthorUnit, PersonType, CertGrade, "广东省", "深圳市区", "深圳市住房和建设局", Url, Profession, "", "", "", ""); // list.Add(corpStaff); ToolDb.SaveEntity(corpStaff, this.ExistCompareFields); // if (!crawlAll && list.Count >= this.MaxCount) return; } } parser.Reset(); } }
/// <summary> /// 获取BodyHtml,去除Script /// </summary> public string GetBodyHtml(string html) { HtmlPage page = GetPage(html); NodeList nodelist = page.Body; NodeFilter filter = new TagNameFilter("script"); NodeList childnode = nodelist.ExtractAllNodesThatMatch(filter, true); for (int i = 0; i < childnode.Size(); i++) { nodelist.Remove(childnode[i]); } return(nodelist.ToHtml()); }
/// <summary> /// 解析html页代码 根据过滤条件获取xml节点 /// </summary> /// <param name="url"></param> /// <param name="filter"></param> /// <returns></returns> public static NodeList GetNodeList(string url, NodeFilter filter) { //获取网页源代码 string pageContent = WebCapture.CapturePage(url); //利用内置解析类 开始解析成xml格式 Lexer lex = new Lexer(pageContent); Parser parser = new Parser(lex); NodeList nodeList = parser.Parse(null); //添加过滤条件 nodeList = nodeList.ExtractAllNodesThatMatch(filter, true); return(nodeList); }
/// <summary> /// 获得列表 /// </summary> /// <returns></returns> public List <OddsLiveMatch> GetScrollMatchList() { List <OddsLiveMatch> liveMatchList = new List <OddsLiveMatch>(); try { HttpHelper h = new HttpHelper(); Cookie lng = new Cookie("lng", "2"); lng.Domain = domain; h.CookieContainer.Add(lng); string zoudi = h.GetHtml("https://" + domain + "/default.aspx" + zoudiUrl); if (!string.IsNullOrEmpty(zoudi)) { #region 分析网页html节点 Lexer lexer = new Lexer(zoudi); Parser parser = new Parser(lexer); NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children; ITag divNode = bodyNodes.ExtractAllNodesThatMatch(new TagNameFilter("FORM"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("DIV"))[0] as ITag; if (divNode.Attributes["ID"].Equals("PageBody")) { NodeList dataDivList = divNode.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.Div)); if (dataDivList[0].ToPlainTextString() == "走地盤") { if (dataDivList[2].ToPlainTextString() == "全場賽果") { return(liveMatchList); } for (int i = 0; i < dataDivList.Count; i++) { ITag div = dataDivList[i] as ITag; if (div.Attributes["CLASS"] != null && div.Attributes["CLASS"].Equals("menuRow")) { OddsLiveMatch oddsLive = new OddsLiveMatch(); oddsLive.urlparams = (div.FirstChild as ITag).Attributes["HREF"].ToString(); oddsLive.id = oddsLive.urlparams.Split('&')[0].Substring(4); oddsLive.time = DateTime.Now; oddsLive.name = div.ToPlainTextString(); liveMatchList.Add(oddsLive); } } } } #endregion 分析网页html节点 } } catch (Exception) { } return(liveMatchList); }
/// <summary> /// 示例:"#test_div span .need [disabled='disabled']" /// //不支持多条件筛选如:div[name='test'],不支持>筛选 /// </summary> /// <param name="html">html字符串,需要以html标签包裹</param> /// <param name="queryStr">同于jquery筛选</param> /// <returns></returns> public static NodeList GetDomsFromHtml(string html, string queryStr) { //id与class等测试大小写不敏感 HtmlPage page = GetPage(html); NodeList doms = page.Body; string[] queryArr = queryStr.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < queryArr.Length; i++) { string query = queryArr[i]; if (query.StartsWith("#")) { string id = query.Replace("#", ""); doms = doms.ExtractAllNodesThatMatch(new HasAttributeFilter("id", id), true); } else if (query.StartsWith(".")) { string css = query.Replace(".", ""); doms = doms.ExtractAllNodesThatMatch(new HasAttributeFilter("class", css), true); } else if (query.StartsWith("[")) { // "[name='test']" string[] attr = query.Substring(1, query.Length - 2).Split('='); string name = attr[0]; string value = attr[1].Substring(1, attr[1].Length - 2);//去除单引号 doms = doms.ExtractAllNodesThatMatch(new HasAttributeFilter(name, value), true); } else //按标签查询 { string tag = query; doms = doms.ExtractAllNodesThatMatch(new TagNameFilter(tag), true); } } return(doms); }
public void DealHtml(IList list, string html, bool crawlAll, string ddlIndex, DateTime dateTime) { Parser parserCtx = new Parser(new Lexer(html)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "DropDownList2"))); string classification = ctxNode.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("option"), new HasAttributeFilter("value", ddlIndex)), true).AsString().Replace(" ", ""); parserCtx.Reset(); Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1"))); if (aNodes != null && aNodes.Count == 1 && aNodes[0] is TableTag) { TableTag table = (TableTag)aNodes[0]; for (int i = 1; i < table.Rows.Length; i++) { string corpName = string.Empty, corpType = string.Empty, corpClassification = string.Empty; int corpAllRanking = 0, classificationRanking = 0; decimal realScore = 0; corpName = table.Rows[i].Columns[1].ToPlainTextString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim(); corpType = table.Rows[i].Columns[2].ToPlainTextString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim(); try { corpAllRanking = int.Parse(table.Rows[i].Columns[3].ToPlainTextString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim()); classificationRanking = Convert.ToInt32(table.Rows[i].Columns[3].ToPlainTextString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim()); realScore = decimal.Parse(table.Rows[i].Columns[4].ToPlainTextString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim()); } catch (Exception ex) { Logger.Error(ex + "企业名称:" + corpName); continue; } corpClassification = classification; //DateTime dateTime=DateTime.Parse(DateTime.Today.AddDays(-1).ToString("yyyy-MM-dd")); CorpCreditss info = ToolDb.GenCorpCreditSS(corpName, corpType, corpClassification, corpAllRanking, classificationRanking, 0, 0, 0, 0, 0, 0, dateTime, realScore, "广东省", "深圳市", DateTime.Now, "深圳市住房和建设局", SiteUrl); if (info != null && !string.IsNullOrEmpty(info.CorpName)) { ToolDb.SaveEntity(info, this.ExistCompareFields); } count++; if (count >= 200) { Thread.Sleep(1000 * 500); count = 1; } } } }
/// <summary> /// 获得列表 /// </summary> /// <returns></returns> public List <OddsLiveMatch> GetMatchScrollOdds(string matchid, string urlparams) { List <OddsLiveMatch> liveMatchList = new List <OddsLiveMatch>(); try { HttpHelper h = new HttpHelper(); Cookie lng = new Cookie("lng", "2"); lng.Domain = domain; h.CookieContainer.Add(lng); //string zoudi = h.GetHtml("https://" + domain + "/default.aspx" + urlparams); string zoudi = h.GetHtml(urlparams); if (!string.IsNullOrEmpty(zoudi)) { #region 分析网页html节点 Lexer lexer = new Lexer(zoudi); Parser parser = new Parser(lexer); NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children; ITag divNode = bodyNodes.ExtractAllNodesThatMatch(new TagNameFilter("FORM"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("DIV"))[0] as ITag; if (divNode.Attributes["ID"].Equals("PageBody")) { NodeList dataDivList = divNode.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.Div)); if (dataDivList[0].ToPlainTextString() == "走地盤") { if (dataDivList[2].ToPlainTextString() == "全場賽果") { OddsLiveHistory liveHistory = new OddsLiveHistory(); liveHistory.matchid = matchid; liveHistory.home = float.Parse(dataDivList[3].ToPlainTextString().Split(' ')[0]); liveHistory.draw = float.Parse(dataDivList[5].ToPlainTextString().Split(' ')[0]); liveHistory.away = float.Parse(dataDivList[7].ToPlainTextString().Split(' ')[0]); liveHistory.time = DateTime.Now; dal.AddHistory(liveHistory); } } } #endregion 分析网页html节点 } } catch (Exception) { } return(liveMatchList); }
public virtual bool NodePushRangeChildren() { NodeList nl = m_node.Children; nl = nl.ExtractAllNodesThatMatch(AndFilter.TrueFilter, true); if (nl.Count > 0) { m_nodestack.Push(m_node); m_nodestack.Push(m_nodeenum); m_nodestack.Push(m_nodelist); m_nodelist = nl; m_nodeenum = m_nodelist.Elements(); m_node = null; return(true); } else { return(false); } }
public void DealHtml(IList list, string html, bool crawlAll, string ddlIndex) { Parser parserCtxTime = new Parser(new Lexer(html)); NodeList ctxNodeTime = parserCtxTime.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "drpRpt"))); string dateTime = ctxNodeTime.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("option"), new HasAttributeFilter("value", "419425")), true).AsString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim(); parserCtxTime.Reset(); Parser parserCtx = new Parser(new Lexer(html)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "DropDownList2"))); string classification = ctxNode.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("option"), new HasAttributeFilter("value", ddlIndex)), true).AsString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim(); parserCtx.Reset(); Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1"))); if (aNodes != null && aNodes.Count == 1 && aNodes[0] is TableTag) { TableTag table = (TableTag)aNodes[0]; for (int i = 1; i < table.Rows.Length; i++) { if (table.Rows[i].Columns.Length == 6) { string corpName = string.Empty, corpType = string.Empty, corpClassification = string.Empty, corpRank = string.Empty; int corpAllRanking = 0, classificationRanking = 0; decimal realScore = 0; corpName = table.Rows[i].Columns[1].ToPlainTextString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim(); corpRank = table.Rows[i].Columns[2].ToPlainTextString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim(); try { corpAllRanking = int.Parse(table.Rows[i].Columns[3].ToPlainTextString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim()); classificationRanking = int.Parse(table.Rows[i].Columns[4].ToPlainTextString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim()); realScore = decimal.Parse(table.Rows[i].Columns[5].ToPlainTextString().Replace(" ", "").Replace("\r", "").Replace("\n", "").Trim()); } catch (Exception ex) { Logger.Error(ex + "企业名称:" + corpName); continue; } corpClassification = classification; DateTime satrtTime = new DateTime(); DateTime endTime = new DateTime(); try { satrtTime = DateTime.Parse(dateTime.Substring(0, 10)); endTime = DateTime.Parse(dateTime.Substring(dateTime.IndexOf("----") + 4, 10)); } catch (Exception ex) { Logger.Error(ex + "企业名称:" + corpName); continue; } CorpCreditjd info = ToolDb.GenCorpCreditJD(corpName, corpType, corpRank, corpClassification.ToString(), corpAllRanking.ToString(), classificationRanking.ToString(), satrtTime.ToString().ToString(), endTime.ToString(), realScore.ToString(), "广东省", "深圳市", "深圳市住房和建设局", SiteUrl, "", "", "", ""); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return; } } } } parserDtl.Reset(); }
private NodeList GetTagList(NodeList nodelist, string tag) { nodelist = nodelist.ExtractAllNodesThatMatch(new TagNameFilter(tag), true); return(nodelist); }
public void DealHtml(IList list, string html, bool crawlAll) { Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_BDGSList2_GridView1"))); if (aNodes != null && aNodes.Count == 1 && aNodes[0] is TableTag) { TableTag table = (TableTag)aNodes[0]; for (int i = 1; i < table.Rows.Length; i++) { if (table.Rows[i].Columns.Length == 7) { Type typs = typeof(ATag); string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, prjType = string.Empty, htmlTxt = string.Empty; prjCode = table.Rows[i].Columns[1].ToPlainTextString().Trim(); InfoTitle = table.Rows[i].Columns[2].ToPlainTextString().Trim(); buildUnit = table.Rows[i].Columns[3].ToPlainTextString().Trim(); prjType = table.Rows[i].Columns[4].ToPlainTextString().Trim(); InfoType = "标底公示"; PublistTime = table.Rows[i].Columns[5].ToPlainTextString().Trim(); string urlSpilt = (table.Rows[i].Columns[2].Children.SearchFor(typs, true)[0] as ATag).Link; string url = urlSpilt.Substring(urlSpilt.IndexOf("\"")).Replace("\"", "").Replace(")", ""); InfoUrl = "http://jyzx.cb.gov.cn/LGjyzxWeb/SiteManage/" + url; string ctxhtml = string.Empty; try { ctxhtml = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); } catch (Exception ex) { continue; } Parser parserCtx = new Parser(new Lexer(ctxhtml)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "text"))); InfoCtx = ctxNode.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_lblContent")), true).AsString().Replace(" ", ""); htmlTxt = ctxNode.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_lblContent")), true).AsHtml(); NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "深圳龙岗区工程", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "深圳市建设工程交易中心龙岗分中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, prjType, string.Empty, htmlTxt); list.Add(info); NodeList fileNode = ctxNode.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_AccessoriesControl1_GridView1")), true); if (fileNode != null && fileNode.Count > 0 && fileNode[0] is TableTag) { TableTag fileTable = fileNode[0] as TableTag; for (int j = 1; j < fileTable.Rows.Length; j++) { BaseAttach attach = ToolDb.GenBaseAttach(fileTable.Rows[j].Columns[0].ToPlainTextString().Trim(), info.Id, "http://jyzx.cb.gov.cn/LGjyzxWeb/" + (fileTable.Rows[j].Columns[0].SearchFor(typs, true)[0] as ATag).Link.Replace("../", "")); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return; } } } } parserDtl.Reset(); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string HtmlTxt = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new TagNameFilter("form")), new TagNameFilter("a"))); if (sNode != null && sNode.Count > 0) { for (int i = 0; i < sNode.Count; i++) { ATag pageA = sNode[i] as ATag; if (pageA.ToPlainTextString().Contains("尾页")) { try { pageInt = int.Parse(pageA.Link.Remove(0, pageA.Link.LastIndexOf("=") + 1)); } catch (Exception) { } } } } parser.Reset(); for (int i = 1; i <= pageInt; i++) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&Page=" + i.ToString()), Encoding.Default); } catch (Exception ex) { continue; } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("bordercolor", "#CCCCCC"))); if (sNode != null && sNode.Count > 0) { HtmlTxt = sNode.AsHtml(); string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty; StringBuilder ctx = new StringBuilder(); decimal decMoney = 0; TableTag table = sNode[1] as TableTag; for (int j = 1; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; //招标类型 bidType = tr.Columns[0].ToPlainTextString(); string invType = "施工,设计,勘察,服务,劳务分包,专业分包,小型施工,监理,设备材料,其他"; if (invType.Contains(bidType)) { specType = "建设工程"; } else { specType = "其他"; } //项目名称 prjName = tr.Columns[1].ToPlainTextString().Replace(" ", ""); //中标单位 bidUnit = tr.Columns[2].ToPlainTextString().Replace(" ", ""); //发布时间 bidDate = tr.Columns[3].ToPlainTextString().TrimStart('[').TrimEnd(']'); NodeList cNode = new NodeList(); //进行搜索子节点A标签 tr.Columns[1].CollectInto(cNode, new TagNameFilter("a")); InfoUrl = "http://www.chjssz.gov.cn/" + (cNode[0] as ATag).Link; prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(bidType); BidInfo info = ToolDb.GenBidInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, string.Empty, bidDate, bidUnit, bidDate, string.Empty, "见附件", string.Empty, "广州建设工程交易中心", bidType, specType, string.Empty, string.Empty, InfoUrl, string.Empty, HtmlTxt); list.Add(info); //采集内容页 string dlHtml = string.Empty; try { dlHtml = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default); } catch (Exception ex) { continue; } Parser dlParser = new Parser(new Lexer(dlHtml)); NodeList dlNodes = dlParser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("background", "pic/abouts_16.jpg"))); if (dlNodes != null && dlNodes.Count > 0) { NodeList ddNode = dlNodes.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("A"), new HasAttributeFilter("target", "_blank")), true); if (ddNode != null && ddNode.Count > 0) { for (int k = 0; k < ddNode.Count; k++) { ATag ddATag = ddNode[k] as ATag; if (ddATag.Link.Contains("UploadFiles")) { BaseAttach attach = ToolDb.GenBaseAttach(ddATag.StringText, info.Id, "http://www.chjssz.gov.cn/" + ddATag.Link); base.AttachList.Add(attach); } } dlParser.Reset(); } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
private IList GetList(int path, bool crawlAll, IList list) { string html = string.Empty; try { html = ToolWeb.GetHtmlByUrl(GetStartUrl(path), Encoding.Default); } catch (Exception ex) { Logger.Error(ex); return(list); } //第一页 GetCorpStaffSzjsjMethod(path, list, html, crawlAll); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } string viewState = ""; int pageInt = 1; Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Table1"))); if (tdNodes != null) { try { string pageTemp = tdNodes.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ucPageNumControl_lbltotal")), true).AsString().Replace(" ", "").Trim(); pageInt = int.Parse(pageTemp); } catch (Exception ex) { Logger.Error(ex); } } parser.Reset(); if (pageInt > 1) { for (int i = 2; i <= pageInt; i++) { string cookiestr = string.Empty; viewState = ToolWeb.GetAspNetViewState(html); NameValueCollection nvc = ToolWeb.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "CORP_NAME", "NAME", "ucPageNumControl:gotopage", "ucPageNumControl:NEXTpage" }, new string[] { string.Empty, string.Empty, viewState, string.Empty, string.Empty, string.Empty, "下一页" }); try { html = ToolWeb.GetHtmlByUrl(GetStartUrl(path), nvc, Encoding.Default, ref cookiestr); //处理后续页 GetCorpStaffSzjsjMethod(path, list, html, crawlAll); } catch (Exception ex) { Logger.Error(ex); continue; } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } return(null); }
private void Save(int l, string bidhtml, IList list, bool crawlAll) { string Url = "http://61.144.226.2:8008/JDScore.aspx?clearPaging=true&guid=450845"; string htl = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiedtstr = string.Empty; try { htl = ToolWeb.GetHtmlByUrl(Url, Encoding.UTF8, ref cookiedtstr); } catch (Exception ex) { Logger.Error(ex.ToString()); } string[] classLen = new string[] { "A", "A-", "B", "B-", "C", "C-" }; string ddlindex = l.ToString(); if (l > 13) { ddlindex = "999999999"; } for (int n = 0; n < classLen.Length; n++) { int pageInt = 1; viewState = ToolWeb.GetAspNetViewState(htl); eventValidation = ToolWeb.GetAspNetEventValidation(htl); string strcookie = string.Empty; NameValueCollection nvc3 = ToolWeb.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "txtCorpName", "DropDownList1", "DropDownList2", "hiddenIsFirst", "GridViewPaging1$txtGridViewPagingForwardTo", "GridViewPaging1$btnForwardToPage", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION" }, new string[] { "", "", "", viewState, "", classLen[n], ddlindex, "false", "1", "Go", "", eventValidation }); try { htl = ToolWeb.GetHtmlByUrl(Url, nvc3, Encoding.UTF8, ref strcookie); } catch (Exception ex) { } Parser parser = new Parser(new Lexer(htl)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "GridViewPaging1_lblGridViewPagingDesc"))); if (dtList != null && dtList.Count > 0) { Regex reg = new Regex(@"共\d+页"); try { pageInt = int.Parse(reg.Match(dtList.AsString()).Value.Trim(new char[] { '共', '页' })); } catch { } } for (int k = 1; k <= pageInt; k++) { if (k > 1) { string viewState1 = ToolWeb.GetAspNetViewState(htl); string eventValidation1 = ToolWeb.GetAspNetEventValidation(htl); NameValueCollection nvc4 = ToolWeb.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "txtCorpName", "DropDownList1", "DropDownList2", "hiddenIsFirst", "GridViewPaging1$txtGridViewPagingForwardTo", "GridViewPaging1$btnForwardToPage", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION" }, new string[] { "", "", "", viewState1, "", classLen[n], ddlindex, "false", k.ToString(), "Go", "", eventValidation1 }); try { htl = ToolWeb.GetHtmlByUrl(Url, nvc4, Encoding.UTF8, ref strcookie); } catch (Exception ex) { } } string beg = string.Empty, end = string.Empty, avg = string.Empty, type = string.Empty, thtype = string.Empty, classlv = string.Empty; Parser parserCtx = new Parser(new Lexer(htl)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "DropDownList1"))); classlv = ctxNode.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("option"), new HasAttributeFilter("value", classLen[n])), true).AsString().Replace(" ", ""); Parser parserCtx2 = new Parser(new Lexer(htl)); NodeList ctxNode2 = parserCtx2.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "DropDownList2"))); thtype = ctxNode2.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("option"), new HasAttributeFilter("value", ddlindex)), true).AsString().Replace(" ", ""); Parser dtparser = new Parser(new Lexer(htl)); NodeList delList = dtparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView2"))); if (delList != null && delList.Count > 0) { TableTag tab = delList[0] as TableTag; for (int e = 1; e < tab.RowCount; e++) { Winista.Text.HtmlParser.Tags.TableRow trdate = tab.Rows[e]; type = trdate.Columns[0].ToPlainTextString().Trim(); beg = trdate.Columns[1].ToPlainTextString().Trim(); end = trdate.Columns[2].ToPlainTextString().Trim(); Regex regInt = new Regex(@"\d{1,}[\.]?\d{0,}"); string temp = trdate.Columns[3].ToPlainTextString(); avg = regInt.Match(temp).Value; } } Parser par = new Parser(new Lexer(htl)); NodeList conList = par.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1"))); if (conList != null && conList.Count > 0) { TableTag tabContent = conList[0] as TableTag; for (int f = 1; f < tabContent.RowCount; f++) { Winista.Text.HtmlParser.Tags.TableRow dr = tabContent.Rows[f]; string corpName = string.Empty, corpType = string.Empty, corpRank = string.Empty, corpCategory = string.Empty, ranking = string.Empty, categoryRank = string.Empty, realScore = string.Empty, province = string.Empty, city = string.Empty, infoSource = string.Empty, infourl = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidhtl = string.Empty, bad = string.Empty, good = string.Empty; if (dr.ColumnCount > 7) { corpName = dr.Columns[1].ToPlainTextString().Trim(); categoryRank = dr.Columns[6].ToPlainTextString().Trim(); ranking = dr.Columns[5].ToPlainTextString().Trim(); string rea = dr.Columns[7].ToPlainTextString().Trim(); string goodStr = dr.Columns[3].ToPlainTextString().Trim(); string badStr = dr.Columns[4].ToPlainTextString().Trim(); Regex regInt = new Regex(@"\d{1,}[\.]?\d{0,}"); realScore = regInt.Match(rea).Value; good = regInt.Match(goodStr).Value; bad = regInt.Match(badStr).Value; beginDate = beg; endDate = end; corpCategory = thtype; corpRank = classlv; infourl = Url; corpType = type; infoSource = "深圳市住房和建设局"; province = "广东省"; city = "深圳市"; bidhtl = bidhtml; } else { corpName = dr.Columns[1].ToPlainTextString().Trim(); categoryRank = dr.Columns[5].ToPlainTextString().Trim(); ranking = dr.Columns[4].ToPlainTextString().Trim(); string rea = dr.Columns[6].ToPlainTextString().Trim(); string goodStr = dr.Columns[2].ToPlainTextString().Trim(); string badStr = dr.Columns[3].ToPlainTextString().Trim(); Regex regInt = new Regex(@"\d{1,}[\.]?\d{0,}"); realScore = regInt.Match(rea).Value; good = regInt.Match(goodStr).Value; bad = regInt.Match(badStr).Value; beginDate = beg; endDate = end; corpCategory = thtype; corpRank = classlv; infourl = Url; corpType = type; infoSource = "深圳市住房和建设局"; province = "广东省"; city = "深圳市"; bidhtl = bidhtml; } CorpCreditjd info = ToolDb.GenCorpCreditJD(corpName, corpType, corpRank, corpCategory, ranking, categoryRank, beginDate, endDate, realScore, province, city, infoSource, infourl, bidhtl, avg, good, bad); ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate); count++; sqlcount++; //if (!crawlAll && list.Count >= this.MaxCount) break; if (count > 200) { count = 0; Thread.Sleep(120000); } } } } } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellspacing", "5"))); if (tdNodes != null && tdNodes.Count > 0) { NodeList aNodes = new NodeList(); tdNodes[0].CollectInto(aNodes, new TagNameFilter("a")); if (aNodes != null && aNodes.Count > 0) { for (int i = 0; i < aNodes.Count; i++) { ATag aTag = aNodes[i] as ATag; if (aTag.ToPlainTextString().Contains("尾页")) { Regex re = new Regex(@"[^0-9]+"); pageInt = int.Parse(re.Replace(aTag.Link, "")); break; } } } } parser.Reset(); for (int i = 1; i <= pageInt; i++) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("http://218.20.201.20/www/zbmsg/2008/xzb_list.asp?page=" + i.ToString() + "&id=13828"), Encoding.Default); } catch (Exception ex) { continue; } parser = new Parser(new Lexer(html)); tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_list")), true))); if (tdNodes != null && tdNodes.Count > 0) { for (int j = 0; j < tdNodes.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, buildScale = string.Empty, buildCycle = string.Empty, levels = string.Empty, structType = string.Empty, bidMoney = string.Empty, buildType = string.Empty, buildQual = string.Empty, InfoUrl = string.Empty, beginDate = string.Empty, bidType = string.Empty, HtmlTxt = string.Empty; decimal decMoney = 0; StringBuilder ctx = new StringBuilder(); ATag aTag = tdNodes[j] as ATag; if (aTag.Link.Contains("xzb_show.asp")) { InfoUrl = "http://218.20.201.20/www/zbmsg/2008/" + aTag.Link.Remove(aTag.Link.IndexOf("&")); Regex regexHtml = new Regex(@"<div[^>]*>[\s]*</div>"); string dlHtml = string.Empty; try { dlHtml = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).ToLower().Replace(" ", ""); } catch (Exception ex) { continue; } string filterHtml = dlHtml.Replace("\n", "").Replace("\r", "").Replace("<u>", "<a>").Replace("</u>", "</a>"); prjName = aTag.ToPlainTextString(); //内容 Parser ctxParser = new Parser(new Lexer(dlHtml)); NodeList ctxNodes = ctxParser.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), false)); ctx.Append(ctxNodes.AsString().Replace(" ", "")); HtmlTxt = ctxNodes.AsHtml(); Parser dlParser = new Parser(new Lexer(regexHtml.Replace(filterHtml, ""))); NodeList dlNodes = dlParser.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), false)); //搜索附件 NodeList findFiles = dlNodes.ExtractAllNodesThatMatch(new TagNameFilter("a"), true); NodeList fileNode = new NodeList(); if (findFiles != null && findFiles.Count > 0) { for (int f = 0; f < findFiles.Count; f++) { ATag fileA = findFiles[f] as ATag; if (fileA.Link.Contains("uploadfile")) { fileNode.Add(fileA); } } } INode nods = dlNodes[0].Parent.Parent.Parent.Parent; //发布日期 if (nods != null) { TableTag tb = nods as TableTag; for (int t = 0; t < tb.RowCount; t++) { TableRow tr = tb.Rows[t]; if (tr.ToPlainTextString().Contains("发布日期")) { beginDate = tr.ToPlainTextString().Substring(tr.ToPlainTextString().IndexOf("[") + 1, tr.ToPlainTextString().IndexOf("]") - tr.ToPlainTextString().IndexOf("[") - 1); break; } } } for (int k = 0; k < dlNodes.Count; k++) { if (dlNodes[k] is ITag) { //对a标签进行过滤 Regex strReplace = new Regex(@"<a[^>]*>|</a>"); if (dlNodes[k].ToPlainTextString().Contains("中标候选人为:") || dlNodes[k].ToPlainTextString().Contains("中标人为:")) { NodeList bidUnitNode = new NodeList(); dlNodes[k].CollectInto(bidUnitNode, new TagNameFilter("a")); if (bidUnitNode.Count > 0) { //找出匹配的项 Regex regexbidUnit = new Regex(@"<a[^>]*>[^<]*</a>"); MatchCollection matchbidUnit = null; if (dlNodes[k].ToPlainTextString().Contains("中标候选人为:")) { matchbidUnit = regexbidUnit.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("中标候选人为:"))); } else if (dlNodes[k].ToPlainTextString().Contains("中标人为:")) { matchbidUnit = regexbidUnit.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("中标人为:"))); } if (matchbidUnit != null && matchbidUnit.Count > 0) { bidUnit = strReplace.Replace(matchbidUnit[0].ToString(), ""); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = dlNodes[k + 1].ToPlainTextString().Trim(); } } else { bidUnit = dlNodes[k + 1].ToPlainTextString(); } } if (dlNodes[k].ToPlainTextString().Contains("中标价:") || dlNodes[k].ToPlainTextString().Contains("投标报价:") || dlNodes[k].ToPlainTextString().Contains("中标价:") || dlNodes[k].ToPlainTextString().Contains("中标价为")) { Regex regdecimal = new Regex(@"\d{1,}[\.]?\d{0,}"); NodeList moneyNode = new NodeList(); dlNodes[k].CollectInto(moneyNode, new TagNameFilter("a")); if (moneyNode.Count > 0) { Regex regexmoney = new Regex(@"<a[^>]*>[^<]*</a>"); MatchCollection matchmoney = null; if (dlNodes[k].ToPlainTextString().Contains("中标价:")) { matchmoney = regexmoney.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("中标价:"))); } if (dlNodes[k].ToPlainTextString().Contains("投标报价:")) { matchmoney = regexmoney.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("投标报价:"))); } if (matchmoney != null && matchmoney.Count > 0) { if (dlNodes[k].ToPlainTextString().Contains("万元")) { try { decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString())[0].ToString()); } catch (Exception ex) { } } else { try { decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString())[0].ToString()) / 10000; } catch (Exception ex) { } } } } else { if (dlNodes[k].ToPlainTextString().Contains("万元")) { decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString().ToString())[0].ToString()); } else { decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString().ToString())[0].ToString()) / 10000; } } } } } string regexstr = @"<[^>]*>"; string ctxStr = Regex.Replace(ctx.ToString(), regexstr, string.Empty, RegexOptions.IgnoreCase); bidUnit = bidUnit.Replace(" ", "").Trim(); Regex reg = new Regex(@"[\u4e00-\u9fa5]"); if (!reg.IsMatch(bidUnit)) { bidUnit = ""; } else { Regex regBidMoneys = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); string t = regBidMoneys.Match(bidUnit).Value; if (!string.IsNullOrEmpty(t)) { bidUnit = ""; } } if (string.IsNullOrEmpty(bidUnit) || decMoney <= 0) { string txt = string.Empty; parser = new Parser(new Lexer(dlHtml)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), true))); if (dtList != null && dtList.Count > 1) { for (int k = 0; k < dtList.Count; k++) { if (dtList[k].ToPlainTextString().Trim().Contains("中标候选人") || dtList[k].ToPlainTextString().Trim().Contains("中标人")) { try { if (string.IsNullOrEmpty(dtList[k + 1].ToPlainTextString().Trim())) { txt += dtList[k].ToPlainTextString().Trim(); string text = txt.Remove(txt.Length - txt.IndexOf("为:") - 2); if (string.IsNullOrEmpty(text)) { txt += dtList[k].ToPlainTextString().Trim(); txt += dtList[k + 2].ToPlainTextString().Trim() + "\r\n"; } else { txt += dtList[k].ToPlainTextString().Trim() + "\r\n"; } } else { txt += dtList[k].ToPlainTextString().Trim(); string text = txt.Remove(txt.Length - txt.IndexOf("为:") - 2); if (string.IsNullOrEmpty(text)) { txt += dtList[k].ToPlainTextString().Trim(); txt += dtList[k + 1].ToPlainTextString().Trim() + "\r\n"; } else { txt += dtList[k].ToPlainTextString().Trim() + "\r\n"; } } } catch { } } else { txt += dtList[k].ToPlainTextString().Trim() + "\r\n"; } } if (string.IsNullOrEmpty(bidUnit)) { Regex regBidUnit = new Regex(@"(中标单位|中标候选单位|中标候选人为|中标人为):[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(txt.Replace("\r\n\r\n", "")).Value.Replace("中标候选人为", "").Replace("中标人为", "").Replace("中标单位:", "").Replace("中标候选单位:", "").Replace(":", "").Trim(); } if (decMoney <= 0) { Regex regBidMoneystr = new Regex(@"(金额|价格|报价|中标价)(:|:)[^\r\n]+\r\n"); string monerystr = regBidMoneystr.Match(txt).Value.Replace("中标价", "").Replace("金额", "").Replace("价格", "").Replace("报价", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (!string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value)) { if (monerystr.Contains("万元") || monerystr.Contains("万美元")) { decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value); } else { try { decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value) / 10000; if (decMoney < decimal.Parse("0.1")) { decMoney = 0; } } catch (Exception) { decMoney = 0; } } } } } } if (string.IsNullOrEmpty(bidUnit) || decMoney <= 0) { string txt = string.Empty; parser = new Parser(new Lexer(dlHtml)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("p"), new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), true))); if (dtList != null && dtList.Count > 1) { for (int k = 0; k < dtList.Count; k++) { if (dtList[k].ToPlainTextString().Trim().Contains("中标候选人") || dtList[k].ToPlainTextString().Trim().Contains("中标人")) { if (string.IsNullOrEmpty(dtList[k + 1].ToPlainTextString().Trim())) { k++; txt += dtList[k].ToPlainTextString().Trim(); } else { txt += dtList[k].ToPlainTextString().Trim(); string text = txt.Remove(txt.Length - txt.IndexOf("为:") - 2); if (string.IsNullOrEmpty(text)) { txt = ""; txt += dtList[k].ToPlainTextString().Trim(); } else { txt = ""; txt += dtList[k].ToPlainTextString().Trim() + "\r\n"; } } } else { txt += dtList[k].ToPlainTextString().Trim() + "\r\n"; } Regex regexsHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); txt = regexsHtml.Replace(txt, ""); } if (string.IsNullOrEmpty(bidUnit)) { Regex regBidUnit = new Regex(@"(中标单位|中标候选单位|中标候选人为|中标人为):[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(txt.Replace("\r\n\r\n", "")).Value.Replace("中标候选人为", "").Replace("中标人为", "").Replace("中标单位:", "").Replace("中标候选单位:", "").Replace(":", "").Trim(); } if (string.IsNullOrEmpty(bidMoney)) { Regex regBidMoneystr = new Regex(@"(金额|价格|报价|中标价|中标价为)(:|:)[^\r\n]+\r\n"); string monerystr = regBidMoneystr.Match(txt).Value.Replace("中标价为", "").Replace("中标价", "").Replace("金额", "").Replace("价格", "").Replace("报价", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (!string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value)) { if (monerystr.Contains("万元") || monerystr.Contains("万美元")) { decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value); } else { try { decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value) / 10000; if (decMoney < decimal.Parse("0.1")) { decMoney = 0; } } catch (Exception) { decMoney = 0; } } } } } } prjName = ToolDb.GetPrjName(prjName.Replace(" ", "")); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "广州市区", "番禺区", string.Empty, string.Empty, prjName, buildUnit, beginDate, bidUnit, beginDate, string.Empty, ctxStr, string.Empty, "广州市番禺区建设局", bidType, "建设工程", string.Empty, decMoney.ToString(), InfoUrl, string.Empty, HtmlTxt); list.Add(info); if (fileNode.Count > 0) { try { for (int f = 0; f < fileNode.Count; f++) { BaseAttach attach = ToolDb.GenBaseAttach((fileNode[0] as ATag).StringText, info.Id, "http://218.20.201.20" + (fileNode[0] as ATag).Link); base.AttachList.Add(attach); } } catch { } } dlParser.Reset(); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }