private string Filtrate(string str) { str = str.Replace("<p>", "\r\n"); str = str.Replace("document.write(\"", ""); str = str.Replace("\");", ""); str = str.Replace("document.write('", ""); str = str.Replace("');", ""); str = str.Replace("document.write", ""); str = str.Replace("')", ""); str = str.Replace("('", ""); str = Often.OutTxt(str); str = str.Replace(" ", ""); while (str.IndexOf("\r\n\r\n") > -1) { str = str.Replace("\r\n\r\n", "\r\n"); } while (str.IndexOf(" ") > -1) { str = str.Replace(" ", " "); } return(str); }
private void bt09_Click(object sender, EventArgs e) { GetOut(Often.OutText(Often.OutTxt(GetInput()))); }
/// <summary>根据当前Url与Url内容采集数据</summary> public void GatherOper() { int cindex = 0; while (NowUrl.Trim() != "" && ListLinkContent.Trim() != "") { if (!IsListLinkEcho(NowUrl)) { string ListLinkArea = ""; if (gi.ws.UrlRangeRule != "") { ListLinkArea = wapp.GatherApp.GetRegTxtString(ListLinkContent, gi.ws.UrlRangeRule, gi.ws.UrlRangeRuleIndex); lout.OT(1, "获取内容页链接列表范围匹配内容。[" + ListLinkArea.Length.ToString() + "]"); } else { ListLinkArea = ListLinkContent; } if (ListLinkArea.Trim() != "") { List <string> linklist = wapp.GatherApp.GetRegTxtList(ListLinkArea, gi.ws.UrlRule, gi.ws.UrlRuleIndex); lout.OT(1, "获取内容页链接匹配数组[" + linklist.Count.ToString() + "]"); for (int x = 0; x < linklist.Count; x++) { StartDate = DateTime.Now; EndDate = StartDate; string links = wapp.GatherApp.FormatUrl(NowUrl, linklist[x].Trim()); if (!IsContLinkEcho(links)) { string actlink = links; NowContLinkList.Add(actlink); string errs = ""; string conts = wapp.GatherApp.GetHttp(det, actlink, gi.ws.CodingMode, ref PageEncoding, ref PageCoding, ref errs); if (errs != "") { lout.OT(1, "获取内容页起始url内容失败,错误信息:" + errs + "。[Url:" + actlink + "]"); } else { lout.OT(1, "获取内容页起始url内容。[" + actlink + "][" + conts.Length.ToString() + "]"); } DataRow nsdr = gi.gdt.NewRow(); for (int si = 0; si < gi.wsilist.Count; si++) { string sconts = ""; if (gi.wsilist[si].ContentRangeRule != "") { sconts = wapp.GatherApp.GetRegTxtString(conts, gi.wsilist[si].ContentRangeRule, gi.wsilist[si].ContentRangeRuleIndex); } else { sconts = conts; } string sis = ""; if (gi.wsilist[si].ContentType == 2) { sis = wapp.GatherApp.FormatDate(Often.OutTxt(wapp.GatherApp.GetUnCode(wapp.GatherApp.GetRegsTxt(sconts, gi.wsilist[si].ContentRule, gi.wsilist[si].ContentRuleIndex), gi.wsilist[si].ContentCodeMode))); } else { sis = wapp.GatherApp.GetUnCode(wapp.GatherApp.GetRegsTxt(sconts, gi.wsilist[si].ContentRule, gi.wsilist[si].ContentRuleIndex), gi.wsilist[si].ContentCodeMode); } if (gi.wsilist[si].ContentType == 1 && Often.IsNum(sis)) { nsdr[si] = sis; } else if (gi.wsilist[si].ContentType == 2 && Often.IsDate(sis)) { nsdr[si] = sis; } else { nsdr[si] = sis; } if (sis != "") { lout.OT(1, "[" + gi.wsilist[si].ListTitle + "(" + gi.wsilist[si].ListCode + ")]内容:[" + sis + "]采集成功!"); } } gi.gdt.Rows.Add(nsdr); GatherCount++; if (GatherCount >= gi.ws.GatherMaxNum) { return; } #region 保存采集内存表中的信息到数据库 if (gi.gdt.Rows.Count > 999) { string savefile = GetExcelSaveFileName(); lout.OT(1, "保存数据到Excel文件操作开始[文件名:" + savefile + "]..."); wapp.Excel ex = new wapp.Excel(); ex.DataTableToExcel(gi.gdt); ex.WriteFile(savefile); gi.gdt.Clear(); lout.OT(1, "保存数据到Excel文件操作结束[文件名:" + savefile + "]"); } #endregion } } } } string listurl = NowUrl; NowUrlList.Add(NowUrl); NowUrl = ""; if (gi.ws.IsNext == 1) { if (gi.ws.NextMode == 1) { #region 采集内容链接列表分页(列表分页为单页采集模式) if (CentUrlList.Count > 0 && cindex < CentUrlList.Count) { NowUrl = CentUrlList[cindex].Trim(); cindex++; } #endregion } else { #region 采集内容链接列表分页(列表分页为多页采集模式) if (gi.ws.NextRangeRule != "") { string s = wapp.GatherApp.GetRegTxtString(ListLinkContent, gi.ws.NextRangeRule, gi.ws.NextRangeRuleIndex); if (s != "") { s = wapp.GatherApp.GetRegsTxt(s, gi.ws.NextRule, gi.ws.NextRuleIndex); if (s != "") { NowUrl = wapp.GatherApp.GetUrls(wapp.GatherApp.FormatUrl(listurl, s)); } } } else { string s = wapp.GatherApp.GetRegsTxt(ListLinkContent, gi.ws.NextRule, gi.ws.NextRuleIndex); if (s != "") { NowUrl = wapp.GatherApp.GetUrls(wapp.GatherApp.FormatUrl(listurl, s)); } } #endregion } if (NowUrl != "" && !IsListLinkEcho(NowUrl)) { string errs = ""; ListLinkContent = wapp.GatherApp.GetHttp(det, NowUrl, gi.ws.CodingMode, ref PageEncoding, ref PageCoding, ref errs); if (errs != "") { ListLinkContent = ""; lout.OT(1, "列表页采集失败,错误信息:" + errs + ",[Url:" + NowUrl + "]"); } } else { ListLinkContent = ""; } } NowUrlCount++; } }