private void StartList() { _listLinkUrl.Clear(); MessageOut($"[{modelTask.TaskName}]开始采集数据!请稍候..."); var task = new TaskFactory().StartNew(() => { //加载为采集的列表 if (modelTask.IsSpiderUrl == 1) { var spiderList = new SpiderListHelper(); spiderList.Model = modelTask; spiderList.OutTreeNodeHandler += (string url, string title, string cover, int nodeIndex) => { var m = new ModelLinkUrl() { Url = url, Title = title, Cover = cover }; bool addFlag = true; foreach (var item in _listLinkUrl.ToArray()) { if (item.Url == url) { addFlag = false; break; } } if (addFlag) { string msg = url + "==" + HtmlHelper.Instance.ParseTags(title); if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) { _listLinkUrl.Enqueue(m); } else { msg += "采集地址存在!不需要采集!"; } MessageOut(msg); } }; spiderList.OutMessageHandler += (string msg) => { MessageOut(msg); }; spiderList.AnalyzeAllList(); MessageOut("分析获取网页个数为" + _listLinkUrl.Count + "个!"); MessageOut("采集网站列表完成!"); } else { MessageOut("采集列表关闭,不需要采集!"); } OutTaskStatusHandler?.Invoke(EnumTaskType.View); }); }
/// <summary> /// 采集网址列表 /// </summary> private void GetAllLinkUrl(string urlList) { string pageContent = CollectionHelper.Instance.GetHttpPage(urlList, 100000, Encoding.GetEncoding(modelTask.PageEncode)); if (pageContent == "$StartFalse$" || pageContent == "$EndFalse$") { MessageOut(urlList + "采集地址失败!结果:" + pageContent); return; } if (modelTask.LinkUrlCutAreaStart != null && modelTask.LinkUrlCutAreaEnd != null) { pageContent = HtmlHelper.Instance.ParseCollectionStrings(pageContent); pageContent = CollectionHelper.Instance.GetBody(pageContent, HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaStart), HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaEnd), false, false); pageContent = HtmlHelper.Instance.UnParseCollectionStrings(pageContent); } string regexHref = cRegexHelper.RegexATag; if (modelTask.IsHandGetUrl == 1) { regexHref = modelTask.HandCollectionUrlRegex; regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref); regexHref = regexHref.Replace("\\(\\*)", ".+?"); regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)"); } Match mch = null; Regex reg = new Regex(regexHref, RegexOptions.IgnoreCase | RegexOptions.Compiled); string url = string.Empty, title = string.Empty; for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch()) { Thread.Sleep(1); title = mch.Groups[2].Value; if (string.IsNullOrEmpty(title)) { continue; } url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value); url = url.Replace("\\", ""); bool isLoop = false; if (modelTask.LinkUrlMustIncludeStr != null) { //包含 if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1) { continue; } } //不包含 if (modelTask.LinkUrlNoMustIncludeStr != null) { foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries)) { if (url.IndexOf(str) > -1) { isLoop = true; break; } } } if (isLoop) { continue; } ModelLinkUrl m = new ModelLinkUrl(); m.Url = url; m.Title = title; //添加Url bool addFlag = true; foreach (var item in _listLinkUrl.ToArray()) { if (item.Url == url) { addFlag = false; break; } } if (addFlag) { //开始过滤数据库存在的数据 string msg = url + "==" + HtmlHelper.Instance.ParseTags(title); if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) { _listLinkUrl.Enqueue(m); } else { msg += "采集地址存在!不需要采集!"; } MessageOut(msg); } } }
private void Run_ViewUrl(int index, int threadindex) { if (modelTask.IsSpiderContent == 1) { if (_listLinkUrl.Count > 0) { ProressNum++; if (OutPutTaskProgressBarDelegate != null) { MainEvents.OutPutTaskProgressBarEventArgs ea = new MainEvents.OutPutTaskProgressBarEventArgs(); ea.ProgressNum = ProressNum; ea.RecordNum = TaskCount; ea.TaskIndex = TaskIndex; OutPutTaskProgressBarDelegate(this, ea); } ModelLinkUrl mlink = _listLinkUrl.Dequeue(); string url = mlink.Url; string SQL = string.Empty, cutContent = string.Empty; string pageContent = CollectionHelper.Instance.GetHttpPage(url, 1000, Encoding.GetEncoding(modelTask.PageEncode)); string title = CollectionHelper.Instance.CutStr(pageContent, "<title>([\\S\\s]*?)</title>")[0]; StringBuilder sb1 = new StringBuilder(); StringBuilder sb2 = new StringBuilder(); StringBuilder strSql = new StringBuilder(); StringBuilder sb3 = new StringBuilder(); foreach (ModelTaskLabel m in modelTask.ListTaskLabel) { string regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelNameCutRegex); regContent = CommonHelper.ReplaceSystemRegexTag(regContent); string CutContent = CollectionHelper.Instance.CutStr(pageContent, regContent)[0]; #region 替换内容中的链接为远程链接 string[] TagImgList = CollectionHelper.Instance.GetImgTag(CutContent); foreach (string tagimg in TagImgList) { if (string.IsNullOrEmpty(tagimg)) { break; } //远程连接 string newTagImg = CollectionHelper.Instance.FormatUrl(modelTask.TestViewUrl, tagimg); //替换连接 CutContent = CutContent.Replace(tagimg, newTagImg); #region 保存远程图片 if (m.IsDownResource == 1) { //替换时间格式连接 FileInfo fImg = new FileInfo(newTagImg); string ext = fImg.Extension; ext = string.IsNullOrEmpty(ext) ? ".jpg" : ext; string newTimeImg = "images/" + DateTime.Now.ToString("yyyyMMddHHmmss") + ext; lock (QueueHelper.lockObj) { var d = new Dictionary <string, string>(); d.Add(newTagImg, newTimeImg); QueueHelper.Q_DownImgResource.Enqueue(d); } } #endregion } #endregion if (m.IsLoop == 1) { string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent); foreach (string s in LabelString) { CutContent += s + "$$$$"; } int n = CutContent.LastIndexOf("$$$$"); CutContent = CutContent.Remove(n, 4); } if (m.IsLinkUrl == 1) { string[] CutContentArr = CutContent.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries); foreach (string sUrl in CutContentArr) { CutContent = CollectionHelper.Instance.DefiniteUrl(sUrl, modelTask.TestViewUrl);//地址 CutContent = CollectionHelper.Instance.GetHttpPage(CutContent, 1000, Encoding.GetEncoding(modelTask.PageEncode)); regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelValueLinkUrlRegex); regContent = regContent.Replace("\\(\\*)", ".+?"); regContent = regContent.Replace("\\[参数]", "([\\S\\s].*?)"); CutContent = CollectionHelper.Instance.CutStr(CutContent, regContent)[0]; } } #region 标签是分页 if (m.IsPager == 1) { regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelValuePagerRegex); regContent = regContent.Replace("\\(\\*)", ".+?"); regContent = regContent.Replace("\\[参数]", "([\\S\\s].*?)"); string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent); foreach (string pageUrl in LabelString) { string url1 = CollectionHelper.Instance.DefiniteUrl(pageUrl, url); string pageContentPager = CollectionHelper.Instance.GetHttpPage(url1, 100000); if (pageContent.Equals("$UrlIsFalse$") || pageContent.Equals("$GetFalse$")) { CutContent += "=====分页内容=======================================================\r\n"; CutContent += "远程链接内容失败!"; } else { //重新截取标签 string regContent1 = HtmlHelper.Instance.ParseCollectionStrings(m.LabelNameCutRegex); regContent1 = CommonHelper.ReplaceSystemRegexTag(regContent1); string CutContent1 = CollectionHelper.Instance.CutStr(pageContentPager, regContent1)[0]; CutContent += "=====分页内容=======================================================\r\n"; CutContent += CutContent1; } } } #endregion #region 过滤Html if (!string.IsNullOrEmpty(m.LabelHtmlRemove)) { //CutContent = HtmlHelper.ReplaceNormalHtml(CutContent, model.TestViewUrl, false); string[] arr = m.LabelHtmlRemove.Split(new string[] { "||||" }, StringSplitOptions.RemoveEmptyEntries); foreach (string str in arr) { if (str == "all") { CutContent = CollectionHelper.Instance.NoHtml(CutContent); break; } else if (str == "table") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "table", 2); } else if (str == "font<span>") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "font", 3); CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "span", 3); } else if (str == "a") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "a", 3); } } } #endregion #region 排除字符 if (!string.IsNullOrEmpty(m.LabelRemove)) { foreach (string str in m.LabelRemove.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries)) { CutContent = CutContent.Replace(str, ""); } } #endregion #region 替换字符 if (!string.IsNullOrEmpty(m.LabelReplace)) { foreach (string str in m.LabelReplace.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries)) { string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries); CutContent = CutContent.Replace(ListStr[0], ListStr[1]); } } #endregion sb1.Append("" + m.LabelName.Replace("'", "''") + ","); sb2.Append("'" + CutContent.Replace("'", "''") + "',"); if (CutContent.Replace("'", "''").Length < 100) { sb3.Append(" " + m.LabelName.Replace("'", "''") + "='" + CutContent.Replace("'", "''") + "' and"); } //添加文件下载功能 开关打开的时候 if (m.IsDownResource == 1) { string[] imgExtArr = m.DownResourceExts.Split(new string[] { ";" }, StringSplitOptions.None); foreach (string s in imgExtArr) { } string downImgPath = AppDomain.CurrentDomain.BaseDirectory + "Data\\Collection\\" + modelTask.TaskName + "\\Images\\"; CutContent = ImageDownHelper.SaveUrlPics(CutContent, downImgPath); } } string LocalSQLiteName = "Data\\Collection\\" + modelTask.TaskName + "\\SpiderResult.db"; string sql = " Select Count(1) From Content Where HrefSource='" + url + "' "; object o = SQLiteHelper.ExecuteScalar(LocalSQLiteName, sql); if (Convert.ToInt32("0" + o) == 0) { strSql.Append("insert into Content(HrefSource,"); strSql.Append(sb1.ToString().Remove(sb1.Length - 1)); strSql.Append(")"); strSql.Append(" values ('" + url + "',"); strSql.Append(sb2.ToString().Remove(sb2.Length - 1)); strSql.Append(")"); SQLiteHelper.Execute(LocalSQLiteName, strSql.ToString()); } title = title.Replace('\\', ' ').Replace('/', ' ').Split(new char[] { '_' })[0].Split(new char[] { '-' })[0]; gatherEv.Message = mlink.Url + "=" + title; GatherWorkDelegate(this, gatherEv); } else { gatherEv.Message = "没有采集到任何地址!不需要采集!"; GatherWorkDelegate(this, gatherEv); } //暂停 var r = new Random(); var stepNext = r.Next(1, 4); Thread.Sleep(stepNext * 2000); } }
/// <summary> /// 采集网址列表 /// </summary> private void GetAllLinkUrl(string urlList) { string pageContent = CollectionHelper.Instance.GetHttpPage(urlList, 100000, Encoding.GetEncoding(modelTask.PageEncode)); if (pageContent == "$StartFalse$" || pageContent == "$EndFalse$") { MessageOut(urlList + "采集地址失败!结果:" + pageContent); return; } if (modelTask.LinkUrlCutAreaStart != null && modelTask.LinkUrlCutAreaEnd != null) { pageContent = HtmlHelper.Instance.ParseCollectionStrings(pageContent); pageContent = CollectionHelper.Instance.GetBody(pageContent, HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaStart), HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaEnd), false, false); pageContent = HtmlHelper.Instance.UnParseCollectionStrings(pageContent); } string regexHref = cRegexHelper.RegexATag; int i = 0; if (modelTask.IsHandGetUrl == 1) { regexHref = modelTask.HandCollectionUrlRegex; //regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref); //regexHref = regexHref.Replace("\\(\\*)", ".+?"); //regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)"); if (modelTask.LinkSpliceUrlStr.Trim() == "") { regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref); regexHref = regexHref.Replace("\\(\\*)", ".+?"); regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)"); } else { regexHref = regexHref.Replace("[", "\\["); regexHref = regexHref.Replace("\\[参数]", "[参数]"); regexHref = regexHref.Replace("(*)", ".+?"); while (regexHref.IndexOf("[参数]") >= 0) { i++; int tmp = regexHref.IndexOf("[参数]"); //获取[参数]第一次出现的索引值 regexHref = regexHref.Remove(tmp, "[参数]".Length); //在该索引处删除[参数] regexHref = regexHref.Insert(tmp, "(?<参数" + i + ">.+?)"); // 在该索引出插入112 } } } Match mch = null; Regex reg = new Regex(regexHref, RegexOptions.IgnoreCase | RegexOptions.Compiled); string url = string.Empty, title = string.Empty; if (modelTask.LinkSpliceUrlStr.Trim() == "") { for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch()) { Thread.Sleep(1); title = mch.Groups[2].Value; if (string.IsNullOrEmpty(title)) { continue; } url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value); url = url.Replace("\\", ""); bool isLoop = false; if (modelTask.LinkUrlMustIncludeStr != null) { //包含 if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1) { continue; } } //不包含 if (modelTask.LinkUrlNoMustIncludeStr != null) { foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries)) { if (url.IndexOf(str) > -1) { isLoop = true; break; } } } if (isLoop) { continue; } ModelLinkUrl m = new ModelLinkUrl(); m.Url = url; m.Title = title; //添加Url bool addFlag = true; foreach (var item in _listLinkUrl.ToArray()) { if (item.Url == url) { addFlag = false; break; } } if (addFlag) { //开始过滤数据库存在的数据 string msg = url + "==" + HtmlHelper.Instance.ParseTags(title); if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) { _listLinkUrl.Enqueue(m); } else { msg += "采集地址存在!不需要采集!"; } MessageOut(msg); } } } else { MatchCollection matches = reg.Matches(pageContent); for (int j = 0; j < matches.Count; j++) { Thread.Sleep(1); Match match = matches[j]; string aurl = modelTask.LinkSpliceUrlStr; for (int x = 1; x <= i; x++) { aurl = aurl.Replace("[参数" + x.ToString() + "]", match.Groups["参数" + x.ToString()].Value); } url = CollectionHelper.Instance.FormatUrl(urlList, aurl); url = url.Replace("\\", ""); bool isLoop = false; if (modelTask.LinkUrlMustIncludeStr != null) { //包含 if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1) { continue; } } //不包含 if (modelTask.LinkUrlNoMustIncludeStr != null) { foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries)) { if (url.IndexOf(str) > -1) { isLoop = true; break; } } } if (isLoop) { continue; } ModelLinkUrl m = new ModelLinkUrl(); m.Url = url; m.Title = title; //添加Url bool addFlag = true; foreach (var item in _listLinkUrl.ToArray()) { if (item.Url == url) { addFlag = false; break; } } if (addFlag) { //开始过滤数据库存在的数据 string msg = url + "==" + HtmlHelper.Instance.ParseTags(title); if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) { _listLinkUrl.Enqueue(m); } else { msg += "采集地址存在!不需要采集!"; } MessageOut(msg); } } } //for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch()) { // Thread.Sleep(1); // title = mch.Groups[2].Value; // if (string.IsNullOrEmpty(title)) { // continue; // } // url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value); // url = url.Replace("\\", ""); // bool isLoop = false; // if (modelTask.LinkUrlMustIncludeStr != null) { // //包含 // if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1) { // continue; // } // } // //不包含 // if (modelTask.LinkUrlNoMustIncludeStr != null) { // foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries)) { // if (url.IndexOf(str) > -1) { // isLoop = true; // break; // } // } // } // if (isLoop) { // continue; // } // ModelLinkUrl m = new ModelLinkUrl(); // m.Url = url; // m.Title = title; // //添加Url // bool addFlag = true; // foreach (var item in _listLinkUrl.ToArray()) { // if (item.Url == url) { // addFlag = false; // break; // } // } // if (addFlag) { // //开始过滤数据库存在的数据 // string msg = url + "==" + HtmlHelper.Instance.ParseTags(title); // if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) { // _listLinkUrl.Enqueue(m); // } // else { // msg += "采集地址存在!不需要采集!"; // } // MessageOut(msg); // } //} }