private void StartList() { _listLinkUrl.Clear(); MessageOut($"[{modelTask.TaskName}]开始采集数据!请稍候..."); var task = new TaskFactory().StartNew(() => { //加载为采集的列表 if (modelTask.IsSpiderUrl == 1) { var spiderList = new SpiderListHelper(); spiderList.Model = modelTask; spiderList.OutTreeNodeHandler += (string url, string title, string cover, int nodeIndex) => { var m = new ModelLinkUrl() { Url = url, Title = title, Cover = cover }; bool addFlag = true; foreach (var item in _listLinkUrl.ToArray()) { if (item.Url == url) { addFlag = false; break; } } if (addFlag) { string msg = url + "==" + HtmlHelper.Instance.ParseTags(title); if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) { _listLinkUrl.Enqueue(m); } else { msg += "采集地址存在!不需要采集!"; } MessageOut(msg); } }; spiderList.OutMessageHandler += (string msg) => { MessageOut(msg); }; spiderList.AnalyzeAllList(); MessageOut("分析获取网页个数为" + _listLinkUrl.Count + "个!"); MessageOut("采集网站列表完成!"); } else { MessageOut("采集列表关闭,不需要采集!"); } OutTaskStatusHandler?.Invoke(EnumTaskType.View); }); }
private void btnSubmit_Click(object sender, EventArgs e) { string ss = isHtml ? this.htmlEditor.InnerHtml : this.htmlEditor.InnerText; DALContentHelper.UpdateContent(this.TaskName, this.Id, this.HeaderText, ss); if (OutEdit != null) { OutEdit(this.Cell, ss); } this.Hide(); this.Close(); }
private void frmTaskDataEdit_Load(object sender, EventArgs e) { if (!string.IsNullOrEmpty(Id)) { object oo = DALContentHelper.GetContent(this.TaskName, this.Id, this.HeaderText); if (oo.ToString().IndexOf("/>") == -1) { isHtml = false; } this.htmlEditor.InnerHtml = oo.ToString(); } }
private int Bind_DataList() { if (!string.IsNullOrEmpty(this.TaskName)) { int oCount = 0; int startIndex = (this.Pager.PageCurrent - 1) * this.Pager.PageSize; int pageSize = this.Pager.PageSize; DataTable dt = DALContentHelper.GetContentList(this.TaskName, startIndex, pageSize, ref oCount); this.Pager.bindingSource.DataSource = dt; this.Pager.bindingNavigator.BindingSource = Pager.bindingSource; this.dataGridView_DataList.DataSource = this.Pager.bindingSource; return(oCount); } return(0); }
/// <summary> /// 采集网址列表 /// </summary> private void GetAllLinkUrl(string urlList) { string pageContent = CollectionHelper.Instance.GetHttpPage(urlList, 100000, Encoding.GetEncoding(modelTask.PageEncode)); if (pageContent == "$StartFalse$" || pageContent == "$EndFalse$") { MessageOut(urlList + "采集地址失败!结果:" + pageContent); return; } if (modelTask.LinkUrlCutAreaStart != null && modelTask.LinkUrlCutAreaEnd != null) { pageContent = HtmlHelper.Instance.ParseCollectionStrings(pageContent); pageContent = CollectionHelper.Instance.GetBody(pageContent, HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaStart), HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaEnd), false, false); pageContent = HtmlHelper.Instance.UnParseCollectionStrings(pageContent); } string regexHref = cRegexHelper.RegexATag; if (modelTask.IsHandGetUrl == 1) { regexHref = modelTask.HandCollectionUrlRegex; regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref); regexHref = regexHref.Replace("\\(\\*)", ".+?"); regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)"); } Match mch = null; Regex reg = new Regex(regexHref, RegexOptions.IgnoreCase | RegexOptions.Compiled); string url = string.Empty, title = string.Empty; for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch()) { Thread.Sleep(1); title = mch.Groups[2].Value; if (string.IsNullOrEmpty(title)) { continue; } url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value); url = url.Replace("\\", ""); bool isLoop = false; if (modelTask.LinkUrlMustIncludeStr != null) { //包含 if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1) { continue; } } //不包含 if (modelTask.LinkUrlNoMustIncludeStr != null) { foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries)) { if (url.IndexOf(str) > -1) { isLoop = true; break; } } } if (isLoop) { continue; } ModelLinkUrl m = new ModelLinkUrl(); m.Url = url; m.Title = title; //添加Url bool addFlag = true; foreach (var item in _listLinkUrl.ToArray()) { if (item.Url == url) { addFlag = false; break; } } if (addFlag) { //开始过滤数据库存在的数据 string msg = url + "==" + HtmlHelper.Instance.ParseTags(title); if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) { _listLinkUrl.Enqueue(m); } else { msg += "采集地址存在!不需要采集!"; } MessageOut(msg); } } }
/// <summary> /// 采集网址列表 /// </summary> private void GetAllLinkUrl(string urlList) { string pageContent = CollectionHelper.Instance.GetHttpPage(urlList, 100000, Encoding.GetEncoding(modelTask.PageEncode)); if (pageContent == "$StartFalse$" || pageContent == "$EndFalse$") { MessageOut(urlList + "采集地址失败!结果:" + pageContent); return; } if (modelTask.LinkUrlCutAreaStart != null && modelTask.LinkUrlCutAreaEnd != null) { pageContent = HtmlHelper.Instance.ParseCollectionStrings(pageContent); pageContent = CollectionHelper.Instance.GetBody(pageContent, HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaStart), HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaEnd), false, false); pageContent = HtmlHelper.Instance.UnParseCollectionStrings(pageContent); } string regexHref = cRegexHelper.RegexATag; int i = 0; if (modelTask.IsHandGetUrl == 1) { regexHref = modelTask.HandCollectionUrlRegex; //regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref); //regexHref = regexHref.Replace("\\(\\*)", ".+?"); //regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)"); if (modelTask.LinkSpliceUrlStr.Trim() == "") { regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref); regexHref = regexHref.Replace("\\(\\*)", ".+?"); regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)"); } else { regexHref = regexHref.Replace("[", "\\["); regexHref = regexHref.Replace("\\[参数]", "[参数]"); regexHref = regexHref.Replace("(*)", ".+?"); while (regexHref.IndexOf("[参数]") >= 0) { i++; int tmp = regexHref.IndexOf("[参数]"); //获取[参数]第一次出现的索引值 regexHref = regexHref.Remove(tmp, "[参数]".Length); //在该索引处删除[参数] regexHref = regexHref.Insert(tmp, "(?<参数" + i + ">.+?)"); // 在该索引出插入112 } } } Match mch = null; Regex reg = new Regex(regexHref, RegexOptions.IgnoreCase | RegexOptions.Compiled); string url = string.Empty, title = string.Empty; if (modelTask.LinkSpliceUrlStr.Trim() == "") { for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch()) { Thread.Sleep(1); title = mch.Groups[2].Value; if (string.IsNullOrEmpty(title)) { continue; } url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value); url = url.Replace("\\", ""); bool isLoop = false; if (modelTask.LinkUrlMustIncludeStr != null) { //包含 if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1) { continue; } } //不包含 if (modelTask.LinkUrlNoMustIncludeStr != null) { foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries)) { if (url.IndexOf(str) > -1) { isLoop = true; break; } } } if (isLoop) { continue; } ModelLinkUrl m = new ModelLinkUrl(); m.Url = url; m.Title = title; //添加Url bool addFlag = true; foreach (var item in _listLinkUrl.ToArray()) { if (item.Url == url) { addFlag = false; break; } } if (addFlag) { //开始过滤数据库存在的数据 string msg = url + "==" + HtmlHelper.Instance.ParseTags(title); if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) { _listLinkUrl.Enqueue(m); } else { msg += "采集地址存在!不需要采集!"; } MessageOut(msg); } } } else { MatchCollection matches = reg.Matches(pageContent); for (int j = 0; j < matches.Count; j++) { Thread.Sleep(1); Match match = matches[j]; string aurl = modelTask.LinkSpliceUrlStr; for (int x = 1; x <= i; x++) { aurl = aurl.Replace("[参数" + x.ToString() + "]", match.Groups["参数" + x.ToString()].Value); } url = CollectionHelper.Instance.FormatUrl(urlList, aurl); url = url.Replace("\\", ""); bool isLoop = false; if (modelTask.LinkUrlMustIncludeStr != null) { //包含 if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1) { continue; } } //不包含 if (modelTask.LinkUrlNoMustIncludeStr != null) { foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries)) { if (url.IndexOf(str) > -1) { isLoop = true; break; } } } if (isLoop) { continue; } ModelLinkUrl m = new ModelLinkUrl(); m.Url = url; m.Title = title; //添加Url bool addFlag = true; foreach (var item in _listLinkUrl.ToArray()) { if (item.Url == url) { addFlag = false; break; } } if (addFlag) { //开始过滤数据库存在的数据 string msg = url + "==" + HtmlHelper.Instance.ParseTags(title); if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) { _listLinkUrl.Enqueue(m); } else { msg += "采集地址存在!不需要采集!"; } MessageOut(msg); } } } //for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch()) { // Thread.Sleep(1); // title = mch.Groups[2].Value; // if (string.IsNullOrEmpty(title)) { // continue; // } // url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value); // url = url.Replace("\\", ""); // bool isLoop = false; // if (modelTask.LinkUrlMustIncludeStr != null) { // //包含 // if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1) { // continue; // } // } // //不包含 // if (modelTask.LinkUrlNoMustIncludeStr != null) { // foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries)) { // if (url.IndexOf(str) > -1) { // isLoop = true; // break; // } // } // } // if (isLoop) { // continue; // } // ModelLinkUrl m = new ModelLinkUrl(); // m.Url = url; // m.Title = title; // //添加Url // bool addFlag = true; // foreach (var item in _listLinkUrl.ToArray()) { // if (item.Url == url) { // addFlag = false; // break; // } // } // if (addFlag) { // //开始过滤数据库存在的数据 // string msg = url + "==" + HtmlHelper.Instance.ParseTags(title); // if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) { // _listLinkUrl.Enqueue(m); // } // else { // msg += "采集地址存在!不需要采集!"; // } // MessageOut(msg); // } //} }