private void Run(int index, int threadindex) { if (ModelTask.IsSpiderContent == 1) { if (ListLinkUrl.Count > 0) { ProressNum++; if (OutPutTaskProgressBarDelegate != null) { MainEvents.OutPutTaskProgressBarEventArgs ea = new MainEvents.OutPutTaskProgressBarEventArgs(); ea.ProgressNum = ProressNum; ea.RecordNum = TaskCount; ea.TaskIndex = TaskIndex; OutPutTaskProgressBarDelegate(this, ea); } ModelLinkUrl mlink = ListLinkUrl.Dequeue(); string url = mlink.Url; string SQL = string.Empty, cutContent = string.Empty; string pageContent = CollectionHelper.Instance.GetHttpPage(url, 1000, Encoding.GetEncoding(ModelTask.PageEncode)); string title = CollectionHelper.Instance.CutStr(pageContent, "<title>([\\S\\s]*?)</title>")[0]; StringBuilder sb1 = new StringBuilder(); StringBuilder sb2 = new StringBuilder(); StringBuilder strSql = new StringBuilder(); StringBuilder sb3 = new StringBuilder(); foreach (ModelTaskLabel m in ModelTask.ListTaskLabel) { string regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelNameCutRegex); regContent = CommonHelper.ReplaceSystemRegexTag(regContent); string CutContent = CollectionHelper.Instance.CutStr(pageContent, regContent)[0]; #region 替换内容中的链接为远程链接 string[] TagImgList = CollectionHelper.Instance.GetImgTag(CutContent); foreach (string tagimg in TagImgList) { if (string.IsNullOrEmpty(tagimg)) { break; } string newTagImg = CollectionHelper.Instance.FormatUrl(ModelTask.TestViewUrl, tagimg); CutContent = CutContent.Replace(tagimg, newTagImg); #region 保存远程图片 #endregion } #endregion if (m.IsLoop == 1) { string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent); foreach (string s in LabelString) { CutContent += s + "$$$$"; } int n = CutContent.LastIndexOf("$$$$"); CutContent = CutContent.Remove(n, 4); } if (m.IsLinkUrl == 1) { string[] CutContentArr = CutContent.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries); foreach (string sUrl in CutContentArr) { CutContent = CollectionHelper.Instance.DefiniteUrl(sUrl, ModelTask.TestViewUrl);//地址 CutContent = CollectionHelper.Instance.GetHttpPage(CutContent, 1000, Encoding.GetEncoding(ModelTask.PageEncode)); regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelValueLinkUrlRegex); regContent = regContent.Replace("\\(\\*)", ".+?"); regContent = regContent.Replace("\\[参数]", "([\\S\\s].*?)"); CutContent = CollectionHelper.Instance.CutStr(CutContent, regContent)[0]; } } #region 过滤Html if (!string.IsNullOrEmpty(m.LabelHtmlRemove)) { //CutContent = HtmlHelper.ReplaceNormalHtml(CutContent, model.TestViewUrl, false); string[] arr = m.LabelHtmlRemove.Split(new string[] { "||||" }, StringSplitOptions.RemoveEmptyEntries); foreach (string str in arr) { if (str == "all") { CutContent = CollectionHelper.Instance.NoHtml(CutContent); break; } else if (str == "table") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "table", 2); } else if (str == "font<span>") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "font", 3); CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "span", 3); } else if (str == "a") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "a", 3); } } } #endregion #region 排除字符 if (!string.IsNullOrEmpty(m.LabelRemove)) { foreach (string str in m.LabelRemove.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries)) { CutContent = CutContent.Replace(str, ""); } } #endregion #region 替换字符 if (!string.IsNullOrEmpty(m.LabelReplace)) { foreach (string str in m.LabelReplace.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries)) { string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries); CutContent = CutContent.Replace(ListStr[0], ListStr[1]); } } #endregion sb1.Append("" + m.LabelName.Replace("'", "''") + ","); sb2.Append("'" + CutContent.Replace("'", "''") + "',"); if (CutContent.Replace("'", "''").Length < 100) { sb3.Append(" " + m.LabelName.Replace("'", "''") + "='" + CutContent.Replace("'", "''") + "' and"); } //添加文件下载功能 开关打开的时候 if (m.IsDownResource == 1) { string[] imgExtArr = m.DownResourceExts.Split(new string[] { ";" }, StringSplitOptions.None); foreach (string s in imgExtArr) { } string downImgPath = AppDomain.CurrentDomain.BaseDirectory + "Data\\Collection\\" + ModelTask.TaskName + "\\Images\\"; CutContent = ImageDownHelper.SaveUrlPics(CutContent, downImgPath); } } strSql.Append("insert into Content(HrefSource,"); strSql.Append(sb1.ToString().Remove(sb1.Length - 1)); strSql.Append(")"); strSql.Append(" values ('" + url + "',"); strSql.Append(sb2.ToString().Remove(sb2.Length - 1)); strSql.Append(")"); string LocalSQLiteName = "Data\\Collection\\" + ModelTask.TaskName + "\\SpiderResult.db"; SQLiteHelper.Execute(LocalSQLiteName, strSql.ToString()); title = title.Replace('\\', ' ').Replace('/', ' ').Split(new char[] { '_' })[0].Split(new char[] { '-' })[0]; gatherEv.Message = mlink.Url + "=" + title; GatherWorkDelegate(this, gatherEv); } else { MessageShow("没有采集到任何地址!不需要采集!"); } } }
/// <summary> /// 获取标签内容 /// </summary> /// <param name="taskName"></param> /// <param name="collectionContentStepTime"></param> /// <param name="spiderViewUrl"></param> /// <param name="itemTaskLebel"></param> /// <param name="pageContent"></param> /// <param name="isTest"></param> /// <returns></returns> private string GetLabelContent(string taskName, int collectionContentStepTime, string spiderViewUrl, ModelTaskLabel itemTaskLebel, string pageContent, bool isTest = false) { var remoteViewUrl = itemTaskLebel.TestViewUrl; if (string.IsNullOrEmpty(itemTaskLebel.TestViewUrl)) { remoteViewUrl = spiderViewUrl; } string regContent = HtmlHelper.Instance.ParseCollectionStrings(itemTaskLebel.LabelNameCutRegex); regContent = CommonHelper.ReplaceSystemRegexTag(regContent); string CutContent = CollectionHelper.Instance.CutStr(pageContent, regContent)[0]; #region 载资源 var imgTag = ImageDownHelper.GetImgTag(CutContent); if (itemTaskLebel.IsDownResource == 1) { string[] imgExtArr = itemTaskLebel.DownResourceExts.Split(new string[] { ";" }, StringSplitOptions.RemoveEmptyEntries); var downImgPath = AppDomain.CurrentDomain.BaseDirectory + "Data\\Collection\\" + taskName + "\\Images\\"; int ii = 1; foreach (var img in imgTag) { var remoteImg = CollectionHelper.Instance.FormatUrl(remoteViewUrl, img); var newImg = DateTime.Now.ToString("yyyyMMddHHmmssffffff") + "_" + ii + ".jpg"; if (!string.IsNullOrEmpty(itemTaskLebel.DownResourceExts)) { var imgExt = remoteImg.Substring(remoteImg.LastIndexOf(".")); if (imgExtArr.SingleOrDefault(x => x.ToLower() == imgExt.ToLower()) != imgExt.ToLower()) { continue; } } CutContent = CutContent.Replace(img, downImgPath + newImg); if (!isTest) { QueueImgHelper.AddImg(Model.ID, downImgPath + newImg, remoteImg, collectionContentStepTime); } ii++; } } else { foreach (var img in imgTag) { var remoteImg = CollectionHelper.Instance.FormatUrl(remoteViewUrl, img); CutContent = CutContent.Replace(img, remoteImg); } } #endregion #region 结果为循环 if (itemTaskLebel.IsLoop == 1) { string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent); foreach (string s in LabelString) { CutContent += s + "$$$$"; } int n = CutContent.LastIndexOf("$$$$"); CutContent = CutContent.Remove(n, 4); } #endregion #region 过滤Html if (!string.IsNullOrEmpty(itemTaskLebel.LabelHtmlRemove)) { string[] arr = itemTaskLebel.LabelHtmlRemove.Split(new string[] { "||||" }, StringSplitOptions.RemoveEmptyEntries); foreach (string str in arr) { if (str == "all") { CutContent = CollectionHelper.Instance.NoHtml(CutContent); break; } else if (str == "table") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "table", 2); } else if (str == "font<span>") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "font", 3); CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "span", 3); } else if (str == "a") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "a", 3); } } } #endregion #region 排除字符 if (!string.IsNullOrEmpty(itemTaskLebel.LabelRemove)) { foreach (string str in itemTaskLebel.LabelRemove.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries)) { string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries); if (ListStr[1] == "1") { CutContent = CollectionHelper.RemoveHtml(CutContent, ListStr[0]); } else { CutContent = CutContent.Replace(ListStr[0], ""); } } } #endregion #region 替换字符 if (!string.IsNullOrEmpty(itemTaskLebel.LabelReplace)) { foreach (string str in itemTaskLebel.LabelReplace.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries)) { string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries); CutContent = CutContent.Replace(ListStr[0], ListStr[1]); } } #endregion #region 加载插件 string SpiderLabelPlugin = itemTaskLebel.SpiderLabelPlugin; if (SpiderLabelPlugin != "不使用插件" && !string.IsNullOrEmpty(SpiderLabelPlugin)) { CutContent = PythonExtHelper.RunPython(PluginUtility.SpiderContentPluginPath + SpiderLabelPlugin, new object[] { remoteViewUrl, CutContent }); } #endregion return(CutContent); }
private void Run_ViewUrl(int index, int threadindex) { if (modelTask.IsSpiderContent == 1) { if (_listLinkUrl.Count > 0) { ProressNum++; if (OutPutTaskProgressBarDelegate != null) { MainEvents.OutPutTaskProgressBarEventArgs ea = new MainEvents.OutPutTaskProgressBarEventArgs(); ea.ProgressNum = ProressNum; ea.RecordNum = TaskCount; ea.TaskIndex = TaskIndex; OutPutTaskProgressBarDelegate(this, ea); } ModelLinkUrl mlink = _listLinkUrl.Dequeue(); string url = mlink.Url; string SQL = string.Empty, cutContent = string.Empty; string pageContent = CollectionHelper.Instance.GetHttpPage(url, 1000, Encoding.GetEncoding(modelTask.PageEncode)); string title = CollectionHelper.Instance.CutStr(pageContent, "<title>([\\S\\s]*?)</title>")[0]; StringBuilder sb1 = new StringBuilder(); StringBuilder sb2 = new StringBuilder(); StringBuilder strSql = new StringBuilder(); StringBuilder sb3 = new StringBuilder(); foreach (ModelTaskLabel m in modelTask.ListTaskLabel) { string regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelNameCutRegex); regContent = CommonHelper.ReplaceSystemRegexTag(regContent); string CutContent = CollectionHelper.Instance.CutStr(pageContent, regContent)[0]; #region 替换内容中的链接为远程链接 string[] TagImgList = CollectionHelper.Instance.GetImgTag(CutContent); foreach (string tagimg in TagImgList) { if (string.IsNullOrEmpty(tagimg)) { break; } //远程连接 string newTagImg = CollectionHelper.Instance.FormatUrl(modelTask.TestViewUrl, tagimg); //替换连接 CutContent = CutContent.Replace(tagimg, newTagImg); #region 保存远程图片 if (m.IsDownResource == 1) { //替换时间格式连接 FileInfo fImg = new FileInfo(newTagImg); string ext = fImg.Extension; ext = string.IsNullOrEmpty(ext) ? ".jpg" : ext; string newTimeImg = "images/" + DateTime.Now.ToString("yyyyMMddHHmmss") + ext; lock (QueueHelper.lockObj) { var d = new Dictionary <string, string>(); d.Add(newTagImg, newTimeImg); QueueHelper.Q_DownImgResource.Enqueue(d); } } #endregion } #endregion if (m.IsLoop == 1) { string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent); foreach (string s in LabelString) { CutContent += s + "$$$$"; } int n = CutContent.LastIndexOf("$$$$"); CutContent = CutContent.Remove(n, 4); } if (m.IsLinkUrl == 1) { string[] CutContentArr = CutContent.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries); foreach (string sUrl in CutContentArr) { CutContent = CollectionHelper.Instance.DefiniteUrl(sUrl, modelTask.TestViewUrl);//地址 CutContent = CollectionHelper.Instance.GetHttpPage(CutContent, 1000, Encoding.GetEncoding(modelTask.PageEncode)); regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelValueLinkUrlRegex); regContent = regContent.Replace("\\(\\*)", ".+?"); regContent = regContent.Replace("\\[参数]", "([\\S\\s].*?)"); CutContent = CollectionHelper.Instance.CutStr(CutContent, regContent)[0]; } } #region 标签是分页 if (m.IsPager == 1) { regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelValuePagerRegex); regContent = regContent.Replace("\\(\\*)", ".+?"); regContent = regContent.Replace("\\[参数]", "([\\S\\s].*?)"); string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent); foreach (string pageUrl in LabelString) { string url1 = CollectionHelper.Instance.DefiniteUrl(pageUrl, url); string pageContentPager = CollectionHelper.Instance.GetHttpPage(url1, 100000); if (pageContent.Equals("$UrlIsFalse$") || pageContent.Equals("$GetFalse$")) { CutContent += "=====分页内容=======================================================\r\n"; CutContent += "远程链接内容失败!"; } else { //重新截取标签 string regContent1 = HtmlHelper.Instance.ParseCollectionStrings(m.LabelNameCutRegex); regContent1 = CommonHelper.ReplaceSystemRegexTag(regContent1); string CutContent1 = CollectionHelper.Instance.CutStr(pageContentPager, regContent1)[0]; CutContent += "=====分页内容=======================================================\r\n"; CutContent += CutContent1; } } } #endregion #region 过滤Html if (!string.IsNullOrEmpty(m.LabelHtmlRemove)) { //CutContent = HtmlHelper.ReplaceNormalHtml(CutContent, model.TestViewUrl, false); string[] arr = m.LabelHtmlRemove.Split(new string[] { "||||" }, StringSplitOptions.RemoveEmptyEntries); foreach (string str in arr) { if (str == "all") { CutContent = CollectionHelper.Instance.NoHtml(CutContent); break; } else if (str == "table") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "table", 2); } else if (str == "font<span>") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "font", 3); CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "span", 3); } else if (str == "a") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "a", 3); } } } #endregion #region 排除字符 if (!string.IsNullOrEmpty(m.LabelRemove)) { foreach (string str in m.LabelRemove.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries)) { CutContent = CutContent.Replace(str, ""); } } #endregion #region 替换字符 if (!string.IsNullOrEmpty(m.LabelReplace)) { foreach (string str in m.LabelReplace.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries)) { string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries); CutContent = CutContent.Replace(ListStr[0], ListStr[1]); } } #endregion sb1.Append("" + m.LabelName.Replace("'", "''") + ","); sb2.Append("'" + CutContent.Replace("'", "''") + "',"); if (CutContent.Replace("'", "''").Length < 100) { sb3.Append(" " + m.LabelName.Replace("'", "''") + "='" + CutContent.Replace("'", "''") + "' and"); } //添加文件下载功能 开关打开的时候 if (m.IsDownResource == 1) { string[] imgExtArr = m.DownResourceExts.Split(new string[] { ";" }, StringSplitOptions.None); foreach (string s in imgExtArr) { } string downImgPath = AppDomain.CurrentDomain.BaseDirectory + "Data\\Collection\\" + modelTask.TaskName + "\\Images\\"; CutContent = ImageDownHelper.SaveUrlPics(CutContent, downImgPath); } } string LocalSQLiteName = "Data\\Collection\\" + modelTask.TaskName + "\\SpiderResult.db"; string sql = " Select Count(1) From Content Where HrefSource='" + url + "' "; object o = SQLiteHelper.ExecuteScalar(LocalSQLiteName, sql); if (Convert.ToInt32("0" + o) == 0) { strSql.Append("insert into Content(HrefSource,"); strSql.Append(sb1.ToString().Remove(sb1.Length - 1)); strSql.Append(")"); strSql.Append(" values ('" + url + "',"); strSql.Append(sb2.ToString().Remove(sb2.Length - 1)); strSql.Append(")"); SQLiteHelper.Execute(LocalSQLiteName, strSql.ToString()); } title = title.Replace('\\', ' ').Replace('/', ' ').Split(new char[] { '_' })[0].Split(new char[] { '-' })[0]; gatherEv.Message = mlink.Url + "=" + title; GatherWorkDelegate(this, gatherEv); } else { gatherEv.Message = "没有采集到任何地址!不需要采集!"; GatherWorkDelegate(this, gatherEv); } //暂停 var r = new Random(); var stepNext = r.Next(1, 4); Thread.Sleep(stepNext * 2000); } }