/// <summary> /// 测试所有标签 /// </summary> /// <param name="Test_ViewUrl"></param> /// <param name="Test_LabelList"></param> public void TestAllLabel(string Test_ViewUrl, List <ModelTaskLabel> Test_LabelList) { try { string pageContent = CommonHelper.getPageContent(Test_ViewUrl, Model.PageEncode); if (pageContent.Equals("本次请求并未返回任何数据")) { OutViewUrlContentHandler?.Invoke("采集地址不正确!或者采集内容失败!"); return; } StringBuilder sbTest = new StringBuilder(); string tempContent = pageContent; foreach (var itemLabel in Test_LabelList) { var CutContent = string.Empty; CutContent += "【" + itemLabel.LabelName + "】: "; CutContent += GetLabelContent("测试", 10, Model.TestViewUrl, itemLabel, pageContent, true); sbTest.AppendLine(CutContent); } OutViewUrlContentHandler?.Invoke(sbTest.ToString()); } catch (Exception ex) { OutViewUrlContentHandler?.Invoke("测试网页采集失败!" + ex.Message); } }
/// <summary> /// 采集内容 /// </summary> /// <param name="viewUrl"></param> /// <param name="Test_LabelList"></param> public void SpiderContent(string viewUrl, List <ModelTaskLabel> Test_LabelList) { string SQL = string.Empty, cutContent = string.Empty; string pageContent = CommonHelper.getPageContent(viewUrl, Model.PageEncode); string title = CollectionHelper.Instance.CutStr(pageContent, "<title>([\\S\\s]*?)</title>")[0]; StringBuilder sb1 = new StringBuilder(); StringBuilder sb2 = new StringBuilder(); StringBuilder strSql = new StringBuilder(); StringBuilder sb3 = new StringBuilder(); string tempContent = pageContent; foreach (ModelTaskLabel itemTaskLabel in Model.ListTaskLabel) { var CutContent = GetLabelContent(Model.TaskName, Model.CollectionContentStepTime.Value, Model.TestViewUrl, itemTaskLabel, pageContent, false); #region 替换特殊 sb1.Append("" + itemTaskLabel.LabelName.Replace("'", "''") + ","); sb2.Append("'" + CutContent.Replace("'", "''") + "',"); if (CutContent.Replace("'", "''").Length < 100) { sb3.Append(" " + itemTaskLabel.LabelName.Replace("'", "''") + "='" + CutContent.Replace("'", "''") + "' and"); } #endregion } try { //是否有采集列表 然后在更新或者插入 #region 保存数据库 string LocalSQLiteName = "Data\\Collection\\" + Model.TaskName + "\\SpiderResult.db"; string sql = " Select Count(1) From Content Where HrefSource='" + viewUrl + "' "; object o = DbHelper.ExecuteScalar(LocalSQLiteName, sql); if (Convert.ToInt32("0" + o) == 0) { strSql.Append("insert into Content(HrefSource,"); strSql.Append(sb1.ToString().Remove(sb1.Length - 1)); strSql.Append(")"); strSql.Append(" values ('" + viewUrl + "',"); strSql.Append(sb2.ToString().Remove(sb2.Length - 1)); strSql.Append(")"); DbHelper.Execute(LocalSQLiteName, strSql.ToString()); } title = title.Replace('\\', ' ').Replace('/', ' ').Split(new char[] { '_' })[0].Split(new char[] { '-' })[0]; #endregion #region 更新采集列表为 已采集 #endregion OutViewUrlContentHandler?.Invoke(viewUrl + "=" + title); } catch (Exception ex) { OutViewUrlContentHandler?.Invoke(viewUrl + "=" + title + "=" + ex.Message); } }
/// <summary> /// 获取标签内容 /// </summary> /// <param name="taskName"></param> /// <param name="collectionContentStepTime"></param> /// <param name="spiderViewUrl"></param> /// <param name="itemTaskLebel"></param> /// <param name="pageContent"></param> /// <param name="isTest"></param> /// <returns></returns> private string GetLabelContent(string taskName, int collectionContentStepTime, string spiderViewUrl, ModelTaskLabel itemTaskLebel, string pageContent, bool isTest = false) { var remoteViewUrl = itemTaskLebel.TestViewUrl; if (string.IsNullOrEmpty(itemTaskLebel.TestViewUrl)) { remoteViewUrl = spiderViewUrl; } string regContent = HtmlHelper.Instance.ParseCollectionStrings(itemTaskLebel.LabelNameCutRegex); regContent = CommonHelper.ReplaceSystemRegexTag(regContent); string CutContent = CollectionHelper.Instance.CutStr(pageContent, regContent)[0]; #region 载资源 var imgTag = ImageDownHelper.GetImgTag(CutContent); if (itemTaskLebel.IsDownResource != null && itemTaskLebel.IsDownResource.Value == 1) { string[] imgExtArr = itemTaskLebel.DownResourceExts.Split(new string[] { ";" }, StringSplitOptions.RemoveEmptyEntries); var downImgPath = Model.ResourceSavePath;// itemTaskLebel.ResourceSavePath; //AppDomain.CurrentDomain.BaseDirectory + "Data\\Collection\\" + taskName + "\\Images\\"; if (string.IsNullOrEmpty(downImgPath)) { downImgPath = AppDomain.CurrentDomain.BaseDirectory + "Data\\Collection\\" + taskName + "\\Images\\"; } int ii = 1; foreach (var img in imgTag) { var remoteImg = CollectionHelper.Instance.FormatUrl(remoteViewUrl, img); var imgExt = remoteImg.Substring(remoteImg.LastIndexOf(".")); var newImg = DateTime.Now.ToString("yyyyMMddHHmmssffffff") + "_" + ii + imgExt; if (string.IsNullOrEmpty(itemTaskLebel.DownResourceExts) || imgExtArr.Any(x => x.ToLower() == imgExt.ToLower())) { // if (imgExtArr.SingleOrDefault(x => x.ToLower() == imgExt.ToLower()) != imgExt.ToLower()) { // continue; //} CutContent = CutContent.Replace(img, downImgPath + newImg); if (!isTest) { QueueImgHelper.AddImg(Model.ID, downImgPath + newImg, remoteImg, collectionContentStepTime); } else { OutViewUrlContentHandler?.Invoke($"允许下载资源后辍:{itemTaskLebel.DownResourceExts},本资源后辍:{imgExt}\r\n"); } } ii++; } } else { foreach (var img in imgTag) { var remoteImg = CollectionHelper.Instance.FormatUrl(remoteViewUrl, img); CutContent = CutContent.Replace(img, remoteImg); } } #endregion #region 结果为循环 if (itemTaskLebel.IsLoop == 1) { string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent); foreach (string s in LabelString) { CutContent += s + "$$$$"; } int n = CutContent.LastIndexOf("$$$$"); CutContent = CutContent.Remove(n, 4); } #endregion #region 过滤Html if (!string.IsNullOrEmpty(itemTaskLebel.LabelHtmlRemove)) { string[] arr = itemTaskLebel.LabelHtmlRemove.Split(new string[] { "||||" }, StringSplitOptions.RemoveEmptyEntries); foreach (string str in arr) { if (str == "all") { CutContent = CollectionHelper.Instance.NoHtml(CutContent); break; } else if (str == "table") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "table", 2); } else if (str == "font<span>") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "font", 3); CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "span", 3); } else if (str == "a") { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "a", 3); } else { CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, str, 2); } } } #endregion #region 排除字符 if (!string.IsNullOrEmpty(itemTaskLebel.LabelRemove)) { foreach (string str in itemTaskLebel.LabelRemove.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries)) { string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries); if (ListStr[1] == "2") { // CutContent = CollectionHelper.RemoveHtml(CutContent, ListStr[0]); CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, ListStr[0], 2); } else { // CutContent = CutContent.Replace(ListStr[0], ""); CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, ListStr[0], 3); } } } #endregion #region 替换字符 if (!string.IsNullOrEmpty(itemTaskLebel.LabelReplace)) { foreach (string str in itemTaskLebel.LabelReplace.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries)) { string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries); Regex reg = new Regex(ListStr[0], RegexOptions.IgnoreCase); if (ListStr.Length == 1) { CutContent = reg.Replace(CutContent, "");// CutContent.Replace(ListStr[0], ""); } else { CutContent = reg.Replace(CutContent, ListStr[1]); // CutContent = CutContent.Replace(ListStr[0], ListStr[1]); } } } #endregion #region 加载插件 string SpiderLabelPlugin = itemTaskLebel.SpiderLabelPlugin; if (SpiderLabelPlugin != "不使用插件" && !string.IsNullOrEmpty(SpiderLabelPlugin)) { CutContent = PythonExtHelper.RunPython(PluginUtility.SpiderContentPluginPath + SpiderLabelPlugin, new object[] { remoteViewUrl, CutContent }); } #endregion return(CutContent); }