/// <summary> /// 分析列表 /// </summary> public void AnalyzeAllList() { OutMessageHandler?.Invoke("正在分析采集列表个数!"); if (Model.IsSource == 1) { AnalyzeSingleList(Model.DemoListUrl); } else { foreach (string linkUrl in Model.CollectionContent.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries)) { try { AnalyzeSingleList(linkUrl); } catch (Exception ex1) { Log4Helper.Write(LogLevel.Error, ex1.StackTrace, ex1); continue; } } } }
/// <summary> /// 解析列表连接 /// </summary> /// <param name="testUrl"></param> /// <param name="num"></param> public void ResolveList(string testUrl, int num) { string pageContent = string.Empty; if (Model.IsSource == 1) { pageContent = Model.SourceText; } else { pageContent = CommonHelper.getPageContent(testUrl, Model.PageEncode); if (string.IsNullOrEmpty(pageContent)) { OutMessageHandler?.Invoke("采集列表失败!"); return; } } if (Model.LinkUrlCutAreaStart?.Trim() != "" && Model.LinkUrlCutAreaEnd?.Trim() != "") { pageContent = HtmlHelper.Instance.ParseCollectionStrings(pageContent); pageContent = CollectionHelper.Instance.GetBody(pageContent, HtmlHelper.Instance.ParseCollectionStrings(Model.LinkUrlCutAreaStart), HtmlHelper.Instance.ParseCollectionStrings(Model.LinkUrlCutAreaEnd), false, false); pageContent = HtmlHelper.Instance.UnParseCollectionStrings(pageContent); } string regexHref = cRegexHelper.RegexATag; // int i = 0; if (Model.IsHandGetUrl == 1) { regexHref = Model.HandCollectionUrlRegex; regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref); regexHref = HtmlHelper.Instance.UnParseCollectionStrings(regexHref); regexHref = regexHref.Replace("[", "(?<"); regexHref = regexHref.Replace("]", ">.*?)"); regexHref = regexHref.Replace("(*)", ".*?"); // regexHref = regexHref.Replace(" ", "\\s+"); //格式化 //20190523这里出现了问题,先注释掉 // regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref); } Match mch = null; Regex reg = new Regex(regexHref, RegexOptions.IgnoreCase); string url = string.Empty; string title = string.Empty; string strUrl = string.Empty; string cover = string.Empty; MatchCollection matches = reg.Matches(pageContent); for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch()) { if (mch.Groups["链接"] != null) { url = CollectionHelper.Instance.FormatUrl(testUrl, mch.Groups["链接"].Value); } if (mch.Groups["标题"] != null) { title = mch.Groups["标题"].Value; } if (mch.Groups["封面"] != null) { cover = CollectionHelper.Instance.FormatUrl(testUrl, mch.Groups["封面"].Value); } if (Model.LinkUrlMustIncludeStr.Trim() != "") { if (url.IndexOf(Model.LinkUrlMustIncludeStr) == -1) { continue; } } if (Model.LinkUrlNoMustIncludeStr.Trim() != "") { bool isFlag = true; foreach (string str in Model.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries)) { if (url.IndexOf(str) > -1) { isFlag = false; break; } } if (!isFlag) { continue; } } #region 加载插件 //string SpiderLabelPlugin = Model.PluginSpiderUrl; //if (SpiderLabelPlugin != "不使用插件" && !string.IsNullOrEmpty(SpiderLabelPlugin)) { // CutContent = PythonExtHelper.RunPython(SpiderLabelPlugin, url, title); //} #endregion OutTreeNodeHandler?.Invoke(url, title, cover, num); if (Model.IsSource == 0) { int minTime = 10; int maxTime = 100; Thread.Sleep(new Random().Next(minTime, maxTime)); } } }