public static async Task <NameValueCollection> GetContentNameValueCollectionAsync(Rule rule, Item item) { var attributes = new NameValueCollection(); var result = await WebClientUtils.GetRemoteHtmlAsync(item.Url, rule.Charset, rule.CookieString); if (!result.IsSuccess) { throw new Exception(result.ErrorMessage); } var contentHtml = result.Content; var regexContentExclude = GatherUtils.GetRegexString(rule.ContentExclude); var regexChannel = GatherUtils.GetRegexChannel(rule.ContentChannelStart, rule.ContentChannelEnd); var regexContent = GatherUtils.GetRegexContent(rule.ContentContentStart, rule.ContentContentEnd); var regexContent2 = string.Empty; if (!string.IsNullOrEmpty(rule.ContentContentStart2) && !string.IsNullOrEmpty(rule.ContentContentEnd2)) { regexContent2 = GatherUtils.GetRegexContent(rule.ContentContentStart2, rule.ContentContentEnd2); } var regexContent3 = string.Empty; if (!string.IsNullOrEmpty(rule.ContentContentStart3) && !string.IsNullOrEmpty(rule.ContentContentEnd3)) { regexContent3 = GatherUtils.GetRegexContent(rule.ContentContentStart3, rule.ContentContentEnd3); } var regexNextPage = GatherUtils.GetRegexUrl(rule.ContentNextPageStart, rule.ContentNextPageEnd); var regexTitle = GatherUtils.GetRegexTitle(rule.ContentTitleStart, rule.ContentTitleEnd); var contentAttributes = ListUtils.GetStringList(rule.ContentAttributes); var title = rule.ContentTitleByList ? item.Content.Title : GetValue("title", regexTitle, contentHtml); var body = GetValue("content", regexContent, contentHtml); if (string.IsNullOrEmpty(body) && !string.IsNullOrEmpty(regexContent2)) { body = GetValue("content", regexContent2, contentHtml); } if (string.IsNullOrEmpty(body) && !string.IsNullOrEmpty(regexContent3)) { body = GetValue("content", regexContent3, contentHtml); } if (!string.IsNullOrEmpty(regexContentExclude)) { body = Replace(regexContentExclude, body, string.Empty); } if (!string.IsNullOrEmpty(rule.ContentHtmlClearCollection)) { var htmlClearList = StringCollectionToList(rule.ContentHtmlClearCollection); foreach (var htmlClear in htmlClearList) { var clearRegex = $@"<{htmlClear}[^>]*>.*?<\/{htmlClear}>"; body = Replace(clearRegex, body, string.Empty); } } if (!string.IsNullOrEmpty(rule.ContentHtmlClearTagCollection)) { var htmlClearTagList = StringCollectionToList(rule.ContentHtmlClearTagCollection); foreach (var htmlClearTag in htmlClearTagList) { var clearRegex = $@"<{htmlClearTag}[^>]*>"; body = Replace(clearRegex, body, string.Empty); clearRegex = $@"<\/{htmlClearTag}>"; body = Replace(clearRegex, body, string.Empty); } } var contentNextPageUrl = GetUrl(regexNextPage, contentHtml, item.Url); if (!string.IsNullOrEmpty(contentNextPageUrl)) { body = await GetPageContentAsync(body, rule.Charset, contentNextPageUrl, rule.CookieString, regexContentExclude, rule.ContentHtmlClearCollection, rule.ContentHtmlClearTagCollection, regexContent, regexContent2, regexContent3, regexNextPage); } var channel = GetValue("channel", regexChannel, contentHtml); foreach (var attributeName in contentAttributes) { var normalByList = GetByListValue(rule, attributeName); var normalStart = GetStartValue(rule, attributeName); var normalEnd = GetEndValue(rule, attributeName); var normalDefault = GetDefaultValue(rule, attributeName); var regex = GetRegexAttributeName(attributeName, normalStart, normalEnd); var value = normalByList ? item.Content.Get <string>(attributeName) : GetValue(attributeName, regex, contentHtml); if (string.IsNullOrEmpty(value)) { value = normalDefault; } attributes.Set(attributeName, value); } attributes.Add("标题", title); attributes.Add("栏目", channel); attributes.Add("正文", body); return(attributes); }
private async Task GatherChannelsAsync(int adminId, int siteId, int ruleId, string guid) { var cache = InitCache(guid, "开始获取链接..."); //if (isCli) await CliUtils.PrintLine(cache.Message); var rule = await _ruleRepository.GetAsync(ruleId); var siteInfo = await _siteRepository.GetAsync(siteId); var channelInfo = await _channelRepository.GetAsync(rule.ChannelId); if (channelInfo == null) { channelInfo = await _channelRepository.GetAsync(siteId); rule.ChannelId = siteId; } var regexTitleInclude = GatherUtils.GetRegexString(rule.TitleInclude); var regexContentExclude = GatherUtils.GetRegexString(rule.ContentExclude); var regexChannel = GatherUtils.GetRegexChannel(rule.ContentChannelStart, rule.ContentChannelEnd); var regexContent = GatherUtils.GetRegexContent(rule.ContentContentStart, rule.ContentContentEnd); var regexContent2 = string.Empty; if (!string.IsNullOrEmpty(rule.ContentContentStart2) && !string.IsNullOrEmpty(rule.ContentContentEnd2)) { regexContent2 = GatherUtils.GetRegexContent(rule.ContentContentStart2, rule.ContentContentEnd2); } var regexContent3 = string.Empty; if (!string.IsNullOrEmpty(rule.ContentContentStart3) && !string.IsNullOrEmpty(rule.ContentContentEnd3)) { regexContent3 = GatherUtils.GetRegexContent(rule.ContentContentStart3, rule.ContentContentEnd3); } var regexNextPage = GatherUtils.GetRegexUrl(rule.ContentNextPageStart, rule.ContentNextPageEnd); var regexTitle = GatherUtils.GetRegexTitle(rule.ContentTitleStart, rule.ContentTitleEnd); var contentAttributes = ListUtils.GetStringList(rule.ContentAttributes); var items = await GatherUtils.GetAllItemsAsync(rule, cache); cache.TotalCount = rule.GatherNum > 0 ? rule.GatherNum : items.Count; cache.IsSuccess = true; cache.Message = "开始采集内容..."; //if (isCli) await CliUtils.PrintLine(cache.Message); var channelIdAndContentIdList = new List <KeyValuePair <int, int> >(); foreach (var item in items) { var result = await GatherOneAsync(siteInfo, channelInfo, regexTitleInclude, regexContentExclude, regexTitle, regexContent, regexContent2, regexContent3, regexNextPage, regexChannel, contentAttributes, rule, item, channelIdAndContentIdList, adminId); if (result.Success) { cache.SuccessCount++; cache.IsSuccess = true; cache.Message = $"采集成功:{result.Title}"; //if (isCli) await CliUtils.PrintLine(cache.Message); } else { cache.FailureCount++; cache.IsSuccess = false; cache.Message = result.ErrorMessage; //if (isCli) await CliUtils.PrintErrorAsync($"采集失败:{errorMessage}"); cache.FailureMessages.Add(result.ErrorMessage); } if (cache.SuccessCount == cache.TotalCount) { break; } } //if (rule.IsChecked) //{ // foreach (var channelIdAndContentId in channelIdAndContentIdList) // { // var channelId = channelIdAndContentId.Key; // var contentId = channelIdAndContentId.Value; // CreateManager.CreateContent(siteId, channelId, contentId); // } //} await _ruleRepository.UpdateLastGatherDateAsync(ruleId); cache.Status = StatusSuccess; cache.IsSuccess = true; cache.Message = $"任务完成,共采集内容 {cache.SuccessCount} 篇。"; //if (isCli) await CliUtils.PrintLine(cache.Message); }