Beispiel #1
0
        public static async Task <NameValueCollection> GetContentNameValueCollectionAsync(Rule rule, Item item)
        {
            var attributes = new NameValueCollection();

            var result = await WebClientUtils.GetRemoteHtmlAsync(item.Url, rule.Charset, rule.CookieString);

            if (!result.IsSuccess)
            {
                throw new Exception(result.ErrorMessage);
            }

            var contentHtml         = result.Content;
            var regexContentExclude = GatherUtils.GetRegexString(rule.ContentExclude);
            var regexChannel        = GatherUtils.GetRegexChannel(rule.ContentChannelStart, rule.ContentChannelEnd);
            var regexContent        = GatherUtils.GetRegexContent(rule.ContentContentStart, rule.ContentContentEnd);
            var regexContent2       = string.Empty;

            if (!string.IsNullOrEmpty(rule.ContentContentStart2) && !string.IsNullOrEmpty(rule.ContentContentEnd2))
            {
                regexContent2 = GatherUtils.GetRegexContent(rule.ContentContentStart2, rule.ContentContentEnd2);
            }
            var regexContent3 = string.Empty;

            if (!string.IsNullOrEmpty(rule.ContentContentStart3) && !string.IsNullOrEmpty(rule.ContentContentEnd3))
            {
                regexContent3 = GatherUtils.GetRegexContent(rule.ContentContentStart3, rule.ContentContentEnd3);
            }
            var regexNextPage     = GatherUtils.GetRegexUrl(rule.ContentNextPageStart, rule.ContentNextPageEnd);
            var regexTitle        = GatherUtils.GetRegexTitle(rule.ContentTitleStart, rule.ContentTitleEnd);
            var contentAttributes = ListUtils.GetStringList(rule.ContentAttributes);

            var title = rule.ContentTitleByList ? item.Content.Title : GetValue("title", regexTitle, contentHtml);
            var body  = GetValue("content", regexContent, contentHtml);

            if (string.IsNullOrEmpty(body) && !string.IsNullOrEmpty(regexContent2))
            {
                body = GetValue("content", regexContent2, contentHtml);
            }
            if (string.IsNullOrEmpty(body) && !string.IsNullOrEmpty(regexContent3))
            {
                body = GetValue("content", regexContent3, contentHtml);
            }

            if (!string.IsNullOrEmpty(regexContentExclude))
            {
                body = Replace(regexContentExclude, body, string.Empty);
            }
            if (!string.IsNullOrEmpty(rule.ContentHtmlClearCollection))
            {
                var htmlClearList = StringCollectionToList(rule.ContentHtmlClearCollection);
                foreach (var htmlClear in htmlClearList)
                {
                    var clearRegex = $@"<{htmlClear}[^>]*>.*?<\/{htmlClear}>";
                    body = Replace(clearRegex, body, string.Empty);
                }
            }
            if (!string.IsNullOrEmpty(rule.ContentHtmlClearTagCollection))
            {
                var htmlClearTagList = StringCollectionToList(rule.ContentHtmlClearTagCollection);
                foreach (var htmlClearTag in htmlClearTagList)
                {
                    var clearRegex = $@"<{htmlClearTag}[^>]*>";
                    body       = Replace(clearRegex, body, string.Empty);
                    clearRegex = $@"<\/{htmlClearTag}>";
                    body       = Replace(clearRegex, body, string.Empty);
                }
            }

            var contentNextPageUrl = GetUrl(regexNextPage, contentHtml, item.Url);

            if (!string.IsNullOrEmpty(contentNextPageUrl))
            {
                body = await GetPageContentAsync(body, rule.Charset, contentNextPageUrl, rule.CookieString, regexContentExclude, rule.ContentHtmlClearCollection, rule.ContentHtmlClearTagCollection, regexContent, regexContent2, regexContent3, regexNextPage);
            }
            var channel = GetValue("channel", regexChannel, contentHtml);

            foreach (var attributeName in contentAttributes)
            {
                var normalByList  = GetByListValue(rule, attributeName);
                var normalStart   = GetStartValue(rule, attributeName);
                var normalEnd     = GetEndValue(rule, attributeName);
                var normalDefault = GetDefaultValue(rule, attributeName);
                var regex         = GetRegexAttributeName(attributeName, normalStart, normalEnd);
                var value         = normalByList ? item.Content.Get <string>(attributeName) : GetValue(attributeName, regex, contentHtml);
                if (string.IsNullOrEmpty(value))
                {
                    value = normalDefault;
                }
                attributes.Set(attributeName, value);
            }

            attributes.Add("标题", title);
            attributes.Add("栏目", channel);
            attributes.Add("正文", body);

            return(attributes);
        }
        private async Task GatherChannelsAsync(int adminId, int siteId, int ruleId, string guid)
        {
            var cache = InitCache(guid, "开始获取链接...");
            //if (isCli) await CliUtils.PrintLine(cache.Message);

            var rule = await _ruleRepository.GetAsync(ruleId);

            var siteInfo = await _siteRepository.GetAsync(siteId);

            var channelInfo = await _channelRepository.GetAsync(rule.ChannelId);

            if (channelInfo == null)
            {
                channelInfo = await _channelRepository.GetAsync(siteId);

                rule.ChannelId = siteId;
            }

            var regexTitleInclude   = GatherUtils.GetRegexString(rule.TitleInclude);
            var regexContentExclude = GatherUtils.GetRegexString(rule.ContentExclude);
            var regexChannel        = GatherUtils.GetRegexChannel(rule.ContentChannelStart, rule.ContentChannelEnd);
            var regexContent        = GatherUtils.GetRegexContent(rule.ContentContentStart, rule.ContentContentEnd);
            var regexContent2       = string.Empty;

            if (!string.IsNullOrEmpty(rule.ContentContentStart2) && !string.IsNullOrEmpty(rule.ContentContentEnd2))
            {
                regexContent2 = GatherUtils.GetRegexContent(rule.ContentContentStart2, rule.ContentContentEnd2);
            }
            var regexContent3 = string.Empty;

            if (!string.IsNullOrEmpty(rule.ContentContentStart3) && !string.IsNullOrEmpty(rule.ContentContentEnd3))
            {
                regexContent3 = GatherUtils.GetRegexContent(rule.ContentContentStart3, rule.ContentContentEnd3);
            }
            var regexNextPage     = GatherUtils.GetRegexUrl(rule.ContentNextPageStart, rule.ContentNextPageEnd);
            var regexTitle        = GatherUtils.GetRegexTitle(rule.ContentTitleStart, rule.ContentTitleEnd);
            var contentAttributes = ListUtils.GetStringList(rule.ContentAttributes);

            var items = await GatherUtils.GetAllItemsAsync(rule, cache);

            cache.TotalCount = rule.GatherNum > 0 ? rule.GatherNum : items.Count;
            cache.IsSuccess  = true;
            cache.Message    = "开始采集内容...";
            //if (isCli) await CliUtils.PrintLine(cache.Message);

            var channelIdAndContentIdList = new List <KeyValuePair <int, int> >();

            foreach (var item in items)
            {
                var result = await GatherOneAsync(siteInfo, channelInfo,
                                                  regexTitleInclude, regexContentExclude, regexTitle, regexContent, regexContent2, regexContent3, regexNextPage, regexChannel,
                                                  contentAttributes,
                                                  rule, item, channelIdAndContentIdList, adminId);

                if (result.Success)
                {
                    cache.SuccessCount++;
                    cache.IsSuccess = true;
                    cache.Message   = $"采集成功:{result.Title}";
                    //if (isCli) await CliUtils.PrintLine(cache.Message);
                }
                else
                {
                    cache.FailureCount++;
                    cache.IsSuccess = false;
                    cache.Message   = result.ErrorMessage;
                    //if (isCli) await CliUtils.PrintErrorAsync($"采集失败:{errorMessage}");
                    cache.FailureMessages.Add(result.ErrorMessage);
                }
                if (cache.SuccessCount == cache.TotalCount)
                {
                    break;
                }
            }

            //if (rule.IsChecked)
            //{
            //    foreach (var channelIdAndContentId in channelIdAndContentIdList)
            //    {
            //        var channelId = channelIdAndContentId.Key;
            //        var contentId = channelIdAndContentId.Value;

            //        CreateManager.CreateContent(siteId, channelId, contentId);
            //    }
            //}

            await _ruleRepository.UpdateLastGatherDateAsync(ruleId);

            cache.Status    = StatusSuccess;
            cache.IsSuccess = true;
            cache.Message   = $"任务完成,共采集内容 {cache.SuccessCount} 篇。";
            //if (isCli) await CliUtils.PrintLine(cache.Message);
        }