Пример #1
0
        public static void Gather(IAdministratorInfo adminInfo, int siteId, int ruleId, string guid)
        {
            var cache = ProgressCache.Init(guid, "开始获取链接...");

            var gatherRuleInfo = Main.GatherRuleRepository.GetGatherRuleInfo(ruleId);
            var siteInfo       = Context.SiteApi.GetSiteInfo(siteId);
            var channelInfo    = Context.ChannelApi.GetChannelInfo(siteId, gatherRuleInfo.ChannelId);

            if (channelInfo == null)
            {
                channelInfo = Context.ChannelApi.GetChannelInfo(siteId, siteId);
                gatherRuleInfo.ChannelId = siteId;
            }

            var regexUrlInclude     = GetRegexString(gatherRuleInfo.UrlInclude);
            var regexTitleInclude   = GetRegexString(gatherRuleInfo.TitleInclude);
            var regexContentExclude = GetRegexString(gatherRuleInfo.ContentExclude);
            var regexListArea       = GetRegexArea(gatherRuleInfo.ListAreaStart, gatherRuleInfo.ListAreaEnd);
            var regexChannel        = GetRegexChannel(gatherRuleInfo.ContentChannelStart, gatherRuleInfo.ContentChannelEnd);
            var regexContent        = GetRegexContent(gatherRuleInfo.ContentContentStart, gatherRuleInfo.ContentContentEnd);
            var regexContent2       = string.Empty;

            if (!string.IsNullOrEmpty(gatherRuleInfo.ContentContentStart2) && !string.IsNullOrEmpty(gatherRuleInfo.ContentContentEnd2))
            {
                regexContent2 = GetRegexContent(gatherRuleInfo.ContentContentStart2, gatherRuleInfo.ContentContentEnd2);
            }
            var regexContent3 = string.Empty;

            if (!string.IsNullOrEmpty(gatherRuleInfo.ContentContentStart3) && !string.IsNullOrEmpty(gatherRuleInfo.ContentContentEnd3))
            {
                regexContent3 = GetRegexContent(gatherRuleInfo.ContentContentStart3, gatherRuleInfo.ContentContentEnd3);
            }
            var regexNextPage        = GetRegexUrl(gatherRuleInfo.ContentNextPageStart, gatherRuleInfo.ContentNextPageEnd);
            var regexTitle           = GetRegexTitle(gatherRuleInfo.ContentTitleStart, gatherRuleInfo.ContentTitleEnd);
            var contentAttributes    = TranslateUtils.StringCollectionToStringList(gatherRuleInfo.ContentAttributes);
            var contentAttributesXml = TranslateUtils.ToNameValueCollection(gatherRuleInfo.ContentAttributesXml);

            var contentUrls = GetContentUrlList(gatherRuleInfo, regexListArea, regexUrlInclude, cache);

            cache.TotalCount = gatherRuleInfo.GatherNum > 0 ? gatherRuleInfo.GatherNum : contentUrls.Count;
            cache.IsSuccess  = true;
            cache.Message    = "开始采集内容...";

            var contentTitleDict          = new Dictionary <int, IList <string> >();
            var channelIdAndContentIdList = new List <KeyValuePair <int, int> >();

            foreach (var contentUrl in contentUrls)
            {
                if (GatherOneByUrl(siteInfo, channelInfo, gatherRuleInfo.IsSaveImage, gatherRuleInfo.IsSetFirstImageAsImageUrl, gatherRuleInfo.IsEmptyContentAllowed, gatherRuleInfo.IsSameTitleAllowed, gatherRuleInfo.IsChecked, gatherRuleInfo.Charset, contentUrl, gatherRuleInfo.CookieString, regexTitleInclude, regexContentExclude, gatherRuleInfo.ContentHtmlClearCollection, gatherRuleInfo.ContentHtmlClearTagCollection, gatherRuleInfo.ContentReplaceFrom, gatherRuleInfo.ContentReplaceTo, regexTitle, regexContent, regexContent2, regexContent3, regexNextPage, regexChannel, contentAttributes, contentAttributesXml, contentTitleDict, channelIdAndContentIdList, adminInfo, out var title, out var errorMessage))
                {
                    cache.SuccessCount++;
                    cache.IsSuccess = true;
                    cache.Message   = $"成功采集内容:{title}";
                }
Пример #2
0
        public static ProgressCache Init(string guid, string message)
        {
            if (string.IsNullOrEmpty(guid))
            {
                return(null);
            }

            var cache = new ProgressCache
            {
                Status          = StatusProgress,
                IsSuccess       = true,
                Message         = message,
                FailureMessages = new List <string>()
            };

            CacheUtils.InsertHours(guid, cache, 1);
            return(cache);
        }
Пример #3
0
        public static List <string> GetContentUrlList(GatherRuleInfo gatherRuleInfo, string regexListArea, string regexUrlInclude, ProgressCache cache)
        {
            var gatherUrls  = GetGatherUrlList(gatherRuleInfo);
            var contentUrls = new List <string>();

            foreach (var gatherUrl in gatherUrls)
            {
                cache.IsSuccess = true;
                cache.Message   = "获取链接:" + gatherUrl;

                try
                {
                    var urls = GetContentUrls(gatherUrl, gatherRuleInfo.Charset, gatherRuleInfo.CookieString, regexListArea, regexUrlInclude);
                    contentUrls.AddRange(urls);
                }
                catch (Exception ex)
                {
                    cache.IsSuccess = false;
                    cache.Message   = ex.Message;
                    cache.FailureMessages.Add(ex.Message);
                }
            }

            if (gatherRuleInfo.IsOrderByDesc)
            {
                contentUrls.Reverse();
            }
            return(contentUrls);
        }