Пример #1
0
        private async Task GatherChannelsAsync(int adminId, int siteId, int ruleId, string guid)
        {
            var cache = InitCache(guid, "开始获取链接...");
            //if (isCli) await CliUtils.PrintLine(cache.Message);

            var rule = await _ruleRepository.GetAsync(ruleId);

            var siteInfo = await _siteRepository.GetAsync(siteId);

            var channelInfo = await _channelRepository.GetAsync(rule.ChannelId);

            if (channelInfo == null)
            {
                channelInfo = await _channelRepository.GetAsync(siteId);

                rule.ChannelId = siteId;
            }

            var regexTitleInclude   = GatherUtils.GetRegexString(rule.TitleInclude);
            var regexContentExclude = GatherUtils.GetRegexString(rule.ContentExclude);
            var regexChannel        = GatherUtils.GetRegexChannel(rule.ContentChannelStart, rule.ContentChannelEnd);
            var regexContent        = GatherUtils.GetRegexContent(rule.ContentContentStart, rule.ContentContentEnd);
            var regexContent2       = string.Empty;

            if (!string.IsNullOrEmpty(rule.ContentContentStart2) && !string.IsNullOrEmpty(rule.ContentContentEnd2))
            {
                regexContent2 = GatherUtils.GetRegexContent(rule.ContentContentStart2, rule.ContentContentEnd2);
            }
            var regexContent3 = string.Empty;

            if (!string.IsNullOrEmpty(rule.ContentContentStart3) && !string.IsNullOrEmpty(rule.ContentContentEnd3))
            {
                regexContent3 = GatherUtils.GetRegexContent(rule.ContentContentStart3, rule.ContentContentEnd3);
            }
            var regexNextPage     = GatherUtils.GetRegexUrl(rule.ContentNextPageStart, rule.ContentNextPageEnd);
            var regexTitle        = GatherUtils.GetRegexTitle(rule.ContentTitleStart, rule.ContentTitleEnd);
            var contentAttributes = ListUtils.GetStringList(rule.ContentAttributes);

            var items = await GatherUtils.GetAllItemsAsync(rule, cache);

            cache.TotalCount = rule.GatherNum > 0 ? rule.GatherNum : items.Count;
            cache.IsSuccess  = true;
            cache.Message    = "开始采集内容...";
            //if (isCli) await CliUtils.PrintLine(cache.Message);

            var channelIdAndContentIdList = new List <KeyValuePair <int, int> >();

            foreach (var item in items)
            {
                var result = await GatherOneAsync(siteInfo, channelInfo,
                                                  regexTitleInclude, regexContentExclude, regexTitle, regexContent, regexContent2, regexContent3, regexNextPage, regexChannel,
                                                  contentAttributes,
                                                  rule, item, channelIdAndContentIdList, adminId);

                if (result.Success)
                {
                    cache.SuccessCount++;
                    cache.IsSuccess = true;
                    cache.Message   = $"采集成功:{result.Title}";
                    //if (isCli) await CliUtils.PrintLine(cache.Message);
                }
                else
                {
                    cache.FailureCount++;
                    cache.IsSuccess = false;
                    cache.Message   = result.ErrorMessage;
                    //if (isCli) await CliUtils.PrintErrorAsync($"采集失败:{errorMessage}");
                    cache.FailureMessages.Add(result.ErrorMessage);
                }
                if (cache.SuccessCount == cache.TotalCount)
                {
                    break;
                }
            }

            //if (rule.IsChecked)
            //{
            //    foreach (var channelIdAndContentId in channelIdAndContentIdList)
            //    {
            //        var channelId = channelIdAndContentId.Key;
            //        var contentId = channelIdAndContentId.Value;

            //        CreateManager.CreateContent(siteId, channelId, contentId);
            //    }
            //}

            await _ruleRepository.UpdateLastGatherDateAsync(ruleId);

            cache.Status    = StatusSuccess;
            cache.IsSuccess = true;
            cache.Message   = $"任务完成,共采集内容 {cache.SuccessCount} 篇。";
            //if (isCli) await CliUtils.PrintLine(cache.Message);
        }
Пример #2
0
        public static async Task <NameValueCollection> GetContentNameValueCollectionAsync(Rule rule, Item item)
        {
            var attributes = new NameValueCollection();

            var result = await WebClientUtils.GetRemoteHtmlAsync(item.Url, rule.Charset, rule.CookieString);

            if (!result.IsSuccess)
            {
                throw new Exception(result.ErrorMessage);
            }

            var contentHtml         = result.Content;
            var regexContentExclude = GatherUtils.GetRegexString(rule.ContentExclude);
            var regexChannel        = GatherUtils.GetRegexChannel(rule.ContentChannelStart, rule.ContentChannelEnd);
            var regexContent        = GatherUtils.GetRegexContent(rule.ContentContentStart, rule.ContentContentEnd);
            var regexContent2       = string.Empty;

            if (!string.IsNullOrEmpty(rule.ContentContentStart2) && !string.IsNullOrEmpty(rule.ContentContentEnd2))
            {
                regexContent2 = GatherUtils.GetRegexContent(rule.ContentContentStart2, rule.ContentContentEnd2);
            }
            var regexContent3 = string.Empty;

            if (!string.IsNullOrEmpty(rule.ContentContentStart3) && !string.IsNullOrEmpty(rule.ContentContentEnd3))
            {
                regexContent3 = GatherUtils.GetRegexContent(rule.ContentContentStart3, rule.ContentContentEnd3);
            }
            var regexNextPage     = GatherUtils.GetRegexUrl(rule.ContentNextPageStart, rule.ContentNextPageEnd);
            var regexTitle        = GatherUtils.GetRegexTitle(rule.ContentTitleStart, rule.ContentTitleEnd);
            var contentAttributes = ListUtils.GetStringList(rule.ContentAttributes);

            var title = rule.ContentTitleByList ? item.Content.Title : GetValue("title", regexTitle, contentHtml);
            var body  = GetValue("content", regexContent, contentHtml);

            if (string.IsNullOrEmpty(body) && !string.IsNullOrEmpty(regexContent2))
            {
                body = GetValue("content", regexContent2, contentHtml);
            }
            if (string.IsNullOrEmpty(body) && !string.IsNullOrEmpty(regexContent3))
            {
                body = GetValue("content", regexContent3, contentHtml);
            }

            if (!string.IsNullOrEmpty(regexContentExclude))
            {
                body = Replace(regexContentExclude, body, string.Empty);
            }
            if (!string.IsNullOrEmpty(rule.ContentHtmlClearCollection))
            {
                var htmlClearList = StringCollectionToList(rule.ContentHtmlClearCollection);
                foreach (var htmlClear in htmlClearList)
                {
                    var clearRegex = $@"<{htmlClear}[^>]*>.*?<\/{htmlClear}>";
                    body = Replace(clearRegex, body, string.Empty);
                }
            }
            if (!string.IsNullOrEmpty(rule.ContentHtmlClearTagCollection))
            {
                var htmlClearTagList = StringCollectionToList(rule.ContentHtmlClearTagCollection);
                foreach (var htmlClearTag in htmlClearTagList)
                {
                    var clearRegex = $@"<{htmlClearTag}[^>]*>";
                    body       = Replace(clearRegex, body, string.Empty);
                    clearRegex = $@"<\/{htmlClearTag}>";
                    body       = Replace(clearRegex, body, string.Empty);
                }
            }

            var contentNextPageUrl = GetUrl(regexNextPage, contentHtml, item.Url);

            if (!string.IsNullOrEmpty(contentNextPageUrl))
            {
                body = await GetPageContentAsync(body, rule.Charset, contentNextPageUrl, rule.CookieString, regexContentExclude, rule.ContentHtmlClearCollection, rule.ContentHtmlClearTagCollection, regexContent, regexContent2, regexContent3, regexNextPage);
            }
            var channel = GetValue("channel", regexChannel, contentHtml);

            foreach (var attributeName in contentAttributes)
            {
                var normalByList  = GetByListValue(rule, attributeName);
                var normalStart   = GetStartValue(rule, attributeName);
                var normalEnd     = GetEndValue(rule, attributeName);
                var normalDefault = GetDefaultValue(rule, attributeName);
                var regex         = GetRegexAttributeName(attributeName, normalStart, normalEnd);
                var value         = normalByList ? item.Content.Get <string>(attributeName) : GetValue(attributeName, regex, contentHtml);
                if (string.IsNullOrEmpty(value))
                {
                    value = normalDefault;
                }
                attributes.Set(attributeName, value);
            }

            attributes.Add("标题", title);
            attributes.Add("栏目", channel);
            attributes.Add("正文", body);

            return(attributes);
        }
Пример #3
0
        private async Task <(bool Success, string Title, string ErrorMessage)> GatherOneAsync(Site siteInfo, Channel channelInfo, string regexTitleInclude, string regexContentExclude, string regexTitle, string regexContent, string regexContent2, string regexContent3, string regexNextPage, string regexChannel, IEnumerable <string> contentAttributes, Rule rule, Item item, ICollection <KeyValuePair <int, int> > channelIdAndContentIdList, int adminId)
        {
            try
            {
                var result = await WebClientUtils.GetRemoteHtmlAsync(item.Url, rule.Charset, rule.CookieString);

                if (!result.IsSuccess)
                {
                    return(false, string.Empty, result.ErrorMessage);
                }
                var contentHtml  = result.Content;
                var errorMessage = string.Empty;

                var title   = rule.ContentTitleByList ? item.Content.Title : GatherUtils.GetValue("title", regexTitle, contentHtml);
                var content = GatherUtils.GetValue("content", regexContent, contentHtml);
                if (string.IsNullOrEmpty(content) && !string.IsNullOrEmpty(regexContent2))
                {
                    content = GatherUtils.GetValue("content", regexContent2, contentHtml);
                }
                if (string.IsNullOrEmpty(content) && !string.IsNullOrEmpty(regexContent3))
                {
                    content = GatherUtils.GetValue("content", regexContent3, contentHtml);
                }

                //如果标题或内容为空,返回false并退出
                if (string.IsNullOrEmpty(title))
                {
                    errorMessage = $"无法获取标题:{item.Url}";
                    return(false, title, errorMessage);
                }
                if (rule.IsEmptyContentAllowed == false && string.IsNullOrEmpty(content))
                {
                    errorMessage = $"无法获取内容正文:{item.Url}";
                    return(false, title, errorMessage);
                }

                title = StringUtils.StripTags(title);

                if (!string.IsNullOrEmpty(regexTitleInclude))
                {
                    if (GatherUtils.IsMatch(regexTitleInclude, title) == false)
                    {
                        errorMessage = $"标题不符合要求:{item.Url}";
                        return(false, title, errorMessage);
                    }
                }
                if (!string.IsNullOrEmpty(regexContentExclude))
                {
                    content = GatherUtils.Replace(regexContentExclude, content, string.Empty);
                }
                if (!string.IsNullOrEmpty(rule.ContentHtmlClearCollection))
                {
                    var htmlClearList = GatherUtils.StringCollectionToList(rule.ContentHtmlClearCollection);
                    foreach (var htmlClear in htmlClearList)
                    {
                        var clearRegex = $@"<{htmlClear}[^>]*>.*?<\/{htmlClear}>";
                        content = GatherUtils.Replace(clearRegex, content, string.Empty);
                    }
                }
                if (!string.IsNullOrEmpty(rule.ContentHtmlClearTagCollection))
                {
                    var htmlClearTagList = GatherUtils.StringCollectionToList(rule.ContentHtmlClearTagCollection);
                    foreach (var htmlClearTag in htmlClearTagList)
                    {
                        var clearRegex = $@"<{htmlClearTag}[^>]*>";
                        content    = GatherUtils.Replace(clearRegex, content, string.Empty);
                        clearRegex = $@"<\/{htmlClearTag}>";
                        content    = GatherUtils.Replace(clearRegex, content, string.Empty);
                    }
                }

                var contentNextPageUrl = GatherUtils.GetUrl(regexNextPage, contentHtml, item.Url);
                if (!string.IsNullOrEmpty(contentNextPageUrl))
                {
                    try
                    {
                        content = await GatherUtils.GetPageContentAsync(content, rule.Charset, contentNextPageUrl, rule.CookieString, regexContentExclude, rule.ContentHtmlClearCollection, rule.ContentHtmlClearTagCollection, regexContent, regexContent2, regexContent3, regexNextPage);
                    }
                    catch (Exception ex)
                    {
                        errorMessage = ex.Message;
                        return(false, title, errorMessage);
                    }
                }

                var channel   = GatherUtils.GetValue("channel", regexChannel, contentHtml);
                var channelId = channelInfo.Id;
                if (!string.IsNullOrEmpty(channel))
                {
                    var channelIdByNodeName = 0;

                    var childChannelIdList = await _channelRepository.GetChannelIdsAsync(siteInfo.Id, channelInfo.Id, ScopeType.All);

                    foreach (var childChannelId in childChannelIdList)
                    {
                        if (channel == await _channelRepository.GetChannelNameAsync(siteInfo.Id, childChannelId))
                        {
                            channelIdByNodeName = childChannelId;
                        }
                    }

                    //var channelIdByNodeName = ChannelManager.GetChannelIdByParentIdAndChannelName(siteInfo.Id, channelInfo.Id, channel, recursive: false);
                    if (channelIdByNodeName == 0)
                    {
                        var newChannelInfo = new Channel
                        {
                            SiteId               = siteInfo.Id,
                            ParentId             = channelInfo.Id,
                            ChannelName          = channel,
                            ContentModelPluginId = channelInfo.ContentModelPluginId
                        };

                        channelId = await _channelRepository.InsertAsync(newChannelInfo);
                    }
                    else
                    {
                        channelId = channelIdByNodeName;
                    }
                }

                if (!rule.IsSameTitleAllowed)
                {
                    var theChannel = await _channelRepository.GetAsync(channelId);

                    var contentIds = await _contentRepository.GetContentIdsBySameTitleAsync(siteInfo, theChannel, title);

                    if (contentIds.Count > 0)
                    {
                        errorMessage = $"已包含相同标题:{title}";
                        return(false, title, errorMessage);
                    }
                }

                var contentInfo = new Content
                {
                    AddDate = DateTime.Now
                };

                foreach (var attributeName in contentAttributes)
                {
                    if (!StringUtils.EqualsIgnoreCase(attributeName, nameof(Content.Title)) && !StringUtils.EqualsIgnoreCase(attributeName, nameof(Content.Body)))
                    {
                        var normalByList = GatherUtils.GetByListValue(rule, attributeName);
                        var normalStart  = GatherUtils.GetStartValue(rule, attributeName);
                        var normalEnd    = GatherUtils.GetEndValue(rule, attributeName);

                        //采集为空时的默认值
                        var normalDefault = GatherUtils.GetDefaultValue(rule, attributeName);

                        var regex = GatherUtils.GetRegexAttributeName(attributeName, normalStart, normalEnd);
                        var value = normalByList ? item.Content.Get <string>(attributeName) : GatherUtils.GetValue(attributeName, regex, contentHtml);

                        //采集为空时的默认值
                        if (string.IsNullOrEmpty(value))
                        {
                            value = normalDefault;
                        }

                        if (StringUtils.EqualsIgnoreCase(nameof(Content.AddDate), attributeName))
                        {
                            value = GatherUtils.ReplaceFirst(value, ":", ":");
                            contentInfo.AddDate = TranslateUtils.ToDateTime(value, DateTime.Now);
                        }
                        else if (StringUtils.EqualsIgnoreCase(nameof(Content.Color), attributeName))
                        {
                            contentInfo.Color = TranslateUtils.ToBool(value, defaultValue: false);
                        }
                        else if (StringUtils.EqualsIgnoreCase(nameof(Content.Hot), attributeName))
                        {
                            contentInfo.Hot = TranslateUtils.ToBool(value, defaultValue: false);
                        }
                        else if (StringUtils.EqualsIgnoreCase(nameof(Content.Recommend), attributeName))
                        {
                            contentInfo.Recommend = TranslateUtils.ToBool(value, defaultValue: false);
                        }
                        else if (StringUtils.EqualsIgnoreCase(nameof(Content.Top), attributeName))
                        {
                            contentInfo.Top = TranslateUtils.ToBool(value, defaultValue: false);
                        }
                        else if (StringUtils.EqualsIgnoreCase(nameof(Content.ImageUrl), attributeName))
                        {
                            if (!string.IsNullOrEmpty(value))
                            {
                                var attachmentUrl = GatherUtils.GetUrlByBaseUrl(value, item.Url);

                                var fileExtension = PageUtils.GetExtensionFromUrl(attachmentUrl);
                                var fileName      =
                                    $"{StringUtils.GetShortGuid(false)}{fileExtension}";

                                var directoryPath = await _pathManager.GetUploadDirectoryPathAsync(siteInfo, UploadType.Image);

                                var filePath = PathUtils.Combine(directoryPath, fileName);
                                DirectoryUtils.CreateDirectoryIfNotExists(filePath);
                                try
                                {
                                    await WebClientUtils.DownloadAsync(attachmentUrl, filePath);

                                    contentInfo.ImageUrl =
                                        await _pathManager.GetVirtualUrlByPhysicalPathAsync(siteInfo, filePath);
                                }
                                catch
                                {
                                    // ignored
                                }
                            }
                        }
                        else if (StringUtils.EqualsIgnoreCase(nameof(Content.VideoUrl), attributeName))
                        {
                            if (!string.IsNullOrEmpty(value))
                            {
                                var attachmentUrl = GatherUtils.GetUrlByBaseUrl(value, item.Url);
                                var fileExtension = PageUtils.GetExtensionFromUrl(attachmentUrl);
                                var fileName      = $"{StringUtils.GetShortGuid(false)}{fileExtension}";

                                var directoryPath = await _pathManager.GetUploadDirectoryPathAsync(siteInfo, UploadType.Video);

                                var filePath = PathUtils.Combine(directoryPath, fileName);
                                DirectoryUtils.CreateDirectoryIfNotExists(filePath);
                                try
                                {
                                    await WebClientUtils.DownloadAsync(attachmentUrl, filePath);

                                    contentInfo.VideoUrl = await _pathManager.GetVirtualUrlByPhysicalPathAsync(siteInfo, filePath);
                                }
                                catch
                                {
                                    // ignored
                                }
                            }
                        }
                        else if (StringUtils.EqualsIgnoreCase(nameof(Content.FileUrl), attributeName))
                        {
                            if (!string.IsNullOrEmpty(value))
                            {
                                var attachmentUrl = GatherUtils.GetUrlByBaseUrl(value, item.Url);
                                var fileExtension = PageUtils.GetExtensionFromUrl(attachmentUrl);
                                var fileName      = $"{StringUtils.GetShortGuid(false)}{fileExtension}";

                                var directoryPath = await _pathManager.GetUploadDirectoryPathAsync(siteInfo, UploadType.File);

                                var filePath = PathUtils.Combine(directoryPath, fileName);
                                DirectoryUtils.CreateDirectoryIfNotExists(filePath);
                                try
                                {
                                    await WebClientUtils.DownloadAsync(attachmentUrl, filePath);

                                    contentInfo.FileUrl = await _pathManager.GetVirtualUrlByPhysicalPathAsync(siteInfo, filePath);
                                }
                                catch
                                {
                                    // ignored
                                }
                            }
                        }
                        else if (StringUtils.EqualsIgnoreCase(nameof(Content.Hits), attributeName))
                        {
                            contentInfo.Hits = TranslateUtils.ToInt(value);
                        }
                        else if (StringUtils.EqualsIgnoreCase("FileName", attributeName) && !string.IsNullOrEmpty(rule.FileNameAttributeName))
                        {
                            var fileName = PathUtils.GetFileNameWithoutExtension(item.Url);
                            contentInfo.Set(rule.FileNameAttributeName, fileName);
                        }
                        else
                        {
                            contentInfo.Set(attributeName, value);
                        }
                    }
                }

                var firstImageUrl = string.Empty;
                if (rule.IsSaveImage)
                {
                    var originalImageSrcList = GatherUtils.GetOriginalImageSrcList(content);
                    var imageSrcList         = GatherUtils.GetImageSrcList(item.Url, content);
                    if (originalImageSrcList.Count == imageSrcList.Count)
                    {
                        for (var i = 0; i < originalImageSrcList.Count; i++)
                        {
                            var originalImageSrc = originalImageSrcList[i];
                            var imageSrc         = imageSrcList[i];

                            var fileExtension = PathUtils.GetExtension(originalImageSrc);
                            var fileName      = $"{StringUtils.GetShortGuid(false)}{fileExtension}";

                            var directoryPath = await _pathManager.GetUploadDirectoryPathAsync(siteInfo, UploadType.Image);

                            var filePath = PathUtils.Combine(directoryPath, fileName);
                            DirectoryUtils.CreateDirectoryIfNotExists(filePath);
                            try
                            {
                                await WebClientUtils.DownloadAsync(imageSrc, filePath);

                                var fileUrl = await _pathManager.GetVirtualUrlByPhysicalPathAsync(siteInfo, filePath);

                                content = content.Replace(originalImageSrc, fileUrl);
                                if (firstImageUrl == string.Empty)
                                {
                                    firstImageUrl = fileUrl;
                                }
                            }
                            catch
                            {
                                // ignored
                            }
                        }
                    }
                }

                if (rule.ImageSource == ImageSource.Content)
                {
                    if (string.IsNullOrEmpty(firstImageUrl))
                    {
                        var imageSrcList = GatherUtils.GetImageSrcList(item.Url, content);
                        if (imageSrcList.Count > 0)
                        {
                            firstImageUrl = imageSrcList[index : 0];
                        }
                    }

                    if (!string.IsNullOrEmpty(firstImageUrl))
                    {
                        contentInfo.ImageUrl = firstImageUrl;
                    }
                }
                else if (rule.ImageSource == ImageSource.List)
                {
                    contentInfo.ImageUrl = item.Content.ImageUrl;
                }

                if (rule.IsSaveFiles)
                {
                    var originalLinkHrefList = GatherUtils.GetOriginalLinkHrefList(content);
                    var linkHrefList         = GatherUtils.GetLinkHrefList(item.Url, content);
                    if (originalLinkHrefList.Count == linkHrefList.Count)
                    {
                        for (var i = 0; i < originalLinkHrefList.Count; i++)
                        {
                            var originalLinkHref = originalLinkHrefList[i];
                            var linkHref         = linkHrefList[i];

                            var fileExtension = PathUtils.GetExtension(originalLinkHref);
                            var fileName      = $"{StringUtils.GetShortGuid(false)}{fileExtension}";

                            var directoryPath = await _pathManager.GetUploadDirectoryPathAsync(siteInfo, UploadType.File);

                            var filePath = PathUtils.Combine(directoryPath, fileName);
                            DirectoryUtils.CreateDirectoryIfNotExists(filePath);
                            try
                            {
                                await WebClientUtils.DownloadAsync(linkHref, filePath);

                                var fileUrl = await _pathManager.GetVirtualUrlByPhysicalPathAsync(siteInfo, filePath);

                                content = content.Replace(originalLinkHref, fileUrl);
                            }
                            catch
                            {
                                // ignored
                            }
                        }
                    }
                }

                //contentInfo.Content = StringUtility.TextEditorContentEncode(content, siteInfo, false);
                contentInfo.SiteId          = siteInfo.Id;
                contentInfo.ChannelId       = channelId;
                contentInfo.AdminId         = adminId;
                contentInfo.LastEditAdminId = adminId;
                contentInfo.Checked         = rule.IsChecked;
                contentInfo.CheckedLevel    = 0;
                contentInfo.Title           = title;
                contentInfo.Body            = content;

                //contentInfo.SourceId = SourceManager.CaiJi;

                var theContentId = await _contentRepository.InsertAsync(siteInfo, channelInfo, contentInfo);

                channelIdAndContentIdList.Add(new KeyValuePair <int, int>(contentInfo.ChannelId, theContentId));

                return(true, title, string.Empty);
            }
            catch (Exception ex)
            {
                return(false, string.Empty, ex.Message);
            }
        }