예제 #1
0
        public static List <string> GetContentUrls(string gatherUrl, string charset, string cookieString, string regexListArea, string regexUrlInclude)
        {
            var contentUrls = new List <string>();
            var listHtml    = WebClientUtils.GetRemoteFileSource(gatherUrl, ECharsetUtils.GetEnumType(charset), cookieString);
            var areaHtml    = string.Empty;

            if (!string.IsNullOrEmpty(regexListArea))
            {
                areaHtml = GetContent("area", regexListArea, listHtml);
            }

            var urlsList = GetUrls(!string.IsNullOrEmpty(areaHtml) ? areaHtml : listHtml, gatherUrl);

            var isInclude = !string.IsNullOrEmpty(regexUrlInclude);

            foreach (var url in urlsList)
            {
                if (!string.IsNullOrEmpty(url))
                {
                    var contentUrl = url.Replace("&amp;", "&");
                    if (isInclude && !IsMatch(regexUrlInclude, contentUrl))
                    {
                        continue;
                    }
                    if (!contentUrls.Contains(contentUrl))
                    {
                        contentUrls.Add(contentUrl);
                    }
                }
            }
            return(contentUrls);
        }
예제 #2
0
        public static string GetPageContent(string previousPageContent, string charset, string url, string cookieString, string regexContentExclude, string contentHtmlClearCollection, string contentHtmlClearTagCollection, string regexContent, string regexContent2, string regexContent3, string regexNextPage)
        {
            var content = previousPageContent;

            if (!WebClientUtils.GetRemoteHtml(url, ECharsetUtils.GetEnumType(charset), cookieString, out var contentHtml, out var errorMessage))
            {
                throw new Exception(errorMessage);
            }
            var nextPageContent = GetContent("content", regexContent, contentHtml);

            if (string.IsNullOrEmpty(nextPageContent) && !string.IsNullOrEmpty(regexContent2))
            {
                nextPageContent = GetContent("content", regexContent2, contentHtml);
            }
            if (string.IsNullOrEmpty(nextPageContent) && !string.IsNullOrEmpty(regexContent3))
            {
                nextPageContent = GetContent("content", regexContent3, contentHtml);
            }

            if (!string.IsNullOrEmpty(nextPageContent))
            {
                if (string.IsNullOrEmpty(content))
                {
                    content += nextPageContent;
                }
                else
                {
                    content += Utils.PagePlaceHolder + nextPageContent;
                }
            }

            if (!string.IsNullOrEmpty(regexContentExclude))
            {
                content = Replace(regexContentExclude, content, string.Empty);
            }
            if (!string.IsNullOrEmpty(contentHtmlClearCollection))
            {
                var htmlClearList = StringCollectionToList(contentHtmlClearCollection);
                foreach (var htmlClear in htmlClearList)
                {
                    var clearRegex = $@"<{htmlClear}[^>]*>.*?<\/{htmlClear}>";
                    content = Replace(clearRegex, content, string.Empty);
                }
            }
            if (!string.IsNullOrEmpty(contentHtmlClearTagCollection))
            {
                var htmlClearTagList = StringCollectionToList(contentHtmlClearTagCollection);
                foreach (var htmlClearTag in htmlClearTagList)
                {
                    var clearRegex = $@"<{htmlClearTag}[^>]*>";
                    content    = Replace(clearRegex, content, string.Empty);
                    clearRegex = $@"<\/{htmlClearTag}>";
                    content    = Replace(clearRegex, content, string.Empty);
                }
            }

            var contentNextPageUrl = GetUrl(regexNextPage, contentHtml, url);

            if (!string.IsNullOrEmpty(contentNextPageUrl))
            {
                if (StringUtils.EqualsIgnoreCase(url, contentNextPageUrl))
                {
                    contentNextPageUrl = string.Empty;
                }
            }
            return(!string.IsNullOrEmpty(contentNextPageUrl) ? GetPageContent(content, charset, contentNextPageUrl, cookieString, regexContentExclude, contentHtmlClearCollection, contentHtmlClearTagCollection, regexContent, regexContent2, regexContent3, regexNextPage) : content);
        }
예제 #3
0
        public static NameValueCollection GetContentNameValueCollection(string charset, string url, string cookieString, string regexContentExclude, string contentHtmlClearCollection, string contentHtmlClearTagCollection, string regexTitle, string regexContent, string regexContent2, string regexContent3, string regexNextPage, string regexChannel, List <string> contentAttributes, Dictionary <string, string> attributesDict)
        {
            var attributes = new NameValueCollection();

            if (!WebClientUtils.GetRemoteHtml(url, ECharsetUtils.GetEnumType(charset), cookieString, out var contentHtml, out var errorMessage))
            {
                throw new Exception(errorMessage);
            }
            var title   = GetContent("title", regexTitle, contentHtml);
            var content = GetContent("content", regexContent, contentHtml);

            if (string.IsNullOrEmpty(content) && !string.IsNullOrEmpty(regexContent2))
            {
                content = GetContent("content", regexContent2, contentHtml);
            }
            if (string.IsNullOrEmpty(content) && !string.IsNullOrEmpty(regexContent3))
            {
                content = GetContent("content", regexContent3, contentHtml);
            }

            if (!string.IsNullOrEmpty(regexContentExclude))
            {
                content = Replace(regexContentExclude, content, string.Empty);
            }
            if (!string.IsNullOrEmpty(contentHtmlClearCollection))
            {
                var htmlClearList = StringCollectionToList(contentHtmlClearCollection);
                foreach (var htmlClear in htmlClearList)
                {
                    var clearRegex = $@"<{htmlClear}[^>]*>.*?<\/{htmlClear}>";
                    content = Replace(clearRegex, content, string.Empty);
                }
            }
            if (!string.IsNullOrEmpty(contentHtmlClearTagCollection))
            {
                var htmlClearTagList = StringCollectionToList(contentHtmlClearTagCollection);
                foreach (var htmlClearTag in htmlClearTagList)
                {
                    var clearRegex = $@"<{htmlClearTag}[^>]*>";
                    content    = Replace(clearRegex, content, string.Empty);
                    clearRegex = $@"<\/{htmlClearTag}>";
                    content    = Replace(clearRegex, content, string.Empty);
                }
            }

            var contentNextPageUrl = GetUrl(regexNextPage, contentHtml, url);

            if (!string.IsNullOrEmpty(contentNextPageUrl))
            {
                content = GetPageContent(content, charset, contentNextPageUrl, cookieString, regexContentExclude, contentHtmlClearCollection, contentHtmlClearTagCollection, regexContent, regexContent2, regexContent3, regexNextPage);
            }

            var channel = GetContent("channel", regexChannel, contentHtml);

            attributes.Add("Title", title);
            attributes.Add("Channel", channel);
            attributes.Add("Content", content);

            foreach (var attributeName in contentAttributes)
            {
                var normalStart = GetStartValue(attributesDict, attributeName);
                var normalEnd   = GetEndValue(attributesDict, attributeName);
                var regex       = GetRegexAttributeName(attributeName, normalStart, normalEnd);
                var value       = GetContent(attributeName, regex, contentHtml);
                attributes.Set(attributeName, value);
            }

            return(attributes);
        }
예제 #4
0
        private static bool GatherOneByUrl(ISiteInfo siteInfo, IChannelInfo channelInfo, bool isSaveImage, bool isSetFirstImageAsImageUrl, bool isEmptyContentAllowed, bool isSameTitleAllowed, bool isChecked, string charset, string url, string cookieString, string regexTitleInclude, string regexContentExclude, string contentHtmlClearCollection, string contentHtmlClearTagCollection, string contentReplaceFrom, string contentReplaceTo, string regexTitle, string regexContent, string regexContent2, string regexContent3, string regexNextPage, string regexChannel, IEnumerable <string> contentAttributes, NameValueCollection contentAttributesXml, IDictionary <int, IList <string> > contentTitleDict, ICollection <KeyValuePair <int, int> > channelIdAndContentIdList, IAdministratorInfo adminInfo, out string title, out string errorMessage)
        {
            title        = string.Empty;
            errorMessage = string.Empty;

            try
            {
                //TODO:采集文件、链接标题为内容标题、链接提示为内容标题
                //string extension = PathUtils.GetExtension(url);
                //if (!EFileSystemTypeUtils.IsTextEditable(extension))
                //{
                //    if (EFileSystemTypeUtils.IsImageOrFlashOrPlayer(extension))
                //    {

                //    }
                //}
                var tableName   = Context.ContentApi.GetTableName(siteInfo.Id, channelInfo.Id);
                var contentHtml = WebClientUtils.GetRemoteFileSource(url, ECharsetUtils.GetEnumType(charset), cookieString);
                title = GetContent("title", regexTitle, contentHtml);
                var content = GetContent("content", regexContent, contentHtml);
                if (string.IsNullOrEmpty(content) && !string.IsNullOrEmpty(regexContent2))
                {
                    content = GetContent("content", regexContent2, contentHtml);
                }
                if (string.IsNullOrEmpty(content) && !string.IsNullOrEmpty(regexContent3))
                {
                    content = GetContent("content", regexContent3, contentHtml);
                }

                //如果标题或内容为空,返回false并退出
                if (string.IsNullOrEmpty(title))
                {
                    errorMessage = $"无法获取标题:{url}";
                    return(false);
                }
                if (isEmptyContentAllowed == false && string.IsNullOrEmpty(content))
                {
                    errorMessage = $"无法获取内容正文:{url}";
                    return(false);
                }

                title = StringUtils.StripTags(title);

                if (!string.IsNullOrEmpty(regexTitleInclude))
                {
                    if (IsMatch(regexTitleInclude, title) == false)
                    {
                        errorMessage = $"标题不符合要求:{url}";
                        return(false);
                    }
                }
                if (!string.IsNullOrEmpty(regexContentExclude))
                {
                    content = Replace(regexContentExclude, content, string.Empty);
                }
                if (!string.IsNullOrEmpty(contentHtmlClearCollection))
                {
                    var htmlClearList = StringCollectionToList(contentHtmlClearCollection);
                    foreach (var htmlClear in htmlClearList)
                    {
                        var clearRegex = $@"<{htmlClear}[^>]*>.*?<\/{htmlClear}>";
                        content = Replace(clearRegex, content, string.Empty);
                    }
                }
                if (!string.IsNullOrEmpty(contentHtmlClearTagCollection))
                {
                    var htmlClearTagList = StringCollectionToList(contentHtmlClearTagCollection);
                    foreach (var htmlClearTag in htmlClearTagList)
                    {
                        var clearRegex = $@"<{htmlClearTag}[^>]*>";
                        content    = Replace(clearRegex, content, string.Empty);
                        clearRegex = $@"<\/{htmlClearTag}>";
                        content    = Replace(clearRegex, content, string.Empty);
                    }
                }

                if (!string.IsNullOrEmpty(contentReplaceFrom))
                {
                    var fromList = TranslateUtils.StringCollectionToStringCollection(contentReplaceFrom);
                    var isMulti  = false;
                    if (!string.IsNullOrEmpty(contentReplaceTo) && contentReplaceTo.IndexOf(value: ',') != -1)
                    {
                        if (StringUtils.GetCount(",", contentReplaceTo) + 1 == fromList.Count)
                        {
                            isMulti = true;
                        }
                    }
                    if (isMulti == false)
                    {
                        foreach (var from in fromList)
                        {
                            title   = Replace($"({from.Replace(" ", "\\s")})(?!</a>)(?![^><]*>)", title, contentReplaceTo);
                            content = Replace($"({from.Replace(" ", "\\s")})(?!</a>)(?![^><]*>)", content, contentReplaceTo);
                        }
                    }
                    else
                    {
                        var tos = TranslateUtils.StringCollectionToStringCollection(contentReplaceTo);
                        for (var i = 0; i < fromList.Count; i++)
                        {
                            title   = Replace($"({fromList[i].Replace(" ", "\\s")})(?!</a>)(?![^><]*>)", title, tos[i]);
                            content = Replace($"({fromList[i].Replace(" ", "\\s")})(?!</a>)(?![^><]*>)", content, tos[i]);
                        }
                    }
                }

                var contentNextPageUrl = GetUrl(regexNextPage, contentHtml, url);
                if (!string.IsNullOrEmpty(contentNextPageUrl))
                {
                    try
                    {
                        content = GetPageContent(content, charset, contentNextPageUrl, cookieString, regexContentExclude, contentHtmlClearCollection, contentHtmlClearTagCollection, regexContent, regexContent2, regexContent3, regexNextPage);
                    }
                    catch (Exception ex)
                    {
                        errorMessage = ex.Message;
                        return(false);
                    }
                }

                var channel   = GetContent("channel", regexChannel, contentHtml);
                var channelId = channelInfo.Id;
                if (!string.IsNullOrEmpty(channel))
                {
                    var channelIdByNodeName = 0;

                    var childChannelIdList = Context.ChannelApi.GetChannelIdList(siteInfo.Id, channelInfo.Id);
                    foreach (var childChannelId in childChannelIdList)
                    {
                        if (channel == Context.ChannelApi.GetChannelName(siteInfo.Id, childChannelId))
                        {
                            channelIdByNodeName = childChannelId;
                        }
                    }

                    //var channelIdByNodeName = ChannelManager.GetChannelIdByParentIdAndChannelName(siteInfo.Id, channelInfo.Id, channel, recursive: false);
                    if (channelIdByNodeName == 0)
                    {
                        var newChannelInfo = Context.ChannelApi.NewInstance(siteInfo.Id);

                        newChannelInfo.ParentId             = channelInfo.Id;
                        newChannelInfo.ChannelName          = channel;
                        newChannelInfo.ContentModelPluginId = channelInfo.ContentModelPluginId;

                        channelId = Context.ChannelApi.Insert(siteInfo.Id, newChannelInfo);
                    }
                    else
                    {
                        channelId = channelIdByNodeName;
                    }
                }

                if (!isSameTitleAllowed)
                {
                    if (!contentTitleDict.TryGetValue(channelId, out var contentTitleList))
                    {
                        var repository = new Repository(Context.Environment.DatabaseType,
                                                        Context.Environment.ConnectionString, tableName);
                        contentTitleList = repository.GetAll <string>(Q.Select(ContentAttribute.Title)
                                                                      .Where(ContentAttribute.ChannelId, channelId));
                    }

                    if (contentTitleList.Contains(title))
                    {
                        errorMessage = $"已包含相同标题:{title}";
                        return(false);
                    }

                    contentTitleList.Add(title);
                    contentTitleDict[channelId] = contentTitleList;
                }

                var contentInfo = Context.ContentApi.NewInstance(siteInfo.Id, channelId);

                contentInfo.AdminId          = adminInfo.Id;
                contentInfo.AddUserName      = adminInfo.UserName;
                contentInfo.AddDate          = DateTime.Now;
                contentInfo.LastEditUserName = contentInfo.AddUserName;
                contentInfo.LastEditDate     = contentInfo.AddDate;
                contentInfo.Checked          = isChecked;
                contentInfo.CheckedLevel     = 0;

                contentInfo.Title = title;

                foreach (var attributeName in contentAttributes)
                {
                    if (!StringUtils.EqualsIgnoreCase(attributeName, ContentAttribute.Title) && !StringUtils.EqualsIgnoreCase(attributeName, ContentAttribute.Content))
                    {
                        var normalStart = StringUtils.ValueFromUrl(contentAttributesXml[attributeName + "_ContentStart"]);
                        var normalEnd   = StringUtils.ValueFromUrl(contentAttributesXml[attributeName + "_ContentEnd"]);

                        //采集为空时的默认值
                        var normalDefault = StringUtils.ValueFromUrl(contentAttributesXml[attributeName + "_ContentDefault"]);

                        var regex = GetRegexAttributeName(attributeName, normalStart, normalEnd);
                        var value = GetContent(attributeName, regex, contentHtml);

                        //采集为空时的默认值
                        if (string.IsNullOrEmpty(value))
                        {
                            value = normalDefault;
                        }

                        if (ContentAttribute.AllAttributes.Value.Contains(attributeName))
                        {
                            if (StringUtils.EqualsIgnoreCase(ContentAttribute.AddDate, attributeName))
                            {
                                contentInfo.AddDate = TranslateUtils.ToDateTime(value, DateTime.Now);
                            }
                            else if (StringUtils.EqualsIgnoreCase(ContentAttribute.IsColor, attributeName))
                            {
                                contentInfo.Color = TranslateUtils.ToBool(value, defaultValue: false);
                            }
                            else if (StringUtils.EqualsIgnoreCase(ContentAttribute.IsHot, attributeName))
                            {
                                contentInfo.Hot = TranslateUtils.ToBool(value, defaultValue: false);
                            }
                            else if (StringUtils.EqualsIgnoreCase(ContentAttribute.IsRecommend, attributeName))
                            {
                                contentInfo.Recommend = TranslateUtils.ToBool(value, defaultValue: false);
                            }
                            else if (StringUtils.EqualsIgnoreCase(ContentAttribute.IsTop, attributeName))
                            {
                                contentInfo.Top = TranslateUtils.ToBool(value, defaultValue: false);
                            }
                            else if (StringUtils.EqualsIgnoreCase(ContentAttribute.ImageUrl, attributeName))
                            {
                                if (!string.IsNullOrEmpty(value))
                                {
                                    var attachmentUrl = PageUtils.GetUrlByBaseUrl(value, url);

                                    var fileExtension = PageUtils.GetExtensionFromUrl(attachmentUrl);
                                    var fileName      =
                                        $"{StringUtils.GetShortGuid(false)}{fileExtension}";

                                    var filePath = Context.SiteApi.GetUploadFilePath(siteInfo.Id, fileName);
                                    Utils.CreateDirectoryIfNotExists(filePath);
                                    try
                                    {
                                        WebClientUtils.SaveRemoteFileToLocal(attachmentUrl, filePath);
                                        contentInfo.ImageUrl = Context.SiteApi.GetSiteUrlByFilePath(filePath);
                                    }
                                    catch
                                    {
                                        // ignored
                                    }
                                }
                            }
                            else if (StringUtils.EqualsIgnoreCase(ContentAttribute.VideoUrl, attributeName))
                            {
                                if (!string.IsNullOrEmpty(value))
                                {
                                    var attachmentUrl = PageUtils.GetUrlByBaseUrl(value, url);
                                    var fileExtension = PageUtils.GetExtensionFromUrl(attachmentUrl);
                                    var fileName      = $"{StringUtils.GetShortGuid(false)}{fileExtension}";
                                    var filePath      = Context.SiteApi.GetUploadFilePath(siteInfo.Id, fileName);
                                    Utils.CreateDirectoryIfNotExists(filePath);
                                    try
                                    {
                                        WebClientUtils.SaveRemoteFileToLocal(attachmentUrl, filePath);
                                        contentInfo.VideoUrl = Context.SiteApi.GetSiteUrlByFilePath(filePath);
                                    }
                                    catch
                                    {
                                        // ignored
                                    }
                                }
                            }
                            else if (StringUtils.EqualsIgnoreCase(ContentAttribute.FileUrl, attributeName))
                            {
                                if (!string.IsNullOrEmpty(value))
                                {
                                    var attachmentUrl = PageUtils.GetUrlByBaseUrl(value, url);
                                    var fileExtension = PageUtils.GetExtensionFromUrl(attachmentUrl);
                                    var fileName      = $"{StringUtils.GetShortGuid(false)}{fileExtension}";
                                    var filePath      = Context.SiteApi.GetUploadFilePath(siteInfo.Id, fileName);
                                    Utils.CreateDirectoryIfNotExists(filePath);
                                    try
                                    {
                                        WebClientUtils.SaveRemoteFileToLocal(attachmentUrl, filePath);
                                        contentInfo.FileUrl = Context.SiteApi.GetSiteUrlByFilePath(filePath);
                                    }
                                    catch
                                    {
                                        // ignored
                                    }
                                }
                            }
                            else if (StringUtils.EqualsIgnoreCase(ContentAttribute.Hits, attributeName))
                            {
                                contentInfo.Hits = TranslateUtils.ToInt(value);
                            }
                            else
                            {
                                contentInfo.Set(attributeName, value);
                            }
                        }
                        else
                        {
                            //var styleInfo = TableStyleManager.GetTableStyleInfo(tableName, attributeName, relatedIdentities: null);
                            //value = InputParserUtility.GetContentByTableStyle(value, siteInfo, styleInfo);

                            //if (styleInfo.InputType == InputType.Image || styleInfo.InputType == InputType.Video || styleInfo.InputType == InputType.File)
                            //{
                            //    if (!string.IsNullOrEmpty(value))
                            //    {
                            //        var attachmentUrl = PageUtils.GetUrlByBaseUrl(value, url);
                            //        var fileExtension = PathUtils.GetExtension(attachmentUrl);
                            //        var fileName = $"{StringUtils.GetShortGuid(false)}{fileExtension}";
                            //        var filePath = Context.SiteApi.GetUploadFilePath(siteInfo.Id, fileName);
                            //        Utils.CreateDirectoryIfNotExists(filePath);
                            //        try
                            //        {
                            //            WebClientUtils.SaveRemoteFileToLocal(attachmentUrl, filePath);
                            //            value = Context.SiteApi.GetSiteUrlByFilePath(filePath);
                            //        }
                            //        catch
                            //        {
                            //            // ignored
                            //        }
                            //    }
                            //}

                            contentInfo.Set(attributeName, value);
                        }
                    }
                }

                if (string.IsNullOrEmpty(contentInfo.ImageUrl))
                {
                    var firstImageUrl = string.Empty;
                    if (isSaveImage)
                    {
                        var originalImageSrcList = GetOriginalImageSrcList(content);
                        var imageSrcList         = GetImageSrcList(url, content);
                        if (originalImageSrcList.Count == imageSrcList.Count)
                        {
                            for (var i = 0; i < originalImageSrcList.Count; i++)
                            {
                                var originalImageSrc = originalImageSrcList[i];
                                var imageSrc         = imageSrcList[i];

                                var fileExtension = PathUtils.GetExtension(originalImageSrc);
                                var fileName      = $"{StringUtils.GetShortGuid(false)}{fileExtension}";
                                var filePath      = Context.SiteApi.GetUploadFilePath(siteInfo.Id, fileName);
                                Utils.CreateDirectoryIfNotExists(filePath);
                                try
                                {
                                    WebClientUtils.SaveRemoteFileToLocal(imageSrc, filePath);
                                    var fileUrl = Context.SiteApi.GetSiteUrlByFilePath(filePath);
                                    content = content.Replace(originalImageSrc, fileUrl);
                                    if (firstImageUrl == string.Empty)
                                    {
                                        firstImageUrl = fileUrl;
                                    }
                                }
                                catch
                                {
                                    // ignored
                                }
                            }
                        }
                    }
                    else if (isSetFirstImageAsImageUrl)
                    {
                        var imageSrcList = GetImageSrcList(url, content);
                        if (imageSrcList.Count > 0)
                        {
                            firstImageUrl = imageSrcList[index : 0];
                        }
                    }

                    if (isSetFirstImageAsImageUrl)
                    {
                        contentInfo.ImageUrl = firstImageUrl;
                    }
                }
                //contentInfo.Content = StringUtility.TextEditorContentEncode(content, siteInfo, false);
                contentInfo.Content = content;

                //contentInfo.SourceId = SourceManager.CaiJi;

                var theContentId = Context.ContentApi.Insert(siteInfo.Id, channelInfo.Id, contentInfo);
                channelIdAndContentIdList.Add(new KeyValuePair <int, int>(contentInfo.ChannelId, theContentId));

                errorMessage = string.Empty;
                return(true);
            }
            catch (Exception ex)
            {
                errorMessage = ex.Message;
                return(false);
            }
        }
예제 #5
0
        public static NameValueCollection GetContentNameValueCollection(string charset, string url, string cookieString, string regexContentExclude, string contentHtmlClearCollection, string contentHtmlClearTagCollection, string regexTitle, string regexContent, string regexContent2, string regexContent3, string regexNextPage, string regexChannel, List <string> contentAttributes, NameValueCollection contentAttributesXml)
        {
            var attributes = new NameValueCollection();

            var contentHtml = WebClientUtils.GetRemoteFileSource(url, ECharsetUtils.GetEnumType(charset), cookieString);
            var title       = GetContent("title", regexTitle, contentHtml);
            var content     = GetContent("content", regexContent, contentHtml);

            if (string.IsNullOrEmpty(content) && !string.IsNullOrEmpty(regexContent2))
            {
                content = GetContent("content", regexContent2, contentHtml);
            }
            if (string.IsNullOrEmpty(content) && !string.IsNullOrEmpty(regexContent3))
            {
                content = GetContent("content", regexContent3, contentHtml);
            }

            if (!string.IsNullOrEmpty(regexContentExclude))
            {
                content = Replace(regexContentExclude, content, string.Empty);
            }
            if (!string.IsNullOrEmpty(contentHtmlClearCollection))
            {
                var htmlClearList = StringCollectionToList(contentHtmlClearCollection);
                foreach (var htmlClear in htmlClearList)
                {
                    var clearRegex = $@"<{htmlClear}[^>]*>.*?<\/{htmlClear}>";
                    content = Replace(clearRegex, content, string.Empty);
                }
            }
            if (!string.IsNullOrEmpty(contentHtmlClearTagCollection))
            {
                var htmlClearTagList = StringCollectionToList(contentHtmlClearTagCollection);
                foreach (var htmlClearTag in htmlClearTagList)
                {
                    var clearRegex = $@"<{htmlClearTag}[^>]*>";
                    content    = Replace(clearRegex, content, string.Empty);
                    clearRegex = $@"<\/{htmlClearTag}>";
                    content    = Replace(clearRegex, content, string.Empty);
                }
            }

            var contentNextPageUrl = GetUrl(regexNextPage, contentHtml, url);

            if (!string.IsNullOrEmpty(contentNextPageUrl))
            {
                content = GetPageContent(content, charset, contentNextPageUrl, cookieString, regexContentExclude, contentHtmlClearCollection, contentHtmlClearTagCollection, regexContent, regexContent2, regexContent3, regexNextPage);
            }

            var channel = GetContent("channel", regexChannel, contentHtml);

            attributes.Add("title", title);
            attributes.Add("channel", channel);
            attributes.Add("content", StringUtils.HtmlEncode(content));

            foreach (var attributeName in contentAttributes)
            {
                var normalStart = StringUtils.ValueFromUrl(contentAttributesXml[attributeName + "_ContentStart"]);
                var normalEnd   = StringUtils.ValueFromUrl(contentAttributesXml[attributeName + "_ContentEnd"]);
                var regex       = GetRegexAttributeName(attributeName, normalStart, normalEnd);
                var value       = GetContent(attributeName, regex, contentHtml);
                attributes.Set(attributeName, value);
            }

            return(attributes);
        }