public static async Task <string> GetPageContentAsync(string previousPageContent, Charset charset, string url, string cookieString, string regexContentExclude, string contentHtmlClearCollection, string contentHtmlClearTagCollection, string regexContent, string regexContent2, string regexContent3, string regexNextPage) { var content = previousPageContent; var result = await WebClientUtils.GetRemoteHtmlAsync(url, charset, cookieString); if (!result.IsSuccess) { throw new Exception(result.ErrorMessage); } var contentHtml = result.Content; var nextPageContent = GetValue("content", regexContent, contentHtml); if (string.IsNullOrEmpty(nextPageContent) && !string.IsNullOrEmpty(regexContent2)) { nextPageContent = GetValue("content", regexContent2, contentHtml); } if (string.IsNullOrEmpty(nextPageContent) && !string.IsNullOrEmpty(regexContent3)) { nextPageContent = GetValue("content", regexContent3, contentHtml); } if (!string.IsNullOrEmpty(nextPageContent)) { if (string.IsNullOrEmpty(content)) { content += nextPageContent; } else { content += PagePlaceHolder + nextPageContent; } } if (!string.IsNullOrEmpty(regexContentExclude)) { content = Replace(regexContentExclude, content, string.Empty); } if (!string.IsNullOrEmpty(contentHtmlClearCollection)) { var htmlClearList = StringCollectionToList(contentHtmlClearCollection); foreach (var htmlClear in htmlClearList) { var clearRegex = $@"<{htmlClear}[^>]*>.*?<\/{htmlClear}>"; content = Replace(clearRegex, content, string.Empty); } } if (!string.IsNullOrEmpty(contentHtmlClearTagCollection)) { var htmlClearTagList = StringCollectionToList(contentHtmlClearTagCollection); foreach (var htmlClearTag in htmlClearTagList) { var clearRegex = $@"<{htmlClearTag}[^>]*>"; content = Replace(clearRegex, content, string.Empty); clearRegex = $@"<\/{htmlClearTag}>"; content = Replace(clearRegex, content, string.Empty); } } var contentNextPageUrl = GetUrl(regexNextPage, contentHtml, url); if (!string.IsNullOrEmpty(contentNextPageUrl)) { if (StringUtils.EqualsIgnoreCase(url, contentNextPageUrl)) { contentNextPageUrl = string.Empty; } } return(!string.IsNullOrEmpty(contentNextPageUrl) ? await GetPageContentAsync(content, charset, contentNextPageUrl, cookieString, regexContentExclude, contentHtmlClearCollection, contentHtmlClearTagCollection, regexContent, regexContent2, regexContent3, regexNextPage) : content); }
public static async Task <List <Item> > GetItemsAsync(string gatherUrl, Rule rule) { var result = await WebClientUtils.GetRemoteHtmlAsync(gatherUrl, rule.Charset, rule.CookieString); if (!result.IsSuccess) { throw new Exception(result.ErrorMessage); } var pageHtml = result.Content; var areaHtml = string.Empty; var regexListArea = GetRegexArea(rule.ListAreaStart, rule.ListAreaEnd); if (!string.IsNullOrEmpty(regexListArea)) { areaHtml = GetValue("area", regexListArea, pageHtml); } var listHtml = !string.IsNullOrEmpty(areaHtml) ? areaHtml : pageHtml; var regexContentUrl = GetRegexUrl(rule.ContentUrlStart, rule.ContentUrlEnd); var regexImageUrl = string.Empty; if (rule.ImageSource == ImageSource.List) { regexImageUrl = GetRegexUrl(rule.ImageUrlStart, rule.ImageUrlEnd); } var regexTitle = string.Empty; if (rule.ContentTitleByList) { regexTitle = GetRegexTitle(rule.ContentTitleStart, rule.ContentTitleEnd); } var contentAttributes = ListUtils.GetStringList(rule.ContentAttributes); var contentUrls = GetValues("url", regexContentUrl, listHtml); var imageUrls = GetValues("url", regexImageUrl, listHtml); var titles = GetValues("title", regexTitle, listHtml); var attributesDict = new Dictionary <string, List <string> >(); foreach (var attributeName in contentAttributes) { var normalByList = GetByListValue(rule, attributeName); if (!normalByList) { continue; } var normalStart = GetStartValue(rule, attributeName); var normalEnd = GetEndValue(rule, attributeName); var regex = GetRegexAttributeName(attributeName, normalStart, normalEnd); var values = GetValues(attributeName, regex, listHtml); attributesDict[attributeName] = values; } var myUri = new Uri(gatherUrl); var host = myUri.Scheme + "://" + myUri.Host; if (!myUri.IsDefaultPort) { host += ":" + myUri.Port; } var contentUrlList = new List <string>(); foreach (var contentUrl in contentUrls) { if (string.IsNullOrEmpty(contentUrl)) { continue; } var url = string.Empty; if (PageUtils.IsProtocolUrl(contentUrl)) { url = contentUrl; } else if (contentUrl.StartsWith('/')) { url = PageUtils.Combine(host, contentUrl); } if (string.IsNullOrEmpty(url)) { continue; } if (!contentUrlList.Contains(url)) { contentUrlList.Add(url); } } var imageUrlList = new List <string>(); foreach (var imageUrl in imageUrls) { if (string.IsNullOrEmpty(imageUrl)) { continue; } var url = string.Empty; if (PageUtils.IsProtocolUrl(imageUrl)) { url = imageUrl; } else if (imageUrl.StartsWith('/')) { url = PageUtils.Combine(host, imageUrl); } if (string.IsNullOrEmpty(url)) { continue; } if (!imageUrlList.Contains(url)) { imageUrlList.Add(url); } } var items = new List <Item>(); for (var i = 0; i < contentUrlList.Count; i++) { var content = new Content(); var imageUrl = imageUrls.Count > i ? imageUrls[i] : string.Empty; var title = titles.Count > i ? titles[i] : string.Empty; if (!string.IsNullOrEmpty(imageUrl)) { if (imageUrl.StartsWith('/')) { imageUrl = PageUtils.Combine(host, imageUrl); } } content.ImageUrl = imageUrl; content.Title = title; foreach (var attributeName in contentAttributes) { var normalByList = GetByListValue(rule, attributeName); if (!normalByList) { continue; } var normalDefault = GetDefaultValue(rule, attributeName); var values = attributesDict[attributeName]; var value = values.Count > i ? values[i] : normalDefault; content.Set(attributeName, value); } items.Add(new Item { Url = contentUrlList[i], Content = content }); } return(items); }
public static async Task <NameValueCollection> GetContentNameValueCollectionAsync(Rule rule, Item item) { var attributes = new NameValueCollection(); var result = await WebClientUtils.GetRemoteHtmlAsync(item.Url, rule.Charset, rule.CookieString); if (!result.IsSuccess) { throw new Exception(result.ErrorMessage); } var contentHtml = result.Content; var regexContentExclude = GatherUtils.GetRegexString(rule.ContentExclude); var regexChannel = GatherUtils.GetRegexChannel(rule.ContentChannelStart, rule.ContentChannelEnd); var regexContent = GatherUtils.GetRegexContent(rule.ContentContentStart, rule.ContentContentEnd); var regexContent2 = string.Empty; if (!string.IsNullOrEmpty(rule.ContentContentStart2) && !string.IsNullOrEmpty(rule.ContentContentEnd2)) { regexContent2 = GatherUtils.GetRegexContent(rule.ContentContentStart2, rule.ContentContentEnd2); } var regexContent3 = string.Empty; if (!string.IsNullOrEmpty(rule.ContentContentStart3) && !string.IsNullOrEmpty(rule.ContentContentEnd3)) { regexContent3 = GatherUtils.GetRegexContent(rule.ContentContentStart3, rule.ContentContentEnd3); } var regexNextPage = GatherUtils.GetRegexUrl(rule.ContentNextPageStart, rule.ContentNextPageEnd); var regexTitle = GatherUtils.GetRegexTitle(rule.ContentTitleStart, rule.ContentTitleEnd); var contentAttributes = ListUtils.GetStringList(rule.ContentAttributes); var title = rule.ContentTitleByList ? item.Content.Title : GetValue("title", regexTitle, contentHtml); var body = GetValue("content", regexContent, contentHtml); if (string.IsNullOrEmpty(body) && !string.IsNullOrEmpty(regexContent2)) { body = GetValue("content", regexContent2, contentHtml); } if (string.IsNullOrEmpty(body) && !string.IsNullOrEmpty(regexContent3)) { body = GetValue("content", regexContent3, contentHtml); } if (!string.IsNullOrEmpty(regexContentExclude)) { body = Replace(regexContentExclude, body, string.Empty); } if (!string.IsNullOrEmpty(rule.ContentHtmlClearCollection)) { var htmlClearList = StringCollectionToList(rule.ContentHtmlClearCollection); foreach (var htmlClear in htmlClearList) { var clearRegex = $@"<{htmlClear}[^>]*>.*?<\/{htmlClear}>"; body = Replace(clearRegex, body, string.Empty); } } if (!string.IsNullOrEmpty(rule.ContentHtmlClearTagCollection)) { var htmlClearTagList = StringCollectionToList(rule.ContentHtmlClearTagCollection); foreach (var htmlClearTag in htmlClearTagList) { var clearRegex = $@"<{htmlClearTag}[^>]*>"; body = Replace(clearRegex, body, string.Empty); clearRegex = $@"<\/{htmlClearTag}>"; body = Replace(clearRegex, body, string.Empty); } } var contentNextPageUrl = GetUrl(regexNextPage, contentHtml, item.Url); if (!string.IsNullOrEmpty(contentNextPageUrl)) { body = await GetPageContentAsync(body, rule.Charset, contentNextPageUrl, rule.CookieString, regexContentExclude, rule.ContentHtmlClearCollection, rule.ContentHtmlClearTagCollection, regexContent, regexContent2, regexContent3, regexNextPage); } var channel = GetValue("channel", regexChannel, contentHtml); foreach (var attributeName in contentAttributes) { var normalByList = GetByListValue(rule, attributeName); var normalStart = GetStartValue(rule, attributeName); var normalEnd = GetEndValue(rule, attributeName); var normalDefault = GetDefaultValue(rule, attributeName); var regex = GetRegexAttributeName(attributeName, normalStart, normalEnd); var value = normalByList ? item.Content.Get <string>(attributeName) : GetValue(attributeName, regex, contentHtml); if (string.IsNullOrEmpty(value)) { value = normalDefault; } attributes.Set(attributeName, value); } attributes.Add("标题", title); attributes.Add("栏目", channel); attributes.Add("正文", body); return(attributes); }
private async Task <(bool Success, string Title, string ErrorMessage)> GatherOneAsync(Site siteInfo, Channel channelInfo, string regexTitleInclude, string regexContentExclude, string regexTitle, string regexContent, string regexContent2, string regexContent3, string regexNextPage, string regexChannel, IEnumerable <string> contentAttributes, Rule rule, Item item, ICollection <KeyValuePair <int, int> > channelIdAndContentIdList, int adminId) { try { var result = await WebClientUtils.GetRemoteHtmlAsync(item.Url, rule.Charset, rule.CookieString); if (!result.IsSuccess) { return(false, string.Empty, result.ErrorMessage); } var contentHtml = result.Content; var errorMessage = string.Empty; var title = rule.ContentTitleByList ? item.Content.Title : GatherUtils.GetValue("title", regexTitle, contentHtml); var content = GatherUtils.GetValue("content", regexContent, contentHtml); if (string.IsNullOrEmpty(content) && !string.IsNullOrEmpty(regexContent2)) { content = GatherUtils.GetValue("content", regexContent2, contentHtml); } if (string.IsNullOrEmpty(content) && !string.IsNullOrEmpty(regexContent3)) { content = GatherUtils.GetValue("content", regexContent3, contentHtml); } //如果标题或内容为空,返回false并退出 if (string.IsNullOrEmpty(title)) { errorMessage = $"无法获取标题:{item.Url}"; return(false, title, errorMessage); } if (rule.IsEmptyContentAllowed == false && string.IsNullOrEmpty(content)) { errorMessage = $"无法获取内容正文:{item.Url}"; return(false, title, errorMessage); } title = StringUtils.StripTags(title); if (!string.IsNullOrEmpty(regexTitleInclude)) { if (GatherUtils.IsMatch(regexTitleInclude, title) == false) { errorMessage = $"标题不符合要求:{item.Url}"; return(false, title, errorMessage); } } if (!string.IsNullOrEmpty(regexContentExclude)) { content = GatherUtils.Replace(regexContentExclude, content, string.Empty); } if (!string.IsNullOrEmpty(rule.ContentHtmlClearCollection)) { var htmlClearList = GatherUtils.StringCollectionToList(rule.ContentHtmlClearCollection); foreach (var htmlClear in htmlClearList) { var clearRegex = $@"<{htmlClear}[^>]*>.*?<\/{htmlClear}>"; content = GatherUtils.Replace(clearRegex, content, string.Empty); } } if (!string.IsNullOrEmpty(rule.ContentHtmlClearTagCollection)) { var htmlClearTagList = GatherUtils.StringCollectionToList(rule.ContentHtmlClearTagCollection); foreach (var htmlClearTag in htmlClearTagList) { var clearRegex = $@"<{htmlClearTag}[^>]*>"; content = GatherUtils.Replace(clearRegex, content, string.Empty); clearRegex = $@"<\/{htmlClearTag}>"; content = GatherUtils.Replace(clearRegex, content, string.Empty); } } var contentNextPageUrl = GatherUtils.GetUrl(regexNextPage, contentHtml, item.Url); if (!string.IsNullOrEmpty(contentNextPageUrl)) { try { content = await GatherUtils.GetPageContentAsync(content, rule.Charset, contentNextPageUrl, rule.CookieString, regexContentExclude, rule.ContentHtmlClearCollection, rule.ContentHtmlClearTagCollection, regexContent, regexContent2, regexContent3, regexNextPage); } catch (Exception ex) { errorMessage = ex.Message; return(false, title, errorMessage); } } var channel = GatherUtils.GetValue("channel", regexChannel, contentHtml); var channelId = channelInfo.Id; if (!string.IsNullOrEmpty(channel)) { var channelIdByNodeName = 0; var childChannelIdList = await _channelRepository.GetChannelIdsAsync(siteInfo.Id, channelInfo.Id, ScopeType.All); foreach (var childChannelId in childChannelIdList) { if (channel == await _channelRepository.GetChannelNameAsync(siteInfo.Id, childChannelId)) { channelIdByNodeName = childChannelId; } } //var channelIdByNodeName = ChannelManager.GetChannelIdByParentIdAndChannelName(siteInfo.Id, channelInfo.Id, channel, recursive: false); if (channelIdByNodeName == 0) { var newChannelInfo = new Channel { SiteId = siteInfo.Id, ParentId = channelInfo.Id, ChannelName = channel, ContentModelPluginId = channelInfo.ContentModelPluginId }; channelId = await _channelRepository.InsertAsync(newChannelInfo); } else { channelId = channelIdByNodeName; } } if (!rule.IsSameTitleAllowed) { var theChannel = await _channelRepository.GetAsync(channelId); var contentIds = await _contentRepository.GetContentIdsBySameTitleAsync(siteInfo, theChannel, title); if (contentIds.Count > 0) { errorMessage = $"已包含相同标题:{title}"; return(false, title, errorMessage); } } var contentInfo = new Content { AddDate = DateTime.Now }; foreach (var attributeName in contentAttributes) { if (!StringUtils.EqualsIgnoreCase(attributeName, nameof(Content.Title)) && !StringUtils.EqualsIgnoreCase(attributeName, nameof(Content.Body))) { var normalByList = GatherUtils.GetByListValue(rule, attributeName); var normalStart = GatherUtils.GetStartValue(rule, attributeName); var normalEnd = GatherUtils.GetEndValue(rule, attributeName); //采集为空时的默认值 var normalDefault = GatherUtils.GetDefaultValue(rule, attributeName); var regex = GatherUtils.GetRegexAttributeName(attributeName, normalStart, normalEnd); var value = normalByList ? item.Content.Get <string>(attributeName) : GatherUtils.GetValue(attributeName, regex, contentHtml); //采集为空时的默认值 if (string.IsNullOrEmpty(value)) { value = normalDefault; } if (StringUtils.EqualsIgnoreCase(nameof(Content.AddDate), attributeName)) { value = GatherUtils.ReplaceFirst(value, ":", ":"); contentInfo.AddDate = TranslateUtils.ToDateTime(value, DateTime.Now); } else if (StringUtils.EqualsIgnoreCase(nameof(Content.Color), attributeName)) { contentInfo.Color = TranslateUtils.ToBool(value, defaultValue: false); } else if (StringUtils.EqualsIgnoreCase(nameof(Content.Hot), attributeName)) { contentInfo.Hot = TranslateUtils.ToBool(value, defaultValue: false); } else if (StringUtils.EqualsIgnoreCase(nameof(Content.Recommend), attributeName)) { contentInfo.Recommend = TranslateUtils.ToBool(value, defaultValue: false); } else if (StringUtils.EqualsIgnoreCase(nameof(Content.Top), attributeName)) { contentInfo.Top = TranslateUtils.ToBool(value, defaultValue: false); } else if (StringUtils.EqualsIgnoreCase(nameof(Content.ImageUrl), attributeName)) { if (!string.IsNullOrEmpty(value)) { var attachmentUrl = GatherUtils.GetUrlByBaseUrl(value, item.Url); var fileExtension = PageUtils.GetExtensionFromUrl(attachmentUrl); var fileName = $"{StringUtils.GetShortGuid(false)}{fileExtension}"; var directoryPath = await _pathManager.GetUploadDirectoryPathAsync(siteInfo, UploadType.Image); var filePath = PathUtils.Combine(directoryPath, fileName); DirectoryUtils.CreateDirectoryIfNotExists(filePath); try { await WebClientUtils.DownloadAsync(attachmentUrl, filePath); contentInfo.ImageUrl = await _pathManager.GetVirtualUrlByPhysicalPathAsync(siteInfo, filePath); } catch { // ignored } } } else if (StringUtils.EqualsIgnoreCase(nameof(Content.VideoUrl), attributeName)) { if (!string.IsNullOrEmpty(value)) { var attachmentUrl = GatherUtils.GetUrlByBaseUrl(value, item.Url); var fileExtension = PageUtils.GetExtensionFromUrl(attachmentUrl); var fileName = $"{StringUtils.GetShortGuid(false)}{fileExtension}"; var directoryPath = await _pathManager.GetUploadDirectoryPathAsync(siteInfo, UploadType.Video); var filePath = PathUtils.Combine(directoryPath, fileName); DirectoryUtils.CreateDirectoryIfNotExists(filePath); try { await WebClientUtils.DownloadAsync(attachmentUrl, filePath); contentInfo.VideoUrl = await _pathManager.GetVirtualUrlByPhysicalPathAsync(siteInfo, filePath); } catch { // ignored } } } else if (StringUtils.EqualsIgnoreCase(nameof(Content.FileUrl), attributeName)) { if (!string.IsNullOrEmpty(value)) { var attachmentUrl = GatherUtils.GetUrlByBaseUrl(value, item.Url); var fileExtension = PageUtils.GetExtensionFromUrl(attachmentUrl); var fileName = $"{StringUtils.GetShortGuid(false)}{fileExtension}"; var directoryPath = await _pathManager.GetUploadDirectoryPathAsync(siteInfo, UploadType.File); var filePath = PathUtils.Combine(directoryPath, fileName); DirectoryUtils.CreateDirectoryIfNotExists(filePath); try { await WebClientUtils.DownloadAsync(attachmentUrl, filePath); contentInfo.FileUrl = await _pathManager.GetVirtualUrlByPhysicalPathAsync(siteInfo, filePath); } catch { // ignored } } } else if (StringUtils.EqualsIgnoreCase(nameof(Content.Hits), attributeName)) { contentInfo.Hits = TranslateUtils.ToInt(value); } else if (StringUtils.EqualsIgnoreCase("FileName", attributeName) && !string.IsNullOrEmpty(rule.FileNameAttributeName)) { var fileName = PathUtils.GetFileNameWithoutExtension(item.Url); contentInfo.Set(rule.FileNameAttributeName, fileName); } else { contentInfo.Set(attributeName, value); } } } var firstImageUrl = string.Empty; if (rule.IsSaveImage) { var originalImageSrcList = GatherUtils.GetOriginalImageSrcList(content); var imageSrcList = GatherUtils.GetImageSrcList(item.Url, content); if (originalImageSrcList.Count == imageSrcList.Count) { for (var i = 0; i < originalImageSrcList.Count; i++) { var originalImageSrc = originalImageSrcList[i]; var imageSrc = imageSrcList[i]; var fileExtension = PathUtils.GetExtension(originalImageSrc); var fileName = $"{StringUtils.GetShortGuid(false)}{fileExtension}"; var directoryPath = await _pathManager.GetUploadDirectoryPathAsync(siteInfo, UploadType.Image); var filePath = PathUtils.Combine(directoryPath, fileName); DirectoryUtils.CreateDirectoryIfNotExists(filePath); try { await WebClientUtils.DownloadAsync(imageSrc, filePath); var fileUrl = await _pathManager.GetVirtualUrlByPhysicalPathAsync(siteInfo, filePath); content = content.Replace(originalImageSrc, fileUrl); if (firstImageUrl == string.Empty) { firstImageUrl = fileUrl; } } catch { // ignored } } } } if (rule.ImageSource == ImageSource.Content) { if (string.IsNullOrEmpty(firstImageUrl)) { var imageSrcList = GatherUtils.GetImageSrcList(item.Url, content); if (imageSrcList.Count > 0) { firstImageUrl = imageSrcList[index : 0]; } } if (!string.IsNullOrEmpty(firstImageUrl)) { contentInfo.ImageUrl = firstImageUrl; } } else if (rule.ImageSource == ImageSource.List) { contentInfo.ImageUrl = item.Content.ImageUrl; } if (rule.IsSaveFiles) { var originalLinkHrefList = GatherUtils.GetOriginalLinkHrefList(content); var linkHrefList = GatherUtils.GetLinkHrefList(item.Url, content); if (originalLinkHrefList.Count == linkHrefList.Count) { for (var i = 0; i < originalLinkHrefList.Count; i++) { var originalLinkHref = originalLinkHrefList[i]; var linkHref = linkHrefList[i]; var fileExtension = PathUtils.GetExtension(originalLinkHref); var fileName = $"{StringUtils.GetShortGuid(false)}{fileExtension}"; var directoryPath = await _pathManager.GetUploadDirectoryPathAsync(siteInfo, UploadType.File); var filePath = PathUtils.Combine(directoryPath, fileName); DirectoryUtils.CreateDirectoryIfNotExists(filePath); try { await WebClientUtils.DownloadAsync(linkHref, filePath); var fileUrl = await _pathManager.GetVirtualUrlByPhysicalPathAsync(siteInfo, filePath); content = content.Replace(originalLinkHref, fileUrl); } catch { // ignored } } } } //contentInfo.Content = StringUtility.TextEditorContentEncode(content, siteInfo, false); contentInfo.SiteId = siteInfo.Id; contentInfo.ChannelId = channelId; contentInfo.AdminId = adminId; contentInfo.LastEditAdminId = adminId; contentInfo.Checked = rule.IsChecked; contentInfo.CheckedLevel = 0; contentInfo.Title = title; contentInfo.Body = content; //contentInfo.SourceId = SourceManager.CaiJi; var theContentId = await _contentRepository.InsertAsync(siteInfo, channelInfo, contentInfo); channelIdAndContentIdList.Add(new KeyValuePair <int, int>(contentInfo.ChannelId, theContentId)); return(true, title, string.Empty); } catch (Exception ex) { return(false, string.Empty, ex.Message); } }