public async void Execute(IJobExecutionContext context) { var dataMap = context.JobDetail.JobDataMap; var logService = (ILogService)dataMap[LOG_SERVICE]; try { var feed = (RssCrawler)dataMap[FEED]; var albumService = (IAlbumService)dataMap[ALBUM_SERVICE]; var imageService = (IImageService)dataMap[IMAGE_SERVICE]; var newsItemService = (INewsItemService)dataMap[NEWS_ITEM_SERVICE]; var settingsService = (ISettingsService)dataMap[SETTINGS_SERVICE]; var heartService = (IHeartService)dataMap[HEART_SERVICE]; var rssProcessedItemGateway = (RssProcessedItemGateway)dataMap[RSS_PROCESSED_ITEM_GATEWAY]; logService.TraceMessage($"Starting crawling {feed.RssFeedUrl}"); var feedUrl = feed.RssFeedUrl.StartsWith("//") ? $"https:{feed.RssFeedUrl}" : feed.RssFeedUrl; XmlReader reader = XmlReader.Create(feedUrl); SyndicationFeed syndicationFeed = SyndicationFeed.Load(reader); logService.TraceMessage($"Got {syndicationFeed.Items.Count()} items"); foreach (SyndicationItem item in syndicationFeed.Items) { if (!CheckIfItemIsNew(item, rssProcessedItemGateway)) { continue; } // проверка текста по фильтрам string title = item.Title.Text.Trim(); string description = ParsingHelper.RemoveHtml(item.Summary.Text).Trim(); string fulltext = ParsingHelper.RemoveHtml(GetRssItemExtensionValue(item, "fulltext")); bool filterOk = true; foreach (var filter in feed.Filters) { // фильтры реализованы по логическому "И" // пройдут только те записи, которые соответствуют всем фильтрам var titleMatches = Regex.IsMatch(title, filter.Filter); var descriptionMatches = Regex.IsMatch(description, filter.Filter); var fulltextMatches = fulltext != null && Regex.IsMatch(fulltext, filter.Filter); // если совпадение по регулярке найдено в заголовке, описании или полном тексте, считаем, что фильтр пройден var match = (titleMatches || descriptionMatches || fulltextMatches); filterOk &= match; if (!match) { break; } } logService.TraceMessage( $@"Item with title ""{item.Title.Text}"" {(filterOk ? "matches" : "does not match")}"); Data.Models.RssProcessedItem rssProcessedItem = new Data.Models.RssProcessedItem { NewsItemId = null, RssSource = item.Id, }; if (!filterOk) { rssProcessedItemGateway.Insert(rssProcessedItem); continue; } var newsItemTextStringBuilder = new StringBuilder(); // html parsing var config = Configuration.Default.WithDefaultLoader(); var address = item.Links.First().Uri; var document = await BrowsingContext.New(config).OpenAsync(address.ToString()); string imageId = null; if (!string.IsNullOrEmpty(feed.ImageSelector)) { // есть селектор для парсинга картинок // Asynchronously get the document in a new context using the configuration // Perform the query to get all cells with the content var cell = document.QuerySelector(feed.ImageSelector); if (cell != null) { var url = cell.Attributes["src"].Value; if (url.StartsWith("/") && url[1] != '/') { var uri = new Uri(reader.BaseURI); var feedRootUrl = uri.AbsoluteUri.Replace(uri.AbsolutePath, ""); var endsWithSlash = feedRootUrl.EndsWith("/"); url = $"{feedRootUrl}{(endsWithSlash?"":"/")}{url}"; } if (url.StartsWith("//")) { // ссылку вида //site.ru/img.jpg преобразуем в http://site.ru/img.jpg url = $"http:{url}"; } logService.TraceMessage($"Starting downloading image {url}"); imageId = await albumService.DownloadImage(url); logService.TraceMessage($"Downloading image {url} OK"); } } if (string.IsNullOrEmpty(feed.ContentContainerSelector)) { newsItemTextStringBuilder.Append(item.Summary.Text.Trim()); } else { var content = document.QuerySelector(feed.ContentContainerSelector).InnerHtml; // удаляем все изображения var images = document.QuerySelectorAll($"{feed.ContentContainerSelector} img"); foreach (var img in images) { content = content.Replace(img.OuterHtml, ""); } content = content.Replace("<p></p>", "").Trim(); newsItemTextStringBuilder.Append(content); } lock (feed) { if (!CheckIfItemIsNew(item, rssProcessedItemGateway)) { // если вдруг за время скачивания картинки другой поток успел добавить эту же новость imageService.RemoveImage(imageId); continue; } const int NEWS_DESCRIPTION_LENGTH = 200; var cuttedDescription = TextCutHelper.Cut(description, NEWS_DESCRIPTION_LENGTH).Trim(); bool translitUrls = settingsService.GetSettings <bool>(nameof(Setting.TranslitEnabled)); var relativeUrl = translitUrls ? FormattingHelper.ToTranslitedUrl(title) : FormattingHelper.ToRussianURL(title); if (item.Links.Any()) { newsItemTextStringBuilder.Append("<br>"); newsItemTextStringBuilder.Append("<br>"); var linkText = !string.IsNullOrEmpty(feed.LinkText) ? feed.LinkText : "Читать в источнике"; newsItemTextStringBuilder.Append($@"<a href=""{item.Links.First().Uri.AbsoluteUri}"" target=""_blank"">{linkText}</a>"); } var newsItemText = newsItemTextStringBuilder.ToString(); if (feed.ExcludeItems.Any()) { newsItemText = RemoveDomElements(newsItemText, feed.ExcludeItems.Select(x => x.Selector), logService); } var blogUrl = settingsService.GetSettings <string>(nameof(NewsSettings.BlogUrl)); var parentHeart = heartService.GetHeart(blogUrl); var newNewsItem = new NewsItem() { RssSource = item.Id, AuthorId = null, BreadcrumbsTitle = title, Text = newsItemText, ImageId = imageId, PostingDate = item.PublishDate.DateTime, Description = cuttedDescription, MetaDescription = cuttedDescription, Title = title, RecordType = RecordType.Default, Layout = "clientLayout", RelativeUrl = relativeUrl, ParentHeartId = parentHeart?.HeartId, Tags = feed.Tags }; if (feed.TargetCategoryId != null) { newNewsItem.Categories = new List <IdNamePair <int> >() { new IdNamePair <int>(feed.TargetCategoryId.Value, string.Empty) }; } logService.TraceMessage($"Creating NewsItem for {item.Id}"); int newsItemId = newsItemService.CreateNewsItem(newNewsItem); rssProcessedItem.NewsItemId = newsItemId; rssProcessedItemGateway.Insert(rssProcessedItem); } } } catch (Exception e) { logService.LogError(e); } }