コード例 #1
0
ファイル: RssCrawlingService.cs プロジェクト: synweb/rocms
            public async void Execute(IJobExecutionContext context)
            {
                var dataMap    = context.JobDetail.JobDataMap;
                var logService = (ILogService)dataMap[LOG_SERVICE];

                try
                {
                    var feed                    = (RssCrawler)dataMap[FEED];
                    var albumService            = (IAlbumService)dataMap[ALBUM_SERVICE];
                    var imageService            = (IImageService)dataMap[IMAGE_SERVICE];
                    var newsItemService         = (INewsItemService)dataMap[NEWS_ITEM_SERVICE];
                    var settingsService         = (ISettingsService)dataMap[SETTINGS_SERVICE];
                    var heartService            = (IHeartService)dataMap[HEART_SERVICE];
                    var rssProcessedItemGateway = (RssProcessedItemGateway)dataMap[RSS_PROCESSED_ITEM_GATEWAY];

                    logService.TraceMessage($"Starting crawling {feed.RssFeedUrl}");
                    var             feedUrl         = feed.RssFeedUrl.StartsWith("//") ? $"https:{feed.RssFeedUrl}" : feed.RssFeedUrl;
                    XmlReader       reader          = XmlReader.Create(feedUrl);
                    SyndicationFeed syndicationFeed = SyndicationFeed.Load(reader);
                    logService.TraceMessage($"Got {syndicationFeed.Items.Count()} items");
                    foreach (SyndicationItem item in syndicationFeed.Items)
                    {
                        if (!CheckIfItemIsNew(item, rssProcessedItemGateway))
                        {
                            continue;
                        }

                        // проверка текста по фильтрам
                        string title       = item.Title.Text.Trim();
                        string description = ParsingHelper.RemoveHtml(item.Summary.Text).Trim();
                        string fulltext    = ParsingHelper.RemoveHtml(GetRssItemExtensionValue(item, "fulltext"));
                        bool   filterOk    = true;
                        foreach (var filter in feed.Filters)
                        {
                            // фильтры реализованы по логическому "И"
                            // пройдут только те записи, которые соответствуют всем фильтрам
                            var titleMatches       = Regex.IsMatch(title, filter.Filter);
                            var descriptionMatches = Regex.IsMatch(description, filter.Filter);
                            var fulltextMatches    = fulltext != null && Regex.IsMatch(fulltext, filter.Filter);
                            // если совпадение по регулярке найдено в заголовке, описании или полном тексте, считаем, что фильтр пройден
                            var match = (titleMatches || descriptionMatches || fulltextMatches);
                            filterOk &= match;
                            if (!match)
                            {
                                break;
                            }
                        }
                        logService.TraceMessage(
                            $@"Item with title ""{item.Title.Text}"" {(filterOk ? "matches" : "does not match")}");

                        Data.Models.RssProcessedItem rssProcessedItem = new Data.Models.RssProcessedItem
                        {
                            NewsItemId = null,
                            RssSource  = item.Id,
                        };
                        if (!filterOk)
                        {
                            rssProcessedItemGateway.Insert(rssProcessedItem);
                            continue;
                        }
                        var newsItemTextStringBuilder = new StringBuilder();
                        // html parsing
                        var config   = Configuration.Default.WithDefaultLoader();
                        var address  = item.Links.First().Uri;
                        var document = await BrowsingContext.New(config).OpenAsync(address.ToString());

                        string imageId = null;
                        if (!string.IsNullOrEmpty(feed.ImageSelector))
                        {
                            // есть селектор для парсинга картинок
                            // Asynchronously get the document in a new context using the configuration
                            // Perform the query to get all cells with the content
                            var cell = document.QuerySelector(feed.ImageSelector);
                            if (cell != null)
                            {
                                var url = cell.Attributes["src"].Value;
                                if (url.StartsWith("/") && url[1] != '/')
                                {
                                    var uri           = new Uri(reader.BaseURI);
                                    var feedRootUrl   = uri.AbsoluteUri.Replace(uri.AbsolutePath, "");
                                    var endsWithSlash = feedRootUrl.EndsWith("/");
                                    url = $"{feedRootUrl}{(endsWithSlash?"":"/")}{url}";
                                }
                                if (url.StartsWith("//"))
                                {
                                    // ссылку вида //site.ru/img.jpg преобразуем в http://site.ru/img.jpg
                                    url = $"http:{url}";
                                }
                                logService.TraceMessage($"Starting downloading image {url}");
                                imageId = await albumService.DownloadImage(url);

                                logService.TraceMessage($"Downloading image {url} OK");
                            }
                        }

                        if (string.IsNullOrEmpty(feed.ContentContainerSelector))
                        {
                            newsItemTextStringBuilder.Append(item.Summary.Text.Trim());
                        }
                        else
                        {
                            var content = document.QuerySelector(feed.ContentContainerSelector).InnerHtml;
                            // удаляем все изображения
                            var images = document.QuerySelectorAll($"{feed.ContentContainerSelector} img");
                            foreach (var img in images)
                            {
                                content = content.Replace(img.OuterHtml, "");
                            }
                            content = content.Replace("<p></p>", "").Trim();
                            newsItemTextStringBuilder.Append(content);
                        }
                        lock (feed)
                        {
                            if (!CheckIfItemIsNew(item, rssProcessedItemGateway))
                            {
                                // если вдруг за время скачивания картинки другой поток успел добавить эту же новость
                                imageService.RemoveImage(imageId);
                                continue;
                            }

                            const int NEWS_DESCRIPTION_LENGTH = 200;
                            var       cuttedDescription       = TextCutHelper.Cut(description, NEWS_DESCRIPTION_LENGTH).Trim();
                            bool      translitUrls            = settingsService.GetSettings <bool>(nameof(Setting.TranslitEnabled));
                            var       relativeUrl             = translitUrls
                                ? FormattingHelper.ToTranslitedUrl(title)
                                : FormattingHelper.ToRussianURL(title);
                            if (item.Links.Any())
                            {
                                newsItemTextStringBuilder.Append("<br>");
                                newsItemTextStringBuilder.Append("<br>");
                                var linkText = !string.IsNullOrEmpty(feed.LinkText) ? feed.LinkText : "Читать в источнике";
                                newsItemTextStringBuilder.Append($@"<a href=""{item.Links.First().Uri.AbsoluteUri}"" target=""_blank"">{linkText}</a>");
                            }
                            var newsItemText = newsItemTextStringBuilder.ToString();
                            if (feed.ExcludeItems.Any())
                            {
                                newsItemText = RemoveDomElements(newsItemText, feed.ExcludeItems.Select(x => x.Selector), logService);
                            }

                            var blogUrl     = settingsService.GetSettings <string>(nameof(NewsSettings.BlogUrl));
                            var parentHeart = heartService.GetHeart(blogUrl);
                            var newNewsItem = new NewsItem()
                            {
                                RssSource        = item.Id,
                                AuthorId         = null,
                                BreadcrumbsTitle = title,
                                Text             = newsItemText,
                                ImageId          = imageId,
                                PostingDate      = item.PublishDate.DateTime,
                                Description      = cuttedDescription,
                                MetaDescription  = cuttedDescription,
                                Title            = title,
                                RecordType       = RecordType.Default,
                                Layout           = "clientLayout",
                                RelativeUrl      = relativeUrl,
                                ParentHeartId    = parentHeart?.HeartId,
                                Tags             = feed.Tags
                            };
                            if (feed.TargetCategoryId != null)
                            {
                                newNewsItem.Categories = new List <IdNamePair <int> >()
                                {
                                    new IdNamePair <int>(feed.TargetCategoryId.Value, string.Empty)
                                };
                            }
                            logService.TraceMessage($"Creating NewsItem for {item.Id}");
                            int newsItemId = newsItemService.CreateNewsItem(newNewsItem);
                            rssProcessedItem.NewsItemId = newsItemId;
                            rssProcessedItemGateway.Insert(rssProcessedItem);
                        }
                    }
                }
                catch (Exception e)
                {
                    logService.LogError(e);
                }
            }