Пример #1
0
        public async Task <string> Parse(string url)
        {
            HtmlWeb web     = new HtmlWeb();
            var     htmlDoc = web.Load(url);
            var     node    = htmlDoc.DocumentNode.SelectNodes("(//div[@class='news-container'])[1]")[0];

            //var retVal = HtmlCleanup.ReadElemtnsUntil(node, "<hr>");

            node = HtmlCleanup.RemoveElementsByXpath(node,
                                                     new[] {
                "//meta",
                "//script",
                "//div[contains(@class,'js-banner-container')]",
                "//div[contains(@class, 'news-reference')]",
                "//div[contains(@class, 'news-popular')]",
                "//div[contains(@class, 'news-discussion')]",
                "//div[contains(@class, 'news-widget')]",
                "//div[@class='news-header__flex']",
                "//div[contains(@class, 'news-incut')]",
                "(//p)[last()]",
                "(//p)[last()]"
            }
                                                     );

            node = HtmlCleanup.ReplaceBackgroundImageWithImg(node, "//div[@class='news-header__image']");

            return(node?.InnerHtml);
        }
        public async Task <string> Handle(GetNewsBodyTextQuery request, CancellationToken cancellationToken)
        {
            var newsBody = (await _dbContext.News
                            .FirstOrDefaultAsync(n => n.Id.Equals(request.Id), cancellationToken))
                           .Body;

            return(HtmlCleanup.RemoveHtmlTags(newsBody));
        }
Пример #3
0
        public static string GetSyndicationItemSummary(SyndicationItem syndicationItem)
        {
            var retVal = syndicationItem.Summary?.Text;

            if (string.IsNullOrEmpty(retVal))
            {
                retVal = ((TextSyndicationContent)syndicationItem.Content).Text;
            }
            return(HtmlCleanup.RemoveInlineStyles(retVal));
        }
Пример #4
0
        public async Task RateNews()
        {
            var topNNotratedNewsQuery = new GetTopNNotRatedNewsQuery()
            {
                Count = 30
            };
            var topNotRatedNews = await _mediator.Send(topNNotratedNewsQuery);

            var affinList = Text.ReadAfinn(_configuration["AfinnPath"]);

            foreach (var news in topNotRatedNews)
            {
                if (!string.IsNullOrEmpty(news.Body))
                {
                    var clearBodyText = HtmlCleanup.RemoveHtmlTags(news.Body);
                    var lemmaList     = (await Ispras.GetTexterra(
                                             Text.PrepareForIspras(clearBodyText)))
                                        .Where(l => !string.IsNullOrEmpty(l));

                    var lemmaWithRate = lemmaList.Select(l => new
                    {
                        Lemma = l,
                        Rate  = affinList.Where(aff => aff.Key == l)
                                .FirstOrDefault().Value
                    });
                    var rate = lemmaWithRate.Where(r => r.Rate != null)
                               .Average(r => r.Rate);

                    if (rate == null)
                    {
                        rate = 0;
                        Log.Debug($"News with id {news.Id} rate is null !!!");
                    }
                    var setNewsRateCommand = new SetNewsRateCommand()
                    {
                        Id = news.Id, Rating = (float)rate.Value
                    };
                    var updatedCount = await _mediator.Send(setNewsRateCommand);

                    Log.Information($"Set rate for {topNotRatedNews.Count()} news");
                }
                else
                {
                    Log.Error($"News with id {news.Id} has empty body");
                }
            }
        }
Пример #5
0
        public async Task <string> Parse(string url)
        {
            HtmlWeb web     = new HtmlWeb();
            var     htmlDoc = web.Load(url);
            var     node    = htmlDoc.DocumentNode
                              .SelectSingleNode("//div[@class = 'container']");

            node = HtmlCleanup.RemoveElementsByXpath(node,
                                                     new[] {
                "//meta",
                "//div[@class='more-box']",
                "//p[@class='mb_source']"
            }
                                                     );

            return(node?.InnerHtml);
        }
Пример #6
0
        public async Task <string> Parse(string url)
        {
            HtmlWeb web     = new HtmlWeb();
            var     htmlDoc = web.Load(url);
            var     node    = htmlDoc.DocumentNode
                              .SelectSingleNode("//article[1]");

            node = HtmlCleanup.RemoveElementsByXpath(node,
                                                     new[] {
                "//meta",
                "//script",
                "//div[@class='headline__stamps']",
                "//div[@class='embeded-post-info']",
                "//div[@class='source sa-source-wrapper']"
            });

            node = HtmlCleanup.ReplaceBackgroundImageWithImg(node, "//section[@class='article__img']");

            return(node?.InnerHtml);
        }
Пример #7
0
        public async Task <string> Parse(string url)
        {
            HtmlWeb web     = new HtmlWeb();
            var     htmlDoc = web.Load(url);
            var     node    = htmlDoc.DocumentNode
                              .SelectSingleNode("//div[contains(@class, 'page_news') and contains(@class, 'noselect')]");

            node = HtmlCleanup.RemoveElementsByXpath(node,
                                                     new[] {
                "//meta",
                "//div[contains(@class, 'share_block')]",
                "//div[contains(@class, 'favorite_block')]",
                "//div[contains(@class, 'news_info')]",
                "//div[contains(@class, 'vn-player')]",
                "//div[contains(@class, 'uninote console')]",
                "//div[contains(@class, 'nepncont')]",
            }
                                                     );
            return(node?.InnerHtml);
        }
Пример #8
0
        public async Task <string> Parse(string url)
        {
            HtmlWeb web        = new HtmlWeb();
            var     htmlDoc    = web.Load(url);
            var     headerNode = htmlDoc.DocumentNode
                                 .SelectSingleNode("//div[contains(@class, 'entryContextHeader clearfix')]");

            headerNode = HtmlCleanup.RemoveElementsByXpath(headerNode,
                                                           new[] {
                "//meta",
                "//div[contains(@class, 'byline')]",
                "//div[contains(@class, 'favorite_block')]",
                "//div[contains(@class, 'news_info')]",
                "//div[contains(@class, 'vn-player')]",
                "//div[contains(@class, 'uninote console')]",
                "//div[contains(@class, 'nepncont')]",
            }
                                                           );

            var contentNode = htmlDoc.DocumentNode
                              .SelectSingleNode("//div[@id = 'contentWrapper']");

            contentNode = HtmlCleanup.RemoveElementsByXpath(contentNode,
                                                            new[] {
                "//meta",
                "//script",
                "//section[@class = 'related']",
                "//section[@class ='sources entryMeta']",
                "//section[@class = 'tags clearfix entryMeta']",
                "//div[contains(@class, 'mtl mbl')]",
                "//div[@id = 'commentsContainer']",
                "//aside[@id = 'aside']"
                //"//div[contains(@class, 'nepncont')]",
            }
                                                            );
            return($"{headerNode?.InnerHtml}{contentNode?.InnerHtml}");
        }