public async Task <string> Parse(string url) { HtmlWeb web = new HtmlWeb(); var htmlDoc = web.Load(url); var node = htmlDoc.DocumentNode.SelectNodes("(//div[@class='news-container'])[1]")[0]; //var retVal = HtmlCleanup.ReadElemtnsUntil(node, "<hr>"); node = HtmlCleanup.RemoveElementsByXpath(node, new[] { "//meta", "//script", "//div[contains(@class,'js-banner-container')]", "//div[contains(@class, 'news-reference')]", "//div[contains(@class, 'news-popular')]", "//div[contains(@class, 'news-discussion')]", "//div[contains(@class, 'news-widget')]", "//div[@class='news-header__flex']", "//div[contains(@class, 'news-incut')]", "(//p)[last()]", "(//p)[last()]" } ); node = HtmlCleanup.ReplaceBackgroundImageWithImg(node, "//div[@class='news-header__image']"); return(node?.InnerHtml); }
public async Task <string> Handle(GetNewsBodyTextQuery request, CancellationToken cancellationToken) { var newsBody = (await _dbContext.News .FirstOrDefaultAsync(n => n.Id.Equals(request.Id), cancellationToken)) .Body; return(HtmlCleanup.RemoveHtmlTags(newsBody)); }
public static string GetSyndicationItemSummary(SyndicationItem syndicationItem) { var retVal = syndicationItem.Summary?.Text; if (string.IsNullOrEmpty(retVal)) { retVal = ((TextSyndicationContent)syndicationItem.Content).Text; } return(HtmlCleanup.RemoveInlineStyles(retVal)); }
public async Task RateNews() { var topNNotratedNewsQuery = new GetTopNNotRatedNewsQuery() { Count = 30 }; var topNotRatedNews = await _mediator.Send(topNNotratedNewsQuery); var affinList = Text.ReadAfinn(_configuration["AfinnPath"]); foreach (var news in topNotRatedNews) { if (!string.IsNullOrEmpty(news.Body)) { var clearBodyText = HtmlCleanup.RemoveHtmlTags(news.Body); var lemmaList = (await Ispras.GetTexterra( Text.PrepareForIspras(clearBodyText))) .Where(l => !string.IsNullOrEmpty(l)); var lemmaWithRate = lemmaList.Select(l => new { Lemma = l, Rate = affinList.Where(aff => aff.Key == l) .FirstOrDefault().Value }); var rate = lemmaWithRate.Where(r => r.Rate != null) .Average(r => r.Rate); if (rate == null) { rate = 0; Log.Debug($"News with id {news.Id} rate is null !!!"); } var setNewsRateCommand = new SetNewsRateCommand() { Id = news.Id, Rating = (float)rate.Value }; var updatedCount = await _mediator.Send(setNewsRateCommand); Log.Information($"Set rate for {topNotRatedNews.Count()} news"); } else { Log.Error($"News with id {news.Id} has empty body"); } } }
public async Task <string> Parse(string url) { HtmlWeb web = new HtmlWeb(); var htmlDoc = web.Load(url); var node = htmlDoc.DocumentNode .SelectSingleNode("//div[@class = 'container']"); node = HtmlCleanup.RemoveElementsByXpath(node, new[] { "//meta", "//div[@class='more-box']", "//p[@class='mb_source']" } ); return(node?.InnerHtml); }
public async Task <string> Parse(string url) { HtmlWeb web = new HtmlWeb(); var htmlDoc = web.Load(url); var node = htmlDoc.DocumentNode .SelectSingleNode("//article[1]"); node = HtmlCleanup.RemoveElementsByXpath(node, new[] { "//meta", "//script", "//div[@class='headline__stamps']", "//div[@class='embeded-post-info']", "//div[@class='source sa-source-wrapper']" }); node = HtmlCleanup.ReplaceBackgroundImageWithImg(node, "//section[@class='article__img']"); return(node?.InnerHtml); }
public async Task <string> Parse(string url) { HtmlWeb web = new HtmlWeb(); var htmlDoc = web.Load(url); var node = htmlDoc.DocumentNode .SelectSingleNode("//div[contains(@class, 'page_news') and contains(@class, 'noselect')]"); node = HtmlCleanup.RemoveElementsByXpath(node, new[] { "//meta", "//div[contains(@class, 'share_block')]", "//div[contains(@class, 'favorite_block')]", "//div[contains(@class, 'news_info')]", "//div[contains(@class, 'vn-player')]", "//div[contains(@class, 'uninote console')]", "//div[contains(@class, 'nepncont')]", } ); return(node?.InnerHtml); }
public async Task <string> Parse(string url) { HtmlWeb web = new HtmlWeb(); var htmlDoc = web.Load(url); var headerNode = htmlDoc.DocumentNode .SelectSingleNode("//div[contains(@class, 'entryContextHeader clearfix')]"); headerNode = HtmlCleanup.RemoveElementsByXpath(headerNode, new[] { "//meta", "//div[contains(@class, 'byline')]", "//div[contains(@class, 'favorite_block')]", "//div[contains(@class, 'news_info')]", "//div[contains(@class, 'vn-player')]", "//div[contains(@class, 'uninote console')]", "//div[contains(@class, 'nepncont')]", } ); var contentNode = htmlDoc.DocumentNode .SelectSingleNode("//div[@id = 'contentWrapper']"); contentNode = HtmlCleanup.RemoveElementsByXpath(contentNode, new[] { "//meta", "//script", "//section[@class = 'related']", "//section[@class ='sources entryMeta']", "//section[@class = 'tags clearfix entryMeta']", "//div[contains(@class, 'mtl mbl')]", "//div[@id = 'commentsContainer']", "//aside[@id = 'aside']" //"//div[contains(@class, 'nepncont')]", } ); return($"{headerNode?.InnerHtml}{contentNode?.InnerHtml}"); }