public string TutByParseNews(string newsUrl) { try { var web = new HtmlWeb(); var doc = web.Load(newsUrl); var docNode = doc.DocumentNode; var listContent = docNode.Descendants("div") .Where(d => d.Id == "article_body") .FirstOrDefault()? .InnerHtml; return(HtmlCleaner.CleanHtml(listContent)); } catch (Exception ex) { throw ex; } }
public string OnlinerParseNews(string newsUrl) { try { var web = new HtmlWeb(); var doc = web.Load(newsUrl); var docNode = doc.DocumentNode; var listContent = docNode.Descendants() .Where(d => d.Name == "div") .Where(d => d.Attributes.FirstOrDefault().Name == "class") .Where(d => d.Attributes.FirstOrDefault().Value == "news-text") .FirstOrDefault()? .InnerHtml; return(HtmlCleaner.CleanHtml(listContent)); } catch (Exception ex) { throw ex; } }
public async Task LoadAllNewsBody() { try { foreach (News news in await _unitOfWork.NewsRepository.GetAllAsync()) { if (news == null && news.Body != null) { continue; } if (news.Source.Contains("tut.by")) //Check origin site { try { news.Body = _newsParser.TutByParseNews(news.Source); } catch { continue; } } if (news.Source.Contains("onliner.by")) { try { news.Body = _newsParser.OnlinerParseNews(news.Source); } catch { continue; } } if (news.Body != null) { news.Body = HtmlCleaner.CleanHtml(news.Body); news.PlainText = HtmlCleaner.GetPlainText(news.Body); await _unitOfWork.SaveDBAsync(); } else { Log.Information($"{DateTime.Now}|Info|Couldnt get body of {news.Source}"); } } } catch (Exception ex) { throw ex; } }