protected async Task<HtmlDocument> GetArticle(string url, string encoding = "") { HtmlDocument document = null; try { var response = await _httpClient.GetByteArrayAsync(url); if (encoding == "") encoding = "utf-8"; var source = Encoding.GetEncoding(encoding).GetString(response, 0, response.Length - 1); source = WebUtility.HtmlDecode(source); document = new HtmlDocument(); document.LoadHtml(source); } catch (Exception ex) { Error = new ErrorDescription { Level = ApplicationLevel.NewsParser, Description = ex.Message }; } return document; }
/// <summary> /// Parses article from given Url /// </summary> /// <param name="article">Article container. Url is stored inside </param> /// <returns>Success indication flag</returns> public override bool ParseArticle(ArticleContainer article) { var task = GetArticle(article.Header.Link); task.Wait(); if (task.Result == null) return false; var doc = task.Result; var nodeList = doc.DocumentNode.Descendants().Where (x => x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("article__text js-module js-mediator-article")).ToList(); var newsBody = nodeList.FirstOrDefault(); var sb = new StringBuilder(); if (newsBody != null) { var indexNode = GetDescendantByAttributes(newsBody, "div", "class", "article__item_html"); if (indexNode != null) { foreach (var node in indexNode.ChildNodes.Where(n => n.Name == "p")) sb.Append(node.InnerText.Replace(" ", " ") + " "); var img = GetDescendantByAttributes(newsBody, "img", "class", "photo__pic"); var hasPicture = false; if (img != null) { try { var url = img.Attributes["src"].Value; var binaryTask = GetBinaryContent(url); binaryTask.Wait(); if (binaryTask.Result != null) { var pict = binaryTask.Result; article.Header.Enclosure = pict; hasPicture = true; } } catch (Exception ex) { Error = new ErrorDescription { Level = ApplicationLevel.NewsParser, Description = ex.Message }; } } article.Body = new BodyContainer { Body = sb.ToString(), HasPicture = hasPicture }; return true; } } return false; }
/// <summary> /// Obtains article headers from RSS source /// </summary> /// <param name="source">Rss source</param> /// <param name="refresh">Flad indicating news should be refreshed</param> /// <returns>News headers</returns> public List<NewsHeader> GetArticlesHeaders(RssSource source, bool refresh) { var articleHeaders = new List<NewsHeader>(); var rssFeed = new XmlDocument(); #region new data loading if (refresh) { try { rssFeed.Load(source.Url); // todo refactor this with HttpClient // Load logo if present if (source.Logo == null) { var logoNodes = rssFeed.SelectNodes("rss/channel/image"); if (logoNodes != null) { foreach (var logoUrl in from XmlNode inner in logoNodes select inner.SelectSingleNode("url") into urlNode where urlNode != null select urlNode.InnerText) { var url = logoUrl; if (!logoUrl.Contains("http")) url = "http:" + url; GetContent(url).ContinueWith(response => { if (response.Result != null) { var content = response.Result.Content as StreamContent; if (content != null) content.ReadAsByteArrayAsync() .ContinueWith(bytes => source.Logo = bytes.Result); } }).Wait(); break; } } } var rssNodes = rssFeed.SelectNodes("rss/channel/item"); if (rssNodes != null) { foreach (XmlNode rssNode in rssNodes) { var rssSubNode = rssNode.SelectSingleNode("link"); var link = rssSubNode != null ? rssSubNode.InnerText : ""; var task = _contentStorage.GetArticleByUrl(link); task.Wait(); if (task.Result != null) { articleHeaders.Add(task.Result.Header); continue; } rssSubNode = rssNode.SelectSingleNode("title"); var title = rssSubNode != null ? rssSubNode.InnerText.Replace("—", "-") : ""; rssSubNode = rssNode.SelectSingleNode("description"); var description = rssSubNode != null ? rssSubNode.InnerText.StripTagsRegex() : ""; if (description.Length > 150) description = description.Substring(0, 147) + "..."; // trim too long description for pop-over rssSubNode = rssNode.SelectSingleNode("pubDate"); var date = rssSubNode != null ? rssSubNode.InnerText : ""; rssSubNode = rssNode.SelectSingleNode("enclosure"); var enclosure = rssSubNode != null && rssSubNode.Attributes != null ? rssSubNode.Attributes["url"].Value : ""; byte[] enclosured = null; if (enclosure != "") { GetContent(enclosure).ContinueWith(response => { var content = response.Result.Content as StreamContent; if (content != null) content.ReadAsByteArrayAsync() .ContinueWith(bytes => enclosured = bytes.Result); }).Wait(); } var newsHeader = new NewsHeader { Description = description, Link = link, Title = title, Guid = Guid.NewGuid(), PublishDate = DateTime.Parse(date), Enclosure = enclosured, Source = source.SiteName, HasLogo = source.Logo != null, HasEnclosure = enclosured != null }; articleHeaders.Add(newsHeader); _contentStorage.SaveArticle(new ArticleContainer { Guid = newsHeader.Guid, RssSource = source, Header = newsHeader }); } } } catch (Exception ex) { _errorDescription = new ErrorDescription { Level = ApplicationLevel.RssParser, Description = ex.Message }; } } #endregion #region obtaining loaded data else articleHeaders = _contentStorage.GetArticlesBySource(source).Select(art => art.Header).ToList(); #endregion return articleHeaders; }
/// <summary> /// Parses article from given Url /// </summary> /// <param name="article">Article container. Url is stored inside </param> /// <returns>Success indication flag</returns> public override bool ParseArticle(ArticleContainer article) { var task = GetArticle(article.Header.Link); task.Wait(); if (task.Result == null) return false; var doc = task.Result; var nodeList = doc.DocumentNode.Descendants().Where (x => (x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("news_body"))).ToList(); var newsBody = nodeList.FirstOrDefault(); var sb = new StringBuilder(); if (newsBody != null) { foreach (var node in newsBody.ChildNodes.Where(n => n.Name == "p")) sb.Append(node.InnerText + " "); var imgList = doc.DocumentNode.Descendants().Where (x => (x.Name == "img" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("main_image"))).ToList(); var img = imgList.FirstOrDefault(); var hasPicture = false; if (img != null) { try { var url = img.Attributes["src"].Value; var binaryTask = GetBinaryContent(url); binaryTask.Wait(); if (binaryTask.Result != null) { var pict = binaryTask.Result; article.Header.Enclosure = pict; hasPicture = true; } } catch (Exception ex) { Error = new ErrorDescription { Level = ApplicationLevel.NewsParser, Description = ex.Message }; } } article.Body = new BodyContainer { Body = sb.ToString(), HasPicture = hasPicture }; return true; } return false; }
private async Task<HttpResponseMessage> GetContent(string url) { try { return await _httpClient.GetAsync(url); } catch (Exception ex) { _errorDescription = new ErrorDescription { Level = ApplicationLevel.RssParser, Description = ex.Message }; } return null; }