protected async Task<HtmlDocument> GetArticle(string url, string encoding = "")
 {
     HtmlDocument document = null;
     try
     {
         var response = await _httpClient.GetByteArrayAsync(url);
         if (encoding == "")
             encoding = "utf-8";
         var source = Encoding.GetEncoding(encoding).GetString(response, 0, response.Length - 1);
         source = WebUtility.HtmlDecode(source);
         document = new HtmlDocument();
         document.LoadHtml(source);
     }
     catch (Exception ex)
     {
         Error = new ErrorDescription
         {
             Level = ApplicationLevel.NewsParser,
             Description = ex.Message
         };
     }
     return document;
 }
示例#2
0
        /// <summary>
        /// Parses article from given Url
        /// </summary>
        /// <param name="article">Article container. Url is stored inside </param>
        /// <returns>Success indication flag</returns>
        public override bool ParseArticle(ArticleContainer article)
        {
            var task = GetArticle(article.Header.Link);
            task.Wait();
            if (task.Result == null)
                return false;

            var doc = task.Result;
            var nodeList = doc.DocumentNode.Descendants().Where
                (x =>
                    x.Name == "div" && x.Attributes["class"] != null &&
                     x.Attributes["class"].Value.Contains("article__text js-module js-mediator-article")).ToList();
            var newsBody = nodeList.FirstOrDefault();
            var sb = new StringBuilder();
            if (newsBody != null)
            {
                var indexNode = GetDescendantByAttributes(newsBody, "div", "class", "article__item_html");

                if (indexNode != null)
                {
                    foreach (var node in indexNode.ChildNodes.Where(n => n.Name == "p"))
                        sb.Append(node.InnerText.Replace("&nbsp;", " ") + " ");
                    
                    var img = GetDescendantByAttributes(newsBody, "img", "class", "photo__pic");

                    var hasPicture = false;

                    if (img != null)
                    {
                        try
                        {
                            var url = img.Attributes["src"].Value;

                            var binaryTask = GetBinaryContent(url);
                            binaryTask.Wait();
                            if (binaryTask.Result != null)
                            {
                                var pict = binaryTask.Result;
                                article.Header.Enclosure = pict;
                                hasPicture = true;
                            }
                        }
                        catch (Exception ex)
                        {
                            Error = new ErrorDescription
                            {
                                Level = ApplicationLevel.NewsParser,
                                Description = ex.Message
                            };
                        }
                    }

                    article.Body = new BodyContainer
                    {
                        Body = sb.ToString(),
                        HasPicture = hasPicture
                    };

                    return true;
                }
            }
            return false;
        }
示例#3
0
        /// <summary>
        /// Obtains article headers from RSS source
        /// </summary>
        /// <param name="source">Rss source</param>
        /// <param name="refresh">Flad indicating news should be refreshed</param>
        /// <returns>News headers</returns>
        public List<NewsHeader> GetArticlesHeaders(RssSource source, bool refresh)
        {
            var articleHeaders = new List<NewsHeader>();
            var rssFeed = new XmlDocument();

            #region new data loading

            if (refresh)
            {
                try
                {
                    rssFeed.Load(source.Url); // todo refactor this with HttpClient

                    // Load logo if present

                    if (source.Logo == null)
                    {
                        var logoNodes = rssFeed.SelectNodes("rss/channel/image");
                        if (logoNodes != null)
                        {
                            foreach (var logoUrl in 
                                from XmlNode inner
                                    in logoNodes
                                select inner.SelectSingleNode("url")
                                into urlNode
                                where urlNode != null
                                select urlNode.InnerText)
                            {
                                var url = logoUrl;
                                if (!logoUrl.Contains("http"))
                                    url = "http:" + url;
                                GetContent(url).ContinueWith(response =>
                                {
                                    if (response.Result != null)
                                    {
                                        var content = response.Result.Content as StreamContent;
                                        if (content != null)
                                            content.ReadAsByteArrayAsync()
                                                .ContinueWith(bytes => source.Logo = bytes.Result);
                                    }
                                }).Wait();
                                break;
                            }
                        }
                    }

                    var rssNodes = rssFeed.SelectNodes("rss/channel/item");
                    if (rssNodes != null)
                    {
                        foreach (XmlNode rssNode in rssNodes)
                        {
                            var rssSubNode = rssNode.SelectSingleNode("link");
                            var link = rssSubNode != null ? rssSubNode.InnerText : "";

                            var task = _contentStorage.GetArticleByUrl(link);
                            task.Wait();

                            if (task.Result != null)
                            {
                                articleHeaders.Add(task.Result.Header);
                                continue;
                            }

                            rssSubNode = rssNode.SelectSingleNode("title");
                            var title = rssSubNode != null ? rssSubNode.InnerText.Replace("&mdash;", "-") : "";

                            rssSubNode = rssNode.SelectSingleNode("description");
                            var description = rssSubNode != null ? rssSubNode.InnerText.StripTagsRegex() : "";
                            if (description.Length > 150)
                                description = description.Substring(0, 147) + "..."; // trim too long description for pop-over

                            rssSubNode = rssNode.SelectSingleNode("pubDate");
                            var date = rssSubNode != null ? rssSubNode.InnerText : "";

                            rssSubNode = rssNode.SelectSingleNode("enclosure");
                            var enclosure = rssSubNode != null && rssSubNode.Attributes != null
                                ? rssSubNode.Attributes["url"].Value
                                : "";
                            byte[] enclosured = null;
                            if (enclosure != "")
                            {
                                GetContent(enclosure).ContinueWith(response =>
                                {
                                    var content = response.Result.Content as StreamContent;
                                    if (content != null)
                                        content.ReadAsByteArrayAsync()
                                            .ContinueWith(bytes => enclosured = bytes.Result);
                                }).Wait();
                            }
                            var newsHeader = new NewsHeader
                            {
                                Description = description,
                                Link = link,
                                Title = title,
                                Guid = Guid.NewGuid(),
                                PublishDate = DateTime.Parse(date),
                                Enclosure = enclosured,
                                Source = source.SiteName,
                                HasLogo = source.Logo != null,
                                HasEnclosure = enclosured != null
                            };
                            articleHeaders.Add(newsHeader);
                            _contentStorage.SaveArticle(new ArticleContainer
                            {
                                Guid = newsHeader.Guid,
                                RssSource = source,
                                Header = newsHeader
                            });
                        }
                    }
                }
                catch (Exception ex)
                {
                    _errorDescription = new ErrorDescription
                    {
                        Level = ApplicationLevel.RssParser,
                        Description = ex.Message
                    };
                }
            }

            #endregion
            #region obtaining loaded data
            else
                articleHeaders = _contentStorage.GetArticlesBySource(source).Select(art => art.Header).ToList();
            #endregion

            return articleHeaders;
        }
示例#4
0
        /// <summary>
        /// Parses article from given Url
        /// </summary>
        /// <param name="article">Article container. Url is stored inside </param>
        /// <returns>Success indication flag</returns>
        public override bool ParseArticle(ArticleContainer article)
        {
            var task = GetArticle(article.Header.Link);
            task.Wait();
            if (task.Result == null)
                return false;

            var doc = task.Result;
            var nodeList = doc.DocumentNode.Descendants().Where
                (x =>
                    (x.Name == "div" && x.Attributes["class"] != null &&
                     x.Attributes["class"].Value.Contains("news_body"))).ToList();
            var newsBody = nodeList.FirstOrDefault();
            var sb = new StringBuilder();
            if (newsBody != null)
            {
                foreach (var node in newsBody.ChildNodes.Where(n => n.Name == "p"))
                    sb.Append(node.InnerText + " ");

                var imgList = doc.DocumentNode.Descendants().Where
                    (x =>
                        (x.Name == "img" && x.Attributes["class"] != null &&
                         x.Attributes["class"].Value.Contains("main_image"))).ToList();
                var img = imgList.FirstOrDefault();

                var hasPicture = false;

                if (img != null)
                {
                    try
                    {
                        var url = img.Attributes["src"].Value;

                        var binaryTask = GetBinaryContent(url);
                        binaryTask.Wait();
                        if (binaryTask.Result != null)
                        {
                            var pict = binaryTask.Result;
                            article.Header.Enclosure = pict;
                            hasPicture = true;
                        }
                    }
                    catch (Exception ex)
                    {
                        Error = new ErrorDescription
                        {
                            Level = ApplicationLevel.NewsParser,
                            Description = ex.Message
                        };
                    }
                }

                article.Body = new BodyContainer
                {
                    Body = sb.ToString(),
                    HasPicture = hasPicture
                };

                return true;
            }
            return false;
        }
示例#5
0
 private async Task<HttpResponseMessage> GetContent(string url)
 {
     try
     {
         return await _httpClient.GetAsync(url);
     }
     catch (Exception ex)
     {
         _errorDescription = new ErrorDescription
         {
             Level = ApplicationLevel.RssParser,
             Description = ex.Message
         };
     }
     return null;
 }