예제 #1
0
        public Entities.Article GetArticle(Model.Address address)
        {
            string html = this.GetHtmlString(GetWebResponse(address.address));
            StanSoft.Article article = new StanSoft.Article();

            article = Html2Article.GetArticle(html);

            Entities.Article myArticle = new Entities.Article();
            myArticle.ID = Guid.NewGuid();
            myArticle.Time = article.PublishDate;

            // 这里要去掉采集出来的标题的制表符回车等等
            //以免在绑定到listview会看不见标题
            string clearFilter = @"[\t\n\r]";
            myArticle.Title = Regex.Replace(article.Title,clearFilter,"");
            myArticle.Title = myArticle.Title.TrimStart();
            myArticle.Title = myArticle.Title.TrimEnd();

            myArticle.Site = address.site;
            myArticle.Type = address.type;
            myArticle.Content = StanSoft.UrlUtility.FixUrl(address.address, article.ContentWithTags);

            return myArticle;
        }
예제 #2
0
        /// <summary>
        /// 从给定的Html原始文本中获取正文信息
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        public static Article GetArticle(string html)
        {
            // 如果换行符的数量小于10,则认为html为压缩后的html
            // 由于处理算法是按照行进行处理,需要为html标签添加换行符,便于处理
            if (html.Count(c => c == '\n') < 10)
            {
                html = html.Replace(">", ">\n");
            }

            // 获取html,body标签内容
            string body = "";
            string bodyFilter = @"(?is)<body.*?</body>";
            Match m = Regex.Match(html, bodyFilter);
            if (m.Success)
            {
                body = m.ToString();
            }
            // 过滤样式,脚本等不相干标签
            foreach (var filter in Filters)
            {
                body = Regex.Replace(body, filter[0], filter[1]);
            }
            // 标签规整化处理,将标签属性格式化处理到同一行
            // 处理形如以下的标签:
            //  <a 
            //   href='http://www.baidu.com'
            //   class='test'
            // 处理后为
            //  <a href='http://www.baidu.com' class='test'>
            body = Regex.Replace(body, @"(<[^<>]+)\s*\n\s*", FormatTag);

            string content;
            string contentWithTags;
            GetContent(body, out content, out contentWithTags);

            Article article = new Article
            {
                Title = GetTitle(html),
                PublishDate = GetPublishDate(body),
                Content = content,
                ContentWithTags = contentWithTags
            };

            return article;
        }