public Entities.Article GetArticle(Model.Address address) { string html = this.GetHtmlString(GetWebResponse(address.address)); StanSoft.Article article = new StanSoft.Article(); article = Html2Article.GetArticle(html); Entities.Article myArticle = new Entities.Article(); myArticle.ID = Guid.NewGuid(); myArticle.Time = article.PublishDate; // 这里要去掉采集出来的标题的制表符回车等等 //以免在绑定到listview会看不见标题 string clearFilter = @"[\t\n\r]"; myArticle.Title = Regex.Replace(article.Title,clearFilter,""); myArticle.Title = myArticle.Title.TrimStart(); myArticle.Title = myArticle.Title.TrimEnd(); myArticle.Site = address.site; myArticle.Type = address.type; myArticle.Content = StanSoft.UrlUtility.FixUrl(address.address, article.ContentWithTags); return myArticle; }
/// <summary> /// 从给定的Html原始文本中获取正文信息 /// </summary> /// <param name="html"></param> /// <returns></returns> public static Article GetArticle(string html) { // 如果换行符的数量小于10,则认为html为压缩后的html // 由于处理算法是按照行进行处理,需要为html标签添加换行符,便于处理 if (html.Count(c => c == '\n') < 10) { html = html.Replace(">", ">\n"); } // 获取html,body标签内容 string body = ""; string bodyFilter = @"(?is)<body.*?</body>"; Match m = Regex.Match(html, bodyFilter); if (m.Success) { body = m.ToString(); } // 过滤样式,脚本等不相干标签 foreach (var filter in Filters) { body = Regex.Replace(body, filter[0], filter[1]); } // 标签规整化处理,将标签属性格式化处理到同一行 // 处理形如以下的标签: // <a // href='http://www.baidu.com' // class='test' // 处理后为 // <a href='http://www.baidu.com' class='test'> body = Regex.Replace(body, @"(<[^<>]+)\s*\n\s*", FormatTag); string content; string contentWithTags; GetContent(body, out content, out contentWithTags); Article article = new Article { Title = GetTitle(html), PublishDate = GetPublishDate(body), Content = content, ContentWithTags = contentWithTags }; return article; }