/// <summary> /// 保存HTML /// </summary> /// <param name="args"></param> private static void SaveHtmlEvent(DataReceivedEventArgs args) { Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase); Match mc = reg.Match(args.Html); string m_title = String.Empty; if (mc.Success) { m_title = mc.Groups["title"].Value.Trim(); } Regex urlRegex = new Regex(@"(?i)http://(\w+\.){1,3}(com(\.cn)?|cn|net)\b"); //去除域名后 var shtml = urlRegex.Replace(args.Html, "/File"); try { lock (syncRoot) { //更新数据库 Articles article = new Articles(); article.Id = PrimaryKeyGen.GuidToLongId(); article.IsDelete = false; article.Url = args.Url; article.Title = m_title; article.Summary = m_title; article.Content = shtml; article.AddTime = DateTime.Now.ToString("yyyyMMdd hh:mm:ss"); var saveResult = _IArticlesService.Add(article); if (saveResult) { // 更新索引库 IndexTask task = new IndexTask(); task.TaskId = article.Id; task.Title = m_title; //去除回车,空格,换行 task.Content = HtmlConverts.ConvertHtml(shtml).Replace("\n", "").Replace(" ", "").Replace("\t", "").Replace("\r", "");; task.Summary = m_title; IndexManager.Instance.AddArticle(task); } } } catch (Exception ex) { log.ErrorFormat("Url:{0};\r\n错误信息{1}", args.Url, ex.InnerException.Message); } }
public void TestHtmlConverts() { string url = "http://blog.csdn.net/wangyi1e/article/details/29204987"; string responseStr = HttpGet(url, ""); string result = HtmlConverts.ConvertHtml(responseStr).Replace("\n", "").Replace(" ", "").Replace("\t", "").Replace("\r", ""); }