/// <summary> /// 从media字段中获取作者的清洗方式。以TOP20总结的规则 /// </summary> /// <param name="Author">带有作者的字段</param> /// <returns></returns> public static string CleanAuthor(string Author) { Author = TextCleaner.FullClean(Author); if (Author.Contains("作者")) { Author = Author.Substring(Author.IndexOf("作者") + 3); } if (Author.Contains("来源")) { Author = Author.Substring(0, Author.IndexOf("来源")); } if (Author.Contains("发布时间")) { Author = Author.Substring(0, Author.IndexOf("发布时间")); } if (!string.IsNullOrWhiteSpace(Author)) { return(Author); } else { return(null); } }
/// <summary> /// 从media字段中获取转载媒体的清洗方式。以TOP20总结的规则 /// </summary> /// <param name="MediaName">带有转载媒体的字段</param> /// <returns></returns> public static string CleanMediaName(string MediaName) { MediaName = TextCleaner.FullClean(MediaName); if (MediaName.Contains("来源")) { MediaName = MediaName.Substring(MediaName.IndexOf("来源") + 3); } while (!string.IsNullOrWhiteSpace(MediaName) && MediaName.StartsWith(" ")) { MediaName = MediaName.Substring(1); } if (MediaName.Contains(" ")) { MediaName = MediaName.Substring(0, MediaName.IndexOf(' ')); } MediaName = MediaName.Replace(")", "").Replace(")", ""); if (!string.IsNullOrWhiteSpace(MediaName)) { return(MediaName); } else { return(null); } }
/// <summary> /// 一个英文字符算作半个字长 /// </summary> /// <param name="str"></param> /// <param name="StopWords">禁用词将被替换为Empty</param> /// <returns></returns> public static int GetWeightedLength(string str, IEnumerable <string> StopWords = null) { if (string.IsNullOrWhiteSpace(str)) { return(0); } str = System.Web.HttpUtility.HtmlDecode(str); str = Regex.Replace(str, @"\s*", String.Empty); //禁用词替换 if (StopWords != null) { str = TextCleaner.RemoveStopWords(str, StopWords); } MatchCollection matches = Regex.Matches(str, @"[a-zA-Z0-9\ \+\-\*\\/]+"); Int32 alphLen = 0; foreach (Match match in matches) { alphLen += match.Value.Length; } return(str.Length - alphLen + alphLen / 2); }
/// <summary> /// Tries the get string. /// </summary> /// <returns>The get string.</returns> /// <param name="m">M.</param> /// <param name="MatchGroupName">Match group name.</param> /// <param name="DefaultValue">Default value.</param> /// <param name="Clean">If set to <c>true</c> clean.</param> /// <param name="weibo">If set to <c>true</c> weibo.</param> public static string TryGetString(Match m, string MatchGroupName, string DefaultValue, bool Clean = true) { if (Clean) { return(m.Groups[MatchGroupName].Success ? TextCleaner.FullClean(m.Groups[MatchGroupName].Value) : DefaultValue); } else { return(m.Groups[MatchGroupName].Success ? m.Groups[MatchGroupName].Value : DefaultValue); } }
/// <summary> /// 对文章内容的清洗。依据TOP20总结出的,较为广泛适用的规则 /// </summary> /// <param name="nodes">通过ItemContentXPath选出的nodes</param> /// <param name="Url">该url,用于FormatHtml函数以整理文章格式</param> /// <param name="Format">是否运用FormatHtml来进行文章格式的整理。若否,则在后期会清洗掉p、br等标签</param> /// <returns></returns> public static string CleanContent(HtmlNodeCollection nodes, string Url, bool Format = true) { string Content = string.Empty; foreach (HtmlNode cnode in nodes) { string temp = HtmlFormatter.FormatHtml(cnode.InnerHtml, Url); temp = CleanContent_CleanEditor(temp); temp = CleanContent_CleanA(temp); if (!Format) { temp = TextCleaner.FullClean(temp); } Content += temp; } return(Content); }
/// <summary> /// 全套清洗,企图只留下含有意义的字符 /// </summary> /// <param name="DirtyText">待清洗Html</param> /// <param name="CleanImg">是否清洗掉Img标签</param> /// <param name="RemainConnectionMark">保留连接字符"-_"等</param> /// <param name="CleanHtml">保留Url和Email地址</param> /// <returns></returns> public static string FullClean(string DirtyText, bool CleanImg = true, bool CleanHtml = true, bool RemainConnectionMark = false, bool RemainURLEmail = false, bool cleanComment = true, bool CleanUBB = true) { if (string.IsNullOrEmpty(DirtyText)) { return(""); } string Cleaned = cleanComment ? Regex.Replace(DirtyText, @"<!--[\s\S]*?-->", String.Empty) : DirtyText; Cleaned = CleanHtml ? HTMLCleaner.CleanHTML(Cleaned, CleanImg) : Cleaned; if (string.IsNullOrEmpty(Cleaned)) { return(""); } if (CleanUBB) { Cleaned = TextCleaner.CleanUBB(Cleaned); } Cleaned = CleanNonsense(Cleaned, RemainConnectionMark); Cleaned = CleanRepetition(Cleaned); if (string.IsNullOrEmpty(Cleaned)) { return(""); } if (CleanHtml) { Cleaned = CleanURL(Cleaned); } if (!RemainURLEmail) { Cleaned = CleanEmail(Cleaned); } Cleaned = CnChar2EnChar(Cleaned); if (IsTraditionalChinese(Cleaned)) { Cleaned = ToSimplifyString(Cleaned); } return(Cleaned); }
/// <summary> /// 对Html进行格式重整(以便发布) /// </summary> /// <remarks>整理内容: /// 0.去除不允许的html标签 /// 1.P元素整理:删除空P,前后空格,段首俩空格 /// 2.img标签整理:src变为绝对路径,alt属性设置 /// 3.a标签整理:href变为绝对路径 /// 4.清楚class以及style样式 /// 5.整理图片样式 /// 6.整理视频样式 /// </remarks> /// <param name="OriHtml">原始Html片段</param> /// <param name="Url">文章的Url</param> /// <returns>整理后的Html(如失败则返回原始串)</returns> public static string FormatHtml(string OriHtml, string Url) { if (string.IsNullOrEmpty(OriHtml)) { return(OriHtml); } //将文本替换与HtmlNode的部分分离,减少建立Dom树的次数 #region 加载Doc对象 //加载HtmlDocument(内容为Html片段,可能异常) HtmlNode oriHtmlNode = HtmlUtility.getSafeHtmlRootNode(OriHtml, true, true); if (oriHtmlNode == null || string.IsNullOrWhiteSpace(oriHtmlNode.InnerText)) { return(OriHtml); } oriHtmlNode = oriHtmlNode.SelectSingleNode("//body") ?? oriHtmlNode; #endregion 加载Doc对象 #region P整理 HtmlNodeCollection PNodes = oriHtmlNode.SelectNodes("//p"); if (PNodes != null && PNodes.Count > 0) { foreach (HtmlNode node in PNodes) { try { //清理无内容的P if (string.IsNullOrEmpty(TextCleaner.FullClean(node.InnerHtml, false, false))) { //可能是为了空一行. node.RemoveAll(); } else { while (node.InnerHtml.TrimStart().StartsWith(" ", StringComparison.OrdinalIgnoreCase)) { node.InnerHtml = node.InnerHtml.TrimStart().Substring(6); } while (node.InnerHtml.TrimStart().StartsWith(" ", StringComparison.OrdinalIgnoreCase)) { node.InnerHtml = node.InnerHtml.TrimStart(' '); } } } catch (Exception ex) { Logger.Warn(string.Format("调整内容中p标签时出错:{0},Url={1},P={2}", ex.Message, Url, node.OuterHtml)); } } } #endregion P整理 #region 清理空内容标签 //清理空内容的标签 foreach (string RemoveNullTag in HtmlRemoveNullTags.Split()) { HtmlNodeCollection NullTags = oriHtmlNode.SelectNodes("//" + RemoveNullTag); if (NullTags != null && NullTags.Count > 0) { foreach (HtmlNode node in NullTags) { try { //清理无内容的P if (string.IsNullOrWhiteSpace(TextCleaner.FullClean(node.InnerHtml, false))) { node.ParentNode.RemoveChild(node); } } catch (Exception ex) { Logger.Warn(string.Format("调整内容中标签时出错:{0},Url={1},P={2}", ex.Message, Url, node.OuterHtml)); } } } } #endregion #region Img整理 HtmlNodeCollection ImgNodes = oriHtmlNode.SelectNodes("//img"); if (ImgNodes != null && ImgNodes.Count > 0) { foreach (HtmlNode node in ImgNodes) { //替换alt标签 try { if (node.Attributes["alt"] == null) { node.Attributes.Append("alt", HtmlImgAltText); } else { node.SetAttributeValue("alt", HtmlImgAltText); } } catch (Exception ex) { Logger.Error(string.Format("替换内容中Img标签的alt属性时出错, Img={0}", node.OuterHtml), ex); } //如果包含real_src,则替换src的值;新浪财经特殊处理逻辑 if (node.Attributes["real_src"] != null && !string.IsNullOrEmpty(node.Attributes["real_src"].Value)) { if (node.Attributes["src"] == null) { node.Attributes.Append("src", node.Attributes["real_src"].Value); } else { node.SetAttributeValue("src", node.Attributes["real_src"].Value); } } else if (node.Attributes["data-src"] != null && !string.IsNullOrEmpty(node.Attributes["data-src"].Value)) // wechat { if (node.Attributes["src"] == null) { node.Attributes.Append("src", node.Attributes["data-src"].Value); } else { node.SetAttributeValue("src", node.Attributes["data-src"].Value); } } //src绝对路径 if (node.Attributes["src"] == null || string.IsNullOrEmpty(node.Attributes["src"].Value)) { node.RemoveAll(); } else { try { if (!node.Attributes["src"].Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase) && !node.Attributes["src"].Value.StartsWith("file://", StringComparison.OrdinalIgnoreCase) && !node.Attributes["src"].Value.StartsWith("data:", StringComparison.OrdinalIgnoreCase) && !node.Attributes["src"].Value.StartsWith("https://", StringComparison.OrdinalIgnoreCase) && !node.Attributes["src"].Value.StartsWith("//", StringComparison.OrdinalIgnoreCase)) { node.Attributes["src"].Value = new Uri(new Uri(Url), node.Attributes["src"].Value).AbsoluteUri; } //粗暴的认为在非主域名上的Url包含 icon 的都是表情,移除掉 if (new Uri(node.Attributes["src"].Value).PathAndQuery.Contains("icon")) { node.RemoveAll(); } else { //去掉图片名为各大社交分享平台的图标 (wechat,weibo,qq,qzone,sina,renren,kaixin,baidu,tieba,fetion,fbook,facebook,twitter,linkedin,sohu) string chatImage = @"(wechat\.|weibo\.|qq\.|qzone\.|sina\.|renren\.|kaixin\.|baidu\.|tieba\.|fetion\.|fbook\.|facebook\.|twitter\.|linkedin\.|sohu\.)"; if (Regex.IsMatch(node.Attributes["src"].Value, chatImage, RegexOptions.IgnoreCase)) { node.RemoveAll(); } } } catch (Exception ex) { Logger.Error(string.Format("FormatHtml替换内容中Img标签的src属性时出错, Url={0} Img={1}\n", Url, node.OuterHtml), ex); } } } } #endregion Img整理 #region 视频标签处理 //常见视频网站:需进一步完善 //优酷网/爱奇艺/土豆网/搜狐视频/迅雷看看/凤凰视频/腾讯视频/新浪视频/56网/CNTV视频/酷6网/暴风影音/乐视网/PPS/风行/PPTV //百度视频/糖豆网/芒果TV/激动网/第一视频/爆米花视频/华数TV/爱拍原创/百度影音/熊猫频道/YY直播/播视网/A站/B站 //取出已知的几种视频格式,仅保留src与type,allowscriptaccess,allowfullscreen,wmode HtmlNodeCollection embedNodes = oriHtmlNode.SelectNodes("//embed"); if (embedNodes != null && embedNodes.Count > 0) { foreach (HtmlNode node in embedNodes) { try { //保存下src和type的值 string tempsrc = "", temptype = ""; if (node.Attributes["src"] != null) { tempsrc = node.Attributes["src"].Value; } if (!Regex.IsMatch(tempsrc, @"\.(avi|rmvb|rm|mkv|mp4|3gp|flv|swf)", RegexOptions.IgnoreCase)) { continue; } if (node.Attributes["type"] != null) { temptype = node.Attributes["type"].Value; } //清除其它的attribute node.Attributes.RemoveAll(); if (!string.IsNullOrEmpty(tempsrc)) { node.Attributes.Append("src", tempsrc); } if (!string.IsNullOrEmpty(temptype)) { node.Attributes.Append("type", temptype); } node.Attributes.Append("width", "always"); node.Attributes.Append("allowfullscreen", "true"); node.Attributes.Append("wmode", "opaque"); //node.Attributes.Append("allowscriptaccess", "always"); } catch (Exception ex) { Logger.Error(string.Format("替换视频标签属性时出错, film={0}", node.OuterHtml), ex); } } } //针对Iframe的视频地址去除高宽 HtmlNodeCollection IframeNodes = oriHtmlNode.SelectNodes("//iframe"); if (IframeNodes != null && IframeNodes.Count > 0) { foreach (HtmlNode node in IframeNodes) { try { //保存下src和type的值 string tempsrc = ""; if (node.Attributes["src"] != null) { tempsrc = node.Attributes["src"].Value; } if (!Regex.IsMatch(tempsrc, @"\.(avi|rmvb|rm|mkv|mp4|3gp|flv|swf)", RegexOptions.IgnoreCase)) { continue; } //清除其它的attribute node.Attributes.RemoveAll(); if (!string.IsNullOrEmpty(tempsrc)) { node.Attributes.Append("src", tempsrc); } node.Attributes.Append("allowscriptaccess", "100%"); node.Attributes.Append("allowfullscreen", "true"); node.Attributes.Append("frameborder", "0"); //node.Attributes.Append("allowscriptaccess", "always"); } catch (Exception ex) { Logger.Error(string.Format("替换视频标签属性时出错, film={0}", node.OuterHtml), ex); } } } #endregion #region a整理 HtmlNodeCollection ANodes = oriHtmlNode.SelectNodes("//a[@href]"); if (ANodes != null && ANodes.Count > 0) { foreach (HtmlNode node in ANodes) { try { string href = HTMLCleaner.CleanUrl(node.Attributes["href"].Value); if (!string.IsNullOrEmpty(href) && !node.Attributes["href"].Value.StartsWith("http", StringComparison.OrdinalIgnoreCase)) { node.Attributes["href"].Value = new Uri(new Uri(Url), node.Attributes["href"].Value).AbsoluteUri; } else { node.Attributes["href"].Value = href; } } catch (Exception ex) { Logger.Error(string.Format("FormatHtml替换内容中a标签的href属性时出错, href={0}", node.Attributes["href"].Value), ex); } } } #endregion a整理 string outHtml = oriHtmlNode.InnerHtml; //下面开始字符的替换操作 #region 去除不允许的html标签 foreach (string RemoveTag in HtmlRemoveTags_RemoveContent.Split()) { outHtml = HTMLCleaner.StripHtmlTag(outHtml, RemoveTag, true); } foreach (string RemoveTag in HtmlRemoveTags_RemainContent.Split()) { outHtml = HTMLCleaner.StripHtmlTag(outHtml, RemoveTag, false); } foreach (string RemoveTag in HtmlRemoveProperty.Split()) { outHtml = HTMLCleaner.StripHtmlProperty(outHtml, RemoveTag); } #endregion 去除不允许的html标签 #region some cleanning outHtml = Regex.Replace(outHtml, @"\n|\r", string.Empty, RegexOptions.None); outHtml = Regex.Replace(outHtml, @"\t", " ", RegexOptions.None); outHtml = Regex.Replace(outHtml, @"\s*onload=(""|')?\S*(""|')?\s*", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Multiline); outHtml = Regex.Replace(outHtml, @"\s*onclick=(""|')?\S*(""|')?\s*", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Multiline); outHtml = Regex.Replace(outHtml, @"\s*onmouse\S*=(""|')?\S*(""|')?\s*", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Multiline); outHtml = HTMLCleaner.CleanSpaces(outHtml); #endregion some cleanning //清理空的<>;《》;();();{};[];"" outHtml = Regex.Replace(outHtml, @"<\s*?>", "", RegexOptions.None); outHtml = Regex.Replace(outHtml, @"《\s*?》", "", RegexOptions.None); outHtml = Regex.Replace(outHtml, @"(\s*?)", "", RegexOptions.None); outHtml = Regex.Replace(outHtml, @"\(\s*?\)", "", RegexOptions.None); outHtml = Regex.Replace(outHtml, @"{\s*?}", "", RegexOptions.None); outHtml = Regex.Replace(outHtml, @"\[\s*?\]", "", RegexOptions.None); outHtml = Regex.Replace(outHtml, "\"\\s*?\"", "", RegexOptions.None); //删除空的img标签 outHtml = outHtml.Replace("<img>", "").Replace("<img/>", ""); return(outHtml); }