public void testReplaceHtml() { String html = @"<p> Hello. <font size=""2"">I am size 2</font> <font color=""red"">and I am red</font> </p>"; String result = @"<p> Hello. I am size 2 and I am red </p>"; Assert.AreEqual(result, RegPattern.ReplaceHtml(html, "font", false)); html = @"<p><span style=""color:red;"">kkk</span> aaaaaaaa</p>"; result = @"<p>kkk aaaaaaaa</p>"; Assert.AreEqual(result, RegPattern.ReplaceHtml(html, "span", false)); html = @"<p>zzz <a href=""ccc.html"">aaa name</a> pppp</p>"; result = @"<p>zzz aaa name pppp</p>"; Assert.AreEqual(result, RegPattern.ReplaceHtml(html, "a", false)); //--------------------------------------------------------------------------------------------- html = @"<p> <script> alert('sss'); </script> <style> p {color:red;} </style> Hello. <font size=""2"">I am size 2</font> <font color=""red"">and I am red</font> </p>"; result = @"<p> Hello. <font size=""2"">I am size 2</font> <font color=""red"">and I am red</font> </p>"; String x = RegPattern.ReplaceHtml(html, "script", true); x = RegPattern.ReplaceHtml(x, "style", true); Assert.AreEqual(result, x); html = @" <p> pic1 <img src=""eeeeeeee.jpg"" /> </p> pic2<IMG src=""xxxxxx.gif"" > "; result = @" <p> pic1 </p> pic2 "; x = RegPattern.ReplaceHtml(html, "img", true); Assert.AreEqual(result, x); html = @" <p> pic1 <br/> </p> pic2<BR> <div><br /></div> <br /> "; result = @" <p> pic1 </p> pic2 <div></div> "; x = RegPattern.ReplaceHtml(html, "br", true); Assert.AreEqual(result, x); }
private string filterPage(string input, SpiderTemplate spiderTemplate) { if (strUtil.IsNullOrEmpty(spiderTemplate.DetailClearTag)) { return(input); } String[] arrTag = spiderTemplate.DetailClearTag.ToLower().Split(','); if (arrTag.Length == 0) { return(input); } List <String> rTag = new List <String>(); logger.Info("filterTag, input=" + input); // 过滤标签,以及标签内部的内容 foreach (String tag in arrTag) { // font/span/a 只过滤tag,不过滤内容;其他都过滤内容 if (tag == "font" || tag == "span" || tag == "a") { rTag.Add(tag); continue; } logger.Info("tag=" + tag); input = RegPattern.ReplaceHtml(input, tag, true); } logger.Info("filterTag, clear tag1=" + input); // 只过滤标签,不过滤标签的内容 foreach (String tag in rTag) { logger.Info("tag=" + tag); input = RegPattern.ReplaceHtml(input, tag, false); } logger.Info("filterTag, clear tag2=" + input); return(input); }