public TagBuilder(HtmlCleaner.HtmlWriter w, string tagName, StringBuilder sb) { this.w = w; this.sb = sb; this.tagName = tagName; this.attributes = new Dictionary<string, string>(); }
private static string ExtractSummary(string articleText, int paragraphCount, bool skipRule) { string summary = string.Empty; int index = 0; //Skip past first HR if (skipRule) { var hrMatch = Regexes.Hr.Match(articleText); if (hrMatch != null) { index += hrMatch.Index + hrMatch.Length; } } //Skip past the first paragraph for (int i = 0; i <= paragraphCount; i++) { var pMatch = Regexes.P.Match(articleText.Substring(index)); if (pMatch == null) { break; } index += pMatch.Index; if (i < paragraphCount) { index += pMatch.Length; } } summary += (index == 0) ? articleText : articleText.Substring(0, index); return(HtmlCleaner.CloseTags(summary)); }
/// <summary> /// 一次性获取多个章节内容 /// </summary> /// <param name="nodeIdList"></param> public static bool getNodeDetail(List <string> nodeIdList, bool downloadImage = false) { try { string nodeDetails = HttpWorker.HttpGet(Global.NodeDetailApi, "nodeIds=" + string.Join(",", nodeIdList)); NodeDetailResponse nodeDtailResponse = TranslationWorker.ConvertStringToEntity <NodeDetailResponse>(nodeDetails); List <Node> nodes = nodeDtailResponse.ConvertToNodes(); if (downloadImage) { foreach (Node node in nodes)//下载图片到本地,并生成离线内容 { List <string> urls = HtmlCleaner.GetImageUrl(node.content); foreach (string url in urls) { HttpWorker.SaveImg(url, node.lawId); } node.offlineContent = HtmlCleaner.ChangeImageUrlToLocalPath(node.content, node.lawId); } } db.refreshNode(nodes, detailOnly: true); return(true); } catch (Exception) { MessageBox.Show("操作超时"); return(false); } }
public void Clean_WhenCalledWithHtmlWithMultiLineBreaks_ReplacedBySingleLineBreaks() { string inputString = "<html>\r\n<head>\r\n\r\n</head>\r\n\r\n\r\n</html>"; var cleaner = new HtmlCleaner(); string outputString = cleaner.Clean(inputString); Assert.AreEqual("<html>\r\n<head>\r\n</head>\r\n</html>", outputString); }
public void Clean_WhenCalledWithHtmlIfBlocks_CommentIsNotRemoved() { string inputString = "<html><head><!--[if lt IE 9]><link rel=\"stylesheet\" href=\"explorer.css\" type=\"text/css\" /><![endif]--></head></html>"; var cleaner = new HtmlCleaner(); string outputString = cleaner.Clean(inputString); Assert.AreEqual(inputString, outputString); }
public void StripWhitespace_TextWithoutWhitespace_TextIntact() { string item = "some text without whitespace"; var result = HtmlCleaner.StripWhitespace(item); Assert.Equal(item, result); }
public void Clean_InnerTag_TagDeleted() { const string html = "<div>Text before<p>Text in</p>Text after</div>"; var cleanedText = HtmlCleaner.Clean(html); Assert.Equal("Text before" + Environment.NewLine + "Text in" + Environment.NewLine + "Text after", cleanedText); }
public void Clean_WhenThereAreWhitespaceBetweenTags_SpacesAreRemoved() { string inputString = "<html> <head> </head> </html>"; var cleaner = new HtmlCleaner(); string outputString = cleaner.Clean(inputString); Assert.AreEqual("<html><head></head></html>", outputString); }
public void StripTags_TextWithTags_TagsRemoved() { string item = "<p>some</p><b>text</b><strong>with</strong><a href=\"#\">tags</a><br /><br/>"; var result = HtmlCleaner.StripTags(item); Assert.Equal("sometextwithtags", result); }
public void RemoveDownloadLinks_PostWithDownloadLinks_LinksRemoved() { string postText = "<li><i class=\"fa fa-download\">test</i>test2</li>test3"; var result = HtmlCleaner.RemoveDownloadLinks(postText); Assert.Equal("<li>test3", result); }
public void RemoveHtmlContent_PostWithList_ListRemoved() { string postText = "<ul>\n<li>\n<p>libGDX</p>\n</li>\n<li>\n<p>jMonkeyEngine</p>\n</li>\n<li>\n<p>lwjgl</p>\n</li>\n</ul>"; var result = HtmlCleaner.RemoveHTMLContent(postText); Assert.Equal(string.Empty, result); }
public void RemoveHtmlContent_PostWithQuotes_QuotesRemoved() { string postText = "<blockquote>cytat</blockquote>nie cytat<blockquote>cytat</blockquote>"; var result = HtmlCleaner.RemoveHTMLContent(postText); Assert.Equal("nie cytat", result); }
public void Clean_TextWithDuplivatedSpaces_SpacesTrimmed() { const string html = "<div> Text with many spaces </div>"; var cleanedText = HtmlCleaner.Clean(html); Assert.Equal("Text with many spaces", cleanedText); }
public void Clean_SimpleTag_TagDeleted() { const string html = "<p>Text in html</p>"; var cleanedText = HtmlCleaner.Clean(html); Assert.Equal("Text in html", cleanedText); }
public void Clean_WhenCalledWithHtmlWithOneComment_CommentIsRemoved() { string inputString = "<html><head><!-- kommentar --></head></html>"; var cleaner = new HtmlCleaner(); string outputString = cleaner.Clean(inputString); Assert.AreEqual("<html><head></head></html>", outputString); }
public void Clean_WhenRowEndWithMultipleSpaces_SpacesAreRemoved() { string inputString = "<html> \r\n \r\n<head> \r\n</head>\r\n </html>"; var cleaner = new HtmlCleaner(); string outputString = cleaner.Clean(inputString); Assert.AreEqual("<html>\r\n<head>\r\n</head>\r\n </html>", outputString); }
public void StripTags_TextWithTagsNotClosed_TagsRemoved() { string item = "<p></p></a></code></i><li><ul><ol></li><br /><br/>"; var result = HtmlCleaner.StripTags(item); Assert.Equal(string.Empty, result); }
public void StripTags_TextWithoutTags_TagsNotRemoved() { string item = "some text without tags"; var result = HtmlCleaner.StripTags(item); Assert.Equal(item, result); }
public void StripWhitespace_TextWithWhitespace_WhitespaceRemoved() { string item = "some text with \nwhitespace "; var result = HtmlCleaner.StripWhitespace(item); Assert.Equal("some text with whitespace", result); }
public void Clean_UnorderedList_TextFormatted() { const string html = "<ul><li>Coffee</li><li>Tea</li><li>Milk</li></ul>"; var cleanedText = HtmlCleaner.Clean(html); Assert.Equal("Coffee" + Environment.NewLine + "Tea" + Environment.NewLine + "Milk", cleanedText); }
public void RemoveProperCode_PostWithProperTags_TagsRemoved() { string postText = "<code>test</code><pre><code>test2</pre></code><pre><code class=\"csharp\">test3</code></pre>"; var result = HtmlCleaner.RemoveProperCode(postText); Assert.Equal(string.Empty, result); }
public void CleanTestDefaultCleaner() { var cleaner = new HtmlCleaner(Enumerable.Empty <ISiteHtmlCleaner>()); IArticle cleaned = cleaner.Clean("url", @"<body>Some text</body>"); Assert.AreEqual("Some text", cleaned.Text); }
protected override void Render(HtmlTextWriter writer) { using (var htmlwriter = new HtmlTextWriter(new StringWriter())) { base.Render(htmlwriter); HtmlCleaner.Render(htmlwriter, writer); } }
private static void Main(string[] args) { var htmlCleaner = new HtmlCleaner(new ISiteHtmlCleaner[] { new DailyHtmlCleaner(), new MirrorHtmlCleaner() }); var htmlLoader = new HtmlLoader(); var articleProvider = new ArticleProvider(htmlCleaner, htmlLoader); var cosineSimilarityCalculator = new CosineSimilarityCalculator(); IDocumentFrequencyProvider dfProvider = LoadFrequencies(); var tfIdfCalculator = new TfIdfCalculator(dfProvider); var tokenizer = new Tokenizer(); var articleProcessor = new ArticleProcessor(tfIdfCalculator, tokenizer); var articleComparer = new TextProcessing.ArticleComparer(articleProvider, cosineSimilarityCalculator, articleProcessor); Console.WriteLine("Similar articles:"); double similarity = articleComparer.Compare( @"http://www.dailymail.co.uk/news/article-2489957/Britains-spy-chiefs-grilled-MPs-television-time.html", @"http://www.mirror.co.uk/news/uk-news/mi6-mi5-gchq-bosses-questioned-2685310"); Console.WriteLine(similarity); similarity = articleComparer.Compare( @"http://www.dailymail.co.uk/news/article-2489640/80-parents-caught-children-copying-p**n-style-dances-offensive-lyrics.html", @"http://www.mirror.co.uk/news/uk-news/miley-cyrus-twerking-kids-copying-2685363"); Console.WriteLine(similarity); Console.WriteLine("Same article:"); similarity = articleComparer.Compare( @"http://www.dailymail.co.uk/news/article-2490296/You-STILL-likely-lose-job-recession-25s-shop-workers-risk.html", @"http://www.dailymail.co.uk/news/article-2490296/You-STILL-likely-lose-job-recession-25s-shop-workers-risk.html"); Console.WriteLine(similarity); Console.WriteLine("Different articles:"); similarity = articleComparer.Compare( @"http://www.dailymail.co.uk/femail/article-2489984/Needy-people-likely-cheat.html", @"http://www.dailymail.co.uk/news/article-2490531/Worlds-oldest-paperboy-deliver-round-71-years-route.html"); Console.WriteLine(similarity); similarity = articleComparer.Compare( @"http://www.dailymail.co.uk/news/article-2490412/Wikileaks-journalist-spent-4-months-Edward-Snowden-leaves-Russia.html", @"http://www.dailymail.co.uk/news/article-2489994/Twitter-share-prices-soar-firms-day-trading.html"); Console.WriteLine(similarity); Console.ReadKey(); }
private void ParseMessage(List <Message> messages, HtmlNode node, ISmfTheme theme) { var msgAnchor = node.SelectSingleNode(theme.TopicMessageLink); var msgURL = msgAnchor.Attributes["href"].Value; var rawPostedTime = node.SelectSingleNode(theme.TopicMessagePostedTime).InnerText.Replace("»", "").Trim(); var msgID = Convert.ToInt32(Regex.Match(msgURL, @"(?<=#msg)\d+").Value); var msgSubject = msgAnchor.InnerText; var msgBody = new HtmlCleaner().Remove(node.SelectSingleNode(theme.TopicMessageBody).InnerHtml); var postedTime = DateTime.Parse(rawPostedTime).ToUniversalTime(); // TODO : This is good, but what if this run multiple times on the same Topic? messages.Add(new Message(msgID, msgSubject, msgBody, postedTime)); }
private static void Main(string[] args) { var htmlCleaner = new HtmlCleaner(new[] { new TelegraphHtmlCleaner() }); var htmlLoader = new HtmlLoader(); var articleProvider = new ArticleProvider(htmlCleaner, htmlLoader); var frequencyProvider = new DocumentFrequencyProvider(); var tokenizer = new Tokenizer(); string[] articleUrls = File.ReadAllLines(ArticlesUrlsFile, Encoding.Unicode); foreach (string articleUrl in articleUrls) { IArticle article = articleProvider.Get(articleUrl); IEnumerable <IToken> tokens = tokenizer.Tokenize(article.Text); frequencyProvider.ProcessText(tokens); } SaveFrequencies(frequencyProvider); }
public void CleanTest() { var siteCleaner = MockRepository.GenerateStub <ISiteHtmlCleaner>(); siteCleaner.Stub(sc => sc.SiteUrl) .Return("url"); var mockArticle = MockRepository.GenerateStub <IArticle>(); mockArticle.Stub(a => a.Text) .Return("!"); siteCleaner.Stub(sc => sc.Clean(Arg <string> .Is.Anything)) .Return(mockArticle); var cleaner = new HtmlCleaner(new[] { siteCleaner }); IArticle cleaned = cleaner.Clean("url", string.Empty); Assert.AreEqual("!", cleaned.Text); }
public async Task CreateEditTemplate(CTemplate templateContent) { templateContent.Content = HtmlCleaner.CleanHtml(templateContent.Content); var found = _templatesRepository.FirstOrDefault(a => a.Name == templateContent.Name); if (found == null) { await _templatesRepository.InsertOrUpdateAndGetIdAsync(new Template() { Content = templateContent.Content, Name = templateContent.Name, IsPartial = templateContent.IsPartial, }); } else { found.Content = templateContent.Content; await _templatesRepository.InsertOrUpdateAndGetIdAsync(found); } }
public IHttpActionResult LoadUrl(string url, string clientSecret) { var passkeySetting = ConfigurationManager.AppSettings["passkey"]; if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(clientSecret) || !clientSecret.Equals(passkeySetting)) { return(BadRequest()); } //GET HTML var web = new HtmlWeb(); var htmlDoc = web.Load(url); IHtmlCleaner htmlCleaner = new HtmlCleaner { HtmlSourceDoc = htmlDoc }; htmlDoc = htmlCleaner.CleanHtml(); IWordRankAnalyzer wordRankAnalyzer = new WordRankAnalyzer { HtmlSourceDoc = htmlDoc }; var wordsRankedList = wordRankAnalyzer.RankWords(); var totalWordCount = wordRankAnalyzer.TotalWordsInHtml; IImageAnalyzer imgAnalyzer = new ImageAnalyzer { WebSiteUrl = url, HtmlSourceDoc = htmlDoc }; var imgSourceList = imgAnalyzer.GetImageSources(); return(Ok(new { UrlRequested = url, TotalWordCount = totalWordCount, WordsRankedList = wordsRankedList, ImagesSources = imgSourceList })); }
/// <summary> /// 获取一个章节的内容 /// </summary> /// <param name="nodeId"></param> public static void getNodeDetail(string nodeId) { string nodeDetails = HttpWorker.HttpGet(Global.NodeDetailApi, "nodeIds=" + nodeId); if (nodeDetails == "error") { return; } NodeDetailResponse nodeDtailResponse = TranslationWorker.ConvertStringToEntity <NodeDetailResponse>(nodeDetails); List <Node> nodes = nodeDtailResponse.ConvertToNodes(); foreach (Node node in nodes)//下载图片到本地,并生成离线内容 { List <string> urls = HtmlCleaner.GetImageUrl(node.content); foreach (string url in urls) { HttpWorker.SaveImg(url, node.lawId); } node.offlineContent = HtmlCleaner.ChangeImageUrlToLocalPath(node.content, node.lawId); } db.refreshNode(nodes, detailOnly: true); }
public static ArticleModel FromTable(Tables.Articles_Extended article) { // add microdata to take advantage of Google's rich snippets for articles: // https://developers.google.com/structured-data/rich-snippets/articles if (article.Body_Html.Contains("<img ") && !article.Body_Html.Contains(" itemprop=\"image\" ")) { // only modify the first image int index = article.Body_Html.IndexOf("<img ") + "<img ".Length; article.Body_Html = article.Body_Html.Substring(0, index) + "itemprop=\"image\" " + article.Body_Html.Substring(index); // assume the body+ad contains the entire body index = article.BodyAndAd_Html.IndexOf("<img ") + "<img ".Length; article.BodyAndAd_Html = article.BodyAndAd_Html.Substring(0, index) + "itemprop=\"image\" " + article.BodyAndAd_Html.Substring(index); } return(new ArticleModel() { Id = article.Article_Id, Slug = article.Article_Slug, Author = AuthorModel.FromTable(article), BodyHtml = HtmlCleaner.UnmixContent(article.Body_Html), BodyAndAdHtml = HtmlCleaner.UnmixContent(article.BodyAndAd_Html), CachedCommentCount = (int)article.Cached_Comment_Count, DiscourseTopicId = article.Discourse_Topic_Id, DiscourseTopicOpened = article.Discourse_Topic_Opened == "Y", LastCommentDate = article.Last_Comment_Date, PublishedDate = article.Published_Date, Series = SeriesModel.FromTable(article), FooterAdHtml = HtmlCleaner.UnmixContent(article.Ad_Html), Status = article.PublishedStatus_Name, SummaryHtml = HtmlCleaner.UnmixContent(ArticleModel.ExtractSummary(article.Body_Html)), Title = article.Title_Text, PreviousArticleId = article.Previous_Article_Id, PreviousArticleSlug = article.Previous_Article_Slug, PreviousArticleTitle = article.Previous_Title_Text, NextArticleId = article.Next_Article_Id, NextArticleSlug = article.Next_Article_Slug, NextArticleTitle = article.Next_Title_Text }); }
protected override bool DoInsertData(DataAction action, MarkupPointer begin, MarkupPointer end) { using (new WaitCursor()) { try { string htmlPresentation = DataMeister.LiveClipboardData.HtmlPresentation; // remove document tags and scripts htmlPresentation = HtmlCleaner.RemoveScripts(htmlPresentation); // insert the html EditorContext.InsertHtml(begin, end, htmlPresentation, null); return(true); } catch (Exception e) { //bugfix 1696, put exceptions into the trace log. Trace.Fail("Exception while inserting HTML: " + e.Message, e.StackTrace); return(false); } } }
public CommentModel() { this.index = new Lazy <int>(() => StoredProcs.Comments_GetCommentIndex(this.Id).Execute().Value); this.parentCommentIndex = new Lazy <int?>(() => this.ParentCommentId.HasValue ? StoredProcs.Comments_GetCommentIndex(this.ParentCommentId).Execute() : null); this.getLinks = new Lazy <IList <string> >(() => HtmlCleaner.GetLinkUrls(this.BodyHtml).ToList()); }
private static string MarkdownFormatContent(string text) { return(HtmlCleaner.Clean(CommonMarkConverter.Convert(BbCodeFormatComment(text)))); }
public void CleanTestNullUrlExc() { var cleaner = new HtmlCleaner(Enumerable.Empty <ISiteHtmlCleaner>()); cleaner.Clean(null, string.Empty); }
public override string CleanupHtml(string html, string baseUrl, HtmlCleanupRule cleanupRule) { html = HtmlCleaner.CleanupHtml(html, baseUrl, true, (cleanupRule == HtmlCleanupRule.Normal ? false : true)); return(_editorHost.TransformHtml(html)); }