Beispiel #1
0
		public TagBuilder(HtmlCleaner.HtmlWriter w, string tagName, StringBuilder sb)
		{
			this.w = w;
			this.sb = sb;
			this.tagName = tagName;
			this.attributes = new Dictionary<string, string>();
		}
Beispiel #2
0
        private static string ExtractSummary(string articleText, int paragraphCount, bool skipRule)
        {
            string summary = string.Empty;
            int    index   = 0;

            //Skip past first HR
            if (skipRule)
            {
                var hrMatch = Regexes.Hr.Match(articleText);
                if (hrMatch != null)
                {
                    index += hrMatch.Index + hrMatch.Length;
                }
            }

            //Skip past the first paragraph
            for (int i = 0; i <= paragraphCount; i++)
            {
                var pMatch = Regexes.P.Match(articleText.Substring(index));
                if (pMatch == null)
                {
                    break;
                }

                index += pMatch.Index;
                if (i < paragraphCount)
                {
                    index += pMatch.Length;
                }
            }

            summary += (index == 0) ? articleText : articleText.Substring(0, index);

            return(HtmlCleaner.CloseTags(summary));
        }
 /// <summary>
 /// 一次性获取多个章节内容
 /// </summary>
 /// <param name="nodeIdList"></param>
 public static bool getNodeDetail(List <string> nodeIdList, bool downloadImage = false)
 {
     try
     {
         string             nodeDetails       = HttpWorker.HttpGet(Global.NodeDetailApi, "nodeIds=" + string.Join(",", nodeIdList));
         NodeDetailResponse nodeDtailResponse = TranslationWorker.ConvertStringToEntity <NodeDetailResponse>(nodeDetails);
         List <Node>        nodes             = nodeDtailResponse.ConvertToNodes();
         if (downloadImage)
         {
             foreach (Node node in nodes)//下载图片到本地,并生成离线内容
             {
                 List <string> urls = HtmlCleaner.GetImageUrl(node.content);
                 foreach (string url in urls)
                 {
                     HttpWorker.SaveImg(url, node.lawId);
                 }
                 node.offlineContent = HtmlCleaner.ChangeImageUrlToLocalPath(node.content, node.lawId);
             }
         }
         db.refreshNode(nodes, detailOnly: true);
         return(true);
     }
     catch (Exception)
     {
         MessageBox.Show("操作超时");
         return(false);
     }
 }
Beispiel #4
0
        public void Clean_WhenCalledWithHtmlWithMultiLineBreaks_ReplacedBySingleLineBreaks()
        {
            string inputString = "<html>\r\n<head>\r\n\r\n</head>\r\n\r\n\r\n</html>";
            var cleaner = new HtmlCleaner();
            string outputString = cleaner.Clean(inputString);

            Assert.AreEqual("<html>\r\n<head>\r\n</head>\r\n</html>", outputString);
        }
Beispiel #5
0
        public void Clean_WhenCalledWithHtmlIfBlocks_CommentIsNotRemoved()
        {
            string inputString = "<html><head><!--[if lt IE 9]><link rel=\"stylesheet\" href=\"explorer.css\" type=\"text/css\" /><![endif]--></head></html>";
            var cleaner = new HtmlCleaner();
            string outputString = cleaner.Clean(inputString);

            Assert.AreEqual(inputString, outputString);
        }
Beispiel #6
0
        public void StripWhitespace_TextWithoutWhitespace_TextIntact()
        {
            string item = "some text without whitespace";

            var result = HtmlCleaner.StripWhitespace(item);

            Assert.Equal(item, result);
        }
Beispiel #7
0
        public void Clean_InnerTag_TagDeleted()
        {
            const string html = "<div>Text before<p>Text in</p>Text after</div>";

            var cleanedText = HtmlCleaner.Clean(html);

            Assert.Equal("Text before" + Environment.NewLine + "Text in" + Environment.NewLine + "Text after", cleanedText);
        }
Beispiel #8
0
        public void Clean_WhenThereAreWhitespaceBetweenTags_SpacesAreRemoved()
        {
            string inputString = "<html>   <head>  </head> </html>";
            var cleaner = new HtmlCleaner();
            string outputString = cleaner.Clean(inputString);

            Assert.AreEqual("<html><head></head></html>", outputString);
        }
Beispiel #9
0
        public void StripTags_TextWithTags_TagsRemoved()
        {
            string item = "<p>some</p><b>text</b><strong>with</strong><a href=\"#\">tags</a><br /><br/>";

            var result = HtmlCleaner.StripTags(item);

            Assert.Equal("sometextwithtags", result);
        }
Beispiel #10
0
        public void RemoveDownloadLinks_PostWithDownloadLinks_LinksRemoved()
        {
            string postText = "<li><i class=\"fa fa-download\">test</i>test2</li>test3";

            var result = HtmlCleaner.RemoveDownloadLinks(postText);

            Assert.Equal("<li>test3", result);
        }
Beispiel #11
0
        public void RemoveHtmlContent_PostWithList_ListRemoved()
        {
            string postText = "<ul>\n<li>\n<p>libGDX</p>\n</li>\n<li>\n<p>jMonkeyEngine</p>\n</li>\n<li>\n<p>lwjgl</p>\n</li>\n</ul>";

            var result = HtmlCleaner.RemoveHTMLContent(postText);

            Assert.Equal(string.Empty, result);
        }
Beispiel #12
0
        public void RemoveHtmlContent_PostWithQuotes_QuotesRemoved()
        {
            string postText = "<blockquote>cytat</blockquote>nie cytat<blockquote>cytat</blockquote>";

            var result = HtmlCleaner.RemoveHTMLContent(postText);

            Assert.Equal("nie cytat", result);
        }
Beispiel #13
0
        public void Clean_TextWithDuplivatedSpaces_SpacesTrimmed()
        {
            const string html = "<div> Text with   many spaces  </div>";

            var cleanedText = HtmlCleaner.Clean(html);

            Assert.Equal("Text with many spaces", cleanedText);
        }
Beispiel #14
0
        public void Clean_SimpleTag_TagDeleted()
        {
            const string html = "<p>Text in html</p>";

            var cleanedText = HtmlCleaner.Clean(html);

            Assert.Equal("Text in html", cleanedText);
        }
Beispiel #15
0
        public void Clean_WhenCalledWithHtmlWithOneComment_CommentIsRemoved()
        {
            string inputString = "<html><head><!-- kommentar --></head></html>";
            var cleaner = new HtmlCleaner();
            string outputString = cleaner.Clean(inputString);

            Assert.AreEqual("<html><head></head></html>", outputString);
        }
Beispiel #16
0
        public void Clean_WhenRowEndWithMultipleSpaces_SpacesAreRemoved()
        {
            string inputString = "<html> \r\n  \r\n<head>  \r\n</head>\r\n </html>";
            var cleaner = new HtmlCleaner();
            string outputString = cleaner.Clean(inputString);

            Assert.AreEqual("<html>\r\n<head>\r\n</head>\r\n </html>", outputString);
        }
Beispiel #17
0
        public void StripTags_TextWithTagsNotClosed_TagsRemoved()
        {
            string item = "<p></p></a></code></i><li><ul><ol></li><br /><br/>";

            var result = HtmlCleaner.StripTags(item);

            Assert.Equal(string.Empty, result);
        }
Beispiel #18
0
        public void StripTags_TextWithoutTags_TagsNotRemoved()
        {
            string item = "some text without tags";

            var result = HtmlCleaner.StripTags(item);

            Assert.Equal(item, result);
        }
Beispiel #19
0
        public void StripWhitespace_TextWithWhitespace_WhitespaceRemoved()
        {
            string item = "some  text   with   \nwhitespace   ";

            var result = HtmlCleaner.StripWhitespace(item);

            Assert.Equal("some text with whitespace", result);
        }
Beispiel #20
0
        public void Clean_UnorderedList_TextFormatted()
        {
            const string html = "<ul><li>Coffee</li><li>Tea</li><li>Milk</li></ul>";

            var cleanedText = HtmlCleaner.Clean(html);

            Assert.Equal("Coffee" + Environment.NewLine + "Tea" + Environment.NewLine + "Milk", cleanedText);
        }
Beispiel #21
0
        public void RemoveProperCode_PostWithProperTags_TagsRemoved()
        {
            string postText = "<code>test</code><pre><code>test2</pre></code><pre><code class=\"csharp\">test3</code></pre>";

            var result = HtmlCleaner.RemoveProperCode(postText);

            Assert.Equal(string.Empty, result);
        }
Beispiel #22
0
        public void CleanTestDefaultCleaner()
        {
            var cleaner = new HtmlCleaner(Enumerable.Empty <ISiteHtmlCleaner>());

            IArticle cleaned = cleaner.Clean("url", @"<body>Some text</body>");

            Assert.AreEqual("Some text", cleaned.Text);
        }
Beispiel #23
0
        protected override void Render(HtmlTextWriter writer)
        {
            using (var htmlwriter = new HtmlTextWriter(new StringWriter()))
            {
                base.Render(htmlwriter);

                HtmlCleaner.Render(htmlwriter, writer);
            }
        }
Beispiel #24
0
        private static void Main(string[] args)
        {
            var htmlCleaner                       = new HtmlCleaner(new ISiteHtmlCleaner[] { new DailyHtmlCleaner(), new MirrorHtmlCleaner() });
            var htmlLoader                        = new HtmlLoader();
            var articleProvider                   = new ArticleProvider(htmlCleaner, htmlLoader);
            var cosineSimilarityCalculator        = new CosineSimilarityCalculator();
            IDocumentFrequencyProvider dfProvider = LoadFrequencies();
            var tfIdfCalculator                   = new TfIdfCalculator(dfProvider);
            var tokenizer        = new Tokenizer();
            var articleProcessor = new ArticleProcessor(tfIdfCalculator, tokenizer);
            var articleComparer  = new TextProcessing.ArticleComparer(articleProvider, cosineSimilarityCalculator,
                                                                      articleProcessor);

            Console.WriteLine("Similar articles:");
            double similarity =
                articleComparer.Compare(
                    @"http://www.dailymail.co.uk/news/article-2489957/Britains-spy-chiefs-grilled-MPs-television-time.html",
                    @"http://www.mirror.co.uk/news/uk-news/mi6-mi5-gchq-bosses-questioned-2685310");

            Console.WriteLine(similarity);
            similarity =
                articleComparer.Compare(
                    @"http://www.dailymail.co.uk/news/article-2489640/80-parents-caught-children-copying-p**n-style-dances-offensive-lyrics.html",
                    @"http://www.mirror.co.uk/news/uk-news/miley-cyrus-twerking-kids-copying-2685363");
            Console.WriteLine(similarity);

            Console.WriteLine("Same article:");
            similarity =
                articleComparer.Compare(
                    @"http://www.dailymail.co.uk/news/article-2490296/You-STILL-likely-lose-job-recession-25s-shop-workers-risk.html",
                    @"http://www.dailymail.co.uk/news/article-2490296/You-STILL-likely-lose-job-recession-25s-shop-workers-risk.html");
            Console.WriteLine(similarity);

            Console.WriteLine("Different articles:");
            similarity =
                articleComparer.Compare(
                    @"http://www.dailymail.co.uk/femail/article-2489984/Needy-people-likely-cheat.html",
                    @"http://www.dailymail.co.uk/news/article-2490531/Worlds-oldest-paperboy-deliver-round-71-years-route.html");
            Console.WriteLine(similarity);
            similarity =
                articleComparer.Compare(
                    @"http://www.dailymail.co.uk/news/article-2490412/Wikileaks-journalist-spent-4-months-Edward-Snowden-leaves-Russia.html",
                    @"http://www.dailymail.co.uk/news/article-2489994/Twitter-share-prices-soar-firms-day-trading.html");
            Console.WriteLine(similarity);
            Console.ReadKey();
        }
Beispiel #25
0
        private void ParseMessage(List <Message> messages, HtmlNode node, ISmfTheme theme)
        {
            var msgAnchor     = node.SelectSingleNode(theme.TopicMessageLink);
            var msgURL        = msgAnchor.Attributes["href"].Value;
            var rawPostedTime = node.SelectSingleNode(theme.TopicMessagePostedTime).InnerText.Replace("»", "").Trim();


            var msgID = Convert.ToInt32(Regex.Match(msgURL, @"(?<=#msg)\d+").Value);

            var msgSubject = msgAnchor.InnerText;

            var msgBody = new HtmlCleaner().Remove(node.SelectSingleNode(theme.TopicMessageBody).InnerHtml);

            var postedTime = DateTime.Parse(rawPostedTime).ToUniversalTime();

            // TODO : This is good, but what if this run multiple times on the same Topic?
            messages.Add(new Message(msgID, msgSubject, msgBody, postedTime));
        }
Beispiel #26
0
        private static void Main(string[] args)
        {
            var htmlCleaner       = new HtmlCleaner(new[] { new TelegraphHtmlCleaner() });
            var htmlLoader        = new HtmlLoader();
            var articleProvider   = new ArticleProvider(htmlCleaner, htmlLoader);
            var frequencyProvider = new DocumentFrequencyProvider();
            var tokenizer         = new Tokenizer();

            string[] articleUrls = File.ReadAllLines(ArticlesUrlsFile, Encoding.Unicode);
            foreach (string articleUrl in articleUrls)
            {
                IArticle             article = articleProvider.Get(articleUrl);
                IEnumerable <IToken> tokens  = tokenizer.Tokenize(article.Text);
                frequencyProvider.ProcessText(tokens);
            }

            SaveFrequencies(frequencyProvider);
        }
Beispiel #27
0
        public void CleanTest()
        {
            var siteCleaner = MockRepository.GenerateStub <ISiteHtmlCleaner>();

            siteCleaner.Stub(sc => sc.SiteUrl)
            .Return("url");
            var mockArticle = MockRepository.GenerateStub <IArticle>();

            mockArticle.Stub(a => a.Text)
            .Return("!");
            siteCleaner.Stub(sc => sc.Clean(Arg <string> .Is.Anything))
            .Return(mockArticle);
            var cleaner = new HtmlCleaner(new[] { siteCleaner });

            IArticle cleaned = cleaner.Clean("url", string.Empty);

            Assert.AreEqual("!", cleaned.Text);
        }
        public async Task CreateEditTemplate(CTemplate templateContent)
        {
            templateContent.Content = HtmlCleaner.CleanHtml(templateContent.Content);
            var found = _templatesRepository.FirstOrDefault(a => a.Name == templateContent.Name);

            if (found == null)
            {
                await _templatesRepository.InsertOrUpdateAndGetIdAsync(new Template()
                {
                    Content   = templateContent.Content,
                    Name      = templateContent.Name,
                    IsPartial = templateContent.IsPartial,
                });
            }

            else
            {
                found.Content = templateContent.Content;
                await _templatesRepository.InsertOrUpdateAndGetIdAsync(found);
            }
        }
Beispiel #29
0
        public IHttpActionResult LoadUrl(string url, string clientSecret)
        {
            var passkeySetting = ConfigurationManager.AppSettings["passkey"];

            if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(clientSecret) || !clientSecret.Equals(passkeySetting))
            {
                return(BadRequest());
            }

            //GET HTML
            var web = new HtmlWeb();

            var htmlDoc = web.Load(url);

            IHtmlCleaner htmlCleaner = new HtmlCleaner
            {
                HtmlSourceDoc = htmlDoc
            };

            htmlDoc = htmlCleaner.CleanHtml();

            IWordRankAnalyzer wordRankAnalyzer = new WordRankAnalyzer
            {
                HtmlSourceDoc = htmlDoc
            };

            var wordsRankedList = wordRankAnalyzer.RankWords();

            var totalWordCount = wordRankAnalyzer.TotalWordsInHtml;

            IImageAnalyzer imgAnalyzer = new ImageAnalyzer
            {
                WebSiteUrl    = url,
                HtmlSourceDoc = htmlDoc
            };

            var imgSourceList = imgAnalyzer.GetImageSources();

            return(Ok(new { UrlRequested = url, TotalWordCount = totalWordCount, WordsRankedList = wordsRankedList, ImagesSources = imgSourceList }));
        }
Beispiel #30
0
        /// <summary>
        /// 获取一个章节的内容
        /// </summary>
        /// <param name="nodeId"></param>
        public static void getNodeDetail(string nodeId)
        {
            string nodeDetails = HttpWorker.HttpGet(Global.NodeDetailApi, "nodeIds=" + nodeId);

            if (nodeDetails == "error")
            {
                return;
            }
            NodeDetailResponse nodeDtailResponse = TranslationWorker.ConvertStringToEntity <NodeDetailResponse>(nodeDetails);
            List <Node>        nodes             = nodeDtailResponse.ConvertToNodes();

            foreach (Node node in nodes)//下载图片到本地,并生成离线内容
            {
                List <string> urls = HtmlCleaner.GetImageUrl(node.content);
                foreach (string url in urls)
                {
                    HttpWorker.SaveImg(url, node.lawId);
                }
                node.offlineContent = HtmlCleaner.ChangeImageUrlToLocalPath(node.content, node.lawId);
            }
            db.refreshNode(nodes, detailOnly: true);
        }
Beispiel #31
0
        public static ArticleModel FromTable(Tables.Articles_Extended article)
        {
            // add microdata to take advantage of Google's rich snippets for articles:
            // https://developers.google.com/structured-data/rich-snippets/articles
            if (article.Body_Html.Contains("<img ") && !article.Body_Html.Contains(" itemprop=\"image\" "))
            {
                // only modify the first image
                int index = article.Body_Html.IndexOf("<img ") + "<img ".Length;
                article.Body_Html = article.Body_Html.Substring(0, index) + "itemprop=\"image\" " + article.Body_Html.Substring(index);
                // assume the body+ad contains the entire body
                index = article.BodyAndAd_Html.IndexOf("<img ") + "<img ".Length;
                article.BodyAndAd_Html = article.BodyAndAd_Html.Substring(0, index) + "itemprop=\"image\" " + article.BodyAndAd_Html.Substring(index);
            }

            return(new ArticleModel()
            {
                Id = article.Article_Id,
                Slug = article.Article_Slug,
                Author = AuthorModel.FromTable(article),
                BodyHtml = HtmlCleaner.UnmixContent(article.Body_Html),
                BodyAndAdHtml = HtmlCleaner.UnmixContent(article.BodyAndAd_Html),
                CachedCommentCount = (int)article.Cached_Comment_Count,
                DiscourseTopicId = article.Discourse_Topic_Id,
                DiscourseTopicOpened = article.Discourse_Topic_Opened == "Y",
                LastCommentDate = article.Last_Comment_Date,
                PublishedDate = article.Published_Date,
                Series = SeriesModel.FromTable(article),
                FooterAdHtml = HtmlCleaner.UnmixContent(article.Ad_Html),
                Status = article.PublishedStatus_Name,
                SummaryHtml = HtmlCleaner.UnmixContent(ArticleModel.ExtractSummary(article.Body_Html)),
                Title = article.Title_Text,
                PreviousArticleId = article.Previous_Article_Id,
                PreviousArticleSlug = article.Previous_Article_Slug,
                PreviousArticleTitle = article.Previous_Title_Text,
                NextArticleId = article.Next_Article_Id,
                NextArticleSlug = article.Next_Article_Slug,
                NextArticleTitle = article.Next_Title_Text
            });
        }
Beispiel #32
0
        protected override bool DoInsertData(DataAction action, MarkupPointer begin, MarkupPointer end)
        {
            using (new WaitCursor())
            {
                try
                {
                    string htmlPresentation = DataMeister.LiveClipboardData.HtmlPresentation;

                    // remove document tags and scripts
                    htmlPresentation = HtmlCleaner.RemoveScripts(htmlPresentation);

                    // insert the html
                    EditorContext.InsertHtml(begin, end, htmlPresentation, null);
                    return(true);
                }
                catch (Exception e)
                {
                    //bugfix 1696, put exceptions into the trace log.
                    Trace.Fail("Exception while inserting HTML: " + e.Message, e.StackTrace);
                    return(false);
                }
            }
        }
Beispiel #33
0
 public CommentModel()
 {
     this.index = new Lazy <int>(() => StoredProcs.Comments_GetCommentIndex(this.Id).Execute().Value);
     this.parentCommentIndex = new Lazy <int?>(() => this.ParentCommentId.HasValue ? StoredProcs.Comments_GetCommentIndex(this.ParentCommentId).Execute() : null);
     this.getLinks           = new Lazy <IList <string> >(() => HtmlCleaner.GetLinkUrls(this.BodyHtml).ToList());
 }
Beispiel #34
0
 private static string MarkdownFormatContent(string text)
 {
     return(HtmlCleaner.Clean(CommonMarkConverter.Convert(BbCodeFormatComment(text))));
 }
Beispiel #35
0
        public void CleanTestNullUrlExc()
        {
            var cleaner = new HtmlCleaner(Enumerable.Empty <ISiteHtmlCleaner>());

            cleaner.Clean(null, string.Empty);
        }
 public override string CleanupHtml(string html, string baseUrl, HtmlCleanupRule cleanupRule)
 {
     html = HtmlCleaner.CleanupHtml(html, baseUrl, true, (cleanupRule == HtmlCleanupRule.Normal ? false : true));
     return(_editorHost.TransformHtml(html));
 }