public void Test_BuildDom_on_content_with_html_entities() { const string htmlContent = "<html><head></head><body>»</body></html>"; var document = _sgmlDomBuilder.BuildDocument(htmlContent); Assert.IsTrue(document.ToString().Contains("»")); }
public void Builder_handles_UTF16() { // arrange const string htmlContent = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"pl-PL\">\n<head profile=\"http://gmpg.org/xfn/11\">\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n<title>Jak zwiększyć swoją pewność siebie | Michalpasterski.pl | świadomie o rozwoju osobistym</title>\n<meta name=\"generator\" content=\"WordPress abc\" /> <!-- leave this for stats -->\n<link rel=\"stylesheet\" href=\"http://michalpasterski.pl/wp-content/themes/newtheme/style.css\" type=\"text/css\" media=\"screen\" />\n<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" href=\"http://michalpasterski.pl/feed/\" />\n<link rel=\"alternate\" type=\"text/xml\" title=\"RSS .92\" href=\"http://michalpasterski.pl/feed/rss/\" />\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Atom 0.3\" href=\"http://michalpasterski.pl/feed/atom/\" />\n<link rel=\"pingback\" href=\"http://michalpasterski.pl/xmlrpc.php\" />\n<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Michalpasterski.pl | świadomie o rozwoju osobistym » Jak zwiększyć swoją pewność siebie Kanał z komentarzami\" href=\"http://michalpasterski.pl/2008/11/jak-zwiekszyc-swoja-pewnosc-siebie/feed/\" />\n<link rel='stylesheet' id='wp-polls-css' href='http://michalpasterski.pl/wp-content/plugins/wp-polls/polls-css.css?ver=2.50' type='text/css' media='all' />\n<script type='text/javascript' src='http://michalpasterski.pl/wp-includes/js/jquery/jquery.js?ver=1.4.2'></script>\n<script type='text/javascript' src='http://michalpasterski.pl/wp-content/themes/newtheme/js/js.js?ver=abc'></script>\n<link rel=\"EditURI\" type=\"application/rsd+xml\" title=\"RSD\" href=\"http://michalpasterski.pl/xmlrpc.php?rsd\" />\n<link rel=\"wlwmanifest\" type=\"application/wlwmanifest+xml\" href=\"http://michalpasterski.pl/wp-includes/wlwmanifest.xml\" /> \n<link rel='index' title='Michalpasterski.pl | świadomie o rozwoju osobistym' href='http://michalpasterski.pl/' />\n<link rel='start' title='Jak tworzyć wizualizacje, które zmienią Twoje życie' href='http://michalpasterski.pl/2008/06/jak-tworzyc-wizualizacje/' />\n<link rel='prev' title='Wyjątkowy wpis integracyjny :)' href='http://michalpasterski.pl/2008/11/wyjatkowy-wpis-integracyjny/' />\n<link rel='next' title='O neuronach' href='http://michalpasterski.pl/2008/11/o-neuronach/' />\n<link rel='shortlink' href='http://michalpasterski.pl/?p=217' />\n\n<!-- All in One SEO Pack 1.6.11 by Michael Torbert of Semper Fi Web Design[306,438] -->\n<meta name=\"description\" content=\"Jeśli czujesz, że brakuje Ci pewności siebie, koniecznie przeczytaj ten artykuł. Pokażę Ci dokładnie co należy zrobić aby zacząć wierzyć we własne możliwości!\" />\n<meta name=\"keywords\" content=\"pewność siebie, pewny siebie, pewność, samoocena, nlp, osiąganie celów,nlp,osiąganie celów\" />\n<link rel=\"canonical\" href=\"http://michalpasterski.pl/2008/11/jak-zwiekszyc-swoja-pewnosc-siebie/\" />\n<!-- /all in one seo pack -->\n\n<!-- Start Of Script Generated By cforms v11.6.1 [Oliver Seidel | www.deliciousdays.com] -->\n<link rel=\"stylesheet\" type=\"text/css\" href=\"http://michalpasterski.pl/wp-content/plugins/cforms/styling/cforms.css\" />\n<script type=\"text/javascript\" src=\"http://michalpasterski.pl/wp-content/plugins/cforms/js/cforms.js\"></script>\n<!-- End Of Script Generated By cforms -->\n\n<meta name=\"robots\" content=\"index,follow\" />\n\n <!-- Google Ajax Search -->\n\n \n <link href=\"http://www.google.com/uds/css/gsearch.css\" type=\"text/css\" rel=\"stylesheet\"/>\n <style> \n \n /* Width */\n .gsc-control {\n width: 280px;\n overflow: hidden\n }\n .gs-result .gs-title,\n .gs-result .gs-title * {\n font-size: em;\n color: #549AD8;\n }\n .gsc-results .gsc-trailing-more-results,\n .gsc-results .gsc-trailing-more-results * {\n color: #549AD8;\n }\n .gs-result a.gs-visibleUrl,\n .gs-result .gs-visibleUrl {\n color: #;\n }\n .gs-result a.gs-clusterUrl,\n .gs-result .gs-clusterUrl {\n color: #;\n }\n .gsc-resultsbox-visible {\n display: table;\n width: 100%;\n overflow: hidden\n }\n </style>\n\n\n <style>\n img.gsc-branding-img {\n display: none;\n }\n td.gsc-branding-text div.gsc-branding-text {\n display: none;\n } \n </style>\n\n \n <script src='http://www.google.com/uds/api?file=uds.js&v=1.0&key=ABQIAAAANUD_QVL2ucyKWo8cO8gN3RQq1_6xPedlnuzKZU469BjL0S3e3RRAcBEuKaFsEQuOvP-1uqyW7gA6Bg' type='text/javascript'></script>\n <!-- Google AjaxSearch Plugin for WordPress initialization -->\n <script type='text/javascript'> \n\n\n\n\n function OnLoad()\n {\n \n var searchControl = new GSearchControl();\n searchControl .setLinkTarget(GSearch.LINK_TARGET_SELF); \n var webSearch = new GwebSearch(); \n webSearch.setSiteRestriction(\"http://michalpasterski.pl\");\n webSearch.setUserDefinedLabel(\"Results\");\n webSearch.setUserDefinedClassSuffix(\"webSearch\");\n var blogSearch = new GblogSearch(); \n blogSearch.setSiteRestriction(\"http://michalpasterski.pl\");\n blogSearch.setUserDefinedLabel(\"Blog Search\");\n blogSearch.setUserDefinedClassSuffix(\"siteSearch\");\n blogSearch.setResultOrder(GSearch.ORDER_BY_DATE);\n var options = new GsearcherOptions();\n options.setExpandMode(GSearchControl.EXPAND_MODE_OPEN);\n searchControl.addSearcher(webSearch, options);\n searchControl.addSearcher(blogSearch, options);\n \n\n var drawOptions = new GdrawOptions();\n drawOptions.setDrawMode(GSearchControl.DRAW_MODE_TABBED);\n searchControl.draw(document.getElementById(\"searchcontrol\"),drawOptions);\n\n }\n GSearch.setOnLoadCallback(OnLoad);\n\n </script>\n <!-- Google Maps Plugin for WordPress (end) -->\n\n<link rel=\"stylesheet\" type=\"text/css\" href=\"http://michalpasterski.pl/wp-content/plugins/pdf24-post-to-pdf/styles/lp/default_dfl.css\" />\n<link rel='shortcut icon' href='http://michalpasterski.pl/favicon.ico' />\n<script type=\"text/javascript\" src=\"http://michalpasterski.pl/wp-content/plugins/wordpress-tweaks/tweaks.php?js=targetblank\"></script>\n<style type=\"text/css\">\n.wp-polls .pollbar {\n margin: 1px;\n font-size: 8px;\n line-height: 10px;\n height: 10px;\n background-image: url('http://michalpasterski.pl/wp-content/plugins/wp-polls/images/default_gradient/pollbg.gif');\n border: 1px solid #ffffff;\n}\n</style>\n</head>\n<body>Some body</body>"; // act var xDocument = SgmlDomBuilder.BuildDocument(htmlContent); // assert string serializedHtmlContent = _sgmlDomSerializer.Serialize(xDocument); MyAssert.AssertSubstringCount(1, serializedHtmlContent, "<html"); }
public void Serializer_removes_existing_generator_meta_element() { // arrange const string htmlContent = "<html><head><meta name=\"generator\" value=\"WordPress\"</head><body></body></html>"; var xDocument = SgmlDomBuilder.BuildDocument(htmlContent); // act string serializedHtmlContent = _sgmlDomSerializer.Serialize(xDocument); // assert MyAssert.AssertSubstringCount(1, serializedHtmlContent, "<meta name=\"Generator\""); }
public void Serializer_removes_existing_content_type_meta_element() { // arrange const string htmlContent = "<html><head><meta http-equiv=\"Content-Type\" value=\"UTF-8\"</head><body></body></html>"; var xDocument = SgmlDomBuilder.BuildDocument(htmlContent); // act string serializedHtmlContent = _sgmlDomSerializer.Serialize(xDocument); // assert MyAssert.AssertSubstringCount(0, serializedHtmlContent, "<meta http-equiv=\"Content-Type\""); }
public void Serializer_adds_HandheldFrienly_meta_element_if_DontIncludeMobileSpecificElements_is_false() { // arrange const string htmlContent = "<html><head></head><body></body></html>"; var xDocument = _sgmlDomBuilder.BuildDocument(htmlContent); var domSerializationParams = new DomSerializationParams { DontIncludeMobileSpecificMetaElements = false, }; // act string serializedHtmlContent = _sgmlDomSerializer.SerializeDocument(xDocument, domSerializationParams); // assert AssertHandheldFriendlyMetaElementPresence(serializedHtmlContent, true); }
public void Serializer_removes_viewport_meta_element_if_DontIncludeMobileSpecificElements_is_false() { // arrange const string htmlContent = "<html><head><meta name=\"viewport\" content=\"width=1100\" /></head><body></body></html>"; var xDocument = SgmlDomBuilder.BuildDocument(htmlContent); // act string serializedHtmlContent = _sgmlDomSerializer.Serialize(xDocument, new DomSerializationParams { PrettyPrint = true }); // throw new System.Exception(serializedHtmlContent); // assert AssertViewportMetaElementPresence(serializedHtmlContent, false); }
public void Unlikely_candidates_should_be_removed() { const string content = "<div class=\"sidebar\">Some content.</div>"; var document = _sgmlDomBuilder.BuildDocument(content); _nReadabilityTranscoder.StripUnlikelyCandidates(document); string newContent = _sgmlDomSerializer.SerializeDocument(document); AssertHtmlContentIsEmpty(newContent); }
// [Test] public void Builder_handles_invalid_entity_references() { XDocument document = SgmlDomBuilder.BuildDocument("<p>&#</p>"); Assert.DoesNotThrow(() => _sgmlDomSerializer.Serialize(document)); }
public async Task Dowload(string url, PerformContext context) { using (var client = new HttpClient()) { client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"); var response = await client.GetAsync(url); if (response.StatusCode != HttpStatusCode.OK) { return; } var stream = await response.Content.ReadAsStreamAsync(); byte[] bytes = new byte[stream.Length]; await stream.ReadAsync(bytes, 0, bytes.Length); var isUTF8 = IsTextUTF8(ref bytes); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); Encoding encoding; if (isUTF8) { encoding = Encoding.UTF8; } else { encoding = Encoding.GetEncoding("GBK"); } var html = encoding.GetString(bytes); //var document = new HtmlDocument { OptionAutoCloseOnEnd = true }; //document.LoadHtml(html); //foreach (var selectNode in document.DocumentNode.SelectNodes("//meta")) //{ // if (selectNode.Attributes["http-equiv"]?.Value == "Content-Type") // { // var contentType = selectNode.Attributes["content"].Value; // var match = Regex.Match(contentType, "charset=(?<encoding>[a-zA-Z0-9\\-]*)"); // if (match.Success) // { // var encodingName = match.Groups["encoding"].Value; // html = Encoding.GetEncoding(encodingName).GetString(bytes); // break; // } // } // if (selectNode.Attributes["charset"] != null) // { // var encodingName = selectNode.Attributes["charset"].Value; // html = Encoding.GetEncoding(encodingName).GetString(bytes); // break; // } //} //document.LoadHtml(html); //using (var ms = new MemoryStream()) //using (StreamWriter sw = new StreamWriter(ms, Encoding.UTF8)) //{ // document.Save(sw); // ms.Position = 0; // var xdoc = XDocument.Load(ms); // //using (var sr = new StreamReader(ms)) // //{ // // html = await sr.ReadToEndAsync(); // //} //} //var html = await response.Content.ReadAsStringAsync(); if (string.IsNullOrEmpty(html)) { return; } var transcoder = new NReadabilityTranscoder(); var input = new TranscodingInput(html); try { SgmlDomBuilder builder = new SgmlDomBuilder(); var s = builder.BuildDocument(html); var result = transcoder.Transcode(input); var document = new HtmlDocument { OptionAutoCloseOnEnd = true }; document.LoadHtml(result.ExtractedContent); var node = document.DocumentNode.SelectSingleNode("//div/div/div/div"); var text = node.InnerText.Trim('\r', '\n', ' ', '\t'); context.WriteLine("抽取内容为:"); context.WriteLine(text); const string cmdText = @"UPDATE [dbo].[BaiduNews] SET [Html]=@Html,[Text]=@Text WHERE [Url]=@Url"; await _connection.ExecuteAsync(cmdText, new { Html = html, Text = text, Url = url }); await _connection.ExecuteAsync( @"UPDATE a SET a.[NewsCount]=a.[NewsCount]+1 FROM [dbo].[Monitor] a JOIN [dbo].[BaiduNews] b ON a.[Tag]=b.[Keyword] WHERE b.[Url]=@Url", new { Url = url }); } catch (Exception e) { context.WriteLine(e); } } }