public void LoadTest_Fragment() { string htmlText = "<p>\r\n" + " <strong><a href=\"http://www.nyautohome.com/\">南阳汽车网</a></strong>编辑从<a " + "href=\"http://www.nyautohome.com/store/xinkaidi\">凯迪拉克汽车南阳新凯迪4S店</a>获悉,凯迪拉克针对旗下全新豪华轿车XTS全系28T车型推出“5050气球金融贷款计划”。此项金融贷款服务拥有低首付、低月供、灵活还款三大优势。</p>\r\n" + "<p style=\"text-align: center;\">\r\n" + "<img alt=\"\" src=\"/upload/images/2013/09/09/20130909_091956.jpg\" style=\"width: 500px;\r\n" + "height: 283px;\" /></p>\r\n" + "<p>\r\n" + "以热销的<a href=\"http://www.nyautohome.com/auto/12274\" target=\"_blank\" class=\"contentTextLink\">凯迪拉克XTS</a>28T豪华型为例(官方指导价41.99万元),消费者仅需率先支付车款的50%,便可以不足21万元的价格即刻购得一款拥有真皮座椅、全景天窗、BOSE®音响、10安全气囊、智能三区空调、GPS导航及倒车影像、ESS强化安全策略、配备蓝光播放器的双屏影音娱乐系统等豪华配置的<a " + "href=\"http://www.nyautohome.com/auto/12274\" target=\"_blank\" class=\"contentTextLink\">凯迪拉克XTS</a>。购车当年内,每月支付698元贷款利息,日供仅需20元左右,即可于一年后还清剩余50%车款。如还款时资金紧张,凯迪拉克还提供一年展期服务,延长一年还款期限。对于升级置换车主,可用现有车型抵扣部分首付款项,轻松置换一台拥有更多科技配置的豪华轿车。</p>\r\n" + "<p>\r\n" + "<a href=\"http://www.nyautohome.com/auto/12274\" target=\"_blank\" class=\"contentTextLink\">\r\n" + "凯迪拉克XTS</a> 28T车型搭载荣获“沃德十佳发动机”的2.0T涡轮增压发动机,通过凯迪拉克CUE移动互联体验、Bose\r\n" + "ANC主动降噪静音科技、MRC主动电磁感应悬挂、ESS强化安全策略等业界领先的创新科技,为消费者带来了革命性驾乘体验。凭借极具竞争力的配置与价格优势,<a href=\"http://www.nyautohome.com/auto/12274\" " + "target=\"_blank\" class=\"contentTextLink\">凯迪拉克XTS</a>上市2个多月来,销量已突破4,000辆,获得了消费者的广泛认可,成为国内豪车市场新锐。</p>\r\n" + "<p>\r\n" + "若需了解更多信息,敬请莅临凯迪拉克南阳新凯迪展厅品鉴试驾。</p>\r\n" + "<p>\r\n" + "[stl.store:58]</p>\r\n"; IHtmlDocument document = HtmlAnalyzer.Load(htmlText); Assert.AreEqual <string>(htmlText, document.NodeHtml); }
public void LoadTest_HtmlText() { string htmlText = "<!--123--><p>这是<a href=\"\" target=blank>段落</a>,<!--456-->它中间包含一个注释。</p>"; IHtmlDocument document = HtmlAnalyzer.Load(htmlText); Assert.AreEqual <string>("这是段落,它中间包含一个注释。", document.NodeText); }
public void LoadTest_Empty() { string htmlText = string.Empty; IHtmlDocument document = HtmlAnalyzer.Load(htmlText); Assert.AreEqual <int>(0, document.ChildNodes.Count); }
public void LoadTest_AloneAttributeName() { string htmlText = "<input type=\"check\" autofocus readonly class=a checked value='123' disabled />"; IHtmlDocument document = HtmlAnalyzer.Load(htmlText); Assert.AreEqual <string>("<input type=\"check\" autofocus=\"autofocus\" readonly=\"readonly\" class=\"a\" checked=\"checked\" value=\"123\" disabled=\"disabled\" />", document.NodeHtml); }
public void LoadTest_Entire_Error() { string htmlText = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\r\n" + "<html xmlns=\"http://www.w3.org/1999/xhtml\">\r\n" + "<head>\r\n" + "<title>{$metaTitle}</title>\r\n" + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\r\n" + "<meta name=\"keywords\" content=\"{$metaKeyword}\" />\r\n" + "<meta name=\"description\" content=\"{$metaDescription}\" />\r\n" + "<script type=\"text/javascript\" src=\"site.js\"></script>\r\n" + "<link href=\"site.css\" rel=\"stylesheet\" type=\"text/css\">\r\n" + "</head>\r\n" + "<body id=\"body\">\r\n" + "<!-- body start -->\r\n" + "<div id=\"head\">\r\n" + "<!-- page header -->\r\n" + "Page <Header> <input type=\"text\" value=\"123\" />\r\n" + "</div>\r\n" + "<div id=\"footer\">\r\n" + "<!-- page header -->\r\n" + "Page <Footer>\r\n" + "</div>\r\n" + "<!-- body end -->\r\n" + "</body>\r\n" + "</html>\r\n"; IHtmlDocument document = HtmlAnalyzer.Load(htmlText); Assert.AreEqual <string>(htmlText, document.NodeHtml); }
public void IsLayoutContainHtmlTag_PassEmptyString_ReturnFalse() { bool expected = false; string layout = ""; var actual = HtmlAnalyzer.IsLayoutContainHtmlTag(layout); Assert.AreEqual(expected, actual); }
public void LoadTest_HtmlEncode() { string htmlText = " \t <123> \t "; IHtmlDocument document = HtmlAnalyzer.Load(htmlText); Assert.AreEqual <int>(1, document.ChildNodes.Count); Assert.AreEqual <string>(htmlText, document.ChildNodes[0].NodeText); Assert.AreEqual <string>(htmlText, document.ChildNodes[0].NodeHtml); }
public void LoadTest_WhiteSpace() { string htmlText = " \t \r\n \t\r\n "; IHtmlDocument document = HtmlAnalyzer.Load(htmlText); Assert.AreEqual <int>(1, document.ChildNodes.Count); Assert.AreEqual <string>(htmlText, document.ChildNodes[0].NodeText); Assert.AreEqual <string>(htmlText, document.ChildNodes[0].NodeHtml); }
public void LoadTest_SetAttributeValue() { string htmlText1 = "<p>123<a target=\"_blank\" href=\"../../1.html\">000</a>456</p>"; string htmlText2 = "<p>123<a target=\"_self\" href=\"../../2.html\">000</a>456</p>"; IHtmlDocument document = HtmlAnalyzer.Load(htmlText1); IHtmlAttribute attribute = document.GetElementsByTagName("a").First().GetAttributesByName("target").First(); attribute.Value = "_self"; attribute = document.GetElementsByTagName("a").First().GetAttributesByName("href").First(); attribute.Value = "../../2.html"; Assert.AreEqual <string>(htmlText2, document.NodeHtml); }
public void GetHtmlPageTitle_PassHtmlLayout_WithoutTitle_ReturnNull() { string expectedTitle = null; string layout = @"<!DOCTYPE html ><html> <head resURL = '/static/cde7a081' data-rooturl='' data-resurl='/static/cde7a081'> <link rel = 'stylesheet' href = '/static/cde7a081/css/layout-common.css' type = 'text/css' /> </head><body><a href = 'api/' > REST API</a></body></html> "; string actualTitle = HtmlAnalyzer.GetHtmlPageTitle(layout); Assert.AreEqual(expectedTitle, actualTitle); }
public void IsLayoutContainHtmlTag_PassHtmlLayout_HasNoHtmlTag_ReturnFalse() { bool expected = false; string layout = @"<!DOCTYPE html > <head resURL = '/static/cde7a081' data-rooturl='' data-resurl='/static/cde7a081'> <title > Jenkins </ title >< link rel = 'stylesheet' href = '/static/cde7a081/css/layout-common.css' type = 'text/css' /> <link rel='stylesheet' href='/static/cde7a081/css/style.css' type='text/css' /> <link rel='shortcut icon' href='/static/cde7a081/favicon.ico' type='image/vnd.microsoft.icon' /> <link color='black' rel='mask-icon' href='/images/mask-icon.svg' /> <script> var isRunAsTest = false; var rootURL = ''; var resURL = '/static/cde7a081';</script > <script src = '/static/cde7a081/scripts/prototype.js' type='text/javascript'></script > <script src = '/static/cde7a081/scripts/behavior.js' type='text/javascript'></script > </head><body><a href = 'api/' > REST API</a></body>' "; var actual = HtmlAnalyzer.IsLayoutContainHtmlTag(layout); Assert.AreEqual(expected, actual); }
public void startWork(object state) //工作线程主函数(未完成函数) { while (IsWork != true) { System.Windows.Forms.Application.DoEvents(); //doEvents if (IsWork == false) //没任务,则向任务队列请求 { //IsWork = true; url_Info = getQueue(); if (url_Info != null) { //正在执行任务 DateTime tTmp = DateTime.Now; url_Info.lastTime = tTmp.ToString(); //爬行时间 if (AddUrl(url_Info) == true) { GetHtml htm = new GetHtml(); string tmpHtml = htm.GetPage(url_Info.meUrl); //GetHtml HtmlAnalyzer htmlAnalyzer = new HtmlAnalyzer(tmpHtml, url_Info.meUrl); //HtmlAnalyzer if (htmlAnalyzer.NewUrl != null) //如果分析出url { for (int a = 0; a <= htmlAnalyzer.NewUrl.Count - 1; a++) { Url_info tmp = new Url_info(); tmp.meUrl = htmlAnalyzer.NewUrl[a].ToString(); //把分析器分析出的url传递给临时对象tmp tmp.fromUrl = url_Info.meUrl; //来源设定 tmp.sid = url_Info.sid + 1; //级数+1 postQueue(tmp); //url信息投递到任务队列 } } htm.SavePage(tmpHtml, url_Info.meUrl); } } } if (iListener.OnCloseWork() == true) { IsWork = true; } } }
static void Main(string[] args) { var serpClient = new GoogleSerpClient(); var serpResult = serpClient.Get("kuchen", 100); var websites = new List <HtmlPage>(); foreach (var link in serpResult.Links) { var webClient = new AdvancedWebClient(); var content = webClient.Get(link.UrlAsString); var htmlAnalyzer = new HtmlAnalyzer(); var website = htmlAnalyzer.AnalyzeHtmlPage(content.HtmlContent); websites.Add(website); } var textAnalyzer = new TextAnalyzer(); foreach (var website in websites) { var words = textAnalyzer.GetWords(website.Paragraphs, true); } }
public void GetHtmlPageTitle_PassNull() { Assert.ThrowsException <ArgumentNullException>(() => HtmlAnalyzer.GetHtmlPageTitle(null)); }
public void LoadTest_Null() { string htmlText = null; IHtmlDocument document = HtmlAnalyzer.Load(htmlText); }
public void IsLayoutContainHtmlTag_PassNull() { Assert.ThrowsException <ArgumentNullException>(() => HtmlAnalyzer.IsLayoutContainHtmlTag(null)); }
public void LoadTest_OnlyJavaScript() { IHtmlDocument document = HtmlAnalyzer.Load(Properties.Resources.HtmlAnalyzer_OnlyJavaScript); // there is a bug has not repaired: can not parse js contents. }