Example #1
0
        public void LoadTest_Fragment()
        {
            string htmlText = "<p>\r\n" +
                              "&nbsp; &nbsp;<strong><a href=\"http://www.nyautohome.com/\">南阳汽车网</a></strong>编辑从<a " +
                              "href=\"http://www.nyautohome.com/store/xinkaidi\">凯迪拉克汽车南阳新凯迪4S店</a>获悉,凯迪拉克针对旗下全新豪华轿车XTS全系28T车型推出&ldquo;5050气球金融贷款计划&rdquo;。此项金融贷款服务拥有低首付、低月供、灵活还款三大优势。</p>\r\n" +
                              "<p style=\"text-align: center;\">\r\n" +
                              "<img alt=\"\" src=\"/upload/images/2013/09/09/20130909_091956.jpg\" style=\"width: 500px;\r\n" +
                              "height: 283px;\" /></p>\r\n" +
                              "<p>\r\n" +
                              "以热销的<a href=\"http://www.nyautohome.com/auto/12274\" target=\"_blank\" class=\"contentTextLink\">凯迪拉克XTS</a>28T豪华型为例(官方指导价41.99万元),消费者仅需率先支付车款的50%,便可以不足21万元的价格即刻购得一款拥有真皮座椅、全景天窗、BOSE&reg;音响、10安全气囊、智能三区空调、GPS导航及倒车影像、ESS强化安全策略、配备蓝光播放器的双屏影音娱乐系统等豪华配置的<a " +
                              "href=\"http://www.nyautohome.com/auto/12274\" target=\"_blank\" class=\"contentTextLink\">凯迪拉克XTS</a>。购车当年内,每月支付698元贷款利息,日供仅需20元左右,即可于一年后还清剩余50%车款。如还款时资金紧张,凯迪拉克还提供一年展期服务,延长一年还款期限。对于升级置换车主,可用现有车型抵扣部分首付款项,轻松置换一台拥有更多科技配置的豪华轿车。</p>\r\n" +
                              "<p>\r\n" +
                              "<a href=\"http://www.nyautohome.com/auto/12274\" target=\"_blank\" class=\"contentTextLink\">\r\n" +
                              "凯迪拉克XTS</a> 28T车型搭载荣获&ldquo;沃德十佳发动机&rdquo;的2.0T涡轮增压发动机,通过凯迪拉克CUE移动互联体验、Bose\r\n" +
                              "ANC主动降噪静音科技、MRC主动电磁感应悬挂、ESS强化安全策略等业界领先的创新科技,为消费者带来了革命性驾乘体验。凭借极具竞争力的配置与价格优势,<a href=\"http://www.nyautohome.com/auto/12274\" " +
                              "target=\"_blank\" class=\"contentTextLink\">凯迪拉克XTS</a>上市2个多月来,销量已突破4,000辆,获得了消费者的广泛认可,成为国内豪车市场新锐。</p>\r\n" +
                              "<p>\r\n" +
                              "若需了解更多信息,敬请莅临凯迪拉克南阳新凯迪展厅品鉴试驾。</p>\r\n" +
                              "<p>\r\n" +
                              "[stl.store:58]</p>\r\n";

            IHtmlDocument document = HtmlAnalyzer.Load(htmlText);

            Assert.AreEqual <string>(htmlText, document.NodeHtml);
        }
Example #2
0
        public void LoadTest_HtmlText()
        {
            string        htmlText = "<!--123--><p>这是<a href=\"\" target=blank>段落</a>,<!--456-->它中间包含一个注释。</p>";
            IHtmlDocument document = HtmlAnalyzer.Load(htmlText);

            Assert.AreEqual <string>("这是段落,它中间包含一个注释。", document.NodeText);
        }
Example #3
0
        public void LoadTest_Empty()
        {
            string        htmlText = string.Empty;
            IHtmlDocument document = HtmlAnalyzer.Load(htmlText);

            Assert.AreEqual <int>(0, document.ChildNodes.Count);
        }
Example #4
0
        public void LoadTest_AloneAttributeName()
        {
            string        htmlText = "<input type=\"check\" autofocus readonly class=a checked value='123' disabled />";
            IHtmlDocument document = HtmlAnalyzer.Load(htmlText);

            Assert.AreEqual <string>("<input type=\"check\" autofocus=\"autofocus\" readonly=\"readonly\" class=\"a\" checked=\"checked\" value=\"123\" disabled=\"disabled\" />", document.NodeHtml);
        }
Example #5
0
        public void LoadTest_Entire_Error()
        {
            string htmlText = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\r\n" +
                              "<html xmlns=\"http://www.w3.org/1999/xhtml\">\r\n" +
                              "<head>\r\n" +
                              "<title>{$metaTitle}</title>\r\n" +
                              "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\r\n" +
                              "<meta name=\"keywords\" content=\"{$metaKeyword}\" />\r\n" +
                              "<meta name=\"description\" content=\"{$metaDescription}\" />\r\n" +
                              "<script type=\"text/javascript\" src=\"site.js\"></script>\r\n" +
                              "<link href=\"site.css\" rel=\"stylesheet\" type=\"text/css\">\r\n" +
                              "</head>\r\n" +
                              "<body id=\"body\">\r\n" +
                              "<!-- body start -->\r\n" +
                              "<div id=\"head\">\r\n" +
                              "<!-- page header -->\r\n" +
                              "Page &lt;Header&gt; <input type=\"text\" value=\"123\" />\r\n" +
                              "</div>\r\n" +
                              "<div id=\"footer\">\r\n" +
                              "<!-- page header -->\r\n" +
                              "Page &lt;Footer&gt;\r\n" +
                              "</div>\r\n" +
                              "<!-- body end -->\r\n" +
                              "</body>\r\n" +
                              "</html>\r\n";
            IHtmlDocument document = HtmlAnalyzer.Load(htmlText);

            Assert.AreEqual <string>(htmlText, document.NodeHtml);
        }
Example #6
0
        public void IsLayoutContainHtmlTag_PassEmptyString_ReturnFalse()
        {
            bool   expected = false;
            string layout   = "";

            var actual = HtmlAnalyzer.IsLayoutContainHtmlTag(layout);

            Assert.AreEqual(expected, actual);
        }
Example #7
0
        public void LoadTest_HtmlEncode()
        {
            string        htmlText = "  \t&nbsp;&lt;123&gt;&nbsp;\t  ";
            IHtmlDocument document = HtmlAnalyzer.Load(htmlText);

            Assert.AreEqual <int>(1, document.ChildNodes.Count);
            Assert.AreEqual <string>(htmlText, document.ChildNodes[0].NodeText);
            Assert.AreEqual <string>(htmlText, document.ChildNodes[0].NodeHtml);
        }
Example #8
0
        public void LoadTest_WhiteSpace()
        {
            string        htmlText = "  \t  \r\n   \t\r\n  ";
            IHtmlDocument document = HtmlAnalyzer.Load(htmlText);

            Assert.AreEqual <int>(1, document.ChildNodes.Count);
            Assert.AreEqual <string>(htmlText, document.ChildNodes[0].NodeText);
            Assert.AreEqual <string>(htmlText, document.ChildNodes[0].NodeHtml);
        }
Example #9
0
        public void LoadTest_SetAttributeValue()
        {
            string         htmlText1 = "<p>123<a target=\"_blank\" href=\"../../1.html\">000</a>456</p>";
            string         htmlText2 = "<p>123<a target=\"_self\" href=\"../../2.html\">000</a>456</p>";
            IHtmlDocument  document  = HtmlAnalyzer.Load(htmlText1);
            IHtmlAttribute attribute = document.GetElementsByTagName("a").First().GetAttributesByName("target").First();

            attribute.Value = "_self";
            attribute       = document.GetElementsByTagName("a").First().GetAttributesByName("href").First();
            attribute.Value = "../../2.html";
            Assert.AreEqual <string>(htmlText2, document.NodeHtml);
        }
Example #10
0
        public void GetHtmlPageTitle_PassHtmlLayout_WithoutTitle_ReturnNull()
        {
            string expectedTitle = null;
            string layout        = @"<!DOCTYPE html ><html>
                <head resURL = '/static/cde7a081' data-rooturl='' data-resurl='/static/cde7a081'>
                <link rel = 'stylesheet' href = '/static/cde7a081/css/layout-common.css' type = 'text/css' />
                </head><body><a href = 'api/' > REST API</a></body></html> ";

            string actualTitle = HtmlAnalyzer.GetHtmlPageTitle(layout);

            Assert.AreEqual(expectedTitle, actualTitle);
        }
Example #11
0
        public void IsLayoutContainHtmlTag_PassHtmlLayout_HasNoHtmlTag_ReturnFalse()
        {
            bool   expected = false;
            string layout   = @"<!DOCTYPE html >
                <head resURL = '/static/cde7a081' data-rooturl='' data-resurl='/static/cde7a081'>
                <title > Jenkins </ title >< link rel = 'stylesheet' href = '/static/cde7a081/css/layout-common.css' type = 'text/css' />
                <link rel='stylesheet' href='/static/cde7a081/css/style.css' type='text/css' />
                <link rel='shortcut icon' href='/static/cde7a081/favicon.ico' type='image/vnd.microsoft.icon' />
                <link color='black' rel='mask-icon' href='/images/mask-icon.svg' />
                <script> var isRunAsTest = false; var rootURL = ''; var resURL = '/static/cde7a081';</script >
                <script src = '/static/cde7a081/scripts/prototype.js' type='text/javascript'></script >
                <script src = '/static/cde7a081/scripts/behavior.js' type='text/javascript'></script >
                </head><body><a href = 'api/' > REST API</a></body>' ";

            var actual = HtmlAnalyzer.IsLayoutContainHtmlTag(layout);

            Assert.AreEqual(expected, actual);
        }
 public void startWork(object state)                             //工作线程主函数(未完成函数)
 {
     while (IsWork != true)
     {
         System.Windows.Forms.Application.DoEvents();                       //doEvents
         if (IsWork == false)                                               //没任务,则向任务队列请求
         {
             //IsWork = true;
             url_Info = getQueue();
             if (url_Info != null)
             {
                 //正在执行任务
                 DateTime tTmp = DateTime.Now;
                 url_Info.lastTime = tTmp.ToString();                                                  //爬行时间
                 if (AddUrl(url_Info) == true)
                 {
                     GetHtml      htm          = new GetHtml();
                     string       tmpHtml      = htm.GetPage(url_Info.meUrl);                                //GetHtml
                     HtmlAnalyzer htmlAnalyzer = new HtmlAnalyzer(tmpHtml, url_Info.meUrl);                  //HtmlAnalyzer
                     if (htmlAnalyzer.NewUrl != null)                                                        //如果分析出url
                     {
                         for (int a = 0; a <= htmlAnalyzer.NewUrl.Count - 1; a++)
                         {
                             Url_info tmp = new Url_info();
                             tmp.meUrl   = htmlAnalyzer.NewUrl[a].ToString();                                         //把分析器分析出的url传递给临时对象tmp
                             tmp.fromUrl = url_Info.meUrl;                                                            //来源设定
                             tmp.sid     = url_Info.sid + 1;                                                          //级数+1
                             postQueue(tmp);                                                                          //url信息投递到任务队列
                         }
                     }
                     htm.SavePage(tmpHtml, url_Info.meUrl);
                 }
             }
         }
         if (iListener.OnCloseWork() == true)
         {
             IsWork = true;
         }
     }
 }
Example #13
0
        static void Main(string[] args)
        {
            var serpClient = new GoogleSerpClient();
            var serpResult = serpClient.Get("kuchen", 100);

            var websites = new List <HtmlPage>();

            foreach (var link in serpResult.Links)
            {
                var webClient = new AdvancedWebClient();
                var content   = webClient.Get(link.UrlAsString);

                var htmlAnalyzer = new HtmlAnalyzer();
                var website      = htmlAnalyzer.AnalyzeHtmlPage(content.HtmlContent);
                websites.Add(website);
            }

            var textAnalyzer = new TextAnalyzer();

            foreach (var website in websites)
            {
                var words = textAnalyzer.GetWords(website.Paragraphs, true);
            }
        }
Example #14
0
 public void GetHtmlPageTitle_PassNull()
 {
     Assert.ThrowsException <ArgumentNullException>(() => HtmlAnalyzer.GetHtmlPageTitle(null));
 }
Example #15
0
 public void LoadTest_Null()
 {
     string        htmlText = null;
     IHtmlDocument document = HtmlAnalyzer.Load(htmlText);
 }
Example #16
0
 public void IsLayoutContainHtmlTag_PassNull()
 {
     Assert.ThrowsException <ArgumentNullException>(() => HtmlAnalyzer.IsLayoutContainHtmlTag(null));
 }
Example #17
0
 public void LoadTest_OnlyJavaScript()
 {
     IHtmlDocument document = HtmlAnalyzer.Load(Properties.Resources.HtmlAnalyzer_OnlyJavaScript);
     // there is a bug has not repaired: can not parse js contents.
 }