public void VisibleTest() { var document = new JumonyParser().LoadDocument( Path.Combine( Environment.CurrentDirectory, "StyleTest1.html" ) ); document.DataBind( null ); Assert.AreEqual( document.Find( ".invisible" ).Count(), 0 ); }
// //<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="01CreateScreen/">01CreateScreen/</a></td><td align="right">2016-01-11 10:23 </td><td align="right"> - </td><td> </td></tr> //<tr><td valign="top"><img src="/icons/text.gif" alt="[TXT]"></td><td><a href="Test_money.py">Test_money.py</a></td><td align="right">2016-01-08 15:53 </td><td align="right">1.1K</td><td> </td></tr> // public static List<Resource> GetDirectoryContents(string url) { List<Resource> Rlist = new List<Resource>(); //url = "http://192.168.1.42/testpage/Script"; WebClient MyWebClient = new WebClient(); MyWebClient.Credentials = CredentialCache.DefaultCredentials; Byte[] pageData = MyWebClient.DownloadData(url); string pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句 var htmlSource = new JumonyParser().Parse(pageHtml); var list = htmlSource.Find("img[src=/icons/folder.gif]"); foreach(var one in list) { /*if(one.Attribute("href").Value() == one.InnerText()) { string s = one.InnerText(); }*/ Resource a = new Resource(); a.Name = one.Parent().Parent().Find("a").ElementAt(0).InnerText(); a.Url = url + a.Name; a.IsFolder = true; Rlist.Add(a); } list = htmlSource.Find("img[src=/icons/text.gif]"); foreach (var one in list) { Resource a = new Resource(); a.Name = one.Parent().Parent().Find("a").ElementAt(0).InnerText(); a.Url = url + a.Name; a.IsFolder = false; string t1 = one.Parent().Parent().Find("td[align=right]").ElementAt(0).InnerText(); a.LastModified = DateTime.Parse(t1); Rlist.Add(a); } return Rlist; }
public static string CheckAPK(ref string url) { string downloadurl = "http://192.168.1.40/iwu_android/"; WebClient MyWebClient = new WebClient(); MyWebClient.Credentials = CredentialCache.DefaultCredentials; Byte[] pageData = MyWebClient.DownloadData(downloadurl); string pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句 var htmlSource = new JumonyParser().Parse(pageHtml); var one = htmlSource.Find("img[src=/icons/folder.gif]").Last(); string releaseUrl = one.Parent().Parent().Find("a[href]").First().InnerText(); string time = one.Parent().Parent().Find("td[align=right]").ElementAt(0).InnerText(); url = downloadurl + releaseUrl + "apk/app-release.apk"; return "最新版本号:" + releaseUrl + "\n 版本时间:" + time +"\n是否确定下载?"; }
static void Main( string[] args ) { var document = new JumonyParser().LoadDocument( "http://www.sina.com.cn/", Encoding.GetEncoding( "GB2312" ) ); Stopwatch watch = new Stopwatch(); watch.Restart(); for ( int i = 0; i < 200; i++ ) { var elements = document.Descendants().ToArray(); document.Descendants().FilterBy( "body p a" ).FirstOrDefault(); document.Descendants().FilterBy( "p > a" ).FirstOrDefault(); document.Descendants().FilterBy( "p[class] a" ).FirstOrDefault(); document.Descendants().FilterBy( "p a[href]" ).FirstOrDefault(); document.Descendants().FilterBy( "p + a" ).FirstOrDefault(); document.Descendants().FilterBy( "div a" ).FirstOrDefault(); document.Descendants().FilterBy( "p div a" ).FirstOrDefault(); document.Descendants().FilterBy( "a img[src]" ).FirstOrDefault(); document.Descendants().FilterBy( "div img" ).FirstOrDefault(); document.Descendants().FilterBy( "body img[src]" ).FirstOrDefault(); } watch.Stop(); Console.WriteLine( watch.Elapsed ); watch.Restart(); for ( int i = 0; i < 200; i++ ) { var elements = document.Descendants().ToArray(); document.Descendants().ToArray().FilterBy( "body p a" ).FirstOrDefault(); document.Descendants().ToArray().FilterBy( "p > a" ).FirstOrDefault(); document.Descendants().ToArray().FilterBy( "p[class] a" ).FirstOrDefault(); document.Descendants().ToArray().FilterBy( "p a[href]" ).FirstOrDefault(); document.Descendants().ToArray().FilterBy( "p + a" ).FirstOrDefault(); document.Descendants().ToArray().FilterBy( "div a" ).FirstOrDefault(); document.Descendants().ToArray().FilterBy( "p div a" ).FirstOrDefault(); document.Descendants().ToArray().FilterBy( "a img[src]" ).FirstOrDefault(); document.Descendants().ToArray().FilterBy( "div img" ).FirstOrDefault(); document.Descendants().ToArray().FilterBy( "body img[src]" ).FirstOrDefault(); } watch.Stop(); Console.WriteLine( watch.Elapsed ); watch.Restart(); for ( int i = 0; i < 200; i++ ) { var elements = document.Descendants().ToArray(); document.Find( "body p a" ).FirstOrDefault(); document.Find( "p > a" ).FirstOrDefault(); document.Find( "p[class] a" ).FirstOrDefault(); document.Find( "p a[href]" ).FirstOrDefault(); document.Find( "p + a" ).FirstOrDefault(); document.Find( "div a" ).FirstOrDefault(); document.Find( "p div a" ).FirstOrDefault(); document.Find( "a img[src]" ).FirstOrDefault(); document.Find( "div img" ).FirstOrDefault(); document.Find( "body img[src]" ).FirstOrDefault(); } watch.Stop(); Console.WriteLine( watch.Elapsed ); Console.ReadKey(); }
public void SpecificationTest8() { var document = new JumonyParser().LoadDocument( Path.Combine( Environment.CurrentDirectory, "SpecificationTest8.html" ) ); Assert.AreEqual( document.FindSingle( "div" ).Attributes().Count(), 1, "错误的解析了非法的属性" ); var links = document.Find( "div a" ).ToArray(); Assert.AreEqual( links.Length, 2, "错误的解析了不属于属性值的引用内容" ); Assert.AreEqual( links[0].InnerText(), "Test1", "错误的解析了不属于属性值的引用内容" ); Assert.AreEqual( links[1].InnerText(), " \"Test2", "错误的解析了不属于属性值的引用内容" ); }
/// <summary> /// 单个查询 /// </summary> /// <param name="companyName"></param> public void SingelSearch(string companyName) { var httpClient = new HttpClient(); httpClient.Setting.Timeout = 1000 * 5; httpClient.Create<string>(HttpMethod.Post, firsturl).Send(); while (true) { var targetModel = new CrawlerEntity { 搜索名称 = companyName, 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique }; try { //IP处理 var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP if (proxyEntity == null) { Console.WriteLine("在线代理临时获取策略启动。"); proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity(); Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port); } httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port); var resultBody = httpClient.Create<string>(HttpMethod.Post, targetUrl, data: new { queryStr = targetModel.搜索名称, module = "", idFlag = "qyxy" }).Send(); if (!resultBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultBody.Result)) { RemoveOldIp(proxyEntity); continue; } //提取二级连接 var parser = new JumonyParser(); var urls = parser.Parse(resultBody.Result).Find("li a").ToList(); var nextUrl = ""; if (urls.Count < 1) { AddNull(targetModel); break; } foreach (var htmlElement in urls) { targetModel.名称 = htmlElement.InnerText(); nextUrl = url + htmlElement.Attribute("href").AttributeValue; } //提取目标正文 var resultsecondBody = httpClient.Create<string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send(); var nameValueCollection = new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query)); if (!resultsecondBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultsecondBody.Result)) { RemoveOldIp(proxyEntity); continue; } //正文处理 var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td")); var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList(); var listall = new List<string>(); foreach (var tableList in tableLists) tableList.Find("tr td") .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim())); var fillModel = FillModel(listall); fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper(); new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel)); //后续其他处理 包括了IP使用状态,以查询列表状态 proxyEntity.Usage = proxyEntity.Usage + 1; new ProxyDomain().Update(proxyEntity); Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称); } catch (Exception e) { new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now }); continue; } break; } }