예제 #1
0
    public void VisibleTest()
    {
      var document = new JumonyParser().LoadDocument( Path.Combine( Environment.CurrentDirectory, "StyleTest1.html" ) );
      document.DataBind( null );

      Assert.AreEqual( document.Find( ".invisible" ).Count(), 0 );


    }
예제 #2
0
        //
        //<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="01CreateScreen/">01CreateScreen/</a></td><td align="right">2016-01-11 10:23  </td><td align="right">  - </td><td>&nbsp;</td></tr>
        //<tr><td valign="top"><img src="/icons/text.gif" alt="[TXT]"></td><td><a href="Test_money.py">Test_money.py</a></td><td align="right">2016-01-08 15:53  </td><td align="right">1.1K</td><td>&nbsp;</td></tr>
        //
        public static List<Resource> GetDirectoryContents(string url)
        {
            List<Resource> Rlist = new List<Resource>();
            //url = "http://192.168.1.42/testpage/Script";
            WebClient MyWebClient = new WebClient();
            MyWebClient.Credentials = CredentialCache.DefaultCredentials;
            Byte[] pageData = MyWebClient.DownloadData(url); 
            string pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句
            var htmlSource = new JumonyParser().Parse(pageHtml);

            var list = htmlSource.Find("img[src=/icons/folder.gif]");
            foreach(var one in list)
            {
                /*if(one.Attribute("href").Value() == one.InnerText())
                {
                    string s = one.InnerText();
                }*/
                Resource a = new Resource();
                a.Name = one.Parent().Parent().Find("a").ElementAt(0).InnerText();
                a.Url = url  + a.Name;
                a.IsFolder = true;
                Rlist.Add(a);
            }
            list = htmlSource.Find("img[src=/icons/text.gif]");
            foreach (var one in list)
            {
                Resource a = new Resource();
                a.Name = one.Parent().Parent().Find("a").ElementAt(0).InnerText();
                a.Url = url  + a.Name;
                a.IsFolder = false;
                string t1 = one.Parent().Parent().Find("td[align=right]").ElementAt(0).InnerText();
                a.LastModified = DateTime.Parse(t1);
                Rlist.Add(a);
            }
            return Rlist;
        }
예제 #3
0
        public static string CheckAPK(ref string url)
        {
            string downloadurl = "http://192.168.1.40/iwu_android/";
            WebClient MyWebClient = new WebClient();
            MyWebClient.Credentials = CredentialCache.DefaultCredentials;
            Byte[] pageData = MyWebClient.DownloadData(downloadurl);
            string pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句
            var htmlSource = new JumonyParser().Parse(pageHtml);
            var one = htmlSource.Find("img[src=/icons/folder.gif]").Last();
            string releaseUrl = one.Parent().Parent().Find("a[href]").First().InnerText();
            string time = one.Parent().Parent().Find("td[align=right]").ElementAt(0).InnerText();

            url = downloadurl + releaseUrl + "apk/app-release.apk";
            return "最新版本号:" + releaseUrl + "\n 版本时间:" + time +"\n是否确定下载?";
        }
예제 #4
0
파일: Program.cs 프로젝트: ajayumi/Jumony
    static void Main( string[] args )
    {



      var document = new JumonyParser().LoadDocument( "http://www.sina.com.cn/", Encoding.GetEncoding( "GB2312" ) );

      Stopwatch watch = new Stopwatch();
      watch.Restart();
      for ( int i = 0; i < 200; i++ )
      {

        var elements = document.Descendants().ToArray();

        document.Descendants().FilterBy( "body p a" ).FirstOrDefault();
        document.Descendants().FilterBy( "p > a" ).FirstOrDefault();
        document.Descendants().FilterBy( "p[class] a" ).FirstOrDefault();
        document.Descendants().FilterBy( "p a[href]" ).FirstOrDefault();
        document.Descendants().FilterBy( "p + a" ).FirstOrDefault();
        document.Descendants().FilterBy( "div a" ).FirstOrDefault();
        document.Descendants().FilterBy( "p div a" ).FirstOrDefault();
        document.Descendants().FilterBy( "a img[src]" ).FirstOrDefault();
        document.Descendants().FilterBy( "div img" ).FirstOrDefault();
        document.Descendants().FilterBy( "body img[src]" ).FirstOrDefault();
      }
      watch.Stop();

      Console.WriteLine( watch.Elapsed );

      watch.Restart();
      for ( int i = 0; i < 200; i++ )
      {

        var elements = document.Descendants().ToArray();

        document.Descendants().ToArray().FilterBy( "body p a" ).FirstOrDefault();
        document.Descendants().ToArray().FilterBy( "p > a" ).FirstOrDefault();
        document.Descendants().ToArray().FilterBy( "p[class] a" ).FirstOrDefault();
        document.Descendants().ToArray().FilterBy( "p a[href]" ).FirstOrDefault();
        document.Descendants().ToArray().FilterBy( "p + a" ).FirstOrDefault();
        document.Descendants().ToArray().FilterBy( "div a" ).FirstOrDefault();
        document.Descendants().ToArray().FilterBy( "p div a" ).FirstOrDefault();
        document.Descendants().ToArray().FilterBy( "a img[src]" ).FirstOrDefault();
        document.Descendants().ToArray().FilterBy( "div img" ).FirstOrDefault();
        document.Descendants().ToArray().FilterBy( "body img[src]" ).FirstOrDefault();
      }
      watch.Stop();
      Console.WriteLine( watch.Elapsed );
      watch.Restart();
      for ( int i = 0; i < 200; i++ )
      {

        var elements = document.Descendants().ToArray();

        document.Find( "body p a" ).FirstOrDefault();
        document.Find( "p > a" ).FirstOrDefault();
        document.Find( "p[class] a" ).FirstOrDefault();
        document.Find( "p a[href]" ).FirstOrDefault();
        document.Find( "p + a" ).FirstOrDefault();
        document.Find( "div a" ).FirstOrDefault();
        document.Find( "p div a" ).FirstOrDefault();
        document.Find( "a img[src]" ).FirstOrDefault();
        document.Find( "div img" ).FirstOrDefault();
        document.Find( "body img[src]" ).FirstOrDefault();
      }
      watch.Stop();
      Console.WriteLine( watch.Elapsed );



      Console.ReadKey();


    }
예제 #5
0
    public void SpecificationTest8()
    {
      var document = new JumonyParser().LoadDocument( Path.Combine( Environment.CurrentDirectory, "SpecificationTest8.html" ) );

      Assert.AreEqual( document.FindSingle( "div" ).Attributes().Count(), 1, "错误的解析了非法的属性" );
      var links = document.Find( "div a" ).ToArray();

      Assert.AreEqual( links.Length, 2, "错误的解析了不属于属性值的引用内容" );
      Assert.AreEqual( links[0].InnerText(), "Test1", "错误的解析了不属于属性值的引用内容" );
      Assert.AreEqual( links[1].InnerText(), " \"Test2", "错误的解析了不属于属性值的引用内容" );

    }
예제 #6
0
        /// <summary>
        /// 单个查询
        /// </summary>
        /// <param name="companyName"></param>
        public void SingelSearch(string companyName)
        {
            var httpClient = new HttpClient();
            httpClient.Setting.Timeout = 1000 * 5;
            httpClient.Create<string>(HttpMethod.Post, firsturl).Send();
            while (true)
            {
                var targetModel = new CrawlerEntity { 搜索名称 = companyName, 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique };
                try
                {
                    //IP处理
                    var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP
                    if (proxyEntity == null)
                    {
                        Console.WriteLine("在线代理临时获取策略启动。");
                        proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity();
                        Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port);
                    }

                    httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port);

                    var resultBody = httpClient.Create<string>(HttpMethod.Post, targetUrl, data: new
                    {
                        queryStr = targetModel.搜索名称,
                        module = "",
                        idFlag = "qyxy"
                    }).Send();
                    if (!resultBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //提取二级连接
                    var parser = new JumonyParser();
                    var urls = parser.Parse(resultBody.Result).Find("li a").ToList();
                    var nextUrl = "";
                    if (urls.Count < 1)
                    {
                        AddNull(targetModel);
                        break;
                    }
                    foreach (var htmlElement in urls)
                    {
                        targetModel.名称 = htmlElement.InnerText();
                        nextUrl = url + htmlElement.Attribute("href").AttributeValue;
                    }
                    //提取目标正文
                    var resultsecondBody =
                        httpClient.Create<string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send();
                    var nameValueCollection =
                        new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query));
                    if (!resultsecondBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultsecondBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //正文处理
                    var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td"));
                    var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList();
                    var listall = new List<string>();
                    foreach (var tableList in tableLists)
                        tableList.Find("tr td")
                            .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim()));
                    var fillModel = FillModel(listall);
                    fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper();
                    new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel));
                    //后续其他处理 包括了IP使用状态,以查询列表状态
                    proxyEntity.Usage = proxyEntity.Usage + 1;
                    new ProxyDomain().Update(proxyEntity);
                    Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称);
                }
                catch (Exception e)
                {
                    new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now });
                    continue;
                }
                break;
            }
        }