Exemplo n.º 1
0
        public void TestMethod3()
        {
            var request = new Request("http://www.ruijihg.com/");

            request.Proxy        = new RequestProxy();
            request.Proxy.Ip     = "223.93.172.248";
            request.Proxy.Port   = 3128;
            request.Proxy.Scheme = "http";
            request.RunJS        = true;

            var crawler = new RuiJiCrawler();

            var response = crawler.Request(request);

            Assert.True(response.Data.ToString().Length > 0);
        }
Exemplo n.º 2
0
        public void TestJsonPost()
        {
            var url = "http://s.miaojian.net/api/client/stats/industry?type=0&top=5";

            var request = new Request(url);

            request.Method = "POST";
            request.Headers.Add(new WebHeader("Content-Type", "application/json"));
            request.Cookie = "";
            request.Data   = "{\"filter\":{\"dateRange\":{\"type\":\"month\",\"value\":[]},\"toneIds\":[25]},\"classifyId\":\"100\"}";

            var crawler  = new RuiJiCrawler();
            var response = crawler.Request(request);

            Assert.True(response.Headers.Count > 0);
        }
Exemplo n.º 3
0
        public void TestExtract()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var block = new ExtractBlock();
            var s     = RuiJiExpression.ParserBase("css a[href]").Selectors;

            block.TileSelector.Selectors.AddRange(s);
            var result = RuiJiExtracter.Extract(content, block);

            Assert.IsTrue(true);
        }
Exemplo n.º 4
0
        public void TestMethod6()
        {
            var request = new Request("https://gitee.com/zhupingqi/RuiJi.Net");

            request.Proxy        = new RequestProxy();
            request.Proxy.Ip     = "163.125.223.118";
            request.Proxy.Port   = 8118;
            request.Proxy.Scheme = "https";
            request.RunJS        = true;
            request.Timeout      = 15000;

            var crawler = new RuiJiCrawler();

            var response = crawler.Request(request);

            Assert.True(response.Data.ToString().Length > 0);
        }
Exemplo n.º 5
0
        public void TestMethod5()
        {
            var request = new Request("https://gitee.com/zhupingqi/RuiJi.Net");

            request.Proxy        = new RequestProxy();
            request.Proxy.Ip     = "223.93.172.248";
            request.Proxy.Port   = 3128;
            request.Proxy.Scheme = "http";
            request.RunJS        = true;
            request.Cookie       = "oschina_new_user=false;expires=Wed, 16 Jun 2038 06:57:20 GMT; domain=gitee.com; path=/,aliyungf_tc=AQAAAMt2pVc2cQkACw8UZUJNd5CbXTu0;expires=Wed, 16 Jun 2038 06:57:20 GMT; domain=gitee.com; path=/,oschina_new_user=false;expires=Wed, 16 Jun 2038 06:57:20 GMT; domain=gitee.com; path=/,user_locale=zh-CN;expires=Wed, 16 Jun 2038 06:57:20 GMT; domain=gitee.com; path=/,gitee-session-n=BAh7CEkiD3Nlc3Npb25faWQGOgZFVEkiJTVmYzc3OTQ4ZTRhNGM1MWM5MzI2YjQyOTI1MjRhOGMzBjsAVEkiF21vYnlsZXR0ZV9vdmVycmlkZQY7AEY6CG5pbEkiEF9jc3JmX3Rva2VuBjsARkkiMThCakFMNzlvVXhnNExxcmIwZWxWVFJzS2JMbFRWTHlzcGlJdVpqZWJiaHc9BjsARg%3D%3D--aff6f894a55d2ce1a7be4b3fa036bb95b2b0c68a;expires=Wed, 16 Jun 2038 06:57:20 GMT; domain=.gitee.com; path=/";

            var crawler = new RuiJiCrawler();

            var response = crawler.Request(request);

            Assert.True(response.Data.ToString().Length > 0);
        }
Exemplo n.º 6
0
        public void TestJsonPExtract()
        {
            var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page=1";

            var f = new UrlCompile();
            //url = f.Compile(url);

            var c        = new RuiJiCrawler();
            var response = c.Request(new Request(url));

            var expression = @"
reg /jsonp[\d]+?\((.*)\)/ 1
jpath $..url
";
            var b          = RuiJiBlockParser.ParserBlock(expression);
            var result     = RuiJiExtractor.Extract(response.Data.ToString(), b);

            Assert.IsTrue(result.Content.ToString().Length > 0);
        }
Exemplo n.º 7
0
        public void TestPost2()
        {
            var url = "http://www.qzggzy.com/FrontWeb/ggshow.aspx?Type=zfcg&BigType=10&findtxt=";

            var request = new Request(url);

            request.Method = "POST";
            request.Cookie = "ASP.NET_SessionId=4dsvttfwmcriljen221jaabg";
            var s = System.Web.HttpUtility.UrlEncode("");

            var v = System.Web.HttpUtility.UrlEncode("zOmeaeSdclb5sF7Bh3dF2xYtRF7gNRE18XHHmxRF6s9WyDO7V3zKTsZP67G51gLrXnPzp/fA8RjyxNawmvHc4X1OiAj7ghNU3LAJlsF+0/4+onuVdpPAKKmJZdDtog0CAwS40SphESndm/dwXYu5GncCtSCoX6qyHyo9k8gDJ6L8ArPJ7X5mJX8aicMMimCBZkGu0TYTtqFI+vvDUOuoKVYTCPGRmzt3dyio9QfPC9reAkkAgGIou/PNvSXaH/3WiFR+wN7YVQrj3DdhuXW+wQqvc85+6sDzOLgsOMPeeZcVkqWkSHNzuRN9DJPQ6EamtchsPibFBj+wzYfd28R4Iw==");

            request.Data = string.Format("__EVENTTARGET=ctl15&__EVENTARGUMENT=&__VIEWSTATE={0}&__VIEWSTATEENCRYPTED=&__EVENTVALIDATION={1}&zp__PRONAME=&gvList%24ctl02%24hid_id=41447&gvList%24ctl02%24hidinid=0&gvList%24ctl03%24hid_id=41421&gvList%24ctl03%24hidinid=0&gvList%24ctl04%24hid_id=41409&gvList%24ctl04%24hidinid=0&gvList%24ctl05%24hid_id=41395&gvList%24ctl05%24hidinid=0&gvList%24ctl06%24hid_id=41382&gvList%24ctl06%24hidinid=0&gvList%24ctl07%24hid_id=41343&gvList%24ctl07%24hidinid=0&gvList%24ctl08%24hid_id=41346&gvList%24ctl08%24hidinid=0&gvList%24ctl09%24hid_id=41332&gvList%24ctl09%24hidinid=0&gvList%24ctl10%24hid_id=41345&gvList%24ctl10%24hidinid=0&gvList%24ctl11%24hid_id=41350&gvList%24ctl11%24hidinid=0&gvList%24ctl12%24hid_id=41313&gvList%24ctl12%24hidinid=0&gvList%24ctl13%24hid_id=41314&gvList%24ctl13%24hidinid=0&gvList%24ctl14%24hid_id=41293&gvList%24ctl14%24hidinid=0&gvList%24ctl15%24hid_id=41257&gvList%24ctl15%24hidinid=0&gvList%24ctl16%24hid_id=41256&gvList%24ctl16%24hidinid=0&ctl17=", s, v);

            var crawler  = new RuiJiCrawler();
            var response = crawler.Request(request);

            Assert.IsTrue(response.Headers.Count > 0);
        }
Exemplo n.º 8
0
        public void TestPaging()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://www.kuaidaili.com/free/inha/10");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var exp = @"
[tile]
	css table.table-bordered tr:gt(0):ohtml

	[meta]
	#ip
	css td[data-title='IP']:text

    #port
    css td[data-title='PORT']:text

[paging]
css #listnav a[href]";

            var block  = RuiJiBlockParser.ParserBlock(exp);
            var result = RuiJiExtractor.Extract(content, block);

            if (result.Paging != null && result.Paging.Count > 0 && result.Tiles != null)
            {
                var storage = new FileStorage(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "www", "download"));

                PagingExtractor.CrawlPage(request.Uri, result, block, (u, res) => {
                    var c   = new DownloadContentModel();
                    c.Url   = u.AbsolutePath.Trim();
                    c.IsRaw = false;
                    c.Data  = JsonConvert.SerializeObject(res.Tiles);

                    storage.Insert(c);
                }, int.MaxValue);
            }

            Assert.True(true);
        }
Exemplo n.º 9
0
        public object ProxyPing(int id)
        {
            var watch = new Stopwatch();

            watch.Start();

            try
            {
                var crawler = new RuiJiCrawler();
                var request = new Request("https://www.baidu.com/");
                request.Timeout = 15000;

                var proxy = ProxyLiteDb.Get(id);
                request.Proxy        = new RequestProxy(proxy.Ip, proxy.Port, proxy.UserName, proxy.Password);
                request.Proxy.Scheme = proxy.Type == ProxyTypeEnum.HTTP ? "http" : "https";

                var response = crawler.Request(request);

                watch.Stop();

                return(new
                {
                    elspsed = watch.Elapsed.Milliseconds,
                    code = response.StatusCode,
                    msg = response.StatusCode.ToString()
                });
            }
            catch (Exception ex)
            {
                watch.Stop();

                return(new
                {
                    elspsed = watch.Elapsed.Milliseconds,
                    code = -1,
                    msg = ex.Message
                });
            }
        }
Exemplo n.º 10
0
        public void TestCss()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("http://www.legaldaily.com.cn/index_article/content/2016-08/17/content_6765457.htm?node=5955");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var p = new CssProcessor();
            var s = new CssSelector();

            s.Value = "div.f12:first";
            s.Type  = Core.Extractor.Enum.CssTypeEnum.Text;

            var pr = new ProcessResult();

            pr.Matches.Add(content);

            pr = p.ProcessNeed(s, pr);

            Assert.IsTrue(pr.Content.IndexOf(">>") == -1);
        }
Exemplo n.º 11
0
        public void TestExtractTile()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("http://www.ruijihg.com/archives/category/tech/bigdata");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var parser = new RuiJiParser();
            var eb     = parser.ParseExtract(@"[tile]
css article:html

    [meta]
	#title
	css .entry-header:text

	#summary
	css .entry-header + p:text
	ex /Read more »/ -e"    );

            var result = RuiJiExtractor.Extract(content, eb.Result);

            Assert.True(true);
        }
Exemplo n.º 12
0
        public static Response Request(Request request, bool usecp = false)
        {
            if (NodeConfigurationSection.Standalone)
            {
                var crawler  = new RuiJiCrawler();
                var response = crawler.Request(request);

                if (string.IsNullOrEmpty(request.Ip))
                {
                    var e = CrawlerServerManager.Instance.ElectIP(request.Uri);
                    if (e != null)
                    {
                        request.Ip = e.ClientIp;
                    }
                }

                var    maxRefresh = 2;
                string refreshUrl;

                while (HasRefreshMeta(response, out refreshUrl) && maxRefresh > 0)
                {
                    crawler     = new RuiJiCrawler();
                    request.Uri = new Uri(refreshUrl);
                    response    = crawler.Request(request);

                    maxRefresh--;
                }

                return(response);
            }
            else
            {
                var proxyUrl = ProxyManager.Instance.Elect(NodeProxyTypeEnum.FEEDPROXY);
                if (string.IsNullOrEmpty(proxyUrl))
                {
                    throw new Exception("no available crawler proxy servers");
                }

                proxyUrl = IPHelper.FixLocalUrl(proxyUrl);

                if (usecp)
                {
                    var client      = new RestClient("http://" + proxyUrl);
                    var restRequest = new RestRequest("api/cp/crawl");
                    restRequest.Method = Method.POST;
                    restRequest.AddJsonBody(request);
                    restRequest.Timeout = request.Timeout;

                    var restResponse = client.Execute(restRequest);

                    var response = JsonConvert.DeserializeObject <Response>(restResponse.Content);

                    return(response);
                }
                else
                {
                    var elect = Elect(new CrawlerElectRequest
                    {
                        ElectIp    = string.IsNullOrEmpty(request.Ip),
                        ElectProxy = request.Proxy is null,
                        Uri        = request.Uri
                    });