Ejemplo n.º 1
0
        protected override Response DoTask(Request request)
        {
            try
            {
                Logger.GetLogger("").Info("do task -> request address " + request.Uri);

                var crawler  = new RuiJiCrawler();
                var response = crawler.Request(request);

                if (response != null)
                {
                    Logger.GetLogger("").Info("request " + request.Uri + " response code is " + response.StatusCode);
                }
                if (response == null)
                {
                    Logger.GetLogger("").Error("request " + request.Uri + " response is null");
                }

                return(response);
            }
            catch (Exception ex)
            {
                Logger.GetLogger("").Info("do task -> request address failed " + ex.Message);
            }

            return(null);
        }
Ejemplo n.º 2
0
        private string Ping(string addr)
        {
            try
            {
                var crawler = new RuiJiCrawler();
                var request = new Request("https://www.baidu.com/");
                request.Timeout = 5000;
                var sp = addr.Split(':');

                request.Proxy        = new RequestProxy(sp[0], Convert.ToInt32(sp[1]));
                request.Proxy.Scheme = "https";

                var response = crawler.Request(request);

                if (response.StatusCode == System.Net.HttpStatusCode.OK)
                {
                    return("https");
                }

                request.Proxy.Scheme = "http";

                response = crawler.Request(request);

                if (response.StatusCode == System.Net.HttpStatusCode.OK)
                {
                    return("http");
                }
            }
            catch (Exception ex)
            {
                return(ex.Message);
            }

            return("");
        }
Ejemplo n.º 3
0
        public void TestPaging2()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://3w.huanqiu.com/a/4e2d56fd7f51/7DHitRASkPC?p=1&agt=8");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var exp = @"
[meta]
	#title
	css h1.a-title

	#date_dt
	css .time:text

	#content
	css .a-con:ohtml

[paging]
css .a-page
css a[href]";

            var block  = RuiJiBlockParser.ParserBlock(exp);
            var result = RuiJiExtractor.Extract(content, block);

            if (result.Paging != null && result.Paging.Count > 0 && result.Metas != null && result.Metas.ContainsKey("content"))
            {
                result = PagingExtractor.MergeContent(request.Uri, result, block);
            }

            Assert.True(true);
        }
Ejemplo n.º 4
0
        public Response Crawl(Request request)
        {
            var node = ServerManager.Get(Request.RequestUri.Authority);

            if (node.NodeType == Node.NodeTypeEnum.CRAWLER)
            {
                var crawler  = new RuiJiCrawler();
                var response = crawler.Request(request);

                var    maxRefresh = 2;
                string refreshUrl;

                while (HasRefreshMeta(response, out refreshUrl) && maxRefresh > 0)
                {
                    crawler     = new RuiJiCrawler();
                    request.Uri = new Uri(refreshUrl);
                    response    = crawler.Request(request);

                    maxRefresh--;
                }

                return(response);
            }
            else
            {
                return(Crawler.Request(request));
            }
        }
Ejemplo n.º 5
0
        public void TestPaging()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://www.kuaidaili.com/free/inha/1/");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var block = new ExtractBlock();
            var s     = RuiJiBlockParser.ParserBlock(@"
[tile]
	css table.table-bordered tr:gt(0):ohtml

	[meta]
	#ip
	css td[data-title='IP']:text

    # port
    css td[data-title='PORT']:text

[paging]
css #listnav a:[href]
");

            var result = RuiJiExtractor.Extract(content, s);

            Assert.True(true);
        }
Ejemplo n.º 6
0
        public void TestExtractMeta()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://my.oschina.net/zhupingqi/blog/1826317");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var parser = new RuiJiParser();
            var eb     = parser.ParseExtract(@"
[meta]
	#title
	css h1.header:text

	#author
	css div.blog-meta .avatar + span:text

	#date
	css div.blog-meta > div.item:first:text
	regS /发布于/ 1

	#words_i
	css div.blog-meta > div.item:eq(1):text
	regS / / 1

	#content
	css #articleContent:html"    );

            var result = RuiJiExtractor.Extract(content, eb.Result);

            Assert.True(true);
        }
Ejemplo n.º 7
0
        public void NoIpMethod()
        {
            //no ip
            var crawler  = new RuiJiCrawler();
            var request  = new Request("http://www.baidu.com");
            var response = crawler.Request(request);

            Assert.Equal("https://www.baidu.com/", response.ResponseUri.ToString());
        }
Ejemplo n.º 8
0
        public static Response Request(Request request)
        {
            if (RuiJiConfiguration.Standalone)
            {
                if (string.IsNullOrEmpty(request.Ip))
                {
                    var e = CrawlerServerManager.Instance.ElectIP(request.Uri);
                    if (e != null)
                    {
                        request.Ip = e.ClientIp;
                    }
                    else
                    {
                        request.Ip = IPHelper.GetDefaultIPAddress().ToString();
                    }
                }

                var crawler  = new RuiJiCrawler();
                var response = crawler.Request(request);

                var    maxRefresh = 2;
                string refreshUrl;

                while (HasRefreshMeta(response, out refreshUrl) && maxRefresh > 0)
                {
                    crawler     = new RuiJiCrawler();
                    request.Uri = new Uri(refreshUrl);
                    response    = crawler.Request(request);

                    maxRefresh--;
                }

                return(response);
            }
            else
            {
                var proxyUrl = ProxyManager.Instance.Elect(NodeProxyTypeEnum.CRAWLERPROXY);
                if (string.IsNullOrEmpty(proxyUrl))
                {
                    throw new Exception("no available crawler proxy servers");
                }

                proxyUrl = IPHelper.FixLocalUrl(proxyUrl);

                if (!request.Session)
                {
                    request = (Request)request.Clone();
                }

                var elect = Elect(new CrawlerElectRequest
                {
                    ElectIp    = string.IsNullOrEmpty(request.Ip),
                    ElectProxy = request.Proxy is null,
                    Uri        = request.Uri
                });
Ejemplo n.º 9
0
        public void IpMethod()
        {
            //no ip
            var crawler = new RuiJiCrawler();
            var request = new Request("http://www.cannews.com.cn/2018/1121/185448.shtml");

            request.Ip = "192.168.31.32";
            var response = crawler.Request(request);

            Assert.Equal("https://www.baidu.com/", response.ResponseUri.ToString());
        }
Ejemplo n.º 10
0
        public void TestRequestProxy()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://www.baidu.com");

            request.Proxy = new RequestProxy("223.93.172.248", 3128);

            var response = crawler.Request(request);

            Assert.Equal("https://www.baidu.com", response.ResponseUri.ToString());
        }
Ejemplo n.º 11
0
        public void IpMethod()
        {
            //no ip
            var crawler = new RuiJiCrawler();
            var request = new Request("https://www.baidu.com");

            request.Ip = "192.168.31.196";
            var response = crawler.Request(request);

            Assert.AreEqual(response.ResponseUri.ToString(), "http://www.baidu.com");
        }
Ejemplo n.º 12
0
        public void TestTC()
        {
            var request = new Request("http://ghotel.ly.com/hd-centara-watergate-pavillion-hotel-bangkok-14638/?spm0=10002.2024.206898039.2.3.1.1");

            request.RunJS = true;
            var crawler = new RuiJiCrawler();

            var response = crawler.Request(request);

            Assert.True(response.Data.ToString().Length > 0);
        }
Ejemplo n.º 13
0
        public void TestMethod1()
        {
            var request = new Request("http://www.baidu.com");

            request.RunJS = true;
            var crawler = new RuiJiCrawler();

            var response = crawler.Request(request);

            Assert.True(response.Data.ToString().Length > 0);
        }
Ejemplo n.º 14
0
        public void TestMethod2()
        {
            var request = new Request("https://gitee.com/zhupingqi/RuiJi.Net");

            request.RunJS = true;
            var crawler = new RuiJiCrawler();

            var response = crawler.Request(request);

            Assert.True(response.Data.ToString().Length > 0);
        }
Ejemplo n.º 15
0
        public static void DownloadPage(Uri uri, ExtractResult result, ExtractBlock block, PageDownloadHandler handler, int maxRetry = 10)
        {
            handler(uri, result);

            var pages = new Dictionary <string, ExtractResult>();

            pages.Add(uri.ToString(), result);

            var lines  = String.Join("\n", result.Paging.Distinct());
            var reader = new StringReader(lines);

            var crawler = new RuiJiCrawler();

            var url = reader.ReadLine();

            var diffBuilder = new InlineDiffBuilder(new Differ());

            while (!string.IsNullOrEmpty(url))
            {
                var u = new Uri(uri, url);
                if (pages.ContainsKey(u.ToString()))
                {
                    url = reader.ReadLine();
                    continue;
                }

                var request = new Request(u);

                var response = crawler.Request(request);
                var content  = response.Data.ToString();

                var r = RuiJiExtractor.Extract(content, block);
                if (r.Paging == null || r.Paging.Count == 0)
                {
                    Thread.Sleep(5000);
                    if (--maxRetry == 0)
                    {
                        break;
                    }

                    continue;
                }

                pages.Add(u.ToString(), r);
                handler(u, r);

                var nlines = String.Join("\n", r.Paging.Distinct());
                var diff   = diffBuilder.BuildDiffModel(lines, nlines);

                nlines = string.Join("\n", diff.Lines.Select(m => m.Text));
                reader = new StringReader(nlines);
                url    = reader.ReadLine();
            }
        }
Ejemplo n.º 16
0
        public void TestRequestProxy()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("http://www.baidu.com");

            request.Proxy = new RequestProxy("115.223.233.34", 9000);

            var response = crawler.Request(request);

            Assert.AreEqual(response.ResponseUri.ToString(), "http://www.baidu.com");
        }
Ejemplo n.º 17
0
        public void TestLocalExtract()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var block = new ExtractBlock();

            block.Selectors = new List <ISelector>
            {
                new CssSelector(".entry-content", CssTypeEnum.INNERHTML)
            };

            block.TileSelector = new ExtractTile
            {
                Selectors = new List <ISelector>
                {
                    new CssSelector(".pt-cv-content-item", CssTypeEnum.INNERHTML)
                }
            };

            //block.TileSelector.Metas.AddMeta(new ExtractBase {
            //    Name = "title",
            //    Selectors = new List<ISelector> {
            //        new CssSelector(".pt-cv-title")
            //    }
            //});

            //block.TileSelector.Metas.AddMeta(new ExtractBase
            //{
            //    Name = "url",
            //    Selectors = new List<ISelector> {
            //       new CssSelector(".pt-cv-readmore","href")
            //    }
            //});


            block.TileSelector.Metas.AddMeta("title", new List <ISelector> {
                new CssSelector(".pt-cv-title")
            });

            block.TileSelector.Metas.AddMeta("url", new List <ISelector> {
                new CssSelector(".pt-cv-readmore", "href")
            });

            var r = RuiJiExtractor.Extract(content, block);

            Assert.IsTrue(r.Content.ToString().Length > 0);
            Assert.IsTrue(r.Tiles.Count > 0);
        }
Ejemplo n.º 18
0
        public void TestMethod4()
        {
            var request = new Request("https://gitee.com/zhupingqi/RuiJi.Net");

            request.RunJS  = true;
            request.Cookie = "oschina_new_user=false;expires=Wed, 16 Jun 2038 06:57:20 GMT; domain=gitee.com; path=/,aliyungf_tc=AQAAAMt2pVc2cQkACw8UZUJNd5CbXTu0;expires=Wed, 16 Jun 2038 06:57:20 GMT; domain=gitee.com; path=/,oschina_new_user=false;expires=Wed, 16 Jun 2038 06:57:20 GMT; domain=gitee.com; path=/,user_locale=zh-CN;expires=Wed, 16 Jun 2038 06:57:20 GMT; domain=gitee.com; path=/,gitee-session-n=BAh7CEkiD3Nlc3Npb25faWQGOgZFVEkiJTVmYzc3OTQ4ZTRhNGM1MWM5MzI2YjQyOTI1MjRhOGMzBjsAVEkiF21vYnlsZXR0ZV9vdmVycmlkZQY7AEY6CG5pbEkiEF9jc3JmX3Rva2VuBjsARkkiMThCakFMNzlvVXhnNExxcmIwZWxWVFJzS2JMbFRWTHlzcGlJdVpqZWJiaHc9BjsARg%3D%3D--aff6f894a55d2ce1a7be4b3fa036bb95b2b0c68a;expires=Wed, 16 Jun 2038 06:57:20 GMT; domain=.gitee.com; path=/";

            var crawler = new RuiJiCrawler();

            var response = crawler.Request(request);

            Assert.True(response.Data.ToString().Length > 0);
        }
Ejemplo n.º 19
0
        public void TestJsonGet()
        {
            var url = "http://s.miaojian.net/api/client/classify?id=";

            var request = new Request(url);

            request.Headers.Add(new WebHeader("Content-Type", "application/json"));
            request.Cookie = "";

            var crawler  = new RuiJiCrawler();
            var response = crawler.Request(request);

            Assert.True(response.Headers.Count > 0);
        }
Ejemplo n.º 20
0
        public void TestExtract2()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://www.oschina.net/blog");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var parser = new RuiJiParser();
            var eb     = parser.ParseExtract("css a.blog-title-link:[href]\nexp https://my.oschina.net/*/blog/*");
            var result = RuiJiExtractor.Extract(content, eb.Result);

            Assert.True(true);
        }
Ejemplo n.º 21
0
        public void TestPost()
        {
            var url = "http://s.miaojian.net/api/client/clipping";

            var request = new Request(url);

            request.Method      = "POST";
            request.ContentType = "application/json";
            request.Data        = "{\"page\":1,\"rows\":15,\"orderby\":\"newsDate\",\"sort\":\"desc\",\"meger\":true,\"filter\":{\"mediaTypeIds\":[1983],\"dateRange\":{\"type\":\"month\",\"value\":[]}},\"classifyId\":\"100\"}";

            var crawler  = new RuiJiCrawler();
            var response = crawler.Request(request);

            Assert.True(response.Headers.Count > 0);
        }
Ejemplo n.º 22
0
        public void TestExtract()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var block = new ExtractBlock();
            var s     = RuiJiBlockParser.ParserBase("css a:[href]").Selectors;

            block.TileSelector.Selectors.AddRange(s);
            var result = RuiJiExtractor.Extract(content, block);

            Assert.True(true);
        }
Ejemplo n.º 23
0
        public void TestPost()
        {
            var url = "http://s.miaojian.net/api/client/stats/industry?type=0&top=5";

            var request = new Request(url);

            request.Method = "POST";
            request.Headers.Add(new WebHeader("Content-Type", "application/json"));
            //request.Cookie = "ASP.NET_SessionId=y4stpykzzg42fjqwhksho2a4; instanceId=f2f88812a95945508afe7e56e80726f0; captchaCode=CBPT; .ASPXAUTH=4D137F3E165271DA5DDF953A55B1518BDCFDDDAD0D41DF927B008859D9B0F58985D5728996734519B19EF10FB08C021A6F877F8C6B78CD6B430880133FFDFD3BFD4E26201714A6DE1C89C18E9361412C8CB9D7864745BDF95FE184E8A223AF1A43D7BC1166E45EFE27E6ACACCB64576B2A957CCB097C4FD4BF5FC2DDEA0643CEC6D88D5A3E2473366F900A92C3322058306CD797243988E54258DCE5C026EF14DF14E29078F99B9F885C00D6828375D9E99F41E8AB0C63388D471ED9B25EDBEC1655F332138ECBBA00F006AD6F0DABC3207A1758947FE55D32A5F208530E7F76DA38AD814B49B5FB4844E27230AB7A23544F92B480CBA2DF0112AF269B1B252F";
            request.Data = "{\"filter\":{\"dateRange\":{\"type\":\"month\",\"value\":[]},\"toneIds\":[25]},\"classifyId\":\"100\"}";

            var crawler  = new RuiJiCrawler();
            var response = crawler.Request(request);

            Assert.IsTrue(response.Headers.Count > 0);
        }
Ejemplo n.º 24
0
        public void TestJsonPost()
        {
            var url = "http://s.miaojian.net/api/client/stats/industry?type=0&top=5";

            var request = new Request(url);

            request.Method = "POST";
            request.Headers.Add(new WebHeader("Content-Type", "application/json"));
            request.Cookie = "";
            request.Data   = "{\"filter\":{\"dateRange\":{\"type\":\"month\",\"value\":[]},\"toneIds\":[25]},\"classifyId\":\"100\"}";

            var crawler  = new RuiJiCrawler();
            var response = crawler.Request(request);

            Assert.True(response.Headers.Count > 0);
        }
Ejemplo n.º 25
0
        public void TestMethod3()
        {
            var request = new Request("http://www.ruijihg.com/");

            request.Proxy        = new RequestProxy();
            request.Proxy.Ip     = "223.93.172.248";
            request.Proxy.Port   = 3128;
            request.Proxy.Scheme = "http";
            request.RunJS        = true;

            var crawler = new RuiJiCrawler();

            var response = crawler.Request(request);

            Assert.True(response.Data.ToString().Length > 0);
        }
Ejemplo n.º 26
0
        public void TestMethod6()
        {
            var request = new Request("https://gitee.com/zhupingqi/RuiJi.Net");

            request.Proxy        = new RequestProxy();
            request.Proxy.Ip     = "163.125.223.118";
            request.Proxy.Port   = 8118;
            request.Proxy.Scheme = "https";
            request.RunJS        = true;
            request.Timeout      = 15000;

            var crawler = new RuiJiCrawler();

            var response = crawler.Request(request);

            Assert.True(response.Data.ToString().Length > 0);
        }
Ejemplo n.º 27
0
        public void TestPost2()
        {
            var url = "http://www.qzggzy.com/FrontWeb/ggshow.aspx?Type=zfcg&BigType=10&findtxt=";

            var request = new Request(url);

            request.Method = "POST";
            request.Cookie = "ASP.NET_SessionId=4dsvttfwmcriljen221jaabg";
            var s = System.Web.HttpUtility.UrlEncode("");

            var v = System.Web.HttpUtility.UrlEncode("zOmeaeSdclb5sF7Bh3dF2xYtRF7gNRE18XHHmxRF6s9WyDO7V3zKTsZP67G51gLrXnPzp/fA8RjyxNawmvHc4X1OiAj7ghNU3LAJlsF+0/4+onuVdpPAKKmJZdDtog0CAwS40SphESndm/dwXYu5GncCtSCoX6qyHyo9k8gDJ6L8ArPJ7X5mJX8aicMMimCBZkGu0TYTtqFI+vvDUOuoKVYTCPGRmzt3dyio9QfPC9reAkkAgGIou/PNvSXaH/3WiFR+wN7YVQrj3DdhuXW+wQqvc85+6sDzOLgsOMPeeZcVkqWkSHNzuRN9DJPQ6EamtchsPibFBj+wzYfd28R4Iw==");

            request.Data = string.Format("__EVENTTARGET=ctl15&__EVENTARGUMENT=&__VIEWSTATE={0}&__VIEWSTATEENCRYPTED=&__EVENTVALIDATION={1}&zp__PRONAME=&gvList%24ctl02%24hid_id=41447&gvList%24ctl02%24hidinid=0&gvList%24ctl03%24hid_id=41421&gvList%24ctl03%24hidinid=0&gvList%24ctl04%24hid_id=41409&gvList%24ctl04%24hidinid=0&gvList%24ctl05%24hid_id=41395&gvList%24ctl05%24hidinid=0&gvList%24ctl06%24hid_id=41382&gvList%24ctl06%24hidinid=0&gvList%24ctl07%24hid_id=41343&gvList%24ctl07%24hidinid=0&gvList%24ctl08%24hid_id=41346&gvList%24ctl08%24hidinid=0&gvList%24ctl09%24hid_id=41332&gvList%24ctl09%24hidinid=0&gvList%24ctl10%24hid_id=41345&gvList%24ctl10%24hidinid=0&gvList%24ctl11%24hid_id=41350&gvList%24ctl11%24hidinid=0&gvList%24ctl12%24hid_id=41313&gvList%24ctl12%24hidinid=0&gvList%24ctl13%24hid_id=41314&gvList%24ctl13%24hidinid=0&gvList%24ctl14%24hid_id=41293&gvList%24ctl14%24hidinid=0&gvList%24ctl15%24hid_id=41257&gvList%24ctl15%24hidinid=0&gvList%24ctl16%24hid_id=41256&gvList%24ctl16%24hidinid=0&ctl17=", s, v);

            var crawler  = new RuiJiCrawler();
            var response = crawler.Request(request);

            Assert.IsTrue(response.Headers.Count > 0);
        }
Ejemplo n.º 28
0
        public void TestJsonPExtract()
        {
            var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page=1";

            var f = new UrlCompile();
            //url = f.Compile(url);

            var c        = new RuiJiCrawler();
            var response = c.Request(new Request(url));

            var expression = @"
reg /jsonp[\d]+?\((.*)\)/ 1
jpath $..url
";
            var b          = RuiJiBlockParser.ParserBlock(expression);
            var result     = RuiJiExtractor.Extract(response.Data.ToString(), b);

            Assert.IsTrue(result.Content.ToString().Length > 0);
        }
Ejemplo n.º 29
0
        public void TestSessionCrawler()
        {
            //ServerManager.StartServers();

            var crawler  = new RuiJiCrawler();
            var request  = new Request("http://www.baidu.com/");
            var response = crawler.Request(request);

            Assert.True(response.Headers.Count(m => m.Name == "Set-Cookie") > 0);

            request  = new Request("http://www.baidu.com/about/");
            response = crawler.Request(request);

            Assert.True(response.Headers.Count(m => m.Name == "Set-Cookie") == 0);

            request  = new Request("http://www.kuaidaili.com/");
            response = crawler.Request(request);

            Assert.True(response.Headers.Count(m => m.Name == "Set-Cookie") == 0);
        }
Ejemplo n.º 30
0
        public void TestMime()
        {
            var crawler  = new RuiJiCrawler();
            var request  = new Request("http://img10.jiuxian.com/2018/0111/cd51bb851410404388155b3ec2c505cf4.jpg");
            var response = crawler.Request(request);

            var ex = response.Extensions;

            Assert.True(response.IsRaw);

            request  = new Request("https://avatars0.githubusercontent.com/u/16769087?s=460&v=4");
            response = crawler.Request(request);

            Assert.True(response.IsRaw);

            request  = new Request("http://www.baidu.com/");
            response = crawler.Request(request);

            Assert.False(response.IsRaw);
        }