public void SetTimeout()
        {
            HttpClientDownloader downloader = new HttpClientDownloader();
            var entry = HttpClientDownloader.HttpClientPool.GetHttpClient("a");

            downloader.PrepareHttpClient(entry);
            Assert.Equal(8, entry.Client.Timeout.TotalSeconds);
        }
        public void GetTargetUrlWhenRedirect()
        {
            var downloader = new HttpClientDownloader();
            var page       = downloader.Download(new Request("http://item.jd.com/1231222221111123.html", null));

            Assert.DoesNotContain("1231222221111123", page.TargetUrl);
            Assert.True(page.TargetUrl.Contains("www.jd.com/") || page.TargetUrl.Contains("global.jd.com"));
        }
        public void Ports()
        {
            HttpClientDownloader downloader = new HttpClientDownloader();

            for (int i = 0; i < 100; i++)
            {
                var a = downloader.Download(new Request("http://www.163.com", null));
            }
        }
示例#4
0
 protected override void OnInit(params string[] arguments)
 {
     AddRequest($"http://api.search.sina.com.cn/?c=news&t=&q=赵丽颖&pf=2136012948&ps=2130770082&page=0&stime={DateTime.Now.AddYears(-7).AddDays(-1).ToString("yyyy-MM-dd")}&etime={DateTime.Now.AddDays(1).ToString("yyyy-MM-dd")}&sort=rel&highlight=1&num=10&ie=utf-8&callback=jQuery1720001955628746606708_1508996230766&_=1508996681484", new Dictionary <string, dynamic> {
         { "keyword", "赵丽颖" }
     });
     AddPipeline(new ConsoleEntityPipeline());
     Downloader = new HttpClientDownloader();
     Downloader.AddAfterDownloadCompleteHandler(new ReplaceHandler());
     AddEntityType <SinaNews>();
 }
示例#5
0
        public void GetTargetUrlWhenRedirect()
        {
            Site site = new Site
            {
            };
            HttpClientDownloader downloader = new HttpClientDownloader();
            var page = downloader.Download(new Request("http://item.jd.com/1231222221111123.html", null), new DefaultSpider("test", site));

            Assert.EndsWith("www.jd.com/?d", page.TargetUrl);
        }
        public void Ports()
        {
            HttpClientDownloader downloader = new HttpClientDownloader();
            DefaultSpider        spider     = new DefaultSpider("abcd", new Site {
            });

            for (int i = 0; i < 100; i++)
            {
                var a = downloader.Download(new Request("http://www.163.com", null), spider).Result;
            }
        }
示例#7
0
 public void Download()
 {
     HttpClientDownloader downloader = new HttpClientDownloader();
     var response = downloader.Download(new Request("http://www.163.com")
     {
         Headers = new System.Collections.Generic.Dictionary <string, object>
         {
             { "Cookies", "a=b" }
         }
     });
 }
示例#8
0
        static void RunTest_GetPageWithCustomRequestAndSite()
        {
            var downloader      = new HttpClientDownloader();
            var httpClientEntry = downloader.CreateHttpClientEntry();

            var request     = new Request("http://google.co.kr");
            var site        = new Site();
            var httpMessage = downloader.GenerateHttpRequestMessage(request, site);
            var result      = httpClientEntry.Client.SendAsync(httpMessage).Result;

            System.Console.WriteLine($"[{result.StatusCode}] {downloader.ReadContent(site, result).Substring(0, 500)}");
        }
        public void Ports()
        {
            HttpClientDownloader downloader = new HttpClientDownloader();
            DefaultSpider        spider     = new DefaultSpider("abcd", new Site {
                Timeout = 5000
            });

            for (int i = 0; i < 100; i++)
            {
                downloader.Download(new Request("http://www.163.com", 0, null), spider);
            }
        }
示例#10
0
        static MyDownloader()
        {
            HttpClient = new HttpClientDownloader();
            //HttpClient.AddCookie(new System.Net.Cookie("QCCSESSID", "tc8qnunpciiofa7ll1bgv7ts67", "/", ".qichacha.com"));

            //;   zg_did=%7B%22did%22%3A%20%2216739bd0d6296c-05eb8ff11222c3-b79183d-13c680-16739bd0d641f2%22%7D; UM_distinctid=1673e91a73fb78-04e1f4a087381c-b79183d-13c680-1673e91a740776; CNZZDATA1254842228=1245779446-1541473539-%7C1543045290; hasShow=1; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201543048493469%2C%22updated%22%3A%201543050002607%2C%22info%22%3A%201542861950315%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.baidu.com%22%2C%22cuid%22%3A%20%2210ba9ba4ff1721ac0de9616288793011%22%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1543050003
            var _cookieList = new List <string>();

            _cookieList.Add("QCCSESSID=u1qlm3camss8fo0cg61ltd6010");
            //_cookieList.Add("acw_tc=8ccd104315414788538176746eabf969154afa766a1345dd6779dda6b3");
            _cookieList.Add("_uab_collina=154147884445273729213581");
            HttpClient.AddCookies(string.Join(";", _cookieList), ".qichacha.com");
        }
示例#11
0
        protected override void MyInit(params string[] arguments)
        {
            Identity = ("qidian_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss"));
            var downloader = new HttpClientDownloader();

            downloader.AddAfterDownloadCompleteHandler(new IncrementTargetUrlsBuilder("index_1.shtml"));
            Downloader = downloader;
            ThreadNum  = 1;
            AddStartUrl("http://www.cas.cn/kx/kpwz/index.shtml");
            AddStartUrl("http://www.cas.cn/kx/kpwz/index_1.shtml");
            AddEntityType(typeof(ArticleSummary));
            AddEntityType(typeof(Article));
        }
        protected override void MyInit(params string[] arguments)
        {
            ThreadNum = 1;
            // dowload html by http client
            Downloader = new HttpClientDownloader();

            // storage data to mysql, default is mysql entity pipeline, so you can comment this line. Don't miss sslmode.
            AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;"));
            AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            AddEntityType(typeof(Product));
        }
        public void ParallelDownloader()
        {
            var downloader = new HttpClientDownloader();

            Parallel.For(0, 3, new ParallelOptions
            {
                MaxDegreeOfParallelism = 3
            }, i =>
            {
                var d = downloader.Clone();
                downloader.Download(new Request("http://www.163.com"));
            });
        }
        public void DetectDownloadContent()
        {
            HttpClientDownloader downloader = new HttpClientDownloader();

            var a = downloader.Download(new Request("http://www.163.com", null));

            Assert.Equal(ContentType.Html, a.ContentType);

            HttpClientDownloader2 downloader2 = new HttpClientDownloader2();

            a = downloader2.Download(new Request("http://www.163.com", null));
            Assert.Equal(ContentType.Json, a.ContentType);
        }
示例#15
0
        protected override void MyInit(params string[] arguments)
        {
            ThreadNum = 1;
            // dowload html by http client
            Downloader = new HttpClientDownloader();

            // save data to mysql.
            AddPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            AddEntityType(typeof(Product));
        }
示例#16
0
        public void InitComponent()
        {
            if (_init)
            {
                return;
            }

            Console.CancelKeyPress += ConsoleCancelKeyPress;

            if (Downloader == null)
            {
                Downloader = new HttpClientDownloader();
            }

            if (Pipelines.Count == 0)
            {
                Pipelines.Add(new FilePipeline());
            }

            foreach (var pipeline in Pipelines)
            {
                pipeline.InitPipeline(this);
            }

            if (StartRequests != null && StartRequests.Count > 0)
            {
                Logger.Info($"添加链接到调度中心, 数量: {StartRequests.Count}.");
                if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler))
                {
                    Parallel.ForEach(StartRequests, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 4
                    }, request =>
                    {
                        Scheduler.Push(request);
                    });
                }
                else
                {
                    Scheduler.Load(new HashSet <Request>(StartRequests));
                    ClearStartRequests();
                }
            }
            else
            {
                Logger.Info("添加链接到调度中心, 数量: 0.");
            }

            _init = true;
        }
        public void GetTargetUrlWhenRedirect()
        {
            Site site = new Site
            {
                Headers = new Dictionary <string, string>
                {
                    { "User-Agent", "Chrome" }
                }
            };
            var downloader = new HttpClientDownloader();
            var page       = downloader.Download(new Request("http://item.jd.com/1231222221111123.html", null), new DefaultSpider("test", site));

            Assert.EndsWith("www.jd.com/?d", page.TargetUrl);
        }
示例#18
0
 protected override void MyInit(params string[] arguments)
 {
     Site = new Site
     {
         Headers = new Dictionary <string, string>
         {
             { "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" },
             { "Referer", "https://www.taobao.com/?spm=a230r.1.0.0.ebb2eb2VkWVc7" }
         },
         CookiesStringPart = "thw=cn; miid=715530502217916458; tracknick=style9898123; _cc_=VT5L2FSpdA%3D%3D; tg=0; t=fdf1eb945c2d6b41909558f5c373c37e; cookie2=1cb7771c61122989bb7327f9116858cb; v=0; mt=ci=-1_0; cna=wBEiEVwsTwoCAXTrIc4M/zwX; _tb_token_=e38beee05307e; l=AhoatVWMG7a9HNd5Ar0vu7CJ6so0I54m; isg=AlhY8tnotW2k3pghow1NKSZGIYbqQbzLLM8WWZJJ0RNGLfgXOlGMW27LMVzj; uc3=nk2=EEomLiIV%2BYptPBTr&id2=VyySWWIEs2Gx&vt3=F8dARV%2Bke6706b8vtTM%3D&lg2=VT5L2FSpMGV7TQ%3D%3D; existShop=MTQ5NTYxOTEwMA%3D%3D; lgc=style9898123; skt=57e445e7876bfe9c; publishItemObj=Ng%3D%3D; _m_user_unitinfo_=unit|unzbyun; _m_unitapi_v_=1492572565585; _m_h5_tk=a64b9ef97931dc791ae1708fa1293e93_1496410667055; _m_h5_tk_enc=ade4c443f5c9b6358cfb9821ccf02282; UM_distinctid=15c39e8263a835-05097c28e0b965-37624605-1fa400-15c39e8263bbcd; ali_ab=116.235.37.69.1495620049800.4; linezing_session=3vGYfK3a2T0nRJgCZKSJS15W_1497875606644xXAh_3; uc2=wuf=https%3A%2F%2Fpassport.alibaba.com%2Fac%2Fpassword_reset.htm%3FfromSite%3D6%26appName%3Daliyun%26lang%3Dzh_CN; uc1=cookie14=UoW%2BsOlp%2B6aVYg%3D%3D"
     };
     Scheduler  = new RedisScheduler(Configuration.RedisConnectString);
     Downloader = new HttpClientDownloader
     {
         DownloadCompleteHandlers = new IDownloadCompleteHandler[]
         {
             new SubContentHandler
             {
                 StartOffset = 16,
                 EndOffset   = 22,
                 Start       = "g_page_config = {",
                 End         = "g_srp_loadCss();"
             },
             new IncrementTargetUrlsCreator("&s=0", null, 44)
         }
     };
     ThreadNum             = 20;
     SkipWhenResultIsEmpty = true;
     if (!arguments.Contains("noprepare"))
     {
         PrepareStartUrls = new PrepareStartUrls[]
         {
             new BaseDbPrepareStartUrls
             {
                 BulkInsert    = true,
                 ConnectString = Configuration.ConnectString,
                 QueryString   = "SELECT * FROM taobao.result_keywords",
                 Columns       = new []
                 {
                     new DataColumn("bidwordstr"),
                     new DataColumn("tab")
                 },
                 FormateStrings = new List <string> {
                     "https://s.taobao.com/search?q={0}&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&sort=sale-desc&s=0&tab={1}"
                 }
             }
         };
     }
     AddEntityType(typeof(Item), new MyDataHanlder());
 }
        public void GetTargetUrlWhenRedirect()
        {
            Site site = new Site
            {
                Headers = new Dictionary <string, string>
                {
                    { "User-Agent", "Chrome" }
                }
            };
            var downloader = new HttpClientDownloader();
            var page       = downloader.Download(new Request("http://item.jd.com/1231222221111123.html", null), new DefaultSpider("test", site));

            Assert.True(page.TargetUrl.Contains("www.jd.com/2017?t=") || page.TargetUrl.Contains("global.jd.com"));
        }
示例#20
0
        protected void InitComponent()
        {
            Scheduler.Init(this);

            if (Downloader == null)
            {
                Downloader = new HttpClientDownloader();
            }

            Downloader.SetThreadNum(ThreadNum);

            if (Pipelines.Count == 0)
            {
                Pipelines.Add(new FilePipeline());
            }
            if (ThreadPool == null || ThreadPool.IsShutdown)
            {
                ThreadPool = new CountableThreadPool(ThreadNum);
            }
            if (StartRequests != null)
            {
                Parallel.ForEach(StartRequests, new ParallelOptions()
                {
                    MaxDegreeOfParallelism = 100
                }, request =>
                {
                    Scheduler.Push((Request)request.Clone(), this);
                });

                ClearStartRequests();
                Logger.InfoFormat("Push Request to Scheduler success.");
            }

            if (!_registConsoleCtrlHandler)
            {
                Console.Title             = Identify;
                Console.CancelKeyPress   += Console_CancelKeyPress;
                _registConsoleCtrlHandler = true;

                //根据控制台标题找控制台
                int windowHandler = FindWindow(null, Identify);
                //找关闭按钮
                IntPtr closeMenu = GetSystemMenu((IntPtr)windowHandler, IntPtr.Zero);
                int    SC_CLOSE  = 0xF060;
                //关闭按钮禁用
                RemoveMenu(closeMenu, SC_CLOSE, 0x0);
            }
        }
        public void DetectDownloadContent()
        {
            HttpClientDownloader downloader = new HttpClientDownloader();
            DefaultSpider        spider     = new DefaultSpider("abcd", new Site {
            });

            downloader.Download(new Request("http://www.163.com", null), spider);
            Assert.Equal(Core.Infrastructure.ContentType.Html, spider.Site.ContentType);

            HttpClientDownloader2 downloader2 = new HttpClientDownloader2();
            DefaultSpider         spider2     = new DefaultSpider("abcd", new Site {
            });

            downloader2.Download(new Request("http://www.163.com", null), spider2);
            Assert.Equal(Core.Infrastructure.ContentType.Json, spider2.Site.ContentType);
        }
示例#22
0
        /// <summary>
        /// Create a new spider to fetch data from some website
        /// See use examples on the Github page
        /// </summary>
        /// <param name="spiderName">A unique name for this spider. Folder will be created with that name</param>
        /// <param name="baseUri">The base Uri of the website. Pages outside this Host will not be fetched</param>
        /// <param name="params">Additional initialization parameters</param>
        public SimpleSpider(string spiderName, Uri baseUri, InitializationParams @params = null)
        {
            SpiderName = spiderName;
            BaseUri    = baseUri;

            Cacher     = @params?.Cacher;
            Downloader = @params?.Downloader;

            Configuration = @params?.ConfigurationPrototype ?? new Configuration();
            initializeConfiguration(spiderName, @params);

            LinkCollector = @params?.LinkCollector;
            if (Configuration.Auto_AnchorsLinks && LinkCollector == null)
            {
                LinkCollector = new LinkProcessors.SimpleProcessor();
            }

            initializeQueues();
            // initialize read-only
            if (Cacher == null)
            {
                Cacher = new ContentCacher();
            }
            if (Downloader == null)
            {
                Downloader = new HttpClientDownloader();
            }

            initializeFetchers();
            FetchCompleted += fetchCompleted_AutoCollect;
            FetchRewrite   += fetchRewrite_AutoRewrite;

            Parsers = new List <IParserBase>();
            if (@params?.Parsers != null)
            {
                Parsers.AddRange(@params.Parsers);
            }

            if (@params?.StorageEngine != null)
            {
                Storage = @params.StorageEngine;
                Storage.Initialize(Configuration);
            }

            logInitialStatus();
        }
示例#23
0
        protected override void MyInit(params string[] arguments)
        {
            Site.AddStartUrl("http://chat1.jd.com/api/checkChat?my=list&pidList=3355984&callback=json");
            Site.AddStartUrl("http://chat1.jd.com/api/checkChat?my=list&pidList=3682523&callback=json");
            var downloader = new HttpClientDownloader();

            downloader.AddAfterDownloadCompleteHandler(new SubContentHandler
            {
                Start       = "json(",
                End         = ");",
                StartOffset = 5,
                EndOffset   = 2
            });

            AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost ;User ID=root;Password=1qazZAQ!;Port=3306"));
            AddEntityType(typeof(ProductUpdater));
        }
示例#24
0
        public void TestStartAndStop()
        {
            HttpClientDownloader downloader = new HttpClientDownloader();

            Core.Spider spider = Core.Spider.Create(new Site()
            {
                EncodingName = "UTF-8"
            }, new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).AddPipeline(new TestPipeline()).SetThreadNum(1);
            Page p = downloader.Download(new Request("http://www.baidu.com/", 2, new Dictionary <string, dynamic>()), spider);

            Console.WriteLine(p.Content);
            spider.Start();
            Thread.Sleep(10000);
            spider.Stop();
            Thread.Sleep(10000);
            spider.Start();
            Thread.Sleep(10000);
        }
示例#25
0
 protected override void MyInit()
 {
     Downloader = new HttpClientDownloader
     {
         DownloadCompleteHandlers = new IDownloadCompleteHandler[]
         {
             new IncrementTargetUrlsCreator("index_1.shtml")
         }
     };
     SetThreadNum(10);
     Identity = ("qidian_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss"));
     AddPipeline(
         new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
     AddStartUrl("http://www.cas.cn/kx/kpwz/index.shtml");
     AddStartUrl("http://www.cas.cn/kx/kpwz/index_1.shtml");
     AddEntityType(typeof(ArticleSummary));
     AddEntityType(typeof(Article));
 }
示例#26
0
        public async Task Download()
        {
            var client = new HttpClientDownloader();

            using (var stream = new MemoryStream())
            {
                var obs = client.Download(new Uri(@"https://google.com/favicon.ico"), stream);
                using (var d = obs.Connect())
                {
                    var progress = await obs;
                    if (progress.Size.Bits != 0)
                    {
                        progress.Downloaded.Bits.Should().Be(progress.Size.Bits);
                        progress.Remaining.Bits.Should().Be(0);
                    }
                    progress.State.Should().Be(DownloadProgress.TransferState.Finished);
                }
            }
        }
示例#27
0
        public void InjectCookies()
        {
            var path = "a.cookies";

            if (File.Exists(path))
            {
                File.Delete(path);
            }
            File.AppendAllLines(path, new[] { "www.baidu.com" });
            File.AppendAllLines(path, new[] { "a=b;c=d" });
            FileCookieInject inject = new FileCookieInject(path);

            var downloader = new HttpClientDownloader();

            inject.Inject(downloader);
            var cookies = downloader.GetCookies(new Uri("http://www.baidu.com"));

            Assert.Equal("b", cookies["a"].Value);
            Assert.Equal("d", cookies["c"].Value);
        }
示例#28
0
        protected override void MyInit(params string[] arguments)
        {
            Scheduler = new RedisScheduler();
            var downloader = new HttpClientDownloader();

            downloader.AddAfterDownloadCompleteHandler(new ReplaceContentHandler
            {
                NewValue = "/",
                OldValue = "\\/",
            });
            downloader.AddAfterDownloadCompleteHandler(new IncrementTargetUrlsBuilder("&s=0", 44));
            Downloader            = downloader;
            ThreadNum             = 1;
            SkipWhenResultIsEmpty = true;
            if (!arguments.Contains("noprepare"))
            {
                AddStartUrlBuilder(new DbStartUrlBuilder(Database.MySql, Env.DataConnectionStringSettings.ConnectionString, "SELECT * FROM taobao.result_keywords limit 10000", new[] { "bidwordstr", "tab" }, "https://s.taobao.com/search?q={0}&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&sort=sale-desc&s=0&tab={1}"));
            }
            AddEntityType(typeof(Item), new MyDataHanlder());
        }
示例#29
0
 protected override void MyInit(params string[] arguments)
 {
     Scheduler  = new RedisScheduler(Config.RedisConnectString);
     Downloader = new HttpClientDownloader
     {
         DownloadCompleteHandlers = new IDownloadCompleteHandler[]
         {
             new SubContentHandler
             {
                 StartOffset = 16,
                 EndOffset   = 22,
                 Start       = "g_page_config = {",
                 End         = "g_srp_loadCss();"
             },
             new IncrementTargetUrlsCreator("&s=0", null, 44)
         }
     };
     ThreadNum             = 1;
     SkipWhenResultIsEmpty = true;
     if (!arguments.Contains("noprepare"))
     {
         PrepareStartUrls = new PrepareStartUrls[]
         {
             new BaseDbPrepareStartUrls
             {
                 BulkInsert    = true,
                 ConnectString = Config.ConnectString,
                 QueryString   = "SELECT * FROM taobao.result_keywords limit 10000",
                 Columns       = new []
                 {
                     new DataColumn("bidwordstr"),
                     new DataColumn("tab")
                 },
                 FormateStrings = new List <string> {
                     "https://s.taobao.com/search?q={0}&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&sort=sale-desc&s=0&tab={1}"
                 }
             }
         };
     }
     AddEntityType(typeof(Item), new MyDataHanlder());
 }
        public void SetContentType()
        {
            Site site1 = new Site
            {
                Headers = new Dictionary <string, string>()
                {
                    { "Content-Type", "abcd" }
                }
            };
            Site site2 = new Site
            {
                Headers = new Dictionary <string, string>()
                {
                    { "ContentType", "abcd" }
                }
            };
            var downloader = new HttpClientDownloader();
            var a          = downloader.Download(new Request("http://163.com", null), new DefaultSpider("test", site1));

            a = downloader.Download(new Request("http://163.com", null), new DefaultSpider("test", site2));
        }