public void SetTimeout() { HttpClientDownloader downloader = new HttpClientDownloader(); var entry = HttpClientDownloader.HttpClientPool.GetHttpClient("a"); downloader.PrepareHttpClient(entry); Assert.Equal(8, entry.Client.Timeout.TotalSeconds); }
public void GetTargetUrlWhenRedirect() { var downloader = new HttpClientDownloader(); var page = downloader.Download(new Request("http://item.jd.com/1231222221111123.html", null)); Assert.DoesNotContain("1231222221111123", page.TargetUrl); Assert.True(page.TargetUrl.Contains("www.jd.com/") || page.TargetUrl.Contains("global.jd.com")); }
public void Ports() { HttpClientDownloader downloader = new HttpClientDownloader(); for (int i = 0; i < 100; i++) { var a = downloader.Download(new Request("http://www.163.com", null)); } }
protected override void OnInit(params string[] arguments) { AddRequest($"http://api.search.sina.com.cn/?c=news&t=&q=赵丽颖&pf=2136012948&ps=2130770082&page=0&stime={DateTime.Now.AddYears(-7).AddDays(-1).ToString("yyyy-MM-dd")}&etime={DateTime.Now.AddDays(1).ToString("yyyy-MM-dd")}&sort=rel&highlight=1&num=10&ie=utf-8&callback=jQuery1720001955628746606708_1508996230766&_=1508996681484", new Dictionary <string, dynamic> { { "keyword", "赵丽颖" } }); AddPipeline(new ConsoleEntityPipeline()); Downloader = new HttpClientDownloader(); Downloader.AddAfterDownloadCompleteHandler(new ReplaceHandler()); AddEntityType <SinaNews>(); }
public void GetTargetUrlWhenRedirect() { Site site = new Site { }; HttpClientDownloader downloader = new HttpClientDownloader(); var page = downloader.Download(new Request("http://item.jd.com/1231222221111123.html", null), new DefaultSpider("test", site)); Assert.EndsWith("www.jd.com/?d", page.TargetUrl); }
public void Ports() { HttpClientDownloader downloader = new HttpClientDownloader(); DefaultSpider spider = new DefaultSpider("abcd", new Site { }); for (int i = 0; i < 100; i++) { var a = downloader.Download(new Request("http://www.163.com", null), spider).Result; } }
public void Download() { HttpClientDownloader downloader = new HttpClientDownloader(); var response = downloader.Download(new Request("http://www.163.com") { Headers = new System.Collections.Generic.Dictionary <string, object> { { "Cookies", "a=b" } } }); }
static void RunTest_GetPageWithCustomRequestAndSite() { var downloader = new HttpClientDownloader(); var httpClientEntry = downloader.CreateHttpClientEntry(); var request = new Request("http://google.co.kr"); var site = new Site(); var httpMessage = downloader.GenerateHttpRequestMessage(request, site); var result = httpClientEntry.Client.SendAsync(httpMessage).Result; System.Console.WriteLine($"[{result.StatusCode}] {downloader.ReadContent(site, result).Substring(0, 500)}"); }
public void Ports() { HttpClientDownloader downloader = new HttpClientDownloader(); DefaultSpider spider = new DefaultSpider("abcd", new Site { Timeout = 5000 }); for (int i = 0; i < 100; i++) { downloader.Download(new Request("http://www.163.com", 0, null), spider); } }
static MyDownloader() { HttpClient = new HttpClientDownloader(); //HttpClient.AddCookie(new System.Net.Cookie("QCCSESSID", "tc8qnunpciiofa7ll1bgv7ts67", "/", ".qichacha.com")); //; zg_did=%7B%22did%22%3A%20%2216739bd0d6296c-05eb8ff11222c3-b79183d-13c680-16739bd0d641f2%22%7D; UM_distinctid=1673e91a73fb78-04e1f4a087381c-b79183d-13c680-1673e91a740776; CNZZDATA1254842228=1245779446-1541473539-%7C1543045290; hasShow=1; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201543048493469%2C%22updated%22%3A%201543050002607%2C%22info%22%3A%201542861950315%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.baidu.com%22%2C%22cuid%22%3A%20%2210ba9ba4ff1721ac0de9616288793011%22%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1543050003 var _cookieList = new List <string>(); _cookieList.Add("QCCSESSID=u1qlm3camss8fo0cg61ltd6010"); //_cookieList.Add("acw_tc=8ccd104315414788538176746eabf969154afa766a1345dd6779dda6b3"); _cookieList.Add("_uab_collina=154147884445273729213581"); HttpClient.AddCookies(string.Join(";", _cookieList), ".qichacha.com"); }
protected override void MyInit(params string[] arguments) { Identity = ("qidian_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss")); var downloader = new HttpClientDownloader(); downloader.AddAfterDownloadCompleteHandler(new IncrementTargetUrlsBuilder("index_1.shtml")); Downloader = downloader; ThreadNum = 1; AddStartUrl("http://www.cas.cn/kx/kpwz/index.shtml"); AddStartUrl("http://www.cas.cn/kx/kpwz/index_1.shtml"); AddEntityType(typeof(ArticleSummary)); AddEntityType(typeof(Article)); }
protected override void MyInit(params string[] arguments) { ThreadNum = 1; // dowload html by http client Downloader = new HttpClientDownloader(); // storage data to mysql, default is mysql entity pipeline, so you can comment this line. Don't miss sslmode. AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;")); AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> { { "name", "手机" }, { "cat3", "655" } }); AddEntityType(typeof(Product)); }
public void ParallelDownloader() { var downloader = new HttpClientDownloader(); Parallel.For(0, 3, new ParallelOptions { MaxDegreeOfParallelism = 3 }, i => { var d = downloader.Clone(); downloader.Download(new Request("http://www.163.com")); }); }
public void DetectDownloadContent() { HttpClientDownloader downloader = new HttpClientDownloader(); var a = downloader.Download(new Request("http://www.163.com", null)); Assert.Equal(ContentType.Html, a.ContentType); HttpClientDownloader2 downloader2 = new HttpClientDownloader2(); a = downloader2.Download(new Request("http://www.163.com", null)); Assert.Equal(ContentType.Json, a.ContentType); }
protected override void MyInit(params string[] arguments) { ThreadNum = 1; // dowload html by http client Downloader = new HttpClientDownloader(); // save data to mysql. AddPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> { { "name", "手机" }, { "cat3", "655" } }); AddEntityType(typeof(Product)); }
public void InitComponent() { if (_init) { return; } Console.CancelKeyPress += ConsoleCancelKeyPress; if (Downloader == null) { Downloader = new HttpClientDownloader(); } if (Pipelines.Count == 0) { Pipelines.Add(new FilePipeline()); } foreach (var pipeline in Pipelines) { pipeline.InitPipeline(this); } if (StartRequests != null && StartRequests.Count > 0) { Logger.Info($"添加链接到调度中心, 数量: {StartRequests.Count}."); if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler)) { Parallel.ForEach(StartRequests, new ParallelOptions() { MaxDegreeOfParallelism = 4 }, request => { Scheduler.Push(request); }); } else { Scheduler.Load(new HashSet <Request>(StartRequests)); ClearStartRequests(); } } else { Logger.Info("添加链接到调度中心, 数量: 0."); } _init = true; }
public void GetTargetUrlWhenRedirect() { Site site = new Site { Headers = new Dictionary <string, string> { { "User-Agent", "Chrome" } } }; var downloader = new HttpClientDownloader(); var page = downloader.Download(new Request("http://item.jd.com/1231222221111123.html", null), new DefaultSpider("test", site)); Assert.EndsWith("www.jd.com/?d", page.TargetUrl); }
protected override void MyInit(params string[] arguments) { Site = new Site { Headers = new Dictionary <string, string> { { "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" }, { "Referer", "https://www.taobao.com/?spm=a230r.1.0.0.ebb2eb2VkWVc7" } }, CookiesStringPart = "thw=cn; miid=715530502217916458; tracknick=style9898123; _cc_=VT5L2FSpdA%3D%3D; tg=0; t=fdf1eb945c2d6b41909558f5c373c37e; cookie2=1cb7771c61122989bb7327f9116858cb; v=0; mt=ci=-1_0; cna=wBEiEVwsTwoCAXTrIc4M/zwX; _tb_token_=e38beee05307e; l=AhoatVWMG7a9HNd5Ar0vu7CJ6so0I54m; isg=AlhY8tnotW2k3pghow1NKSZGIYbqQbzLLM8WWZJJ0RNGLfgXOlGMW27LMVzj; uc3=nk2=EEomLiIV%2BYptPBTr&id2=VyySWWIEs2Gx&vt3=F8dARV%2Bke6706b8vtTM%3D&lg2=VT5L2FSpMGV7TQ%3D%3D; existShop=MTQ5NTYxOTEwMA%3D%3D; lgc=style9898123; skt=57e445e7876bfe9c; publishItemObj=Ng%3D%3D; _m_user_unitinfo_=unit|unzbyun; _m_unitapi_v_=1492572565585; _m_h5_tk=a64b9ef97931dc791ae1708fa1293e93_1496410667055; _m_h5_tk_enc=ade4c443f5c9b6358cfb9821ccf02282; UM_distinctid=15c39e8263a835-05097c28e0b965-37624605-1fa400-15c39e8263bbcd; ali_ab=116.235.37.69.1495620049800.4; linezing_session=3vGYfK3a2T0nRJgCZKSJS15W_1497875606644xXAh_3; uc2=wuf=https%3A%2F%2Fpassport.alibaba.com%2Fac%2Fpassword_reset.htm%3FfromSite%3D6%26appName%3Daliyun%26lang%3Dzh_CN; uc1=cookie14=UoW%2BsOlp%2B6aVYg%3D%3D" }; Scheduler = new RedisScheduler(Configuration.RedisConnectString); Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { StartOffset = 16, EndOffset = 22, Start = "g_page_config = {", End = "g_srp_loadCss();" }, new IncrementTargetUrlsCreator("&s=0", null, 44) } }; ThreadNum = 20; SkipWhenResultIsEmpty = true; if (!arguments.Contains("noprepare")) { PrepareStartUrls = new PrepareStartUrls[] { new BaseDbPrepareStartUrls { BulkInsert = true, ConnectString = Configuration.ConnectString, QueryString = "SELECT * FROM taobao.result_keywords", Columns = new [] { new DataColumn("bidwordstr"), new DataColumn("tab") }, FormateStrings = new List <string> { "https://s.taobao.com/search?q={0}&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&sort=sale-desc&s=0&tab={1}" } } }; } AddEntityType(typeof(Item), new MyDataHanlder()); }
public void GetTargetUrlWhenRedirect() { Site site = new Site { Headers = new Dictionary <string, string> { { "User-Agent", "Chrome" } } }; var downloader = new HttpClientDownloader(); var page = downloader.Download(new Request("http://item.jd.com/1231222221111123.html", null), new DefaultSpider("test", site)); Assert.True(page.TargetUrl.Contains("www.jd.com/2017?t=") || page.TargetUrl.Contains("global.jd.com")); }
protected void InitComponent() { Scheduler.Init(this); if (Downloader == null) { Downloader = new HttpClientDownloader(); } Downloader.SetThreadNum(ThreadNum); if (Pipelines.Count == 0) { Pipelines.Add(new FilePipeline()); } if (ThreadPool == null || ThreadPool.IsShutdown) { ThreadPool = new CountableThreadPool(ThreadNum); } if (StartRequests != null) { Parallel.ForEach(StartRequests, new ParallelOptions() { MaxDegreeOfParallelism = 100 }, request => { Scheduler.Push((Request)request.Clone(), this); }); ClearStartRequests(); Logger.InfoFormat("Push Request to Scheduler success."); } if (!_registConsoleCtrlHandler) { Console.Title = Identify; Console.CancelKeyPress += Console_CancelKeyPress; _registConsoleCtrlHandler = true; //根据控制台标题找控制台 int windowHandler = FindWindow(null, Identify); //找关闭按钮 IntPtr closeMenu = GetSystemMenu((IntPtr)windowHandler, IntPtr.Zero); int SC_CLOSE = 0xF060; //关闭按钮禁用 RemoveMenu(closeMenu, SC_CLOSE, 0x0); } }
public void DetectDownloadContent() { HttpClientDownloader downloader = new HttpClientDownloader(); DefaultSpider spider = new DefaultSpider("abcd", new Site { }); downloader.Download(new Request("http://www.163.com", null), spider); Assert.Equal(Core.Infrastructure.ContentType.Html, spider.Site.ContentType); HttpClientDownloader2 downloader2 = new HttpClientDownloader2(); DefaultSpider spider2 = new DefaultSpider("abcd", new Site { }); downloader2.Download(new Request("http://www.163.com", null), spider2); Assert.Equal(Core.Infrastructure.ContentType.Json, spider2.Site.ContentType); }
/// <summary> /// Create a new spider to fetch data from some website /// See use examples on the Github page /// </summary> /// <param name="spiderName">A unique name for this spider. Folder will be created with that name</param> /// <param name="baseUri">The base Uri of the website. Pages outside this Host will not be fetched</param> /// <param name="params">Additional initialization parameters</param> public SimpleSpider(string spiderName, Uri baseUri, InitializationParams @params = null) { SpiderName = spiderName; BaseUri = baseUri; Cacher = @params?.Cacher; Downloader = @params?.Downloader; Configuration = @params?.ConfigurationPrototype ?? new Configuration(); initializeConfiguration(spiderName, @params); LinkCollector = @params?.LinkCollector; if (Configuration.Auto_AnchorsLinks && LinkCollector == null) { LinkCollector = new LinkProcessors.SimpleProcessor(); } initializeQueues(); // initialize read-only if (Cacher == null) { Cacher = new ContentCacher(); } if (Downloader == null) { Downloader = new HttpClientDownloader(); } initializeFetchers(); FetchCompleted += fetchCompleted_AutoCollect; FetchRewrite += fetchRewrite_AutoRewrite; Parsers = new List <IParserBase>(); if (@params?.Parsers != null) { Parsers.AddRange(@params.Parsers); } if (@params?.StorageEngine != null) { Storage = @params.StorageEngine; Storage.Initialize(Configuration); } logInitialStatus(); }
protected override void MyInit(params string[] arguments) { Site.AddStartUrl("http://chat1.jd.com/api/checkChat?my=list&pidList=3355984&callback=json"); Site.AddStartUrl("http://chat1.jd.com/api/checkChat?my=list&pidList=3682523&callback=json"); var downloader = new HttpClientDownloader(); downloader.AddAfterDownloadCompleteHandler(new SubContentHandler { Start = "json(", End = ");", StartOffset = 5, EndOffset = 2 }); AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost ;User ID=root;Password=1qazZAQ!;Port=3306")); AddEntityType(typeof(ProductUpdater)); }
public void TestStartAndStop() { HttpClientDownloader downloader = new HttpClientDownloader(); Core.Spider spider = Core.Spider.Create(new Site() { EncodingName = "UTF-8" }, new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).AddPipeline(new TestPipeline()).SetThreadNum(1); Page p = downloader.Download(new Request("http://www.baidu.com/", 2, new Dictionary <string, dynamic>()), spider); Console.WriteLine(p.Content); spider.Start(); Thread.Sleep(10000); spider.Stop(); Thread.Sleep(10000); spider.Start(); Thread.Sleep(10000); }
protected override void MyInit() { Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new IncrementTargetUrlsCreator("index_1.shtml") } }; SetThreadNum(10); Identity = ("qidian_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss")); AddPipeline( new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); AddStartUrl("http://www.cas.cn/kx/kpwz/index.shtml"); AddStartUrl("http://www.cas.cn/kx/kpwz/index_1.shtml"); AddEntityType(typeof(ArticleSummary)); AddEntityType(typeof(Article)); }
public async Task Download() { var client = new HttpClientDownloader(); using (var stream = new MemoryStream()) { var obs = client.Download(new Uri(@"https://google.com/favicon.ico"), stream); using (var d = obs.Connect()) { var progress = await obs; if (progress.Size.Bits != 0) { progress.Downloaded.Bits.Should().Be(progress.Size.Bits); progress.Remaining.Bits.Should().Be(0); } progress.State.Should().Be(DownloadProgress.TransferState.Finished); } } }
public void InjectCookies() { var path = "a.cookies"; if (File.Exists(path)) { File.Delete(path); } File.AppendAllLines(path, new[] { "www.baidu.com" }); File.AppendAllLines(path, new[] { "a=b;c=d" }); FileCookieInject inject = new FileCookieInject(path); var downloader = new HttpClientDownloader(); inject.Inject(downloader); var cookies = downloader.GetCookies(new Uri("http://www.baidu.com")); Assert.Equal("b", cookies["a"].Value); Assert.Equal("d", cookies["c"].Value); }
protected override void MyInit(params string[] arguments) { Scheduler = new RedisScheduler(); var downloader = new HttpClientDownloader(); downloader.AddAfterDownloadCompleteHandler(new ReplaceContentHandler { NewValue = "/", OldValue = "\\/", }); downloader.AddAfterDownloadCompleteHandler(new IncrementTargetUrlsBuilder("&s=0", 44)); Downloader = downloader; ThreadNum = 1; SkipWhenResultIsEmpty = true; if (!arguments.Contains("noprepare")) { AddStartUrlBuilder(new DbStartUrlBuilder(Database.MySql, Env.DataConnectionStringSettings.ConnectionString, "SELECT * FROM taobao.result_keywords limit 10000", new[] { "bidwordstr", "tab" }, "https://s.taobao.com/search?q={0}&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&sort=sale-desc&s=0&tab={1}")); } AddEntityType(typeof(Item), new MyDataHanlder()); }
protected override void MyInit(params string[] arguments) { Scheduler = new RedisScheduler(Config.RedisConnectString); Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { StartOffset = 16, EndOffset = 22, Start = "g_page_config = {", End = "g_srp_loadCss();" }, new IncrementTargetUrlsCreator("&s=0", null, 44) } }; ThreadNum = 1; SkipWhenResultIsEmpty = true; if (!arguments.Contains("noprepare")) { PrepareStartUrls = new PrepareStartUrls[] { new BaseDbPrepareStartUrls { BulkInsert = true, ConnectString = Config.ConnectString, QueryString = "SELECT * FROM taobao.result_keywords limit 10000", Columns = new [] { new DataColumn("bidwordstr"), new DataColumn("tab") }, FormateStrings = new List <string> { "https://s.taobao.com/search?q={0}&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&sort=sale-desc&s=0&tab={1}" } } }; } AddEntityType(typeof(Item), new MyDataHanlder()); }
public void SetContentType() { Site site1 = new Site { Headers = new Dictionary <string, string>() { { "Content-Type", "abcd" } } }; Site site2 = new Site { Headers = new Dictionary <string, string>() { { "ContentType", "abcd" } } }; var downloader = new HttpClientDownloader(); var a = downloader.Download(new Request("http://163.com", null), new DefaultSpider("test", site1)); a = downloader.Download(new Request("http://163.com", null), new DefaultSpider("test", site2)); }