public Page Download(Request request, ITask task) { // ReSharper disable once UnusedVariable string path = BasePath + "/" + task.Identify + "/"; Page page; try { FileInfo file = GetFile(path + Encrypt.Md5Encrypt(request.Url)); StreamReader bufferedReader = new StreamReader(file.OpenRead()); string line = bufferedReader.ReadLine(); if (("url:\t" + request.Url).Equals(line)) { string html = GetHtml(bufferedReader); page = new Page(request); page.SetUrl(PlainText.Create(request.Url)); page.SetHtml(Html.Create(html)); } } catch (IOException e) { if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException))) { _logger.Info("File not exist for url " + request.Url); } else { _logger.Warn("File read error for url " + request.Url, e); } } page = DownloadWhenMiss(request, task); return page; }
public void Before() { _resultItems = new ResultItems(); _resultItems.AddOrUpdateResultItem("content", "爬虫工具"); Request request = new Request("http://www.baidu.com", 1, null); _resultItems.Request = request; _spider = new DefaultSpider(); }
public Page Download(Request request, ISpider spider) { Page page = new Page(request, spider.Site.ContentType); page.Content = File.ReadAllText(request.Url.LocalPath); page.TargetUrl = request.Url.ToString(); page.Url = request.Url.ToString(); page.StatusCode = 200; return page; }
public object Clone() { IDictionary<string, dynamic> extras = new Dictionary<string, dynamic>(); foreach (var entry in Extras) { extras.Add(entry.Key, entry.Value); } Request newObj = new Request(Url, extras); return newObj; }
public void RedisTest() { RedisScheduler redisScheduler = new RedisScheduler("localhost", ""); ITask task = new TestTask(); Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/", null); request.PutExtra("1", "2"); redisScheduler.Push(request, task); redisScheduler.Poll(task); //System.out.println(poll); }
public static Request GetRequest() { var extras = new Dictionary<string, dynamic>(); extras.Add("Test", "Forever"); var request = new Request("http://www.taobao.com", 2, extras) { Method = "get", Priority = 1 }; return request; }
public static void RunTask() { OoSpider ooSpider = OoSpider.Create(new Site { SleepTime = 10000 }, new CollectorPageModelToDbPipeline(), typeof(Ganji)); ooSpider.SetThreadNum(1); Request request = new Request("http://mobds.ganji.com/datashare/", null); request.Method = "POST"; ooSpider.AddRequest(request); ooSpider.Run(); }
public bool IsDuplicate(Request request, ITask task) { using (var redis = _pool.GetClient()) { redis.Password = _password; bool isDuplicate = redis.SetContainsItem(GetSetKey(task), request.Url); if (!isDuplicate) { redis.AddItemToSet(GetSetKey(task), request.Url); } return isDuplicate; } }
public void RedisTest() { RedisScheduler redisScheduler = new RedisScheduler("localhost", ""); ISpider spider = new DefaultSpider(); redisScheduler.Clear(); Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/", 1, null); request.PutExtra("1", "2"); redisScheduler.Push(request); Request result = redisScheduler.Poll(); Assert.AreEqual("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/", result.Url.ToString()); Request result1 = redisScheduler.Poll(); Assert.IsNull(result1); redisScheduler.Dispose(); redisScheduler.Clear(); }
public void Redis_QueueTest() { RedisScheduler redisScheduler = new RedisScheduler("localhost", ""); ISpider spider = new DefaultSpider(); Request request1 = new Request("http://www.ibm.com/1", 1, null); Request request2 = new Request("http://www.ibm.com/2", 1, null); Request request3 = new Request("http://www.ibm.com/3", 1, null); Request request4 = new Request("http://www.ibm.com/4", 1, null); redisScheduler.Push(request1); redisScheduler.Push(request2); redisScheduler.Push(request3); redisScheduler.Push(request4); Request result = redisScheduler.Poll(); Assert.AreEqual("http://www.ibm.com/4", result.Url.ToString()); Request result1 = redisScheduler.Poll(); Assert.AreEqual("http://www.ibm.com/3", result1.Url.ToString()); redisScheduler.Dispose(); }
public Page Download(Request request, ISpider spider) { // ReSharper disable once UnusedVariable string path = BasePath + "/" + spider.Identity + "/"; Page page; try { FileInfo file = PrepareFile(path + Encrypt.Md5Encrypt(request.Url.ToString())); StreamReader bufferedReader = new StreamReader(file.OpenRead()); string line = bufferedReader.ReadLine(); if (("url:\t" + request.Url).Equals(line)) { string html = GetHtml(bufferedReader); page = new Page(request, spider.Site.ContentType); page.Url = request.Url.ToString(); page.Content = html; } } catch (IOException e) { #if !NET_CORE if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException))) #else if (typeof(FileNotFoundException).GetTypeInfo().IsAssignableFrom(e.GetType().GetTypeInfo())) #endif { spider.Logger.Info("File not exist for url: " + request.Url); } else { spider.Logger.Warn("File read error for url " + request.Url, e); } } page = DownloadWhenMiss(request, spider); return page; }
public bool IsDuplicate(Request request, ISpider spider) { return SafeExecutor.Execute(30, () => { string key = GetSetKey(spider.Identity); bool isDuplicate = Redis.SetContains(key, request.Identity); if (!isDuplicate) { Redis.SetAdd(key, request.Identity); } return isDuplicate; }); }
public Page Download(Request request, ISpider spider) { var page = new Page(request, ContentType.Html); page.Content = ""; return page; }
public void Push(Request request) { }
//[MethodImpl(MethodImplOptions.Synchronized)] protected override void PushWhenNoDuplicate(Request request) { SafeExecutor.Execute(30, () => { Db.ListRightPush(QueueKey, request.Identity); string field = request.Identity; string value = JsonConvert.SerializeObject(request); Db.HashSet(ItemKey, field, value); }); }
public Site AddStartRequest(Request startRequest) { _startRequests.Add(startRequest); if (Domain == null && startRequest.Url != null) { Domain = UrlUtils.GetDomain(startRequest.Url); } return this; }
//[MethodImpl(MethodImplOptions.Synchronized)] protected override void PushWhenNoDuplicate(Request request, ISpider spider) { SafeExecutor.Execute(30, () => { Redis.ListRightPush(GetQueueKey(spider.Identity), request.Identity); string field = request.Identity; string value = JsonConvert.SerializeObject(request); Redis.HashSet(GetItemKey(spider.Identity), field, value); }); }
public override Request Poll(ITask task) { using (var redis = _pool.GetClient()) { redis.Password = _password; string url = redis.PopItemWithLowestScoreFromSortedSet(GetQueueKey(task)); if (url == null) { return null; } string hashId = ItemPrefix + task.Identify; string field = Encrypt.Md5Encrypt(url); string json = null; //redis �п���ȡ����ʧ�� for (int i = 0; i < 10 && string.IsNullOrEmpty(json = redis.GetValueFromHash(hashId, field)); ++i) { Thread.Sleep(150); } if (!string.IsNullOrEmpty(json)) { return JsonConvert.DeserializeObject<Request>(json); } Request request = new Request(url, null); return request; } }
public void AddTargetRequest(Request request) { _targetRequests.Add(request); }
public Page(Request request, ContentType contentType) { Request = request; ResultItems.Request = request; ContentType = contentType; }
public void OnError(Request request) { _errorUrls.Add(request.Url); _errorCount.Inc(); }
private Page DownloadWhenMiss(Request request, ISpider spider) { Page page = null; if (_downloaderWhenFileMiss != null) { page = _downloaderWhenFileMiss.Download(request, spider); } return page; }
/// <summary> /// Add requests to fetch /// </summary> public void AddTargetRequest(Request request) { lock (this) { TargetRequests.Add(request); } }
public bool IsDuplicate(Request request) { return SafeExecutor.Execute(30, () => { bool isDuplicate = Db.SetContains(SetKey, request.Identity); if (!isDuplicate) { Db.SetAdd(SetKey, request.Identity); } return isDuplicate; }); }
/// <summary> /// Add urls to fetch /// </summary> /// <param name="requests"></param> /// <param name="priority"></param> public void AddTargetRequests(IList<string> requests, int priority) { lock (this) { foreach (string s in requests) { if (string.IsNullOrEmpty(s) || s.Equals("#") || s.StartsWith("javascript:")) { continue; } string s1 = UrlUtils.CanonicalizeUrl(s, Url); Request request = new Request(s1, Request.NextDepth, Request.Extras) { Priority = priority }; TargetRequests.Add(request); } } }
private Page DownloadWhenMiss(Request request, ITask task) { Page page = null; if (_downloaderWhenFileMiss != null) { page = _downloaderWhenFileMiss.Download(request, task); } return page; }
public Page(Request request) { _request = request; _resultItems.Request = request; }
public void OnError(Request request) { _errorUrls.Add(request.Url.ToString()); _errorCount.Inc(); #if !NET_CORE if (_spider.SaveStatus && !string.IsNullOrEmpty(MongoConnectString)) { MongoClient _mongoClient = new MongoClient(MongoConnectString); var collection = _mongoClient.GetDatabase(_mongoDatabaseName).GetCollection<Request>(_errorRequestCollection); collection.InsertOne(request); } #endif }
public void AddTargetRequests(IList<string> requests, long priority) { foreach (string s in requests) { if (string.IsNullOrEmpty(s) || s.Equals("#") || s.StartsWith("javascript:")) { continue; } string s1 = UrlUtils.CanonicalizeUrl(s, _url.ToString()); Request request = new Request(s1, _request?.Extras) { Priority = priority }; _targetRequests.Add(request); } }
public void OnSuccess(Request request) { _successCount.Inc(); }