Exemple #1
0
        public Page Download(Request request, ITask task)
        {
            // ReSharper disable once UnusedVariable
            string path = BasePath + "/" + task.Identify + "/";
            Page page;
            try
            {
                FileInfo file = GetFile(path + Encrypt.Md5Encrypt(request.Url));

                StreamReader bufferedReader = new StreamReader(file.OpenRead());
                string line = bufferedReader.ReadLine();
                if (("url:\t" + request.Url).Equals(line))
                {
                    string html = GetHtml(bufferedReader);
                    page = new Page(request);
                    page.SetUrl(PlainText.Create(request.Url));
                    page.SetHtml(Html.Create(html));
                }
            }
            catch (IOException e)
            {
                if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException)))
                {
                    _logger.Info("File not exist for url " + request.Url);
                }
                else
                {
                    _logger.Warn("File read error for url " + request.Url, e);
                }
            }
            page = DownloadWhenMiss(request, task);
            return page;
        }
 public void Before()
 {
     _resultItems = new ResultItems();
     _resultItems.AddOrUpdateResultItem("content", "爬虫工具");
     Request request = new Request("http://www.baidu.com", 1, null);
     _resultItems.Request = request;
     _spider = new DefaultSpider();
 }
        public Page Download(Request request, ISpider spider)
        {
            Page page = new Page(request, spider.Site.ContentType);
            page.Content = File.ReadAllText(request.Url.LocalPath);
            page.TargetUrl = request.Url.ToString();
            page.Url = request.Url.ToString();
            page.StatusCode = 200;

            return page;
        }
Exemple #4
0
 public object Clone()
 {
     IDictionary<string, dynamic> extras = new Dictionary<string, dynamic>();
     foreach (var entry in Extras)
     {
         extras.Add(entry.Key, entry.Value);
     }
     Request newObj = new Request(Url, extras);
     return newObj;
 }
        public void RedisTest()
        {
            RedisScheduler redisScheduler = new RedisScheduler("localhost", "");

            ITask task = new TestTask();
            Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/", null);
            request.PutExtra("1", "2");
            redisScheduler.Push(request, task);
            redisScheduler.Poll(task);
            //System.out.println(poll);
        }
 public static Request GetRequest()
 {
     var extras = new Dictionary<string, dynamic>();
     extras.Add("Test", "Forever");
     var request = new Request("http://www.taobao.com", 2, extras)
     {
         Method = "get",
         Priority = 1
     };
     return request;
 }
 public static void RunTask()
 {
     OoSpider ooSpider = OoSpider.Create(new Site
     {
         SleepTime = 10000
     }, new CollectorPageModelToDbPipeline(), typeof(Ganji));
     ooSpider.SetThreadNum(1);
     Request request = new Request("http://mobds.ganji.com/datashare/", null);
     request.Method = "POST";
     ooSpider.AddRequest(request);
     ooSpider.Run();
 }
 public bool IsDuplicate(Request request, ITask task)
 {
     using (var redis = _pool.GetClient())
     {
         redis.Password = _password;
         bool isDuplicate = redis.SetContainsItem(GetSetKey(task), request.Url);
         if (!isDuplicate)
         {
             redis.AddItemToSet(GetSetKey(task), request.Url);
         }
         return isDuplicate;
     }
 }
        public void RedisTest()
        {
            RedisScheduler redisScheduler = new RedisScheduler("localhost", "");

            ISpider spider = new DefaultSpider();
            redisScheduler.Clear();

            Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/", 1, null);
            request.PutExtra("1", "2");
            redisScheduler.Push(request);
            Request result = redisScheduler.Poll();
            Assert.AreEqual("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/", result.Url.ToString());
            Request result1 = redisScheduler.Poll();
            Assert.IsNull(result1);
            redisScheduler.Dispose();

            redisScheduler.Clear();
        }
        public void Redis_QueueTest()
        {
            RedisScheduler redisScheduler = new RedisScheduler("localhost", "");

            ISpider spider = new DefaultSpider();
            Request request1 = new Request("http://www.ibm.com/1", 1, null);
            Request request2 = new Request("http://www.ibm.com/2", 1, null);
            Request request3 = new Request("http://www.ibm.com/3", 1, null);
            Request request4 = new Request("http://www.ibm.com/4", 1, null);
            redisScheduler.Push(request1);
            redisScheduler.Push(request2);
            redisScheduler.Push(request3);
            redisScheduler.Push(request4);

            Request result = redisScheduler.Poll();
            Assert.AreEqual("http://www.ibm.com/4", result.Url.ToString());
            Request result1 = redisScheduler.Poll();
            Assert.AreEqual("http://www.ibm.com/3", result1.Url.ToString());
            redisScheduler.Dispose();
        }
Exemple #11
0
        public Page Download(Request request, ISpider spider)
        {
            // ReSharper disable once UnusedVariable
            string path = BasePath + "/" + spider.Identity + "/";
            Page page;
            try
            {
                FileInfo file = PrepareFile(path + Encrypt.Md5Encrypt(request.Url.ToString()));

                StreamReader bufferedReader = new StreamReader(file.OpenRead());
                string line = bufferedReader.ReadLine();
                if (("url:\t" + request.Url).Equals(line))
                {
                    string html = GetHtml(bufferedReader);
                    page = new Page(request, spider.Site.ContentType);
                    page.Url = request.Url.ToString();
                    page.Content = html;
                }
            }
            catch (IOException e)
            {
            #if !NET_CORE
                if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException)))
            #else
                if (typeof(FileNotFoundException).GetTypeInfo().IsAssignableFrom(e.GetType().GetTypeInfo()))
            #endif
                {
                    spider.Logger.Info("File not exist for url: " + request.Url);
                }
                else
                {
                    spider.Logger.Warn("File read error for url " + request.Url, e);
                }
            }
            page = DownloadWhenMiss(request, spider);
            return page;
        }
 public bool IsDuplicate(Request request, ISpider spider)
 {
     return SafeExecutor.Execute(30, () =>
     {
         string key = GetSetKey(spider.Identity);
         bool isDuplicate = Redis.SetContains(key, request.Identity);
         if (!isDuplicate)
         {
             Redis.SetAdd(key, request.Identity);
         }
         return isDuplicate;
     });
 }
Exemple #13
0
 public Page Download(Request request, ISpider spider)
 {
     var page = new Page(request, ContentType.Html);
     page.Content = "";
     return page;
 }
Exemple #14
0
 public void Push(Request request)
 {
 }
        //[MethodImpl(MethodImplOptions.Synchronized)]
        protected override void PushWhenNoDuplicate(Request request)
        {
            SafeExecutor.Execute(30, () =>
            {
                Db.ListRightPush(QueueKey, request.Identity);
                string field = request.Identity;
                string value = JsonConvert.SerializeObject(request);

                Db.HashSet(ItemKey, field, value);
            });
        }
Exemple #16
0
		public Site AddStartRequest(Request startRequest)
		{
			_startRequests.Add(startRequest);
			if (Domain == null && startRequest.Url != null)
			{
				Domain = UrlUtils.GetDomain(startRequest.Url);
			}
			return this;
		}
        //[MethodImpl(MethodImplOptions.Synchronized)]
        protected override void PushWhenNoDuplicate(Request request, ISpider spider)
        {
            SafeExecutor.Execute(30, () =>
            {
                Redis.ListRightPush(GetQueueKey(spider.Identity), request.Identity);
                string field = request.Identity;
                string value = JsonConvert.SerializeObject(request);

                Redis.HashSet(GetItemKey(spider.Identity), field, value);
            });
        }
Exemple #18
0
        public override Request Poll(ITask task)
        {
            using (var redis = _pool.GetClient())
            {
                redis.Password = _password;
                string url = redis.PopItemWithLowestScoreFromSortedSet(GetQueueKey(task));
                if (url == null)
                {
                    return null;
                }

                string hashId = ItemPrefix + task.Identify;
                string field = Encrypt.Md5Encrypt(url);

                string json = null;
                //redis �п���ȡ����ʧ��
                for (int i = 0; i < 10 && string.IsNullOrEmpty(json = redis.GetValueFromHash(hashId, field)); ++i)
                {
                    Thread.Sleep(150);
                }

                if (!string.IsNullOrEmpty(json))
                {
                    return JsonConvert.DeserializeObject<Request>(json);
                }

                Request request = new Request(url, null);
                return request;
            }
        }
Exemple #19
0
 public void AddTargetRequest(Request request)
 {
     _targetRequests.Add(request);
 }
Exemple #20
0
 public Page(Request request, ContentType contentType)
 {
     Request = request;
     ResultItems.Request = request;
     ContentType = contentType;
 }
Exemple #21
0
 public void OnError(Request request)
 {
     _errorUrls.Add(request.Url);
     _errorCount.Inc();
 }
Exemple #22
0
 private Page DownloadWhenMiss(Request request, ISpider spider)
 {
     Page page = null;
     if (_downloaderWhenFileMiss != null)
     {
         page = _downloaderWhenFileMiss.Download(request, spider);
     }
     return page;
 }
Exemple #23
0
 /// <summary>
 /// Add requests to fetch
 /// </summary>		 
 public void AddTargetRequest(Request request)
 {
     lock (this)
     {
         TargetRequests.Add(request);
     }
 }
 public bool IsDuplicate(Request request)
 {
     return SafeExecutor.Execute(30, () =>
     {
         bool isDuplicate = Db.SetContains(SetKey, request.Identity);
         if (!isDuplicate)
         {
             Db.SetAdd(SetKey, request.Identity);
         }
         return isDuplicate;
     });
 }
Exemple #25
0
 /// <summary>
 /// Add urls to fetch
 /// </summary>
 /// <param name="requests"></param>
 /// <param name="priority"></param>
 public void AddTargetRequests(IList<string> requests, int priority)
 {
     lock (this)
     {
         foreach (string s in requests)
         {
             if (string.IsNullOrEmpty(s) || s.Equals("#") || s.StartsWith("javascript:"))
             {
                 continue;
             }
             string s1 = UrlUtils.CanonicalizeUrl(s, Url);
             Request request = new Request(s1, Request.NextDepth, Request.Extras) { Priority = priority };
             TargetRequests.Add(request);
         }
     }
 }
Exemple #26
0
 private Page DownloadWhenMiss(Request request, ITask task)
 {
     Page page = null;
     if (_downloaderWhenFileMiss != null)
     {
         page = _downloaderWhenFileMiss.Download(request, task);
     }
     return page;
 }
Exemple #27
0
 public Page(Request request)
 {
     _request = request;
     _resultItems.Request = request;
 }
            public void OnError(Request request)
            {
                _errorUrls.Add(request.Url.ToString());
                _errorCount.Inc();

                #if !NET_CORE
                if (_spider.SaveStatus && !string.IsNullOrEmpty(MongoConnectString))
                {
                    MongoClient _mongoClient = new MongoClient(MongoConnectString);
                    var collection = _mongoClient.GetDatabase(_mongoDatabaseName).GetCollection<Request>(_errorRequestCollection);
                    collection.InsertOne(request);
                }
                #endif
            }
Exemple #29
0
 public void AddTargetRequests(IList<string> requests, long priority)
 {
     foreach (string s in requests)
     {
         if (string.IsNullOrEmpty(s) || s.Equals("#") || s.StartsWith("javascript:"))
         {
             continue;
         }
         string s1 = UrlUtils.CanonicalizeUrl(s, _url.ToString());
         Request request = new Request(s1, _request?.Extras) { Priority = priority };
         _targetRequests.Add(request);
     }
 }
 public void OnSuccess(Request request)
 {
     _successCount.Inc();
 }