示例#1
0
        public async Task <IActionResult> PutCrawlStep([FromForm] int id, [FromForm] CrawlStep crawlStep)
        {
            if (id != crawlStep.Id)
            {
                return(BadRequest());
            }

            _context.Entry(crawlStep).State = EntityState.Modified;

            try
            {
                await _context.SaveChangesAsync();
            }
            catch (DbUpdateConcurrencyException)
            {
                if (!CrawlStepExists(id))
                {
                    return(NotFound());
                }
                else
                {
                    throw;
                }
            }

            return(NoContent());
        }
示例#2
0
        public void DownloadAsync <T>(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method,
                                      Action <RequestState <T> > completed, Action <DownloadProgressEventArgs> progress,
                                      T state)
        {
            AspectF.Define.
            NotNull(crawlStep, "crawlStep").
            NotNull(completed, "completed");

            if (UserAgent.IsNullOrEmpty())
            {
                UserAgent = "Mozilla/5.0";
            }

            RequestState <T> requestState = new RequestState <T>
            {
                DownloadTimer    = Stopwatch.StartNew(),
                Complete         = completed,
                CrawlStep        = crawlStep,
                Referrer         = referrer,
                State            = state,
                DownloadProgress = progress,
                Retry            = RetryCount.HasValue ? RetryCount.Value + 1 : 1,
                Method           = method,
            };

            DownloadAsync(requestState, null);
        }
示例#3
0
        private string GetCacheFileName(CrawlStep step, DownloadMethod method)
        {
            string fileName = FileSystemHelpers.ToValidFileName(string.Format("{0}_{1}", step.Uri, method));

            fileName = Path.Combine(m_CacheFolder, fileName);
            return(fileName.Max(248));
        }
示例#4
0
        public async Task <ActionResult <CrawlStep> > PostCrawlStep([FromForm] CrawlStep crawlStep)
        {
            _context.CrawlSteps.Add(crawlStep);
            await _context.SaveChangesAsync();

            return(CreatedAtAction("GetCrawlStep", new { id = crawlStep.Id }, crawlStep));
        }
        public async Task <RequestState <T> > DownloadAsync <T>(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method,
                                                                Func <RequestState <T>, Task> completed, Action <DownloadProgressEventArgs> progress,
                                                                T state)
        {
            AspectF.Define.
            NotNull(crawlStep, "crawlStep").
            NotNull(completed, "completed");

            if (this.UserAgent.IsNullOrEmpty())
            {
                this.UserAgent = "Mozilla/5.0";
            }

            var requestState = new RequestState <T>
            {
                DownloadTimer    = Stopwatch.StartNew(),
                Complete         = completed,
                CrawlStep        = crawlStep,
                Referrer         = referrer,
                State            = state,
                DownloadProgress = progress,
                Retry            = this.RetryCount.HasValue ? this.RetryCount.Value + 1 : 1,
                Method           = ConvertToHttpMethod(method),
            };

            return(await this.DownloadAsync(requestState, null));
        }
		/// <summary>
		/// 	Checks if the crawler should follow an url
		/// </summary>
		/// <param name = "uri">Url to check</param>
		/// <param name = "referrer"></param>
		/// <returns>True if the crawler should follow the url, else false</returns>
		public virtual bool IsAllowedUrl(Uri uri, CrawlStep referrer)
		{
			if (m_Crawler.MaximumUrlSize.HasValue && m_Crawler.MaximumUrlSize.Value > 10 &&
				uri.ToString().Length > m_Crawler.MaximumUrlSize.Value)
			{
				return false;
			}

			if (!m_Crawler.IncludeFilter.IsNull() && m_Crawler.IncludeFilter.Any(f => f.Match(uri, referrer)))
			{
				return true;
			}

			if (!m_Crawler.ExcludeFilter.IsNull() && m_Crawler.ExcludeFilter.Any(f => f.Match(uri, referrer)))
			{
				return false;
			}

			if (IsExternalUrl(uri))
			{
				return false;
			}

			return !m_Crawler.AdhereToRobotRules || m_Robot.IsAllowed(m_Crawler.UserAgent, uri);
		}
示例#7
0
        public async Task <AsyncRequestState <T> > DownloadAsync <T>(
            CrawlStep crawlStep,
            CrawlStep referrer,
            DownloadMethod method,
            Action <AsyncRequestState <T> > completed,
            Action <DownloadProgressEventArgs> progress, T state)
        {
            AspectF.Define.
            NotNull(crawlStep, "crawlStep");

            if (UserAgent.IsNullOrEmpty())
            {
                UserAgent = "Mozilla/5.0";
            }

            AsyncRequestState <T> requestState = new AsyncRequestState <T>
            {
                CrawlStep        = crawlStep,
                Referrer         = referrer,
                State            = state,
                DownloadProgress = progress,
                Retry            = RetryCount.HasValue ? RetryCount.Value + 1 : 1,
                Method           = method,
            };

            return(await DownloadAsync(requestState));
        }
        /// <summary>
        ///     Checks if the crawler should follow an url
        /// </summary>
        /// <param name = "uri">Url to check</param>
        /// <param name = "referrer"></param>
        /// <returns>True if the crawler should follow the url, else false</returns>
        public virtual async Task <bool> IsAllowedUrlAsync(Uri uri, CrawlStep referrer)
        {
            if (this.m_Crawler.MaximumUrlSize.HasValue && this.m_Crawler.MaximumUrlSize.Value > 10 &&
                uri.ToString().Length > this.m_Crawler.MaximumUrlSize.Value)
            {
                return(false);
            }

            if (!this.m_Crawler.IncludeFilter.IsNull() && this.m_Crawler.IncludeFilter.Any(f => f.Match(uri, referrer)))
            {
                return(true);
            }

            if (!this.m_Crawler.ExcludeFilter.IsNull() && this.m_Crawler.ExcludeFilter.Any(f => f.Match(uri, referrer)))
            {
                return(false);
            }

            if (IsExternalUrl(uri))
            {
                return(false);
            }

            return(!this.m_Crawler.AdhereToRobotRules || await this.m_Robot.IsAllowed(this.m_Crawler.UserAgent, uri));
        }
示例#9
0
 public bool Match(Uri uri, CrawlStep referrer)
 {
     if (!Regex.IsMatch(uri.AbsoluteUri, @"http://.*\.cnblogs\.com/.*"))
     {
         return true;
     }
       return false;
 }
示例#10
0
        public override bool IsAllowedUrl(Uri uri, CrawlStep referrer)
        {
            if (base.IsExternalUrl(uri))
            {
                return(false);
            }

            return(base.IsAllowedUrl(uri, referrer));
        }
示例#11
0
		/// <summary>
		/// Executes OnDownloadException event
		/// </summary>
		private void OnDownloadException(Exception exception, CrawlStep crawlStep, CrawlStep referrer)
		{
			long downloadErrors = Interlocked.Increment(ref m_DownloadErrors);
			if (MaximumHttpDownloadErrors.HasValue && MaximumHttpDownloadErrors.Value > downloadErrors)
			{
				m_Logger.Error("Number of maximum failed downloads exceeded({0}), cancelling crawl", MaximumHttpDownloadErrors.Value);
				StopCrawl();
			}

			m_Logger.Error("Download exception while downloading {0}, error was {1}", crawlStep.Uri, exception);
			DownloadException.ExecuteEvent(this, () => new DownloadExceptionEventArgs(crawlStep, referrer, exception));
		}
示例#12
0
        /// <summary>
        ///     Gets or Sets a value indicating if cookies will be stored.
        /// </summary>
        private async Task <PropertyBag> DownloadInternalSync(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method)
        {
            PropertyBag result = null;
            Exception   ex     = null;

            using (var resetEvent = new ManualResetEvent(false))
            {
                await DownloadAsync <object>(crawlStep, referrer, method,
                                             (RequestState <object> state) =>
                {
                    if (state.Exception.IsNull())
                    {
                        result = state.PropertyBag;
                        if (!result.GetResponse.IsNull())
                        {
                            using (var response = result.GetResponse())
                            {
                                byte[] data;
                                if (response is MemoryStream)
                                {
                                    data = ((MemoryStream)response).ToArray();
                                }
                                else
                                {
                                    using (var copy = response.CopyToMemory())
                                    {
                                        data = copy.ToArray();
                                    }
                                }

                                result.GetResponse = () => new MemoryStream(data);
                            }
                        }
                    }
                    else
                    {
                        ex = state.Exception;
                    }

                    resetEvent.Set();
                    return(Task.FromResult(0));
                }, null, null);

                resetEvent.WaitOne();
            }

            if (!ex.IsNull())
            {
                throw new Exception("Error write downloading {0}".FormatWith(crawlStep.Uri), ex);
            }

            return(result);
        }
示例#13
0
		/// <summary>
		/// Returns true to continue crawl of this url, else false
		/// </summary>
		/// <returns>True if this step should be cancelled, else false</returns>
		private bool OnAfterDownload(CrawlStep crawlStep, PropertyBag response)
		{
			EventHandler<AfterDownloadEventArgs> afterDownloadTmp = AfterDownload;
			if (afterDownloadTmp.IsNull())
			{
				return crawlStep.IsAllowed;
			}

			AfterDownloadEventArgs e =
				new AfterDownloadEventArgs(!crawlStep.IsAllowed, response);
			afterDownloadTmp(this, e);
			return !e.Cancel;
		}
示例#14
0
		/// <summary>
		/// Returns true to continue crawl of this url, else false
		/// </summary>
		/// <returns>True if this step should be cancelled, else false</returns>
		private bool OnBeforeDownload(CrawlStep crawlStep)
		{
			EventHandler<BeforeDownloadEventArgs> beforeDownloadTmp = BeforeDownload;
			if (beforeDownloadTmp.IsNull())
			{
				return crawlStep.IsAllowed;
			}

			BeforeDownloadEventArgs e =
				new BeforeDownloadEventArgs(!crawlStep.IsAllowed, crawlStep);
			beforeDownloadTmp(this, e);
			return !e.Cancel;
		}
示例#15
0
 public override bool IsAllowedUrl(Uri uri, CrawlStep referrer)
 {
     // True if origin base uri is not equal to the crawler uri
     if (base.IsExternalUrl(uri))
     {
         return(false);
     }
     if (!base.IsAllowedUrl(uri, referrer))
     {
         return(false);
     }
     return(true);
 }
示例#16
0
        public bool Match(Uri uri, CrawlStep referrer)
        {
            if (!m_Match.IsNull())
            {
                return(m_Match(uri, referrer));
            }

            if (!m_Match2.IsNull())
            {
                return(m_Match2(uri));
            }

            return(false);
        }
示例#17
0
		public bool Match(Uri uri, CrawlStep referrer)
		{
			if (!m_Match.IsNull())
			{
				return m_Match(uri, referrer);
			}

			if (!m_Match2.IsNull())
			{
				return m_Match2(uri);
			}

			return false;
		}
示例#18
0
 public void DownloadAsync <T>(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method, Action <RequestState <T> > completed,
                               Action <DownloadProgressEventArgs> progress, T state)
 {
     completed(new RequestState <T>
     {
         DownloadTimer    = Stopwatch.StartNew(),
         Complete         = completed,
         CrawlStep        = crawlStep,
         Referrer         = referrer,
         State            = state,
         DownloadProgress = progress,
         Retry            = RetryCount.HasValue ? RetryCount.Value + 1 : 1,
         Method           = method,
     });
 }
示例#19
0
        public Task <RequestState <T> > DownloadAsync <T>(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method, Func <RequestState <T>, Task> completed, Action <DownloadProgressEventArgs> progress, T state)
        {
            var result = new RequestState <T>
            {
                StartTime        = DateTime.UtcNow,
                Complete         = completed,
                CrawlStep        = crawlStep,
                Referrer         = referrer,
                State            = state,
                DownloadProgress = progress,
                Retry            = this.RetryCount.HasValue ? this.RetryCount.Value + 1 : 1,
                Method           = ConvertToHttpMethod(method),
            };

            return(Task.Factory.StartNew(() =>
            {
                completed(result);
                return result;
            }));
        }
示例#20
0
        public Task <PropertyBag> DownloadAsync(CrawlStep crawlStep, CrawlStep referrer = null, DownloadMethod method = DownloadMethod.GET)
        {
            var result = new PropertyBag
            {
                Step                    = crawlStep,
                CharacterSet            = string.Empty,
                ContentEncoding         = string.Empty,
                ContentType             = "text/html",
                Headers                 = null,
                IsMutuallyAuthenticated = false,
                IsFromCache             = false,
                LastModified            = DateTime.UtcNow,
                Method                  = "GET",
                ProtocolVersion         = new Version(3, 0),
                ResponseUri             = crawlStep.Uri,
                Server                  = "N/A",
                StatusCode              = HttpStatusCode.OK,
                StatusDescription       = "OK",
                GetResponse             = () => new MemoryStream(Encoding.UTF8.GetBytes(Resources.ncrawler_codeplex_com)),
                DownloadTime            = TimeSpan.FromSeconds(1),
            };

            return(Task.FromResult(result));
        }
示例#21
0
        public PropertyBag Download(CrawlStep crawlStep, DownloadMethod method)
        {
            PropertyBag result = new PropertyBag
                {
                    Step = crawlStep,
                    CharacterSet = string.Empty,
                    ContentEncoding = string.Empty,
                    ContentType = "text/html",
                    Headers = null,
                    IsMutuallyAuthenticated = false,
                    IsFromCache = false,
                    LastModified = DateTime.UtcNow,
                    Method = "GET",
                    ProtocolVersion = new Version(3, 0),
                    ResponseUri = crawlStep.Uri,
                    Server = "N/A",
                    StatusCode = HttpStatusCode.OK,
                    StatusDescription = "OK",
                    Response = Encoding.UTF8.GetBytes(Resources.ncrawler_codeplex_com),
                    DownloadTime = TimeSpan.FromSeconds(1),
                };

            return result;
        }
示例#22
0
        public PropertyBag Download(CrawlStep crawlStep, DownloadMethod method)
        {
            PropertyBag result = new PropertyBag
            {
                Step                    = crawlStep,
                CharacterSet            = string.Empty,
                ContentEncoding         = string.Empty,
                ContentType             = "text/html",
                Headers                 = null,
                IsMutuallyAuthenticated = false,
                IsFromCache             = false,
                LastModified            = DateTime.UtcNow,
                Method                  = "GET",
                ProtocolVersion         = new Version(3, 0),
                ResponseUri             = crawlStep.Uri,
                Server                  = "N/A",
                StatusCode              = HttpStatusCode.OK,
                StatusDescription       = "OK",
                Response                = Encoding.UTF8.GetBytes(Resources.ncrawler_codeplex_com),
                DownloadTime            = TimeSpan.FromSeconds(1),
            };

            return(result);
        }
示例#23
0
        /// <summary>
        /// Gets or Sets a value indicating if cookies will be stored.
        /// </summary>
        public PropertyBag Download(CrawlStep crawlStep, DownloadMethod method)
        {
            AspectF.Define.
                NotNull(crawlStep, "crawlStep");

            if (UserAgent.IsNullOrEmpty())
            {
                UserAgent = "Mozilla/5.0";
            }

            if (m_CacheEnabled)
            {
                if (CacheEntryExists(crawlStep, method))
                {
                    return GetCacheEntry(crawlStep, method);
                }
            }

            HttpWebRequest req = (HttpWebRequest) WebRequest.Create(crawlStep.Uri);
            req.Method = method.ToString();
            req.AllowAutoRedirect = true;
            req.UserAgent = UserAgent;
            req.Accept = "*/*";
            req.KeepAlive = true;
            if (ConnectionTimeout.HasValue)
            {
                req.Timeout = Convert.ToInt32(ConnectionTimeout.Value.TotalMilliseconds);
            }

            if (ReadTimeout.HasValue)
            {
                req.ReadWriteTimeout = Convert.ToInt32(ReadTimeout.Value.TotalMilliseconds);
            }

            if (UseCookies)
            {
                req.CookieContainer = CookieContainer;
            }

            Stopwatch downloadTimer = Stopwatch.StartNew();
            HttpWebResponse resp;
            try
            {
                resp = (HttpWebResponse) req.GetResponse();
            }
            catch (WebException we)
            {
                resp = we.Response as HttpWebResponse;
                if (resp.IsNull())
                {
                    throw;
                }
            }

            using (resp)
            using (Stream responseStream = resp.GetResponseStream())
            {
                downloadTimer.Stop();
                PropertyBag result = new PropertyBag
                    {
                        Step = crawlStep,
                        CharacterSet = resp.CharacterSet,
                        ContentEncoding = resp.ContentEncoding,
                        ContentType = resp.ContentType,
                        Headers = resp.Headers,
                        IsMutuallyAuthenticated = resp.IsMutuallyAuthenticated,
                        IsFromCache = resp.IsFromCache,
                        LastModified = resp.LastModified,
                        Method = resp.Method,
                        ProtocolVersion = resp.ProtocolVersion,
                        ResponseUri = resp.ResponseUri,
                        Server = resp.Server,
                        StatusCode = resp.StatusCode,
                        StatusDescription = resp.StatusDescription,
                        Response = CopyStreamToMemory(responseStream, MaximumContentSize),
                        DownloadTime = downloadTimer.Elapsed,
                    };

                if (m_CacheEnabled)
                {
                    WriteCacheEntry(crawlStep, method, result);
                }

                return result;
            }
        }
 public DownloadExceptionEventArgs(CrawlStep crawlStep, Exception exception)
 {
     CrawlStep = crawlStep;
     Exception = exception;
 }
示例#25
0
 protected string Substitute(string original, CrawlStep crawlStep)
 {
     return(this.HasSubstitutionRules
                         ? this.Substitutions.Aggregate(original, (current, substitution) => substitution.Substitute(current, crawlStep))
                         : original);
 }
示例#26
0
 protected string Substitute(string original, CrawlStep crawlStep)
 {
     return HasSubstitutionRules
         ? Substitutions.Aggregate(original, (current, substitution) => substitution.Substitute(current, crawlStep))
         : original;
 }
示例#27
0
 private string GetCacheFileName(CrawlStep step, DownloadMethod method)
 {
     string fileName = FileSystemHelpers.ToValidFileName(string.Format("{0}_{1}", step.Uri, method));
     fileName = Path.Combine(m_CacheFolder, fileName);
     return fileName.Max(248);
 }
 internal BeforeDownloadEventArgs(bool cancel, CrawlStep crawlStep)
     : base(cancel)
 {
     CrawlStep = crawlStep;
 }
示例#29
0
 private PropertyBag GetCacheEntry(CrawlStep step, DownloadMethod method)
 {
     return File.ReadAllBytes(GetCacheFileName(step, method)).FromBinary<PropertyBag>();
 }
示例#30
0
        /// <summary>
        /// Download content from a url
        /// </summary>
        /// <param name="step">Step in crawler that contains url to download</param>
        /// <returns>Downloaded content</returns>
        private PropertyBag Download(CrawlStep step)
        {
            try
            {
                IWebDownloader webDownloader = m_DownloaderFactory.GetDownloader();
                m_Logger.Verbose("Downloading {0}", step.Uri);
                return webDownloader.Download(step, DownloadMethod.Get);
            }
            catch (Exception ex)
            {
                OnDownloadException(ex, step);
            }

            return null;
        }
示例#31
0
文件: Crawler.cs 项目: fzhenmei/study
        private void EndDownload(CrawlStep crawlStep, PropertyBag propertyBag, Exception exception,
            ThreadSafeCounter.ThreadSafeCounterCookie counterCookie)
        {
            using (counterCookie)
            {
                if (exception != null)
                {
                    OnDownloadException(exception, crawlStep);
                } else if (!propertyBag.IsNull())
                {
                    propertyBag.Referrer = crawlStep;

                    // Assign initial properties to propertybag
                    if (!counterCookie.CrawlerQueueEntry.Properties.IsNull())
                    {
                        counterCookie.CrawlerQueueEntry.Properties.
                            ForEach(key => propertyBag[key.Key].Value = key.Value);
                    }

                    if (OnAfterDownload(crawlStep, propertyBag))
                    {
                        // Executes all the pipelines sequentially for each downloaded content
                        // in the crawl process. Used to extract data from content, like which
                        // url's to follow, email addresses, aso.
                        Pipeline.ForEach(pipelineStep => ExecutePipeLineStep(pipelineStep, propertyBag));
                    }
                }
            }

            ProcessQueue();
        }
示例#32
0
 public string Substitute(string original, CrawlStep crawlStep)
 {
     return(this.m_Match.Value.Replace(original, this.m_Replacement));
 }
示例#33
0
		public bool Match(Uri uri, CrawlStep referrer)
		{
			return m_Regex.Value.Match(uri.ToString()).Success;
		}
示例#34
0
 public async Task <PropertyBag> DownloadAsync(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method)
 {
     return(await DownloadInternalSync(crawlStep, referrer, method));
 }
示例#35
0
 private void WriteCacheEntry(CrawlStep step, DownloadMethod method, PropertyBag result)
 {
     File.WriteAllBytes(GetCacheFileName(step, method), result.ToBinary());
 }
示例#36
0
 private PropertyBag GetCacheEntry(CrawlStep step, DownloadMethod method)
 {
     return(File.ReadAllBytes(GetCacheFileName(step, method)).FromBinary <PropertyBag>());
 }
 public DownloadExceptionEventArgs(CrawlStep crawlStep, Exception exception)
 {
     CrawlStep = crawlStep;
     Exception = exception;
 }
示例#38
0
		public string Substitute(string original, CrawlStep crawlStep)
		{
			return m_Match.Value.Replace(original, m_Replacement);
		}
		public DownloadExceptionEventArgs(CrawlStep crawlStep, CrawlStep referrrer, Exception exception)
		{
			CrawlStep = crawlStep;
			Referrer = referrrer;
			Exception = exception;
		}
示例#40
0
 public PropertyBag Download(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method)
 {
     return(DownloadInternalSync(crawlStep, referrer, method));
 }
示例#41
0
 public string Substitute(string original, CrawlStep crawlStep)
 {
     return(_match.Value.Replace(original, _replacement));
 }
示例#42
0
 private void WriteCacheEntry(CrawlStep step, DownloadMethod method, PropertyBag result)
 {
     File.WriteAllBytes(GetCacheFileName(step, method), result.ToBinary());
 }
示例#43
0
 public bool Match(Uri uri, CrawlStep referrer)
 {
     return(m_Regex.Value.Match(uri.ToString()).Success);
 }
示例#44
0
        /// <summary>
        /// Gets or Sets a value indicating if cookies will be stored.
        /// </summary>
        public PropertyBag Download(CrawlStep crawlStep, DownloadMethod method)
        {
            AspectF.Define.
            NotNull(crawlStep, "crawlStep");

            if (UserAgent.IsNullOrEmpty())
            {
                UserAgent = "Mozilla/5.0";
            }

            if (m_CacheEnabled)
            {
                if (CacheEntryExists(crawlStep, method))
                {
                    return(GetCacheEntry(crawlStep, method));
                }
            }

            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(crawlStep.Uri);

            req.Method            = method.ToString();
            req.AllowAutoRedirect = true;
            req.UserAgent         = UserAgent;
            req.Accept            = "*/*";
            req.KeepAlive         = true;
            if (ConnectionTimeout.HasValue)
            {
                req.Timeout = Convert.ToInt32(ConnectionTimeout.Value.TotalMilliseconds);
            }

            if (ReadTimeout.HasValue)
            {
                req.ReadWriteTimeout = Convert.ToInt32(ReadTimeout.Value.TotalMilliseconds);
            }

            if (UseCookies)
            {
                req.CookieContainer = CookieContainer;
            }

            Stopwatch       downloadTimer = Stopwatch.StartNew();
            HttpWebResponse resp;

            try
            {
                resp = (HttpWebResponse)req.GetResponse();
            }
            catch (WebException we)
            {
                resp = we.Response as HttpWebResponse;
                if (resp.IsNull())
                {
                    throw;
                }
            }

            using (resp)
                using (Stream responseStream = resp.GetResponseStream())
                {
                    downloadTimer.Stop();
                    PropertyBag result = new PropertyBag
                    {
                        Step                    = crawlStep,
                        CharacterSet            = resp.CharacterSet,
                        ContentEncoding         = resp.ContentEncoding,
                        ContentType             = resp.ContentType,
                        Headers                 = resp.Headers,
                        IsMutuallyAuthenticated = resp.IsMutuallyAuthenticated,
                        IsFromCache             = resp.IsFromCache,
                        LastModified            = resp.LastModified,
                        Method                  = resp.Method,
                        ProtocolVersion         = resp.ProtocolVersion,
                        ResponseUri             = resp.ResponseUri,
                        Server                  = resp.Server,
                        StatusCode              = resp.StatusCode,
                        StatusDescription       = resp.StatusDescription,
                        Response                = CopyStreamToMemory(responseStream, MaximumContentSize),
                        DownloadTime            = downloadTimer.Elapsed,
                    };

                    if (m_CacheEnabled)
                    {
                        WriteCacheEntry(crawlStep, method, result);
                    }

                    return(result);
                }
        }
示例#45
0
        /// <summary>
        /// 	Queue a new step on the crawler queue
        /// </summary>
        /// <param name = "uri">url to crawl</param>
        /// <param name = "depth">depth of the url</param>
        /// <param name = "referrer">Step which the url was located</param>
        /// <param name = "properties">Custom properties</param>
        public void AddStep(Uri uri, int depth, CrawlStep referrer, Dictionary<string, object> properties)
        {
            var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string;
            if (jsonStr == null)
            {
                using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8))
                {
                    jsonStr = stream.ReadToEnd();
                    var policy = new CacheItemPolicy();
                    policy.Priority = CacheItemPriority.NotRemovable;
                    policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1);
                    cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy);
                    Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite"));
                }
            }
            var json = JsonConvert.DeserializeObject<OriginalWebSiteTxt>(jsonStr);
            var storeRegex = new List<string>();
            var blockRegex = new List<string>();
            if (json.StoreRegex != null && json.StoreRegex.Count > 0)
            {
                storeRegex = json.StoreRegex;
            }
            if (json.BlockRegex != null && json.BlockRegex.Count > 0)
            {
                blockRegex = json.BlockRegex;
            }
            bool needToCrawl = false;
            if (blockRegex != null && blockRegex.Count > 0)
            {
                foreach (var key in blockRegex)
                {
                    if (uri.AbsoluteUri.Contains(key))
                    {
                        return;
                    }
                }
            }
            if (storeRegex != null && storeRegex.Count > 0)
            {
                foreach (var regex in storeRegex)
                {
                    if (Regex.IsMatch(uri.AbsoluteUri, regex, RegexOptions.IgnoreCase))
                    {
                        needToCrawl = true;
                        break;
                    }
                }
            }
            else
            {
                needToCrawl = true;
            }
            if (!needToCrawl) return;

            if (!m_Crawling)
            {
                throw new InvalidOperationException("Crawler must be running before adding steps");
            }

            if (m_CrawlStopped)
            {
                return;
            }

            if ((uri.Scheme != Uri.UriSchemeHttps && uri.Scheme != Uri.UriSchemeHttp) || // Only accept http(s) schema
                (MaximumCrawlDepth.HasValue && MaximumCrawlDepth.Value > 0 && depth >= MaximumCrawlDepth.Value) ||
                !m_CrawlerRules.IsAllowedUrl(uri, referrer) ||
                !m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity)))
            {
                if (depth == 0)
                {
                    StopCrawl();
                }

                return;
            }

            // Make new crawl step
            CrawlStep crawlStep = new CrawlStep(uri, depth)
                {
                    IsExternalUrl = m_CrawlerRules.IsExternalUrl(uri),
                    IsAllowed = true,
                };
            m_CrawlerQueue.Push(new CrawlerQueueEntry
                {
                    CrawlStep = crawlStep,
                    Referrer = referrer,
                    Properties = properties
                });
            m_Logger.Verbose("Added {0} to queue referred from {1}",
                crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString());
            ProcessQueue();
        }
示例#46
0
        /// <summary>
        /// 	Queue a new step on the crawler queue
        /// </summary>
        /// <param name = "uri">url to crawl</param>
        /// <param name = "depth">depth of the url</param>
        /// <param name = "referrer">Step which the url was located</param>
        /// <param name = "properties">Custom properties</param>
        public void AddStep(Uri uri, int depth, CrawlStep referrer, Dictionary<string, object> properties)
        {
            if (!m_Crawling)
            {
                throw new InvalidOperationException("Crawler must be running before adding steps");
            }

            if (m_CrawlStopped)
            {
                return;
            }

            if ((uri.Scheme != Uri.UriSchemeHttps && uri.Scheme != Uri.UriSchemeHttp) || // Only accept http(s) schema
                (MaximumCrawlDepth.HasValue && MaximumCrawlDepth.Value > 0 && depth >= MaximumCrawlDepth.Value) ||
                !m_CrawlerRules.IsAllowedUrl(uri, referrer) ||
                !m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity)))
            {
                if (depth == 0)
                {
                    StopCrawl();
                }

                return;
            }

            // Make new crawl step
            CrawlStep crawlStep = new CrawlStep(uri, depth)
                {
                    IsExternalUrl = m_CrawlerRules.IsExternalUrl(uri),
                    IsAllowed = true,
                };
            m_CrawlerQueue.Push(new CrawlerQueueEntry
                {
                    CrawlStep = crawlStep,
                    Referrer = referrer,
                    Properties = properties
                });
            m_Logger.Verbose("Added {0} to queue referred from {1}",
                crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString());
            ProcessQueue();
        }
		internal BeforeDownloadEventArgs(bool cancel, CrawlStep crawlStep)
			: base(cancel)
		{
			CrawlStep = crawlStep;
		}
示例#48
0
		public DownloadExceptionEventArgs(CrawlStep crawlStep, CrawlStep referrrer, Exception exception)
		{
            this.CrawlStep = crawlStep;
            this.Referrer = referrrer;
            this.Exception = exception;
		}
示例#49
0
 public DownloadExceptionEventArgs(CrawlStep crawlStep, CrawlStep referrrer, Exception exception)
 {
     CrawlStep = crawlStep;
     Referrer  = referrrer;
     Exception = exception;
 }
示例#50
0
 public async Task <PropertyBag> DownloadAsync(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method)
 {
     return(await this.DownloadInternalSync(crawlStep, referrer, method).ConfigureAwait(false));
 }
示例#51
0
 private bool CacheEntryExists(CrawlStep step, DownloadMethod method)
 {
     return(FileSystemHelpers.FileExists(GetCacheFileName(step, method)));
 }
示例#52
0
 private bool CacheEntryExists(CrawlStep step, DownloadMethod method)
 {
     return FileSystemHelpers.FileExists(GetCacheFileName(step, method));
 }