public async Task <IActionResult> PutCrawlStep([FromForm] int id, [FromForm] CrawlStep crawlStep) { if (id != crawlStep.Id) { return(BadRequest()); } _context.Entry(crawlStep).State = EntityState.Modified; try { await _context.SaveChangesAsync(); } catch (DbUpdateConcurrencyException) { if (!CrawlStepExists(id)) { return(NotFound()); } else { throw; } } return(NoContent()); }
public void DownloadAsync <T>(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method, Action <RequestState <T> > completed, Action <DownloadProgressEventArgs> progress, T state) { AspectF.Define. NotNull(crawlStep, "crawlStep"). NotNull(completed, "completed"); if (UserAgent.IsNullOrEmpty()) { UserAgent = "Mozilla/5.0"; } RequestState <T> requestState = new RequestState <T> { DownloadTimer = Stopwatch.StartNew(), Complete = completed, CrawlStep = crawlStep, Referrer = referrer, State = state, DownloadProgress = progress, Retry = RetryCount.HasValue ? RetryCount.Value + 1 : 1, Method = method, }; DownloadAsync(requestState, null); }
private string GetCacheFileName(CrawlStep step, DownloadMethod method) { string fileName = FileSystemHelpers.ToValidFileName(string.Format("{0}_{1}", step.Uri, method)); fileName = Path.Combine(m_CacheFolder, fileName); return(fileName.Max(248)); }
public async Task <ActionResult <CrawlStep> > PostCrawlStep([FromForm] CrawlStep crawlStep) { _context.CrawlSteps.Add(crawlStep); await _context.SaveChangesAsync(); return(CreatedAtAction("GetCrawlStep", new { id = crawlStep.Id }, crawlStep)); }
public async Task <RequestState <T> > DownloadAsync <T>(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method, Func <RequestState <T>, Task> completed, Action <DownloadProgressEventArgs> progress, T state) { AspectF.Define. NotNull(crawlStep, "crawlStep"). NotNull(completed, "completed"); if (this.UserAgent.IsNullOrEmpty()) { this.UserAgent = "Mozilla/5.0"; } var requestState = new RequestState <T> { DownloadTimer = Stopwatch.StartNew(), Complete = completed, CrawlStep = crawlStep, Referrer = referrer, State = state, DownloadProgress = progress, Retry = this.RetryCount.HasValue ? this.RetryCount.Value + 1 : 1, Method = ConvertToHttpMethod(method), }; return(await this.DownloadAsync(requestState, null)); }
/// <summary> /// Checks if the crawler should follow an url /// </summary> /// <param name = "uri">Url to check</param> /// <param name = "referrer"></param> /// <returns>True if the crawler should follow the url, else false</returns> public virtual bool IsAllowedUrl(Uri uri, CrawlStep referrer) { if (m_Crawler.MaximumUrlSize.HasValue && m_Crawler.MaximumUrlSize.Value > 10 && uri.ToString().Length > m_Crawler.MaximumUrlSize.Value) { return false; } if (!m_Crawler.IncludeFilter.IsNull() && m_Crawler.IncludeFilter.Any(f => f.Match(uri, referrer))) { return true; } if (!m_Crawler.ExcludeFilter.IsNull() && m_Crawler.ExcludeFilter.Any(f => f.Match(uri, referrer))) { return false; } if (IsExternalUrl(uri)) { return false; } return !m_Crawler.AdhereToRobotRules || m_Robot.IsAllowed(m_Crawler.UserAgent, uri); }
public async Task <AsyncRequestState <T> > DownloadAsync <T>( CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method, Action <AsyncRequestState <T> > completed, Action <DownloadProgressEventArgs> progress, T state) { AspectF.Define. NotNull(crawlStep, "crawlStep"); if (UserAgent.IsNullOrEmpty()) { UserAgent = "Mozilla/5.0"; } AsyncRequestState <T> requestState = new AsyncRequestState <T> { CrawlStep = crawlStep, Referrer = referrer, State = state, DownloadProgress = progress, Retry = RetryCount.HasValue ? RetryCount.Value + 1 : 1, Method = method, }; return(await DownloadAsync(requestState)); }
/// <summary> /// Checks if the crawler should follow an url /// </summary> /// <param name = "uri">Url to check</param> /// <param name = "referrer"></param> /// <returns>True if the crawler should follow the url, else false</returns> public virtual async Task <bool> IsAllowedUrlAsync(Uri uri, CrawlStep referrer) { if (this.m_Crawler.MaximumUrlSize.HasValue && this.m_Crawler.MaximumUrlSize.Value > 10 && uri.ToString().Length > this.m_Crawler.MaximumUrlSize.Value) { return(false); } if (!this.m_Crawler.IncludeFilter.IsNull() && this.m_Crawler.IncludeFilter.Any(f => f.Match(uri, referrer))) { return(true); } if (!this.m_Crawler.ExcludeFilter.IsNull() && this.m_Crawler.ExcludeFilter.Any(f => f.Match(uri, referrer))) { return(false); } if (IsExternalUrl(uri)) { return(false); } return(!this.m_Crawler.AdhereToRobotRules || await this.m_Robot.IsAllowed(this.m_Crawler.UserAgent, uri)); }
public bool Match(Uri uri, CrawlStep referrer) { if (!Regex.IsMatch(uri.AbsoluteUri, @"http://.*\.cnblogs\.com/.*")) { return true; } return false; }
public override bool IsAllowedUrl(Uri uri, CrawlStep referrer) { if (base.IsExternalUrl(uri)) { return(false); } return(base.IsAllowedUrl(uri, referrer)); }
/// <summary> /// Executes OnDownloadException event /// </summary> private void OnDownloadException(Exception exception, CrawlStep crawlStep, CrawlStep referrer) { long downloadErrors = Interlocked.Increment(ref m_DownloadErrors); if (MaximumHttpDownloadErrors.HasValue && MaximumHttpDownloadErrors.Value > downloadErrors) { m_Logger.Error("Number of maximum failed downloads exceeded({0}), cancelling crawl", MaximumHttpDownloadErrors.Value); StopCrawl(); } m_Logger.Error("Download exception while downloading {0}, error was {1}", crawlStep.Uri, exception); DownloadException.ExecuteEvent(this, () => new DownloadExceptionEventArgs(crawlStep, referrer, exception)); }
/// <summary> /// Gets or Sets a value indicating if cookies will be stored. /// </summary> private async Task <PropertyBag> DownloadInternalSync(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method) { PropertyBag result = null; Exception ex = null; using (var resetEvent = new ManualResetEvent(false)) { await DownloadAsync <object>(crawlStep, referrer, method, (RequestState <object> state) => { if (state.Exception.IsNull()) { result = state.PropertyBag; if (!result.GetResponse.IsNull()) { using (var response = result.GetResponse()) { byte[] data; if (response is MemoryStream) { data = ((MemoryStream)response).ToArray(); } else { using (var copy = response.CopyToMemory()) { data = copy.ToArray(); } } result.GetResponse = () => new MemoryStream(data); } } } else { ex = state.Exception; } resetEvent.Set(); return(Task.FromResult(0)); }, null, null); resetEvent.WaitOne(); } if (!ex.IsNull()) { throw new Exception("Error write downloading {0}".FormatWith(crawlStep.Uri), ex); } return(result); }
/// <summary> /// Returns true to continue crawl of this url, else false /// </summary> /// <returns>True if this step should be cancelled, else false</returns> private bool OnAfterDownload(CrawlStep crawlStep, PropertyBag response) { EventHandler<AfterDownloadEventArgs> afterDownloadTmp = AfterDownload; if (afterDownloadTmp.IsNull()) { return crawlStep.IsAllowed; } AfterDownloadEventArgs e = new AfterDownloadEventArgs(!crawlStep.IsAllowed, response); afterDownloadTmp(this, e); return !e.Cancel; }
/// <summary> /// Returns true to continue crawl of this url, else false /// </summary> /// <returns>True if this step should be cancelled, else false</returns> private bool OnBeforeDownload(CrawlStep crawlStep) { EventHandler<BeforeDownloadEventArgs> beforeDownloadTmp = BeforeDownload; if (beforeDownloadTmp.IsNull()) { return crawlStep.IsAllowed; } BeforeDownloadEventArgs e = new BeforeDownloadEventArgs(!crawlStep.IsAllowed, crawlStep); beforeDownloadTmp(this, e); return !e.Cancel; }
public override bool IsAllowedUrl(Uri uri, CrawlStep referrer) { // True if origin base uri is not equal to the crawler uri if (base.IsExternalUrl(uri)) { return(false); } if (!base.IsAllowedUrl(uri, referrer)) { return(false); } return(true); }
public bool Match(Uri uri, CrawlStep referrer) { if (!m_Match.IsNull()) { return(m_Match(uri, referrer)); } if (!m_Match2.IsNull()) { return(m_Match2(uri)); } return(false); }
public bool Match(Uri uri, CrawlStep referrer) { if (!m_Match.IsNull()) { return m_Match(uri, referrer); } if (!m_Match2.IsNull()) { return m_Match2(uri); } return false; }
public void DownloadAsync <T>(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method, Action <RequestState <T> > completed, Action <DownloadProgressEventArgs> progress, T state) { completed(new RequestState <T> { DownloadTimer = Stopwatch.StartNew(), Complete = completed, CrawlStep = crawlStep, Referrer = referrer, State = state, DownloadProgress = progress, Retry = RetryCount.HasValue ? RetryCount.Value + 1 : 1, Method = method, }); }
public Task <RequestState <T> > DownloadAsync <T>(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method, Func <RequestState <T>, Task> completed, Action <DownloadProgressEventArgs> progress, T state) { var result = new RequestState <T> { StartTime = DateTime.UtcNow, Complete = completed, CrawlStep = crawlStep, Referrer = referrer, State = state, DownloadProgress = progress, Retry = this.RetryCount.HasValue ? this.RetryCount.Value + 1 : 1, Method = ConvertToHttpMethod(method), }; return(Task.Factory.StartNew(() => { completed(result); return result; })); }
public Task <PropertyBag> DownloadAsync(CrawlStep crawlStep, CrawlStep referrer = null, DownloadMethod method = DownloadMethod.GET) { var result = new PropertyBag { Step = crawlStep, CharacterSet = string.Empty, ContentEncoding = string.Empty, ContentType = "text/html", Headers = null, IsMutuallyAuthenticated = false, IsFromCache = false, LastModified = DateTime.UtcNow, Method = "GET", ProtocolVersion = new Version(3, 0), ResponseUri = crawlStep.Uri, Server = "N/A", StatusCode = HttpStatusCode.OK, StatusDescription = "OK", GetResponse = () => new MemoryStream(Encoding.UTF8.GetBytes(Resources.ncrawler_codeplex_com)), DownloadTime = TimeSpan.FromSeconds(1), }; return(Task.FromResult(result)); }
public PropertyBag Download(CrawlStep crawlStep, DownloadMethod method) { PropertyBag result = new PropertyBag { Step = crawlStep, CharacterSet = string.Empty, ContentEncoding = string.Empty, ContentType = "text/html", Headers = null, IsMutuallyAuthenticated = false, IsFromCache = false, LastModified = DateTime.UtcNow, Method = "GET", ProtocolVersion = new Version(3, 0), ResponseUri = crawlStep.Uri, Server = "N/A", StatusCode = HttpStatusCode.OK, StatusDescription = "OK", Response = Encoding.UTF8.GetBytes(Resources.ncrawler_codeplex_com), DownloadTime = TimeSpan.FromSeconds(1), }; return result; }
public PropertyBag Download(CrawlStep crawlStep, DownloadMethod method) { PropertyBag result = new PropertyBag { Step = crawlStep, CharacterSet = string.Empty, ContentEncoding = string.Empty, ContentType = "text/html", Headers = null, IsMutuallyAuthenticated = false, IsFromCache = false, LastModified = DateTime.UtcNow, Method = "GET", ProtocolVersion = new Version(3, 0), ResponseUri = crawlStep.Uri, Server = "N/A", StatusCode = HttpStatusCode.OK, StatusDescription = "OK", Response = Encoding.UTF8.GetBytes(Resources.ncrawler_codeplex_com), DownloadTime = TimeSpan.FromSeconds(1), }; return(result); }
/// <summary> /// Gets or Sets a value indicating if cookies will be stored. /// </summary> public PropertyBag Download(CrawlStep crawlStep, DownloadMethod method) { AspectF.Define. NotNull(crawlStep, "crawlStep"); if (UserAgent.IsNullOrEmpty()) { UserAgent = "Mozilla/5.0"; } if (m_CacheEnabled) { if (CacheEntryExists(crawlStep, method)) { return GetCacheEntry(crawlStep, method); } } HttpWebRequest req = (HttpWebRequest) WebRequest.Create(crawlStep.Uri); req.Method = method.ToString(); req.AllowAutoRedirect = true; req.UserAgent = UserAgent; req.Accept = "*/*"; req.KeepAlive = true; if (ConnectionTimeout.HasValue) { req.Timeout = Convert.ToInt32(ConnectionTimeout.Value.TotalMilliseconds); } if (ReadTimeout.HasValue) { req.ReadWriteTimeout = Convert.ToInt32(ReadTimeout.Value.TotalMilliseconds); } if (UseCookies) { req.CookieContainer = CookieContainer; } Stopwatch downloadTimer = Stopwatch.StartNew(); HttpWebResponse resp; try { resp = (HttpWebResponse) req.GetResponse(); } catch (WebException we) { resp = we.Response as HttpWebResponse; if (resp.IsNull()) { throw; } } using (resp) using (Stream responseStream = resp.GetResponseStream()) { downloadTimer.Stop(); PropertyBag result = new PropertyBag { Step = crawlStep, CharacterSet = resp.CharacterSet, ContentEncoding = resp.ContentEncoding, ContentType = resp.ContentType, Headers = resp.Headers, IsMutuallyAuthenticated = resp.IsMutuallyAuthenticated, IsFromCache = resp.IsFromCache, LastModified = resp.LastModified, Method = resp.Method, ProtocolVersion = resp.ProtocolVersion, ResponseUri = resp.ResponseUri, Server = resp.Server, StatusCode = resp.StatusCode, StatusDescription = resp.StatusDescription, Response = CopyStreamToMemory(responseStream, MaximumContentSize), DownloadTime = downloadTimer.Elapsed, }; if (m_CacheEnabled) { WriteCacheEntry(crawlStep, method, result); } return result; } }
public DownloadExceptionEventArgs(CrawlStep crawlStep, Exception exception) { CrawlStep = crawlStep; Exception = exception; }
protected string Substitute(string original, CrawlStep crawlStep) { return(this.HasSubstitutionRules ? this.Substitutions.Aggregate(original, (current, substitution) => substitution.Substitute(current, crawlStep)) : original); }
protected string Substitute(string original, CrawlStep crawlStep) { return HasSubstitutionRules ? Substitutions.Aggregate(original, (current, substitution) => substitution.Substitute(current, crawlStep)) : original; }
private string GetCacheFileName(CrawlStep step, DownloadMethod method) { string fileName = FileSystemHelpers.ToValidFileName(string.Format("{0}_{1}", step.Uri, method)); fileName = Path.Combine(m_CacheFolder, fileName); return fileName.Max(248); }
internal BeforeDownloadEventArgs(bool cancel, CrawlStep crawlStep) : base(cancel) { CrawlStep = crawlStep; }
private PropertyBag GetCacheEntry(CrawlStep step, DownloadMethod method) { return File.ReadAllBytes(GetCacheFileName(step, method)).FromBinary<PropertyBag>(); }
/// <summary> /// Download content from a url /// </summary> /// <param name="step">Step in crawler that contains url to download</param> /// <returns>Downloaded content</returns> private PropertyBag Download(CrawlStep step) { try { IWebDownloader webDownloader = m_DownloaderFactory.GetDownloader(); m_Logger.Verbose("Downloading {0}", step.Uri); return webDownloader.Download(step, DownloadMethod.Get); } catch (Exception ex) { OnDownloadException(ex, step); } return null; }
private void EndDownload(CrawlStep crawlStep, PropertyBag propertyBag, Exception exception, ThreadSafeCounter.ThreadSafeCounterCookie counterCookie) { using (counterCookie) { if (exception != null) { OnDownloadException(exception, crawlStep); } else if (!propertyBag.IsNull()) { propertyBag.Referrer = crawlStep; // Assign initial properties to propertybag if (!counterCookie.CrawlerQueueEntry.Properties.IsNull()) { counterCookie.CrawlerQueueEntry.Properties. ForEach(key => propertyBag[key.Key].Value = key.Value); } if (OnAfterDownload(crawlStep, propertyBag)) { // Executes all the pipelines sequentially for each downloaded content // in the crawl process. Used to extract data from content, like which // url's to follow, email addresses, aso. Pipeline.ForEach(pipelineStep => ExecutePipeLineStep(pipelineStep, propertyBag)); } } } ProcessQueue(); }
public string Substitute(string original, CrawlStep crawlStep) { return(this.m_Match.Value.Replace(original, this.m_Replacement)); }
public bool Match(Uri uri, CrawlStep referrer) { return m_Regex.Value.Match(uri.ToString()).Success; }
public async Task <PropertyBag> DownloadAsync(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method) { return(await DownloadInternalSync(crawlStep, referrer, method)); }
private void WriteCacheEntry(CrawlStep step, DownloadMethod method, PropertyBag result) { File.WriteAllBytes(GetCacheFileName(step, method), result.ToBinary()); }
private PropertyBag GetCacheEntry(CrawlStep step, DownloadMethod method) { return(File.ReadAllBytes(GetCacheFileName(step, method)).FromBinary <PropertyBag>()); }
public string Substitute(string original, CrawlStep crawlStep) { return m_Match.Value.Replace(original, m_Replacement); }
public DownloadExceptionEventArgs(CrawlStep crawlStep, CrawlStep referrrer, Exception exception) { CrawlStep = crawlStep; Referrer = referrrer; Exception = exception; }
public PropertyBag Download(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method) { return(DownloadInternalSync(crawlStep, referrer, method)); }
public string Substitute(string original, CrawlStep crawlStep) { return(_match.Value.Replace(original, _replacement)); }
public bool Match(Uri uri, CrawlStep referrer) { return(m_Regex.Value.Match(uri.ToString()).Success); }
/// <summary> /// Gets or Sets a value indicating if cookies will be stored. /// </summary> public PropertyBag Download(CrawlStep crawlStep, DownloadMethod method) { AspectF.Define. NotNull(crawlStep, "crawlStep"); if (UserAgent.IsNullOrEmpty()) { UserAgent = "Mozilla/5.0"; } if (m_CacheEnabled) { if (CacheEntryExists(crawlStep, method)) { return(GetCacheEntry(crawlStep, method)); } } HttpWebRequest req = (HttpWebRequest)WebRequest.Create(crawlStep.Uri); req.Method = method.ToString(); req.AllowAutoRedirect = true; req.UserAgent = UserAgent; req.Accept = "*/*"; req.KeepAlive = true; if (ConnectionTimeout.HasValue) { req.Timeout = Convert.ToInt32(ConnectionTimeout.Value.TotalMilliseconds); } if (ReadTimeout.HasValue) { req.ReadWriteTimeout = Convert.ToInt32(ReadTimeout.Value.TotalMilliseconds); } if (UseCookies) { req.CookieContainer = CookieContainer; } Stopwatch downloadTimer = Stopwatch.StartNew(); HttpWebResponse resp; try { resp = (HttpWebResponse)req.GetResponse(); } catch (WebException we) { resp = we.Response as HttpWebResponse; if (resp.IsNull()) { throw; } } using (resp) using (Stream responseStream = resp.GetResponseStream()) { downloadTimer.Stop(); PropertyBag result = new PropertyBag { Step = crawlStep, CharacterSet = resp.CharacterSet, ContentEncoding = resp.ContentEncoding, ContentType = resp.ContentType, Headers = resp.Headers, IsMutuallyAuthenticated = resp.IsMutuallyAuthenticated, IsFromCache = resp.IsFromCache, LastModified = resp.LastModified, Method = resp.Method, ProtocolVersion = resp.ProtocolVersion, ResponseUri = resp.ResponseUri, Server = resp.Server, StatusCode = resp.StatusCode, StatusDescription = resp.StatusDescription, Response = CopyStreamToMemory(responseStream, MaximumContentSize), DownloadTime = downloadTimer.Elapsed, }; if (m_CacheEnabled) { WriteCacheEntry(crawlStep, method, result); } return(result); } }
/// <summary> /// Queue a new step on the crawler queue /// </summary> /// <param name = "uri">url to crawl</param> /// <param name = "depth">depth of the url</param> /// <param name = "referrer">Step which the url was located</param> /// <param name = "properties">Custom properties</param> public void AddStep(Uri uri, int depth, CrawlStep referrer, Dictionary<string, object> properties) { var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string; if (jsonStr == null) { using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8)) { jsonStr = stream.ReadToEnd(); var policy = new CacheItemPolicy(); policy.Priority = CacheItemPriority.NotRemovable; policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1); cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy); Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite")); } } var json = JsonConvert.DeserializeObject<OriginalWebSiteTxt>(jsonStr); var storeRegex = new List<string>(); var blockRegex = new List<string>(); if (json.StoreRegex != null && json.StoreRegex.Count > 0) { storeRegex = json.StoreRegex; } if (json.BlockRegex != null && json.BlockRegex.Count > 0) { blockRegex = json.BlockRegex; } bool needToCrawl = false; if (blockRegex != null && blockRegex.Count > 0) { foreach (var key in blockRegex) { if (uri.AbsoluteUri.Contains(key)) { return; } } } if (storeRegex != null && storeRegex.Count > 0) { foreach (var regex in storeRegex) { if (Regex.IsMatch(uri.AbsoluteUri, regex, RegexOptions.IgnoreCase)) { needToCrawl = true; break; } } } else { needToCrawl = true; } if (!needToCrawl) return; if (!m_Crawling) { throw new InvalidOperationException("Crawler must be running before adding steps"); } if (m_CrawlStopped) { return; } if ((uri.Scheme != Uri.UriSchemeHttps && uri.Scheme != Uri.UriSchemeHttp) || // Only accept http(s) schema (MaximumCrawlDepth.HasValue && MaximumCrawlDepth.Value > 0 && depth >= MaximumCrawlDepth.Value) || !m_CrawlerRules.IsAllowedUrl(uri, referrer) || !m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity))) { if (depth == 0) { StopCrawl(); } return; } // Make new crawl step CrawlStep crawlStep = new CrawlStep(uri, depth) { IsExternalUrl = m_CrawlerRules.IsExternalUrl(uri), IsAllowed = true, }; m_CrawlerQueue.Push(new CrawlerQueueEntry { CrawlStep = crawlStep, Referrer = referrer, Properties = properties }); m_Logger.Verbose("Added {0} to queue referred from {1}", crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString()); ProcessQueue(); }
/// <summary> /// Queue a new step on the crawler queue /// </summary> /// <param name = "uri">url to crawl</param> /// <param name = "depth">depth of the url</param> /// <param name = "referrer">Step which the url was located</param> /// <param name = "properties">Custom properties</param> public void AddStep(Uri uri, int depth, CrawlStep referrer, Dictionary<string, object> properties) { if (!m_Crawling) { throw new InvalidOperationException("Crawler must be running before adding steps"); } if (m_CrawlStopped) { return; } if ((uri.Scheme != Uri.UriSchemeHttps && uri.Scheme != Uri.UriSchemeHttp) || // Only accept http(s) schema (MaximumCrawlDepth.HasValue && MaximumCrawlDepth.Value > 0 && depth >= MaximumCrawlDepth.Value) || !m_CrawlerRules.IsAllowedUrl(uri, referrer) || !m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity))) { if (depth == 0) { StopCrawl(); } return; } // Make new crawl step CrawlStep crawlStep = new CrawlStep(uri, depth) { IsExternalUrl = m_CrawlerRules.IsExternalUrl(uri), IsAllowed = true, }; m_CrawlerQueue.Push(new CrawlerQueueEntry { CrawlStep = crawlStep, Referrer = referrer, Properties = properties }); m_Logger.Verbose("Added {0} to queue referred from {1}", crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString()); ProcessQueue(); }
public DownloadExceptionEventArgs(CrawlStep crawlStep, CrawlStep referrrer, Exception exception) { this.CrawlStep = crawlStep; this.Referrer = referrrer; this.Exception = exception; }
public async Task <PropertyBag> DownloadAsync(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method) { return(await this.DownloadInternalSync(crawlStep, referrer, method).ConfigureAwait(false)); }
private bool CacheEntryExists(CrawlStep step, DownloadMethod method) { return(FileSystemHelpers.FileExists(GetCacheFileName(step, method))); }
private bool CacheEntryExists(CrawlStep step, DownloadMethod method) { return FileSystemHelpers.FileExists(GetCacheFileName(step, method)); }