public override bool IsExternalUrl(Uri uri) { // Is External Url if (!base.IsExternalUrl(uri)) { return false; } // Yes, check if we have crawled it before if (!m_CrawlerHistory.Register(uri.GetUrlKeyString(m_Crawler.UriSensitivity))) { return true; } // Create child crawler to traverse external site with max 2 levels using (Crawler externalCrawler = new Crawler(uri, new HtmlDocumentProcessor(), // Process html new DumperStep()) { MaximumThreadCount = 1, MaximumCrawlDepth = 2, MaximumCrawlCount = 10, ExcludeFilter = Program.ExtensionsToSkip, }) { // Crawl external site externalCrawler.Crawl(); } // Do not follow link on this crawler return true; }
/// <summary> /// Queue a new step on the crawler queue /// </summary> /// <param name = "uri">url to crawl</param> /// <param name = "depth">depth of the url</param> /// <param name = "referrer">Step which the url was located</param> /// <param name = "properties">Custom properties</param> public void AddStep(Uri uri, int depth, CrawlStep referrer, Dictionary<string, object> properties) { if (!m_Crawling) { throw new InvalidOperationException("Crawler must be running before adding steps"); } if (m_CrawlStopped) { return; } if ((uri.Scheme != Uri.UriSchemeHttps && uri.Scheme != Uri.UriSchemeHttp) || // Only accept http(s) schema (MaximumCrawlDepth.HasValue && MaximumCrawlDepth.Value > 0 && depth >= MaximumCrawlDepth.Value) || !m_CrawlerRules.IsAllowedUrl(uri, referrer) || !m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity))) { if (depth == 0) { StopCrawl(); } return; } // Make new crawl step CrawlStep crawlStep = new CrawlStep(uri, depth) { IsExternalUrl = m_CrawlerRules.IsExternalUrl(uri), IsAllowed = true, }; m_CrawlerQueue.Push(new CrawlerQueueEntry { CrawlStep = crawlStep, Referrer = referrer, Properties = properties }); m_Logger.Verbose("Added {0} to queue referred from {1}", crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString()); ProcessQueue(); }
/// <summary> /// Queue a new step on the crawler queue /// </summary> /// <param name = "uri">url to crawl</param> /// <param name = "depth">depth of the url</param> /// <param name = "referrer">Step which the url was located</param> /// <param name = "properties">Custom properties</param> public void AddStep(Uri uri, int depth, CrawlStep referrer, Dictionary<string, object> properties) { var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string; if (jsonStr == null) { using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8)) { jsonStr = stream.ReadToEnd(); var policy = new CacheItemPolicy(); policy.Priority = CacheItemPriority.NotRemovable; policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1); cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy); Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite")); } } var json = JsonConvert.DeserializeObject<OriginalWebSiteTxt>(jsonStr); var storeRegex = new List<string>(); var blockRegex = new List<string>(); if (json.StoreRegex != null && json.StoreRegex.Count > 0) { storeRegex = json.StoreRegex; } if (json.BlockRegex != null && json.BlockRegex.Count > 0) { blockRegex = json.BlockRegex; } bool needToCrawl = false; if (blockRegex != null && blockRegex.Count > 0) { foreach (var key in blockRegex) { if (uri.AbsoluteUri.Contains(key)) { return; } } } if (storeRegex != null && storeRegex.Count > 0) { foreach (var regex in storeRegex) { if (Regex.IsMatch(uri.AbsoluteUri, regex, RegexOptions.IgnoreCase)) { needToCrawl = true; break; } } } else { needToCrawl = true; } if (!needToCrawl) return; if (!m_Crawling) { throw new InvalidOperationException("Crawler must be running before adding steps"); } if (m_CrawlStopped) { return; } if ((uri.Scheme != Uri.UriSchemeHttps && uri.Scheme != Uri.UriSchemeHttp) || // Only accept http(s) schema (MaximumCrawlDepth.HasValue && MaximumCrawlDepth.Value > 0 && depth >= MaximumCrawlDepth.Value) || !m_CrawlerRules.IsAllowedUrl(uri, referrer) || !m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity))) { if (depth == 0) { StopCrawl(); } return; } // Make new crawl step CrawlStep crawlStep = new CrawlStep(uri, depth) { IsExternalUrl = m_CrawlerRules.IsExternalUrl(uri), IsAllowed = true, }; m_CrawlerQueue.Push(new CrawlerQueueEntry { CrawlStep = crawlStep, Referrer = referrer, Properties = properties }); m_Logger.Verbose("Added {0} to queue referred from {1}", crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString()); ProcessQueue(); }