Ejemplo n.º 1
0
        public override bool IsExternalUrl(Uri uri)
        {
            // Is External Url
            if (!base.IsExternalUrl(uri))
            {
                return false;
            }

            // Yes, check if we have crawled it before
            if (!m_CrawlerHistory.Register(uri.GetUrlKeyString(m_Crawler.UriSensitivity)))
            {
                return true;
            }

            // Create child crawler to traverse external site with max 2 levels
            using (Crawler externalCrawler = new Crawler(uri,
                new HtmlDocumentProcessor(), // Process html
                new DumperStep())
                {
                    MaximumThreadCount = 1,
                    MaximumCrawlDepth = 2,
                    MaximumCrawlCount = 10,
                    ExcludeFilter = Program.ExtensionsToSkip,
                })
            {
                // Crawl external site
                externalCrawler.Crawl();
            }

            // Do not follow link on this crawler
            return true;
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 	Queue a new step on the crawler queue
        /// </summary>
        /// <param name = "uri">url to crawl</param>
        /// <param name = "depth">depth of the url</param>
        /// <param name = "referrer">Step which the url was located</param>
        /// <param name = "properties">Custom properties</param>
        public void AddStep(Uri uri, int depth, CrawlStep referrer, Dictionary<string, object> properties)
        {
            if (!m_Crawling)
            {
                throw new InvalidOperationException("Crawler must be running before adding steps");
            }

            if (m_CrawlStopped)
            {
                return;
            }

            if ((uri.Scheme != Uri.UriSchemeHttps && uri.Scheme != Uri.UriSchemeHttp) || // Only accept http(s) schema
                (MaximumCrawlDepth.HasValue && MaximumCrawlDepth.Value > 0 && depth >= MaximumCrawlDepth.Value) ||
                !m_CrawlerRules.IsAllowedUrl(uri, referrer) ||
                !m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity)))
            {
                if (depth == 0)
                {
                    StopCrawl();
                }

                return;
            }

            // Make new crawl step
            CrawlStep crawlStep = new CrawlStep(uri, depth)
                {
                    IsExternalUrl = m_CrawlerRules.IsExternalUrl(uri),
                    IsAllowed = true,
                };
            m_CrawlerQueue.Push(new CrawlerQueueEntry
                {
                    CrawlStep = crawlStep,
                    Referrer = referrer,
                    Properties = properties
                });
            m_Logger.Verbose("Added {0} to queue referred from {1}",
                crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString());
            ProcessQueue();
        }
Ejemplo n.º 3
0
        /// <summary>
        /// 	Queue a new step on the crawler queue
        /// </summary>
        /// <param name = "uri">url to crawl</param>
        /// <param name = "depth">depth of the url</param>
        /// <param name = "referrer">Step which the url was located</param>
        /// <param name = "properties">Custom properties</param>
        public void AddStep(Uri uri, int depth, CrawlStep referrer, Dictionary<string, object> properties)
        {
            var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string;
            if (jsonStr == null)
            {
                using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8))
                {
                    jsonStr = stream.ReadToEnd();
                    var policy = new CacheItemPolicy();
                    policy.Priority = CacheItemPriority.NotRemovable;
                    policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1);
                    cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy);
                    Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite"));
                }
            }
            var json = JsonConvert.DeserializeObject<OriginalWebSiteTxt>(jsonStr);
            var storeRegex = new List<string>();
            var blockRegex = new List<string>();
            if (json.StoreRegex != null && json.StoreRegex.Count > 0)
            {
                storeRegex = json.StoreRegex;
            }
            if (json.BlockRegex != null && json.BlockRegex.Count > 0)
            {
                blockRegex = json.BlockRegex;
            }
            bool needToCrawl = false;
            if (blockRegex != null && blockRegex.Count > 0)
            {
                foreach (var key in blockRegex)
                {
                    if (uri.AbsoluteUri.Contains(key))
                    {
                        return;
                    }
                }
            }
            if (storeRegex != null && storeRegex.Count > 0)
            {
                foreach (var regex in storeRegex)
                {
                    if (Regex.IsMatch(uri.AbsoluteUri, regex, RegexOptions.IgnoreCase))
                    {
                        needToCrawl = true;
                        break;
                    }
                }
            }
            else
            {
                needToCrawl = true;
            }
            if (!needToCrawl) return;

            if (!m_Crawling)
            {
                throw new InvalidOperationException("Crawler must be running before adding steps");
            }

            if (m_CrawlStopped)
            {
                return;
            }

            if ((uri.Scheme != Uri.UriSchemeHttps && uri.Scheme != Uri.UriSchemeHttp) || // Only accept http(s) schema
                (MaximumCrawlDepth.HasValue && MaximumCrawlDepth.Value > 0 && depth >= MaximumCrawlDepth.Value) ||
                !m_CrawlerRules.IsAllowedUrl(uri, referrer) ||
                !m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity)))
            {
                if (depth == 0)
                {
                    StopCrawl();
                }

                return;
            }

            // Make new crawl step
            CrawlStep crawlStep = new CrawlStep(uri, depth)
                {
                    IsExternalUrl = m_CrawlerRules.IsExternalUrl(uri),
                    IsAllowed = true,
                };
            m_CrawlerQueue.Push(new CrawlerQueueEntry
                {
                    CrawlStep = crawlStep,
                    Referrer = referrer,
                    Properties = properties
                });
            m_Logger.Verbose("Added {0} to queue referred from {1}",
                crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString());
            ProcessQueue();
        }