Exemplo n.º 1
0
        public void Start()
        {
            if (IsActive)
            {
                throw new InvalidOperationException("Crawler already active!");
            }

            IsActive   = true;
            httpClient = new HttpClient();
            httpClient.DefaultRequestHeaders.Add("User-Agent", Config.UserAgent);
            robots = new RobotsHandler(Config, httpClient);

            cancelSource = new CancellationTokenSource();

            for (int i = 0; i < Config.MaxConcurrency; i++)
            {
                new Task(Work, cancelSource.Token, TaskCreationOptions.LongRunning).Start();
            }

            StateChanged?.Invoke(this, true);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Finds all URLs in given text content from given URL based on the given configuration and automatically adds to given work manager.
        /// </summary>
        /// <param name="url"></param>
        /// <param name="content"></param>
        /// <param name="config"></param>
        /// <param name="plugins"></param>
        /// <param name="manager"></param>
        /// <param name="cancelSource"></param>
        public static void ScanContentAndAddToManager(string url, string content,
                                                      WorkerConfiguration config, PluginManager plugins, WorkManager manager,
                                                      RobotsHandler robotHandler, CancellationTokenSource cancelSource)
        {
            // check plugins for FindUrls implementation
            PluginInfo foundplugin = null;

            if (plugins != null)
            {
                foreach (var p in plugins.Plugins)
                {
                    if (p.FindUrlsImplemented)
                    {
                        foundplugin = p;
                        break;
                    }
                }
            }

            // find URLs (use PLUGIN that overrides it, if it exists)
            if (foundplugin == null)
            {
                foreach (var u in FindUrls(url, content, config))
                {
                    if (cancelSource.IsCancellationRequested)
                    {
                        break;
                    }
                    validateAndAddFoundUrl(u);
                }
            }
            else
            {
                foreach (var u in foundplugin.FindUrls(url, content))
                {
                    if (cancelSource.IsCancellationRequested)
                    {
                        break;
                    }
                    validateAndAddFoundUrl(u);
                }
            }

            // LOCAL FUNCTION FOR VALIDATING FOUND URLS
            void validateAndAddFoundUrl(string u)
            {
                // check if URL is excluded
                if (robotHandler.IsUrlExcluded(u, config).Result)
                {
                    return;
                }

                // check if URL is eligible for crawling
                if (manager.IsUrlEligibleForCrawl(u) == false)
                {
                    return;
                }

                if (manager.IsUrlCrawled(u))
                {
                    // ignore already-crawled urls
                }
                else
                {
                    manager.AddToBacklog(u);
                }
            }
        }