Пример #1
0
        private async Task SendUris(ILinkExtractor reader)
        {
            var l = new List <QueueItem>();

            if (reader == null)
            {
                return;
            }

            Uri uri = reader.NextUri();

            while (uri != null && l.Count < FetchoConfiguration.Current.MaxLinksToExtractFromOneResource * 10)
            {
                var item = new QueueItem()
                {
                    SourceUri = reader.CurrentSourceUri,
                    TargetUri = uri
                };

                l.Add(item);
                uri = reader.NextUri();
            }

            LinksExtracted += l.Count;
            // effectively block until the URLs are accepted
            await PrioritisationBuffer.SendOrWaitAsync(l.Randomise().Take(FetchoConfiguration.Current.MaxLinksToExtractFromOneResource)).ConfigureAwait(false);
        }
Пример #2
0
        private void OutputUris(ILinkExtractor reader)
        {
            if (reader == null)
            {
                return;
            }

            Uri uri = reader.NextUri();

            while (uri != null)
            {
                Console.WriteLine("{0}\t{1}", reader.CurrentSourceUri, uri);

                uri = reader.NextUri();
            }
        }
Пример #3
0
        public Crawler(WorkerPool workerPool, ICrawlRequestFilter crawlRequestFilter, ILinkExtractor linkExtractor, ICrawlerObserver observer,
                       IReadOnlyDictionary <string, string> customHttpHeaders = null, TimeSpan?requestTimeout = null, int maxRetries = 0, string userAgent = null)
        {
            if (this.maxRetries < 0)
            {
                throw new ArgumentOutOfRangeException(nameof(maxRetries), "Max retries must be non-negative");
            }

            this.crawlRequestFilter = crawlRequestFilter;
            this.linkExtractor      = linkExtractor;
            this.observer           = observer;
            this.maxRetries         = maxRetries;
            this.userAgent          = userAgent;
            this.customHttpHeaders  = customHttpHeaders ?? new Dictionary <string, string>();
            this.workerPool         = workerPool;

            if (requestTimeout != null)
            {
                this.httpClient.Timeout = requestTimeout.Value;
            }
        }
Пример #4
0
 public TextProcessor(ILinkExtractor linkExtractor, IHtmlExtractor htmlExtractor, IMetaExtractor metaExtractor)
 {
     _linkExtractor = linkExtractor;
     _htmlExtractor = htmlExtractor;
     _metaExtractor = metaExtractor;
 }
Пример #5
0
 public CrawlService(IHTMLProvider htmlProvider, ILinkExtractor linkExtractor)
 {
     _htmlProvider  = htmlProvider;
     _linkExtractor = linkExtractor;
     linksToVisit   = new ConcurrentQueue <string>();
 }
Пример #6
0
 public NodeFactory(ILinkExtractor linkExtractor)
 {
     _linkExtractor = linkExtractor;
 }