private async Task SendUris(ILinkExtractor reader) { var l = new List <QueueItem>(); if (reader == null) { return; } Uri uri = reader.NextUri(); while (uri != null && l.Count < FetchoConfiguration.Current.MaxLinksToExtractFromOneResource * 10) { var item = new QueueItem() { SourceUri = reader.CurrentSourceUri, TargetUri = uri }; l.Add(item); uri = reader.NextUri(); } LinksExtracted += l.Count; // effectively block until the URLs are accepted await PrioritisationBuffer.SendOrWaitAsync(l.Randomise().Take(FetchoConfiguration.Current.MaxLinksToExtractFromOneResource)).ConfigureAwait(false); }
private void OutputUris(ILinkExtractor reader) { if (reader == null) { return; } Uri uri = reader.NextUri(); while (uri != null) { Console.WriteLine("{0}\t{1}", reader.CurrentSourceUri, uri); uri = reader.NextUri(); } }
public Crawler(WorkerPool workerPool, ICrawlRequestFilter crawlRequestFilter, ILinkExtractor linkExtractor, ICrawlerObserver observer, IReadOnlyDictionary <string, string> customHttpHeaders = null, TimeSpan?requestTimeout = null, int maxRetries = 0, string userAgent = null) { if (this.maxRetries < 0) { throw new ArgumentOutOfRangeException(nameof(maxRetries), "Max retries must be non-negative"); } this.crawlRequestFilter = crawlRequestFilter; this.linkExtractor = linkExtractor; this.observer = observer; this.maxRetries = maxRetries; this.userAgent = userAgent; this.customHttpHeaders = customHttpHeaders ?? new Dictionary <string, string>(); this.workerPool = workerPool; if (requestTimeout != null) { this.httpClient.Timeout = requestTimeout.Value; } }
public TextProcessor(ILinkExtractor linkExtractor, IHtmlExtractor htmlExtractor, IMetaExtractor metaExtractor) { _linkExtractor = linkExtractor; _htmlExtractor = htmlExtractor; _metaExtractor = metaExtractor; }
public CrawlService(IHTMLProvider htmlProvider, ILinkExtractor linkExtractor) { _htmlProvider = htmlProvider; _linkExtractor = linkExtractor; linksToVisit = new ConcurrentQueue <string>(); }
public NodeFactory(ILinkExtractor linkExtractor) { _linkExtractor = linkExtractor; }