public void Crawl(Uri url) { _linkFilters = new List<ILinkFilter>() { new NoAnchorsFilter(), new NoMailToFilter(), new CacheHitFilter(_cache), }; //Could be done with subclassing //(class NoExternalLinkCrawler : Crawler) //but adds way more complexity for what it is worth. if (!_followExternalLinks) _linkFilters.Add(new NoExternalLinks(url)); var fetcher = new Fetcher(url); //adds the starting url to the queue (even if already downloaded) _queue.Enqueue(fetcher); foreach (Page cachedPage in _cache) { //Each page already in cache is supplied to the OnPageLoaded //method, so their links can be processed. This basically //resumes (with a bit of overhead) the process where it //was stopped. OnPageLoaded(cachedPage); } _queue.Process(); }
public void Enqueue(Fetcher fetcher) { lock (_lockObject) { //If a fetcher with the same url is already in queue it is not //enqueued - it is assumed that two pages with the same //url are the same page, so no need to download them twice. if (_queue.All(x => x.Uri.AbsoluteUri != fetcher.Uri.AbsoluteUri)) { _queue.Add(fetcher); } } }
//Callback invoked on error - it just removes the //fetcher silently. private void RemoveFetcher(Fetcher fetcher) { lock (_lockObject) { _executing.Remove(fetcher); } }
//Callback invoked on successful completion. private void OnCompleted(Fetcher fetcher) { RemoveFetcher(fetcher); _callback(fetcher.DownloadedPage); //If, after invoking the parent's callback, no jobs are in queue //and no jobs are executing it means that the crawling is over, //so the ProcessingOver event is fired. if (!_queue.Any() && !_executing.Any() && ProcessingOver != null) { ProcessingOver(); } }