Example #1
0
 public void Crawl(Uri url)
 {
     _linkFilters = new List<ILinkFilter>()
     {
         new NoAnchorsFilter(),
         new NoMailToFilter(),
         new CacheHitFilter(_cache),
     };
     //Could be done with subclassing
     //(class NoExternalLinkCrawler : Crawler)
     //but adds way more complexity for what it is worth.
     if (!_followExternalLinks) _linkFilters.Add(new NoExternalLinks(url));
     var fetcher = new Fetcher(url);
     //adds the starting url to the queue (even if already downloaded)
     _queue.Enqueue(fetcher);
     foreach (Page cachedPage in _cache)
     {
         //Each page already in cache is supplied to the OnPageLoaded
         //method, so their links can be processed. This basically
         //resumes (with a bit of overhead) the process where it
         //was stopped.
         OnPageLoaded(cachedPage);
     }
     _queue.Process();
 }
Example #2
0
 public void Enqueue(Fetcher fetcher)
 {
     lock (_lockObject)
     {
         //If a fetcher with the same url is already in queue it is not
         //enqueued - it is assumed that two pages with the same
         //url are the same page, so no need to download them twice.
         if (_queue.All(x => x.Uri.AbsoluteUri != fetcher.Uri.AbsoluteUri))
         {
             _queue.Add(fetcher);
         }
     }
 }
Example #3
0
 //Callback invoked on error - it just removes the
 //fetcher silently.
 private void RemoveFetcher(Fetcher fetcher)
 {
     lock (_lockObject)
     {
         _executing.Remove(fetcher);
     }
 }
Example #4
0
 //Callback invoked on successful completion.
 private void OnCompleted(Fetcher fetcher)
 {
     RemoveFetcher(fetcher);
     _callback(fetcher.DownloadedPage);
     //If, after invoking the parent's callback, no jobs are in queue
     //and no jobs are executing it means that the crawling is over,
     //so the ProcessingOver event is fired.
     if (!_queue.Any() && !_executing.Any() && ProcessingOver != null)
     {
         ProcessingOver();
     }
 }