Beispiel #1
0
        public void WriteFile(WebPage page)
        {
            if (_fileToDomainMap.ContainsKey(page.Uri.AbsoluteUri))
            {
                return;
            }

            Guid guid = Guid.NewGuid();
            string fileName = guid.ToString() + ".html";

            page.SavePage(Path.Combine(_outputPath, fileName));

            _fileToDomainMap.TryAdd(page.Uri.AbsoluteUri, fileName);
        }
Beispiel #2
0
        private bool EnsurePoliteVisit(WebPage webpage)
        {
            // Check robotstxt to see if we can visit the page
            string hostname = webpage.Uri.DnsSafeHost;
            RobotsTxt robotsTxt;
            if (!_robotsTxts.ContainsKey(hostname.GetHashCode()))
            {
                robotsTxt = _parser.Parse(hostname);
                if (robotsTxt == null) // we could not get the robotstxt therefore we cannot visit and we requeue the Uri
                {
                    _urlFrontier.AddUri(webpage.Uri);
                    return false;
                }

                _robotsTxts.TryAdd(hostname.GetHashCode(), robotsTxt);
            }
            else
            {
                robotsTxt = _robotsTxts[hostname.GetHashCode()];
            }

            if (!robotsTxt.CanVisit("*", webpage.Uri)) // We are not allowed to visit this page
                return false;

            // We are allowed to visit, ensure we are waiting enough time before next request
            int address = webpage.Address;
            if (_visitedServers.ContainsKey(address))
            {
                DateTime now = DateTime.Now;
                DateTime safeVisit = new DateTime(_visitedServers[address]);

                int delay = (int) safeVisit.Subtract(now).TotalMilliseconds;
                if (delay > 0)
                {
                    Thread.Sleep(delay);
                }
            }
            return true;
        }
Beispiel #3
0
 private void UpdateTimestamp(WebPage webpage)
 {
     DateTime nextValidVisitTime = DateTime.Now.AddSeconds(TIME_BETWEEN_VISITS);
     if (_visitedServers.ContainsKey(webpage.Address))
     {
         _visitedServers[webpage.Address] = nextValidVisitTime.Ticks;
     }
     else
     {
         _visitedServers.TryAdd(webpage.Address, nextValidVisitTime.Ticks);
     }
 }
Beispiel #4
0
        private void ProcessUri(ParallelOptions options, Uri uri)
        {
            if (uri == null)
                return;

            WebPage webpage = new WebPage(uri);

            // Is it safe to visit the webpage?
            if (!EnsurePoliteVisit(webpage)) return;

            // Visit webpage
            webpage.LoadPage();

            // make sure to update or add time for visit to the dictionary
            UpdateTimestamp(webpage);

            // Add webpage anchors to the queue
            if (!webpage.IsLoaded)
                return;

            if (options.CancellationToken.IsCancellationRequested)
            {
                Thread.CurrentThread.Abort();
            }
            _store.WriteFile(webpage);
            _statistics.IncrementPagesCrawled();

            // Add extracted anchors to the queue
            _urlFrontier.AddUriRange(webpage.GetAnchors());
        }