Ejemplo n.º 1
0
        public async Task StartAsync(bool restart = false)
        {
            mLogger.Log(nameof(Crawler), "Running");
            lock (mStatusSyncLock)
            {
                if (Status == CrawlerStatus.RUNNING)
                {
                    return;
                }
                Status = CrawlerStatus.RUNNING;
            }

            if (restart)
            {
                mUrlFrontier = new SimpleUrlFrontier(mConfig);
            }

            mTasks.Clear();
            mFetchSemaphore = new SemaphoreSlim(mConfig.MaxFetchingConcurrency);

            mCancellationTokenSource = new CancellationTokenSource();
            var token = mCancellationTokenSource.Token;
            await Task.Run(() =>
            {
                while (true)
                {
                    if (token.IsCancellationRequested)
                    {
                        break;
                    }

                    string url = null;

                    try
                    {
                        url = mUrlFrontier.PopUrl();
                    }
                    catch (Exception e)
                    {
                        mLogger.LogException(nameof(Crawler), "Failed to pop url", e);
                        mErrorLogger.LogException(nameof(Crawler), "Failed to pop url", e);
                    }

                    if (url != null)
                    {
                        FetchUrlAsync(url);
                    }
                    else if (mTasks.Keys.Any())
                    {
                        Task.WaitAny(mTasks.Keys.ToArray());
                    }
                    else
                    {
                        Thread.Sleep(5000);
                    }
                }
            }, token);
        }
Ejemplo n.º 2
0
 public Crawler(string seeds)
 {
     _logger = Log.Instance;
     _visitedServers = new ConcurrentDictionary<int, long>();
     _domainToIpMap = new ConcurrentDictionary<int, int>();
     _urlFrontier = new SimpleUrlFrontier(seeds);
     _store = new Store();
     _parser = new Parser();
     _robotsTxts = new ConcurrentDictionary<int, RobotsTxt>();
     _cts = new CancellationTokenSource();
 }
Ejemplo n.º 3
0
 public Crawler(CrawlerConfig config,
                IUrlFrontier urlFrontier,
                IFetcher fetcher,
                ISimilarContentManager similarContentManager,
                List <IUrlFilter> urlFilters)
 {
     mConfig               = config;
     Status                = CrawlerStatus.STOPPED;
     mUrlFrontier          = urlFrontier;
     mFetcher              = fetcher;
     mSimilarContentJudger = similarContentManager;
     mUrlFilters           = urlFilters;
     mLogger               = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler.Log"), true);
     mErrorLogger          = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler Error.Log"), false);
 }