public async Task StartAsync(bool restart = false) { mLogger.Log(nameof(Crawler), "Running"); lock (mStatusSyncLock) { if (Status == CrawlerStatus.RUNNING) { return; } Status = CrawlerStatus.RUNNING; } if (restart) { mUrlFrontier = new SimpleUrlFrontier(mConfig); } mTasks.Clear(); mFetchSemaphore = new SemaphoreSlim(mConfig.MaxFetchingConcurrency); mCancellationTokenSource = new CancellationTokenSource(); var token = mCancellationTokenSource.Token; await Task.Run(() => { while (true) { if (token.IsCancellationRequested) { break; } string url = null; try { url = mUrlFrontier.PopUrl(); } catch (Exception e) { mLogger.LogException(nameof(Crawler), "Failed to pop url", e); mErrorLogger.LogException(nameof(Crawler), "Failed to pop url", e); } if (url != null) { FetchUrlAsync(url); } else if (mTasks.Keys.Any()) { Task.WaitAny(mTasks.Keys.ToArray()); } else { Thread.Sleep(5000); } } }, token); }
public Crawler(string seeds) { _logger = Log.Instance; _visitedServers = new ConcurrentDictionary<int, long>(); _domainToIpMap = new ConcurrentDictionary<int, int>(); _urlFrontier = new SimpleUrlFrontier(seeds); _store = new Store(); _parser = new Parser(); _robotsTxts = new ConcurrentDictionary<int, RobotsTxt>(); _cts = new CancellationTokenSource(); }
public Crawler(CrawlerConfig config, IUrlFrontier urlFrontier, IFetcher fetcher, ISimilarContentManager similarContentManager, List <IUrlFilter> urlFilters) { mConfig = config; Status = CrawlerStatus.STOPPED; mUrlFrontier = urlFrontier; mFetcher = fetcher; mSimilarContentJudger = similarContentManager; mUrlFilters = urlFilters; mLogger = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler.Log"), true); mErrorLogger = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler Error.Log"), false); }