static string PoolManager( CollectorPool pool, VisitedUrls urlHistory, VisitedDomains domainHistory) { string link = pool.Next(); Uri uri = link.ToUri(); if (uri.IsNull()) return string.Empty; // No more links in the pool // Check if url has been visited within this session while ( urlHistory.ContainsUrl(uri.ToString()) || domainHistory.ContainsDomain(uri.ToString()) || IsExcludedDomain(pool, uri)) { link = pool.Next(); uri = link.ToUri(); if (uri.IsNull()) return string.Empty; // No more links in the pool } // Recursively check if host is still processed //if (domainHistory.ContainsDomain(uri.ToString())) //{ // // Store the link back to pool // pool.Store(uri.ToString()); // return PoolManager(pool, urlHistory, domainHistory); //} //else //{ // Link is ok, add to history urlHistory.Add(uri.ToString()); domainHistory.Add(uri.ToString()); return uri.ToString(); //} }
static void Main(string[] args) { // arguments // --help // --provider=redis|mysql|sqlserver // --server=hostname // --port= // --database= // --uid= // --pwd= // --parallel-count LogBuffer = new StringBuilder(); int parallelCount = 70; //string provider = "redis"; //string server = "127.0.0.1"; //string port = "6379"; string provider = "mongodb"; string server = "50.62.1.71"; string port = "27017"; string database = "ls"; Console.CancelKeyPress += new ConsoleCancelEventHandler(Console_CancelKeyPress); PrintAndClearHeaderArea(); LogLine("Buffer width: " + Console.BufferWidth); LogLine("Buffer height: " + Console.BufferHeight); IPersistence persistence = PersistenceFactory.GetPersistence(provider, new Dictionary<string, string> { { "server", server }, { "port", port }, { "database", database } }); if (!persistence.Ping()) { LogLine("Unable to connect to the database. Aborting."); Environment.Exit(1); } Repository repository = new Repository(persistence); RobotService robots = new RobotService(); CollectorPool pool; VisitedUrls history; VisitedDomains domainHistory = new VisitedDomains(); int poolCount = 0; Log("Loading pool..."); repository.Load(out pool); pool.Store("squidoo.com"); pool.Store("ezinearticles.com"); pool.Store("hubpages.com"); pool.Store("technorati.com"); pool.Store("buzzle.com"); pool.Store("suite101.com"); pool.Store("goarticles.com"); pool.Store("apsense.com"); pool.Store("allaboutcounseling.com"); pool.Store("digg.com"); pool.Store("dmoz.org"); pool.Store("dir.yahoo.com"); poolCount = pool.Count; LogLine("done. Found " + poolCount); Log("Loading link data..."); repository.LoadData(); LogLine("done. Found " + repository.Links.Count); Log("Loading history..."); repository.Load(out history, DateTime.Today); LogLine("done"); Log("Loading TLD parser..."); TldParser tldParser = new TldParser(); LogLine("done"); Thread.Sleep(5000); PrintAndClearHeaderArea(); SynchronizationContext.SetSynchronizationContext(new SynchronizationContext()); TaskScheduler scheduler = TaskScheduler.FromCurrentSynchronizationContext(); TaskScheduler.UnobservedTaskException += (o, ea) => { LogLine("Exception: {0}", ea.Exception.Message); ea.SetObserved(); }; DateTime start = DateTime.Now; CancellationTokenSource cancelTokenSource = new CancellationTokenSource(); CancellationToken cancelToken = cancelTokenSource.Token; //CountdownEvent countdown = new CountdownEvent(parallelCount); Task[] tasks = new Task[parallelCount]; for (int i = 0; i < parallelCount; i++) { TaskState state = new TaskState(); state.Pool = pool; state.VisitedUrls = history; state.VisitedDomains = domainHistory; state.TldParser = tldParser; state.Robots = robots; state.Repository = repository; state.PoolManagerHandler = PoolManager; state.ProgressHandler = CollectorProgress; state.CancelToken = cancelToken; //state.Countdown = countdown; state.CollectorID = i; tasks[i] = new Task(CollectorProcessor, state, cancelToken, TaskCreationOptions.LongRunning); tasks[i].Start(scheduler); //tasks[i].Start(); } //allDone.WaitOne(); WriteXY(0, WORK_AREA_TOP + parallelCount + 2, "press [ENTER] to quit"); Console.Read(); cancelTokenSource.Cancel(true); try { Task.WaitAll(tasks, cancelToken); //countdown.Wait(cancelToken); } catch { LogLine("Collectors stopped."); } Thread.Sleep(3000); // countdown.Dispose(); PrintAndClearHeaderArea(); LogLine("Closing database..."); repository.Commit<CollectorPool>(pool, null); ((IDisposable)persistence).Dispose(); // Print statistics DateTime elapsed = DateTime.Now; LogLine("Pool count: before = {0} now = {1}", poolCount, pool.Count); int totalLinks = repository.CrawlDateLinks[DateTime.Today.ToString("yyMMdd")].Value.Count; LogLine("Links crawled: {0} in {1} seconds ({2}/sec)", totalLinks, (elapsed - start).Seconds, (totalLinks) / (elapsed - start).Seconds); // Dump data to log //LogLine("Anchors"); //LogLine(repository.Anchors.JsonSerialize(true)); //LogLine("AnchorTextExactRelations"); //LogLine(repository.AnchorTextExactRelations.JsonSerialize(true)); //LogLine("DomainOrSubdomains"); //LogLine(repository.DomainOrSubdomains.JsonSerialize(true)); //LogLine("Domains"); //LogLine(repository.Domains.JsonSerialize(true)); //LogLine("LinkCrawlDateCurrent"); //LogLine(repository.LinkCrawlDateCurrent.JsonSerialize(true)); //LogLine("LinkCrawlDateHistory"); //LogLine(repository.LinkCrawlDateHistory.JsonSerialize(true)); //LogLine("LinkRating"); //LogLine(repository.LinkRating.JsonSerialize(true)); //LogLine("Links"); //LogLine(repository.Links.ToArray().Where((kv) => { return kv.Value.IsDirty; }).JsonSerialize(true)); //LogLine("LinkStatusCurrent"); //LogLine(repository.LinkStatusCurrent.JsonSerialize(true)); //LogLine("LinkStatusHistory"); //LogLine(repository.LinkStatusHistory.JsonSerialize(true)); // Write log to file SaveLog(); Environment.Exit(0); }