예제 #1
0
파일: Program.cs 프로젝트: quartz12345/c
        static string PoolManager(
            CollectorPool pool, 
            VisitedUrls urlHistory,
            VisitedDomains domainHistory)
        {
            string link = pool.Next();
            Uri uri = link.ToUri();

            if (uri.IsNull())
                return string.Empty; // No more links in the pool

            // Check if url has been visited within this session
            while (
                urlHistory.ContainsUrl(uri.ToString()) ||
                domainHistory.ContainsDomain(uri.ToString()) ||
                IsExcludedDomain(pool, uri))
            {
                link = pool.Next();
                uri = link.ToUri();

                if (uri.IsNull())
                    return string.Empty; // No more links in the pool
            }

            // Recursively check if host is still processed
            //if (domainHistory.ContainsDomain(uri.ToString()))
            //{
            //    // Store the link back to pool
            //    pool.Store(uri.ToString());
            //    return PoolManager(pool, urlHistory, domainHistory);
            //}
            //else
            //{
                // Link is ok, add to history
                urlHistory.Add(uri.ToString());
                domainHistory.Add(uri.ToString());

                return uri.ToString();
            //}
        }
예제 #2
0
파일: Program.cs 프로젝트: quartz12345/c
        static void Main(string[] args)
        {
            // arguments
            // --help
            // --provider=redis|mysql|sqlserver
            // --server=hostname
            // --port=
            // --database=
            // --uid=
            // --pwd=
            // --parallel-count

            LogBuffer = new StringBuilder();

            int parallelCount = 70;
            //string provider = "redis";
            //string server = "127.0.0.1";
            //string port = "6379";
            string provider = "mongodb";
            string server = "50.62.1.71";
            string port = "27017";
            string database = "ls";

            Console.CancelKeyPress += new ConsoleCancelEventHandler(Console_CancelKeyPress);
            PrintAndClearHeaderArea();

            LogLine("Buffer width: " + Console.BufferWidth);
            LogLine("Buffer height: " + Console.BufferHeight);

            IPersistence persistence =
                PersistenceFactory.GetPersistence(provider,
                    new Dictionary<string, string>
                    {
                        { "server", server },
                        { "port", port },
                        { "database", database }
                    });

            if (!persistence.Ping())
            {
                LogLine("Unable to connect to the database. Aborting.");
                Environment.Exit(1);
            }

            Repository repository = new Repository(persistence);

            RobotService robots = new RobotService();
            CollectorPool pool;
            VisitedUrls history;
            VisitedDomains domainHistory = new VisitedDomains();

            int poolCount = 0;
            Log("Loading pool...");
            repository.Load(out pool);
            pool.Store("squidoo.com");
            pool.Store("ezinearticles.com");
            pool.Store("hubpages.com");
            pool.Store("technorati.com");
            pool.Store("buzzle.com");
            pool.Store("suite101.com");
            pool.Store("goarticles.com");
            pool.Store("apsense.com");
            pool.Store("allaboutcounseling.com");
            pool.Store("digg.com");
            pool.Store("dmoz.org");
            pool.Store("dir.yahoo.com");

            poolCount = pool.Count;
            LogLine("done. Found " + poolCount);

            Log("Loading link data...");
            repository.LoadData();
            LogLine("done. Found " + repository.Links.Count);

            Log("Loading history...");
            repository.Load(out history, DateTime.Today);
            LogLine("done");

            Log("Loading TLD parser...");
            TldParser tldParser = new TldParser();
            LogLine("done");

            Thread.Sleep(5000);
            PrintAndClearHeaderArea();

            SynchronizationContext.SetSynchronizationContext(new SynchronizationContext());
            TaskScheduler scheduler = TaskScheduler.FromCurrentSynchronizationContext();
            TaskScheduler.UnobservedTaskException += (o, ea) =>
            {
                LogLine("Exception: {0}", ea.Exception.Message);
                ea.SetObserved();
            };

            DateTime start = DateTime.Now;
            CancellationTokenSource cancelTokenSource = new CancellationTokenSource();
            CancellationToken cancelToken = cancelTokenSource.Token;
            //CountdownEvent countdown = new CountdownEvent(parallelCount);

            Task[] tasks = new Task[parallelCount];
            for (int i = 0; i < parallelCount; i++)
            {
                TaskState state = new TaskState();
                state.Pool = pool;
                state.VisitedUrls = history;
                state.VisitedDomains = domainHistory;
                state.TldParser = tldParser;
                state.Robots = robots;
                state.Repository = repository;
                state.PoolManagerHandler = PoolManager;
                state.ProgressHandler = CollectorProgress;
                state.CancelToken = cancelToken;
                //state.Countdown = countdown;
                state.CollectorID = i;

                tasks[i] = new Task(CollectorProcessor, state, cancelToken, TaskCreationOptions.LongRunning);
                tasks[i].Start(scheduler);
                //tasks[i].Start();
            }

            //allDone.WaitOne();
            WriteXY(0, WORK_AREA_TOP + parallelCount + 2, "press [ENTER] to quit");
            Console.Read();

            cancelTokenSource.Cancel(true);
            try
            {
                Task.WaitAll(tasks, cancelToken);
                //countdown.Wait(cancelToken);
            }
            catch
            {
                LogLine("Collectors stopped.");
            }
            Thread.Sleep(3000);
               // countdown.Dispose();

            PrintAndClearHeaderArea();
            LogLine("Closing database...");
            repository.Commit<CollectorPool>(pool, null);
            ((IDisposable)persistence).Dispose();

            // Print statistics
            DateTime elapsed = DateTime.Now;
            LogLine("Pool count: before = {0} now = {1}", poolCount, pool.Count);
            int totalLinks = repository.CrawlDateLinks[DateTime.Today.ToString("yyMMdd")].Value.Count;
            LogLine("Links crawled: {0} in {1} seconds ({2}/sec)",
                totalLinks, (elapsed - start).Seconds, (totalLinks) / (elapsed - start).Seconds);

            // Dump data to log
            //LogLine("Anchors");
            //LogLine(repository.Anchors.JsonSerialize(true));
            //LogLine("AnchorTextExactRelations");
            //LogLine(repository.AnchorTextExactRelations.JsonSerialize(true));
            //LogLine("DomainOrSubdomains");
            //LogLine(repository.DomainOrSubdomains.JsonSerialize(true));
            //LogLine("Domains");
            //LogLine(repository.Domains.JsonSerialize(true));
            //LogLine("LinkCrawlDateCurrent");
            //LogLine(repository.LinkCrawlDateCurrent.JsonSerialize(true));
            //LogLine("LinkCrawlDateHistory");
            //LogLine(repository.LinkCrawlDateHistory.JsonSerialize(true));
            //LogLine("LinkRating");
            //LogLine(repository.LinkRating.JsonSerialize(true));
            //LogLine("Links");
            //LogLine(repository.Links.ToArray().Where((kv) => { return kv.Value.IsDirty; }).JsonSerialize(true));
            //LogLine("LinkStatusCurrent");
            //LogLine(repository.LinkStatusCurrent.JsonSerialize(true));
            //LogLine("LinkStatusHistory");
            //LogLine(repository.LinkStatusHistory.JsonSerialize(true));

            // Write log to file
            SaveLog();

            Environment.Exit(0);
        }