Exemple #1
0
 public void DownloadRobots()
 {
     using (RobotService rs = new RobotService())
     {
         Uri uri = new Uri("http://www.yahoo.com");
         Assert.IsTrue(rs.GetDenyUrls(uri.Host).Count > 0);
         Assert.IsTrue(rs.IsAllowed(string.Empty, uri));
         rs.GetDenyUrls(uri.Host).ForEach(denyUrl =>
         {
             Console.WriteLine(denyUrl);
         });
     }
 }
Exemple #2
0
        private static void TestRun1(int WORK_AREA_TOP, int parallelCount, Repository repository, RobotService robots, CollectorPool pool, VisitedUrls history, TldParser tldParser)
        {
            CreateCollectorPool(pool, history, parallelCount).ForEach(
                c =>
                {
                    //ManualResetEvent done = new ManualResetEvent(false);
                    Dictionary<string, object> properties = new Dictionary<string, object>();
                    properties.Add("TldParser", tldParser);

                    Uri uri = c.Link.ToUri();
                    WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (fetching)", uri.ToUriShort());

                    WebDownloader web = new WebDownloader(uri, properties,
                        ea =>
                        {
                            WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                "{0} [{1}] ({2})",
                                uri.ToUriShort(), ea.Status,
                                (ea.Exception.IsNull() ?
                                    "responded: " + ea.Stream.Length + " bytes" :
                                    "exception: " + ea.Exception.Message));

                            if (ea.Stream.IsNull())
                            {
                                Thread.Sleep(5000);
                            }
                            else
                            {
                                HtmlProcessor processor = new HtmlProcessor(
                                    uri.ToString(), ea.Stream,
                                    ((TldParser)ea.Properties["TldParser"]));

                                WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                    "{0} (found={1})", uri.ToUriShort(), processor.Links.Count);

                                int pushedLinks = 0;
                                int linkCounter = 1;
                                processor.Links.ForEach(l =>
                                {
                                    WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                        "{0} (found={1}, prc={2} {3} ({4}))",
                                        uri.ToUriShort(),
                                        processor.Links.Count,
                                        (l.Domain.Length > 10 ? l.Domain.Substring(0, 10) : l.Domain),
                                        l.Tld,
                                        linkCounter);

                                    if (robots.IsAllowed(string.Empty, l.Href.ToUri()))
                                    {
                                        ++pushedLinks;
                                        pool.Store(l.Href);

                                        repository.SaveLink(uri.ToString(), l.Href, string.Empty, l);
                                        repository.SaveLink(l.Href, string.Empty, uri.ToString(), l);
                                        history.Add(uri.ToString());
                                    }

                                    ++linkCounter;
                                });

                                WriteXY(0, WORK_AREA_TOP + c.SeqNo,
                                    "{0} (found={1}, added={2} links) [DONE]",
                                    uri.ToUriShort(), processor.Links.Count, pushedLinks);

                                ea.Stream.Close();
                            }

                            //((ManualResetEvent)ea.Properties["State"]).Set();
                        });

                    web.Download();
                    //done.WaitOne();
                });
        }
Exemple #3
0
        static void Main(string[] args)
        {
            // arguments
            // --help
            // --provider=redis|mysql|sqlserver
            // --server=hostname
            // --port=
            // --database=
            // --uid=
            // --pwd=
            // --parallel-count

            LogBuffer = new StringBuilder();

            int parallelCount = 70;
            //string provider = "redis";
            //string server = "127.0.0.1";
            //string port = "6379";
            string provider = "mongodb";
            string server = "50.62.1.71";
            string port = "27017";
            string database = "ls";

            Console.CancelKeyPress += new ConsoleCancelEventHandler(Console_CancelKeyPress);
            PrintAndClearHeaderArea();

            LogLine("Buffer width: " + Console.BufferWidth);
            LogLine("Buffer height: " + Console.BufferHeight);

            IPersistence persistence =
                PersistenceFactory.GetPersistence(provider,
                    new Dictionary<string, string>
                    {
                        { "server", server },
                        { "port", port },
                        { "database", database }
                    });

            if (!persistence.Ping())
            {
                LogLine("Unable to connect to the database. Aborting.");
                Environment.Exit(1);
            }

            Repository repository = new Repository(persistence);

            RobotService robots = new RobotService();
            CollectorPool pool;
            VisitedUrls history;
            VisitedDomains domainHistory = new VisitedDomains();

            int poolCount = 0;
            Log("Loading pool...");
            repository.Load(out pool);
            pool.Store("squidoo.com");
            pool.Store("ezinearticles.com");
            pool.Store("hubpages.com");
            pool.Store("technorati.com");
            pool.Store("buzzle.com");
            pool.Store("suite101.com");
            pool.Store("goarticles.com");
            pool.Store("apsense.com");
            pool.Store("allaboutcounseling.com");
            pool.Store("digg.com");
            pool.Store("dmoz.org");
            pool.Store("dir.yahoo.com");

            poolCount = pool.Count;
            LogLine("done. Found " + poolCount);

            Log("Loading link data...");
            repository.LoadData();
            LogLine("done. Found " + repository.Links.Count);

            Log("Loading history...");
            repository.Load(out history, DateTime.Today);
            LogLine("done");

            Log("Loading TLD parser...");
            TldParser tldParser = new TldParser();
            LogLine("done");

            Thread.Sleep(5000);
            PrintAndClearHeaderArea();

            SynchronizationContext.SetSynchronizationContext(new SynchronizationContext());
            TaskScheduler scheduler = TaskScheduler.FromCurrentSynchronizationContext();
            TaskScheduler.UnobservedTaskException += (o, ea) =>
            {
                LogLine("Exception: {0}", ea.Exception.Message);
                ea.SetObserved();
            };

            DateTime start = DateTime.Now;
            CancellationTokenSource cancelTokenSource = new CancellationTokenSource();
            CancellationToken cancelToken = cancelTokenSource.Token;
            //CountdownEvent countdown = new CountdownEvent(parallelCount);

            Task[] tasks = new Task[parallelCount];
            for (int i = 0; i < parallelCount; i++)
            {
                TaskState state = new TaskState();
                state.Pool = pool;
                state.VisitedUrls = history;
                state.VisitedDomains = domainHistory;
                state.TldParser = tldParser;
                state.Robots = robots;
                state.Repository = repository;
                state.PoolManagerHandler = PoolManager;
                state.ProgressHandler = CollectorProgress;
                state.CancelToken = cancelToken;
                //state.Countdown = countdown;
                state.CollectorID = i;

                tasks[i] = new Task(CollectorProcessor, state, cancelToken, TaskCreationOptions.LongRunning);
                tasks[i].Start(scheduler);
                //tasks[i].Start();
            }

            //allDone.WaitOne();
            WriteXY(0, WORK_AREA_TOP + parallelCount + 2, "press [ENTER] to quit");
            Console.Read();

            cancelTokenSource.Cancel(true);
            try
            {
                Task.WaitAll(tasks, cancelToken);
                //countdown.Wait(cancelToken);
            }
            catch
            {
                LogLine("Collectors stopped.");
            }
            Thread.Sleep(3000);
               // countdown.Dispose();

            PrintAndClearHeaderArea();
            LogLine("Closing database...");
            repository.Commit<CollectorPool>(pool, null);
            ((IDisposable)persistence).Dispose();

            // Print statistics
            DateTime elapsed = DateTime.Now;
            LogLine("Pool count: before = {0} now = {1}", poolCount, pool.Count);
            int totalLinks = repository.CrawlDateLinks[DateTime.Today.ToString("yyMMdd")].Value.Count;
            LogLine("Links crawled: {0} in {1} seconds ({2}/sec)",
                totalLinks, (elapsed - start).Seconds, (totalLinks) / (elapsed - start).Seconds);

            // Dump data to log
            //LogLine("Anchors");
            //LogLine(repository.Anchors.JsonSerialize(true));
            //LogLine("AnchorTextExactRelations");
            //LogLine(repository.AnchorTextExactRelations.JsonSerialize(true));
            //LogLine("DomainOrSubdomains");
            //LogLine(repository.DomainOrSubdomains.JsonSerialize(true));
            //LogLine("Domains");
            //LogLine(repository.Domains.JsonSerialize(true));
            //LogLine("LinkCrawlDateCurrent");
            //LogLine(repository.LinkCrawlDateCurrent.JsonSerialize(true));
            //LogLine("LinkCrawlDateHistory");
            //LogLine(repository.LinkCrawlDateHistory.JsonSerialize(true));
            //LogLine("LinkRating");
            //LogLine(repository.LinkRating.JsonSerialize(true));
            //LogLine("Links");
            //LogLine(repository.Links.ToArray().Where((kv) => { return kv.Value.IsDirty; }).JsonSerialize(true));
            //LogLine("LinkStatusCurrent");
            //LogLine(repository.LinkStatusCurrent.JsonSerialize(true));
            //LogLine("LinkStatusHistory");
            //LogLine(repository.LinkStatusHistory.JsonSerialize(true));

            // Write log to file
            SaveLog();

            Environment.Exit(0);
        }