Example #1
0
        /// <summary>
        /// Start crawl process
        /// </summary>
        public virtual async Task CrawlAsync()
        {
            if (this.m_OnlyOneCrawlPerInstance)
            {
                throw new InvalidOperationException("Crawler instance cannot be reused");
            }

            this.m_OnlyOneCrawlPerInstance = true;

            var parameters = new Parameter[]
            {
                new TypedParameter(typeof(Uri), this.m_BaseUri),
                new NamedParameter("crawlStart", this.m_BaseUri),
                new TypedParameter(typeof(Crawler), this),
            };

            this.m_CrawlerQueue   = this.m_LifetimeScope.Resolve <ICrawlerQueue>(parameters);
            parameters            = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerQueue), this.m_CrawlerQueue)).ToArray();
            this.m_CrawlerHistory = this.m_LifetimeScope.Resolve <ICrawlerHistory>(parameters);
            parameters            = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerHistory), this.m_CrawlerHistory)).ToArray();
            this.m_TaskRunner     = this.m_LifetimeScope.Resolve <ITaskRunner>(parameters);
            parameters            = parameters.AddToEnd(new TypedParameter(typeof(ITaskRunner), this.m_TaskRunner)).ToArray();
            this.m_Logger         = this.m_LifetimeScope.Resolve <ILog>(parameters);
            parameters            = parameters.AddToEnd(new TypedParameter(typeof(ILog), this.m_Logger)).ToArray();
            this.m_CrawlerRules   = this.m_LifetimeScope.Resolve <ICrawlerRules>(parameters);
            this.m_Logger.Verbose("Crawl started @ {0}", this.m_BaseUri);
            this.m_WebDownloaderFactory = this.m_LifetimeScope.Resolve <Func <IWebDownloader> >();
            using (this.m_CrawlCompleteEvent = new ManualResetEvent(false))
            {
                this.m_Crawling = true;
                this.m_Runtime  = Stopwatch.StartNew();

                if (this.m_CrawlerQueue.Count > 0)
                {
                    // Resume enabled
                    ProcessQueue();
                }
                else
                {
                    await this.AddStepAsync(this.m_BaseUri, 0);
                }

                if (!this.m_CrawlStopped)
                {
                    this.m_CrawlCompleteEvent.WaitOne();
                }

                this.m_Runtime.Stop();
                this.m_Crawling = false;
            }

            if (this.m_Cancelled)
            {
                OnCancelled();
            }

            this.m_Logger.Verbose("Crawl ended @ {0} in {1}", this.m_BaseUri, this.m_Runtime.Elapsed);
            OnCrawlFinished();
        }
Example #2
0
        public void Test1(ICrawlerHistory crawlerHistory)
        {
            Assert.NotNull(crawlerHistory);
            Assert.AreEqual(0, crawlerHistory.RegisteredCount);

            if (crawlerHistory is IDisposable)
            {
                ((IDisposable)crawlerHistory).Dispose();
            }
        }
Example #3
0
        public void Test1(ICrawlerHistory crawlerHistory)
        {
            Assert.NotNull(crawlerHistory);
            Assert.AreEqual(0, crawlerHistory.RegisteredCount);

            if (crawlerHistory is IDisposable)
            {
                ((IDisposable)crawlerHistory).Dispose();
            }
        }
Example #4
0
        public void Test4(ICrawlerHistory crawlerHistory)
        {
            Assert.NotNull(crawlerHistory);
            Assert.IsTrue(crawlerHistory.Register("123"));
            Assert.IsTrue(crawlerHistory.Register("1234"));

            if (crawlerHistory is IDisposable)
            {
                ((IDisposable)crawlerHistory).Dispose();
            }
        }
Example #5
0
        public void Test4(ICrawlerHistory crawlerHistory)
        {
            Assert.NotNull(crawlerHistory);
            Assert.IsTrue(crawlerHistory.Register("123"));
            Assert.IsTrue(crawlerHistory.Register("1234"));

            if (crawlerHistory is IDisposable)
            {
                ((IDisposable)crawlerHistory).Dispose();
            }
        }
Example #6
0
        public void Test6(ICrawlerHistory crawlerHistory)
        {
            Assert.NotNull(crawlerHistory);

            int count = 0;
            foreach (string url in new StringPatternGenerator("http://ncrawler[a,b,c,d,e,f].codeplex.com/view[0-10].aspx?param1=[a-c]&param2=[D-F]"))
            {
                Assert.IsTrue(crawlerHistory.Register(url));
                Assert.IsFalse(crawlerHistory.Register(url));
                count++;
                Assert.AreEqual(count, crawlerHistory.VisitedCount);
            }
        }
Example #7
0
        public void Test6(ICrawlerHistory crawlerHistory)
        {
            Assert.NotNull(crawlerHistory);

            int count = 0;

            foreach (string url in new StringPatternGenerator("http://ncrawler[a,b,c,d,e,f].codeplex.com/view[0-10].aspx?param1=[a-c]&param2=[D-F]"))
            {
                Assert.IsTrue(crawlerHistory.Register(url));
                Assert.IsFalse(crawlerHistory.Register(url));
                count++;
                Assert.AreEqual(count, crawlerHistory.VisitedCount);
            }
        }
Example #8
0
        public void Test5(ICrawlerHistory crawlerHistory)
        {
            Assert.NotNull(crawlerHistory);

            for (int i = 0; i < 10; i++)
            {
                crawlerHistory.Register(i.ToString());
            }

            for (int i = 0; i < 10; i++)
            {
                Assert.IsFalse(crawlerHistory.Register(i.ToString()));
            }

            for (int i = 10; i < 20; i++)
            {
                Assert.IsTrue(crawlerHistory.Register(i.ToString()));
            }
        }
Example #9
0
        public void Test5(ICrawlerHistory crawlerHistory)
        {
            Assert.NotNull(crawlerHistory);

            for (int i = 0; i < 10; i++)
            {
                crawlerHistory.Register(i.ToString());
            }

            for (int i = 0; i < 10; i++)
            {
                Assert.IsFalse(crawlerHistory.Register(i.ToString()));
            }

            for (int i = 10; i < 20; i++)
            {
                Assert.IsTrue(crawlerHistory.Register(i.ToString()));
            }
        }
Example #10
0
        /// <summary>
        /// Start crawl process
        /// </summary>
        public virtual void Crawl()
        {
            if (m_Crawling)
            {
                throw new InvalidOperationException("Crawler already running");
            }

            Parameter[] parameters = new Parameter[]
            {
                new TypedParameter(typeof(Uri), m_BaseUri),
                new NamedParameter("crawlStart", m_BaseUri),
                new NamedParameter("resume", false),
                new NamedParameter("crawler", this),
            };
            m_CrawlerQueue   = m_LifetimeScope.Resolve <ICrawlerQueue>(parameters);
            m_CrawlerHistory = m_LifetimeScope.Resolve <ICrawlerHistory>(parameters);
            m_Robot          = AdhereToRobotRules ? m_LifetimeScope.Resolve <IRobot>(parameters) : new DummyRobot();
            m_TaskRunner     = m_LifetimeScope.Resolve <ITaskRunner>(parameters);
            m_Logger         = m_LifetimeScope.Resolve <ILog>(parameters);

            m_Logger.Verbose("Crawl started @ {0}", m_BaseUri);
            using (m_CrawlCompleteEvent = new ManualResetEvent(false))
            {
                m_Crawling = true;
                m_Runtime  = Stopwatch.StartNew();
                AddStep(m_BaseUri, 0);
                if (!m_CrawlStopped)
                {
                    m_CrawlCompleteEvent.WaitOne();
                }

                m_Runtime.Stop();
                m_Crawling = false;
            }

            if (m_Cancelled)
            {
                OnCancelled();
            }

            m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed);
            OnCrawlFinished();
        }
Example #11
0
        public void Test5(ICrawlerHistory crawlerHistory)
        {
            Assert.NotNull(crawlerHistory);

            for (int i = 0; i < 10; i++)
            {
                crawlerHistory.Register(i.ToString());
            }

            for (int i = 0; i < 10; i++)
            {
                Assert.IsFalse(crawlerHistory.Register(i.ToString()));
            }

            for (int i = 10; i < 20; i++)
            {
                Assert.IsTrue(crawlerHistory.Register(i.ToString()));
            }

            if (crawlerHistory is IDisposable)
            {
                ((IDisposable)crawlerHistory).Dispose();
            }
        }
Example #12
0
        public void Test5(ICrawlerHistory crawlerHistory)
        {
            Assert.NotNull(crawlerHistory);

            for (int i = 0; i < 10; i++)
            {
                crawlerHistory.Register(i.ToString());
            }

            for (int i = 0; i < 10; i++)
            {
                Assert.IsFalse(crawlerHistory.Register(i.ToString()));
            }

            for (int i = 10; i < 20; i++)
            {
                Assert.IsTrue(crawlerHistory.Register(i.ToString()));
            }

            if (crawlerHistory is IDisposable)
            {
                ((IDisposable)crawlerHistory).Dispose();
            }
        }
Example #13
0
 public CustomCrawlerRules(Crawler crawler, IRobot robot, Uri baseUri, ICrawlerHistory crawlerHistory)
     : base(crawler, robot, baseUri)
 {
     m_CrawlerHistory = crawlerHistory;
 }
Example #14
0
        /// <summary>
        /// 	Start crawl process
        /// </summary>
        public virtual void Crawl()
        {
            using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8))
            {
                var jsonStr = stream.ReadToEnd();
                var policy = new CacheItemPolicy();
                policy.Priority = CacheItemPriority.NotRemovable;
                policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1);
                cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy);
                Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite"));
            }
            
            if (m_OnlyOneCrawlPerInstance)
            {
                throw new InvalidOperationException("Crawler instance cannot be reused");
            }

            m_OnlyOneCrawlPerInstance = true;

            Parameter[] parameters = new Parameter[]
				{
					new TypedParameter(typeof (Uri), m_BaseUri),
					new NamedParameter("crawlStart", m_BaseUri),
					new TypedParameter(typeof (Crawler), this),
				};
            m_CrawlerQueue = m_LifetimeScope.Resolve<ICrawlerQueue>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerQueue), m_CrawlerQueue)).ToArray();
            m_CrawlerHistory = m_LifetimeScope.Resolve<ICrawlerHistory>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerHistory), m_CrawlerHistory)).ToArray();
            m_TaskRunner = m_LifetimeScope.Resolve<ITaskRunner>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof(ITaskRunner), m_TaskRunner)).ToArray();
            m_Logger = m_LifetimeScope.Resolve<ILog>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof(ILog), m_Logger)).ToArray();
            m_CrawlerRules = m_LifetimeScope.Resolve<ICrawlerRules>(parameters);
            m_Logger.Verbose("Crawl started @ {0}", m_BaseUri);
            m_WebDownloaderFactory = m_LifetimeScope.Resolve<Func<IWebDownloader>>();
            using (m_CrawlCompleteEvent = new ManualResetEvent(false))
            {
                m_Crawling = true;
                m_Runtime = Stopwatch.StartNew();

                if (m_CrawlerQueue.Count > 0)
                {
                    // Resume enabled
                    ProcessQueue();
                }
                else
                {
                    AddStep(m_BaseUri, 0);
                }

                if (!m_CrawlStopped)
                {
                    m_CrawlCompleteEvent.WaitOne();
                }

                m_Runtime.Stop();
                m_Crawling = false;
            }

            if (m_Cancelled)
            {
                OnCancelled();
            }

            m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed);
            OnCrawlFinished();
        }
Example #15
0
 public CustomCrawlerRules(Crawler crawler, IRobot robot, Uri baseUri, ICrawlerHistory crawlerHistory)
     : base(crawler, robot, baseUri)
 {
     this.m_CrawlerHistory = crawlerHistory;
 }
Example #16
0
 public void Test4(ICrawlerHistory crawlerHistory)
 {
     Assert.NotNull(crawlerHistory);
     Assert.IsTrue(crawlerHistory.Register("123"));
     Assert.IsTrue(crawlerHistory.Register("1234"));
 }
Example #17
0
        /// <summary>
        /// Start crawl process
        /// </summary>
        public virtual void Crawl()
        {
            if (m_Crawling)
            {
                throw new InvalidOperationException("Crawler already running");
            }

            Parameter[] parameters = new Parameter[]
                {
                    new TypedParameter(typeof (Uri), m_BaseUri),
                    new NamedParameter("crawlStart", m_BaseUri),
                    new NamedParameter("resume", false),
                    new NamedParameter("crawler", this),
                };
            m_CrawlerQueue = m_LifetimeScope.Resolve<ICrawlerQueue>(parameters);
            m_CrawlerHistory = m_LifetimeScope.Resolve<ICrawlerHistory>(parameters);
            m_Robot = AdhereToRobotRules ? m_LifetimeScope.Resolve<IRobot>(parameters) : new DummyRobot();
            m_TaskRunner = m_LifetimeScope.Resolve<ITaskRunner>(parameters);
            m_Logger = m_LifetimeScope.Resolve<ILog>(parameters);

            m_Logger.Verbose("Crawl started @ {0}", m_BaseUri);
            using (m_CrawlCompleteEvent = new ManualResetEvent(false))
            {
                m_Crawling = true;
                m_Runtime = Stopwatch.StartNew();
                AddStep(m_BaseUri, 0);
                if (!m_CrawlStopped)
                {
                    m_CrawlCompleteEvent.WaitOne();
                }

                m_Runtime.Stop();
                m_Crawling = false;
            }

            if (m_Cancelled)
            {
                OnCancelled();
            }

            m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed);
            OnCrawlFinished();
        }
Example #18
0
 public void Test4(ICrawlerHistory crawlerHistory)
 {
     Assert.NotNull(crawlerHistory);
     Assert.IsTrue(crawlerHistory.Register("123"));
     Assert.IsTrue(crawlerHistory.Register("1234"));
 }
Example #19
0
 public void Test2(ICrawlerHistory crawlerHistory)
 {
     Assert.NotNull(crawlerHistory);
     crawlerHistory.Register("123");
     Assert.AreEqual(1, crawlerHistory.VisitedCount);
 }
Example #20
0
 public void Test1(ICrawlerHistory crawlerHistory)
 {
     Assert.NotNull(crawlerHistory);
     Assert.AreEqual(0, crawlerHistory.VisitedCount);
 }
Example #21
0
 public void Test2(ICrawlerHistory crawlerHistory)
 {
     Assert.NotNull(crawlerHistory);
     crawlerHistory.Register("123");
     Assert.AreEqual(1, crawlerHistory.VisitedCount);
 }
Example #22
0
        /// <summary>
        /// 	Start crawl process
        /// </summary>
        public virtual void Crawl()
        {
            if (m_OnlyOneCrawlPerInstance)
            {
                throw new InvalidOperationException("Crawler instance cannot be reused");
            }

            m_OnlyOneCrawlPerInstance = true;

            Parameter[] parameters = new Parameter[]
                {
                    new TypedParameter(typeof (Uri), m_BaseUri),
                    new NamedParameter("crawlStart", m_BaseUri),
                    new TypedParameter(typeof (Crawler), this),
                };
            m_CrawlerQueue = m_LifetimeScope.Resolve<ICrawlerQueue>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof (ICrawlerQueue), m_CrawlerQueue)).ToArray();
            m_CrawlerHistory = m_LifetimeScope.Resolve<ICrawlerHistory>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof (ICrawlerHistory), m_CrawlerHistory)).ToArray();
            m_TaskRunner = m_LifetimeScope.Resolve<ITaskRunner>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof (ITaskRunner), m_TaskRunner)).ToArray();
            m_Logger = m_LifetimeScope.Resolve<ILog>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof (ILog), m_Logger)).ToArray();
            m_CrawlerRules = m_LifetimeScope.Resolve<ICrawlerRules>(parameters);
            m_Logger.Verbose("Crawl started @ {0}", m_BaseUri);
            m_WebDownloaderFactory = m_LifetimeScope.Resolve<Func<IWebDownloader>>();
            using (m_CrawlCompleteEvent = new ManualResetEvent(false))
            {
                m_Crawling = true;
                m_Runtime = Stopwatch.StartNew();

                if (m_CrawlerQueue.Count > 0)
                {
                    // Resume enabled
                    ProcessQueue();
                }
                else
                {
                    AddStep(m_BaseUri, 0);
                }

                if (!m_CrawlStopped)
                {
                    m_CrawlCompleteEvent.WaitOne();
                }

                m_Runtime.Stop();
                m_Crawling = false;
            }

            if (m_Cancelled)
            {
                OnCancelled();
            }

            m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed);
            OnCrawlFinished();
        }
Example #23
0
 public void Test1(ICrawlerHistory crawlerHistory)
 {
     Assert.NotNull(crawlerHistory);
     Assert.AreEqual(0, crawlerHistory.VisitedCount);
 }