Ejemplo n.º 1
0
        /// <summary>
        /// Start crawl process
        /// </summary>
        public virtual async Task CrawlAsync()
        {
            if (this.m_OnlyOneCrawlPerInstance)
            {
                throw new InvalidOperationException("Crawler instance cannot be reused");
            }

            this.m_OnlyOneCrawlPerInstance = true;

            var parameters = new Parameter[]
            {
                new TypedParameter(typeof(Uri), this.m_BaseUri),
                new NamedParameter("crawlStart", this.m_BaseUri),
                new TypedParameter(typeof(Crawler), this),
            };

            this.m_CrawlerQueue   = this.m_LifetimeScope.Resolve <ICrawlerQueue>(parameters);
            parameters            = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerQueue), this.m_CrawlerQueue)).ToArray();
            this.m_CrawlerHistory = this.m_LifetimeScope.Resolve <ICrawlerHistory>(parameters);
            parameters            = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerHistory), this.m_CrawlerHistory)).ToArray();
            this.m_TaskRunner     = this.m_LifetimeScope.Resolve <ITaskRunner>(parameters);
            parameters            = parameters.AddToEnd(new TypedParameter(typeof(ITaskRunner), this.m_TaskRunner)).ToArray();
            this.m_Logger         = this.m_LifetimeScope.Resolve <ILog>(parameters);
            parameters            = parameters.AddToEnd(new TypedParameter(typeof(ILog), this.m_Logger)).ToArray();
            this.m_CrawlerRules   = this.m_LifetimeScope.Resolve <ICrawlerRules>(parameters);
            this.m_Logger.Verbose("Crawl started @ {0}", this.m_BaseUri);
            this.m_WebDownloaderFactory = this.m_LifetimeScope.Resolve <Func <IWebDownloader> >();
            using (this.m_CrawlCompleteEvent = new ManualResetEvent(false))
            {
                this.m_Crawling = true;
                this.m_Runtime  = Stopwatch.StartNew();

                if (this.m_CrawlerQueue.Count > 0)
                {
                    // Resume enabled
                    ProcessQueue();
                }
                else
                {
                    await this.AddStepAsync(this.m_BaseUri, 0);
                }

                if (!this.m_CrawlStopped)
                {
                    this.m_CrawlCompleteEvent.WaitOne();
                }

                this.m_Runtime.Stop();
                this.m_Crawling = false;
            }

            if (this.m_Cancelled)
            {
                OnCancelled();
            }

            this.m_Logger.Verbose("Crawl ended @ {0} in {1}", this.m_BaseUri, this.m_Runtime.Elapsed);
            OnCrawlFinished();
        }
Ejemplo n.º 2
0
        public ImageCrawler(ICrawlerRules<HtmlDocument> rules, ICrawlerProcesser<HtmlDocument> processer, int threads)
        {
            // Assign the rules to be used by the crawler
            // to validate urls and pages for crawling.
            _rules = rules;

            // Assign the processer the crawler will call to
            // have valid pages (specified by the rules) processed.
            _processer = processer;

            // At this point we start a thread for crawling.
            // The thread will block until urls are added to
            // the BlockingCollection.
            Task.Factory.StartNew(() => Crawl(threads), TaskCreationOptions.LongRunning);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// 	Start crawl process
        /// </summary>
        public virtual void Crawl()
        {
            if (m_OnlyOneCrawlPerInstance)
            {
                throw new InvalidOperationException("Crawler instance cannot be reused");
            }

            m_OnlyOneCrawlPerInstance = true;

            Parameter[] parameters = new Parameter[]
                {
                    new TypedParameter(typeof (Uri), m_BaseUri),
                    new NamedParameter("crawlStart", m_BaseUri),
                    new TypedParameter(typeof (Crawler), this),
                };
            m_CrawlerQueue = m_LifetimeScope.Resolve<ICrawlerQueue>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof (ICrawlerQueue), m_CrawlerQueue)).ToArray();
            m_CrawlerHistory = m_LifetimeScope.Resolve<ICrawlerHistory>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof (ICrawlerHistory), m_CrawlerHistory)).ToArray();
            m_TaskRunner = m_LifetimeScope.Resolve<ITaskRunner>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof (ITaskRunner), m_TaskRunner)).ToArray();
            m_Logger = m_LifetimeScope.Resolve<ILog>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof (ILog), m_Logger)).ToArray();
            m_CrawlerRules = m_LifetimeScope.Resolve<ICrawlerRules>(parameters);
            m_Logger.Verbose("Crawl started @ {0}", m_BaseUri);
            m_WebDownloaderFactory = m_LifetimeScope.Resolve<Func<IWebDownloader>>();
            using (m_CrawlCompleteEvent = new ManualResetEvent(false))
            {
                m_Crawling = true;
                m_Runtime = Stopwatch.StartNew();

                if (m_CrawlerQueue.Count > 0)
                {
                    // Resume enabled
                    ProcessQueue();
                }
                else
                {
                    AddStep(m_BaseUri, 0);
                }

                if (!m_CrawlStopped)
                {
                    m_CrawlCompleteEvent.WaitOne();
                }

                m_Runtime.Stop();
                m_Crawling = false;
            }

            if (m_Cancelled)
            {
                OnCancelled();
            }

            m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed);
            OnCrawlFinished();
        }
Ejemplo n.º 4
0
        /// <summary>
        /// 	Start crawl process
        /// </summary>
        public virtual void Crawl()
        {
            using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8))
            {
                var jsonStr = stream.ReadToEnd();
                var policy = new CacheItemPolicy();
                policy.Priority = CacheItemPriority.NotRemovable;
                policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1);
                cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy);
                Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite"));
            }
            
            if (m_OnlyOneCrawlPerInstance)
            {
                throw new InvalidOperationException("Crawler instance cannot be reused");
            }

            m_OnlyOneCrawlPerInstance = true;

            Parameter[] parameters = new Parameter[]
				{
					new TypedParameter(typeof (Uri), m_BaseUri),
					new NamedParameter("crawlStart", m_BaseUri),
					new TypedParameter(typeof (Crawler), this),
				};
            m_CrawlerQueue = m_LifetimeScope.Resolve<ICrawlerQueue>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerQueue), m_CrawlerQueue)).ToArray();
            m_CrawlerHistory = m_LifetimeScope.Resolve<ICrawlerHistory>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerHistory), m_CrawlerHistory)).ToArray();
            m_TaskRunner = m_LifetimeScope.Resolve<ITaskRunner>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof(ITaskRunner), m_TaskRunner)).ToArray();
            m_Logger = m_LifetimeScope.Resolve<ILog>(parameters);
            parameters = parameters.AddToEnd(new TypedParameter(typeof(ILog), m_Logger)).ToArray();
            m_CrawlerRules = m_LifetimeScope.Resolve<ICrawlerRules>(parameters);
            m_Logger.Verbose("Crawl started @ {0}", m_BaseUri);
            m_WebDownloaderFactory = m_LifetimeScope.Resolve<Func<IWebDownloader>>();
            using (m_CrawlCompleteEvent = new ManualResetEvent(false))
            {
                m_Crawling = true;
                m_Runtime = Stopwatch.StartNew();

                if (m_CrawlerQueue.Count > 0)
                {
                    // Resume enabled
                    ProcessQueue();
                }
                else
                {
                    AddStep(m_BaseUri, 0);
                }

                if (!m_CrawlStopped)
                {
                    m_CrawlCompleteEvent.WaitOne();
                }

                m_Runtime.Stop();
                m_Crawling = false;
            }

            if (m_Cancelled)
            {
                OnCancelled();
            }

            m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed);
            OnCrawlFinished();
        }