/// <summary> /// Start crawl process /// </summary> public virtual async Task CrawlAsync() { if (this.m_OnlyOneCrawlPerInstance) { throw new InvalidOperationException("Crawler instance cannot be reused"); } this.m_OnlyOneCrawlPerInstance = true; var parameters = new Parameter[] { new TypedParameter(typeof(Uri), this.m_BaseUri), new NamedParameter("crawlStart", this.m_BaseUri), new TypedParameter(typeof(Crawler), this), }; this.m_CrawlerQueue = this.m_LifetimeScope.Resolve <ICrawlerQueue>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerQueue), this.m_CrawlerQueue)).ToArray(); this.m_CrawlerHistory = this.m_LifetimeScope.Resolve <ICrawlerHistory>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerHistory), this.m_CrawlerHistory)).ToArray(); this.m_TaskRunner = this.m_LifetimeScope.Resolve <ITaskRunner>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ITaskRunner), this.m_TaskRunner)).ToArray(); this.m_Logger = this.m_LifetimeScope.Resolve <ILog>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ILog), this.m_Logger)).ToArray(); this.m_CrawlerRules = this.m_LifetimeScope.Resolve <ICrawlerRules>(parameters); this.m_Logger.Verbose("Crawl started @ {0}", this.m_BaseUri); this.m_WebDownloaderFactory = this.m_LifetimeScope.Resolve <Func <IWebDownloader> >(); using (this.m_CrawlCompleteEvent = new ManualResetEvent(false)) { this.m_Crawling = true; this.m_Runtime = Stopwatch.StartNew(); if (this.m_CrawlerQueue.Count > 0) { // Resume enabled ProcessQueue(); } else { await this.AddStepAsync(this.m_BaseUri, 0); } if (!this.m_CrawlStopped) { this.m_CrawlCompleteEvent.WaitOne(); } this.m_Runtime.Stop(); this.m_Crawling = false; } if (this.m_Cancelled) { OnCancelled(); } this.m_Logger.Verbose("Crawl ended @ {0} in {1}", this.m_BaseUri, this.m_Runtime.Elapsed); OnCrawlFinished(); }
public ImageCrawler(ICrawlerRules<HtmlDocument> rules, ICrawlerProcesser<HtmlDocument> processer, int threads) { // Assign the rules to be used by the crawler // to validate urls and pages for crawling. _rules = rules; // Assign the processer the crawler will call to // have valid pages (specified by the rules) processed. _processer = processer; // At this point we start a thread for crawling. // The thread will block until urls are added to // the BlockingCollection. Task.Factory.StartNew(() => Crawl(threads), TaskCreationOptions.LongRunning); }
/// <summary> /// Start crawl process /// </summary> public virtual void Crawl() { if (m_OnlyOneCrawlPerInstance) { throw new InvalidOperationException("Crawler instance cannot be reused"); } m_OnlyOneCrawlPerInstance = true; Parameter[] parameters = new Parameter[] { new TypedParameter(typeof (Uri), m_BaseUri), new NamedParameter("crawlStart", m_BaseUri), new TypedParameter(typeof (Crawler), this), }; m_CrawlerQueue = m_LifetimeScope.Resolve<ICrawlerQueue>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof (ICrawlerQueue), m_CrawlerQueue)).ToArray(); m_CrawlerHistory = m_LifetimeScope.Resolve<ICrawlerHistory>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof (ICrawlerHistory), m_CrawlerHistory)).ToArray(); m_TaskRunner = m_LifetimeScope.Resolve<ITaskRunner>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof (ITaskRunner), m_TaskRunner)).ToArray(); m_Logger = m_LifetimeScope.Resolve<ILog>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof (ILog), m_Logger)).ToArray(); m_CrawlerRules = m_LifetimeScope.Resolve<ICrawlerRules>(parameters); m_Logger.Verbose("Crawl started @ {0}", m_BaseUri); m_WebDownloaderFactory = m_LifetimeScope.Resolve<Func<IWebDownloader>>(); using (m_CrawlCompleteEvent = new ManualResetEvent(false)) { m_Crawling = true; m_Runtime = Stopwatch.StartNew(); if (m_CrawlerQueue.Count > 0) { // Resume enabled ProcessQueue(); } else { AddStep(m_BaseUri, 0); } if (!m_CrawlStopped) { m_CrawlCompleteEvent.WaitOne(); } m_Runtime.Stop(); m_Crawling = false; } if (m_Cancelled) { OnCancelled(); } m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed); OnCrawlFinished(); }
/// <summary> /// Start crawl process /// </summary> public virtual void Crawl() { using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8)) { var jsonStr = stream.ReadToEnd(); var policy = new CacheItemPolicy(); policy.Priority = CacheItemPriority.NotRemovable; policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1); cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy); Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite")); } if (m_OnlyOneCrawlPerInstance) { throw new InvalidOperationException("Crawler instance cannot be reused"); } m_OnlyOneCrawlPerInstance = true; Parameter[] parameters = new Parameter[] { new TypedParameter(typeof (Uri), m_BaseUri), new NamedParameter("crawlStart", m_BaseUri), new TypedParameter(typeof (Crawler), this), }; m_CrawlerQueue = m_LifetimeScope.Resolve<ICrawlerQueue>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerQueue), m_CrawlerQueue)).ToArray(); m_CrawlerHistory = m_LifetimeScope.Resolve<ICrawlerHistory>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerHistory), m_CrawlerHistory)).ToArray(); m_TaskRunner = m_LifetimeScope.Resolve<ITaskRunner>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ITaskRunner), m_TaskRunner)).ToArray(); m_Logger = m_LifetimeScope.Resolve<ILog>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ILog), m_Logger)).ToArray(); m_CrawlerRules = m_LifetimeScope.Resolve<ICrawlerRules>(parameters); m_Logger.Verbose("Crawl started @ {0}", m_BaseUri); m_WebDownloaderFactory = m_LifetimeScope.Resolve<Func<IWebDownloader>>(); using (m_CrawlCompleteEvent = new ManualResetEvent(false)) { m_Crawling = true; m_Runtime = Stopwatch.StartNew(); if (m_CrawlerQueue.Count > 0) { // Resume enabled ProcessQueue(); } else { AddStep(m_BaseUri, 0); } if (!m_CrawlStopped) { m_CrawlCompleteEvent.WaitOne(); } m_Runtime.Stop(); m_Crawling = false; } if (m_Cancelled) { OnCancelled(); } m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed); OnCrawlFinished(); }