/// <summary> /// Start crawl process /// </summary> public virtual async Task CrawlAsync() { if (this.m_OnlyOneCrawlPerInstance) { throw new InvalidOperationException("Crawler instance cannot be reused"); } this.m_OnlyOneCrawlPerInstance = true; var parameters = new Parameter[] { new TypedParameter(typeof(Uri), this.m_BaseUri), new NamedParameter("crawlStart", this.m_BaseUri), new TypedParameter(typeof(Crawler), this), }; this.m_CrawlerQueue = this.m_LifetimeScope.Resolve <ICrawlerQueue>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerQueue), this.m_CrawlerQueue)).ToArray(); this.m_CrawlerHistory = this.m_LifetimeScope.Resolve <ICrawlerHistory>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerHistory), this.m_CrawlerHistory)).ToArray(); this.m_TaskRunner = this.m_LifetimeScope.Resolve <ITaskRunner>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ITaskRunner), this.m_TaskRunner)).ToArray(); this.m_Logger = this.m_LifetimeScope.Resolve <ILog>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ILog), this.m_Logger)).ToArray(); this.m_CrawlerRules = this.m_LifetimeScope.Resolve <ICrawlerRules>(parameters); this.m_Logger.Verbose("Crawl started @ {0}", this.m_BaseUri); this.m_WebDownloaderFactory = this.m_LifetimeScope.Resolve <Func <IWebDownloader> >(); using (this.m_CrawlCompleteEvent = new ManualResetEvent(false)) { this.m_Crawling = true; this.m_Runtime = Stopwatch.StartNew(); if (this.m_CrawlerQueue.Count > 0) { // Resume enabled ProcessQueue(); } else { await this.AddStepAsync(this.m_BaseUri, 0); } if (!this.m_CrawlStopped) { this.m_CrawlCompleteEvent.WaitOne(); } this.m_Runtime.Stop(); this.m_Crawling = false; } if (this.m_Cancelled) { OnCancelled(); } this.m_Logger.Verbose("Crawl ended @ {0} in {1}", this.m_BaseUri, this.m_Runtime.Elapsed); OnCrawlFinished(); }
public void Test1(ICrawlerHistory crawlerHistory) { Assert.NotNull(crawlerHistory); Assert.AreEqual(0, crawlerHistory.RegisteredCount); if (crawlerHistory is IDisposable) { ((IDisposable)crawlerHistory).Dispose(); } }
public void Test4(ICrawlerHistory crawlerHistory) { Assert.NotNull(crawlerHistory); Assert.IsTrue(crawlerHistory.Register("123")); Assert.IsTrue(crawlerHistory.Register("1234")); if (crawlerHistory is IDisposable) { ((IDisposable)crawlerHistory).Dispose(); } }
public void Test6(ICrawlerHistory crawlerHistory) { Assert.NotNull(crawlerHistory); int count = 0; foreach (string url in new StringPatternGenerator("http://ncrawler[a,b,c,d,e,f].codeplex.com/view[0-10].aspx?param1=[a-c]¶m2=[D-F]")) { Assert.IsTrue(crawlerHistory.Register(url)); Assert.IsFalse(crawlerHistory.Register(url)); count++; Assert.AreEqual(count, crawlerHistory.VisitedCount); } }
public void Test5(ICrawlerHistory crawlerHistory) { Assert.NotNull(crawlerHistory); for (int i = 0; i < 10; i++) { crawlerHistory.Register(i.ToString()); } for (int i = 0; i < 10; i++) { Assert.IsFalse(crawlerHistory.Register(i.ToString())); } for (int i = 10; i < 20; i++) { Assert.IsTrue(crawlerHistory.Register(i.ToString())); } }
/// <summary> /// Start crawl process /// </summary> public virtual void Crawl() { if (m_Crawling) { throw new InvalidOperationException("Crawler already running"); } Parameter[] parameters = new Parameter[] { new TypedParameter(typeof(Uri), m_BaseUri), new NamedParameter("crawlStart", m_BaseUri), new NamedParameter("resume", false), new NamedParameter("crawler", this), }; m_CrawlerQueue = m_LifetimeScope.Resolve <ICrawlerQueue>(parameters); m_CrawlerHistory = m_LifetimeScope.Resolve <ICrawlerHistory>(parameters); m_Robot = AdhereToRobotRules ? m_LifetimeScope.Resolve <IRobot>(parameters) : new DummyRobot(); m_TaskRunner = m_LifetimeScope.Resolve <ITaskRunner>(parameters); m_Logger = m_LifetimeScope.Resolve <ILog>(parameters); m_Logger.Verbose("Crawl started @ {0}", m_BaseUri); using (m_CrawlCompleteEvent = new ManualResetEvent(false)) { m_Crawling = true; m_Runtime = Stopwatch.StartNew(); AddStep(m_BaseUri, 0); if (!m_CrawlStopped) { m_CrawlCompleteEvent.WaitOne(); } m_Runtime.Stop(); m_Crawling = false; } if (m_Cancelled) { OnCancelled(); } m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed); OnCrawlFinished(); }
public void Test5(ICrawlerHistory crawlerHistory) { Assert.NotNull(crawlerHistory); for (int i = 0; i < 10; i++) { crawlerHistory.Register(i.ToString()); } for (int i = 0; i < 10; i++) { Assert.IsFalse(crawlerHistory.Register(i.ToString())); } for (int i = 10; i < 20; i++) { Assert.IsTrue(crawlerHistory.Register(i.ToString())); } if (crawlerHistory is IDisposable) { ((IDisposable)crawlerHistory).Dispose(); } }
public CustomCrawlerRules(Crawler crawler, IRobot robot, Uri baseUri, ICrawlerHistory crawlerHistory) : base(crawler, robot, baseUri) { m_CrawlerHistory = crawlerHistory; }
/// <summary> /// Start crawl process /// </summary> public virtual void Crawl() { using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8)) { var jsonStr = stream.ReadToEnd(); var policy = new CacheItemPolicy(); policy.Priority = CacheItemPriority.NotRemovable; policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1); cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy); Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite")); } if (m_OnlyOneCrawlPerInstance) { throw new InvalidOperationException("Crawler instance cannot be reused"); } m_OnlyOneCrawlPerInstance = true; Parameter[] parameters = new Parameter[] { new TypedParameter(typeof (Uri), m_BaseUri), new NamedParameter("crawlStart", m_BaseUri), new TypedParameter(typeof (Crawler), this), }; m_CrawlerQueue = m_LifetimeScope.Resolve<ICrawlerQueue>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerQueue), m_CrawlerQueue)).ToArray(); m_CrawlerHistory = m_LifetimeScope.Resolve<ICrawlerHistory>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ICrawlerHistory), m_CrawlerHistory)).ToArray(); m_TaskRunner = m_LifetimeScope.Resolve<ITaskRunner>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ITaskRunner), m_TaskRunner)).ToArray(); m_Logger = m_LifetimeScope.Resolve<ILog>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof(ILog), m_Logger)).ToArray(); m_CrawlerRules = m_LifetimeScope.Resolve<ICrawlerRules>(parameters); m_Logger.Verbose("Crawl started @ {0}", m_BaseUri); m_WebDownloaderFactory = m_LifetimeScope.Resolve<Func<IWebDownloader>>(); using (m_CrawlCompleteEvent = new ManualResetEvent(false)) { m_Crawling = true; m_Runtime = Stopwatch.StartNew(); if (m_CrawlerQueue.Count > 0) { // Resume enabled ProcessQueue(); } else { AddStep(m_BaseUri, 0); } if (!m_CrawlStopped) { m_CrawlCompleteEvent.WaitOne(); } m_Runtime.Stop(); m_Crawling = false; } if (m_Cancelled) { OnCancelled(); } m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed); OnCrawlFinished(); }
public CustomCrawlerRules(Crawler crawler, IRobot robot, Uri baseUri, ICrawlerHistory crawlerHistory) : base(crawler, robot, baseUri) { this.m_CrawlerHistory = crawlerHistory; }
public void Test4(ICrawlerHistory crawlerHistory) { Assert.NotNull(crawlerHistory); Assert.IsTrue(crawlerHistory.Register("123")); Assert.IsTrue(crawlerHistory.Register("1234")); }
/// <summary> /// Start crawl process /// </summary> public virtual void Crawl() { if (m_Crawling) { throw new InvalidOperationException("Crawler already running"); } Parameter[] parameters = new Parameter[] { new TypedParameter(typeof (Uri), m_BaseUri), new NamedParameter("crawlStart", m_BaseUri), new NamedParameter("resume", false), new NamedParameter("crawler", this), }; m_CrawlerQueue = m_LifetimeScope.Resolve<ICrawlerQueue>(parameters); m_CrawlerHistory = m_LifetimeScope.Resolve<ICrawlerHistory>(parameters); m_Robot = AdhereToRobotRules ? m_LifetimeScope.Resolve<IRobot>(parameters) : new DummyRobot(); m_TaskRunner = m_LifetimeScope.Resolve<ITaskRunner>(parameters); m_Logger = m_LifetimeScope.Resolve<ILog>(parameters); m_Logger.Verbose("Crawl started @ {0}", m_BaseUri); using (m_CrawlCompleteEvent = new ManualResetEvent(false)) { m_Crawling = true; m_Runtime = Stopwatch.StartNew(); AddStep(m_BaseUri, 0); if (!m_CrawlStopped) { m_CrawlCompleteEvent.WaitOne(); } m_Runtime.Stop(); m_Crawling = false; } if (m_Cancelled) { OnCancelled(); } m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed); OnCrawlFinished(); }
public void Test2(ICrawlerHistory crawlerHistory) { Assert.NotNull(crawlerHistory); crawlerHistory.Register("123"); Assert.AreEqual(1, crawlerHistory.VisitedCount); }
public void Test1(ICrawlerHistory crawlerHistory) { Assert.NotNull(crawlerHistory); Assert.AreEqual(0, crawlerHistory.VisitedCount); }
/// <summary> /// Start crawl process /// </summary> public virtual void Crawl() { if (m_OnlyOneCrawlPerInstance) { throw new InvalidOperationException("Crawler instance cannot be reused"); } m_OnlyOneCrawlPerInstance = true; Parameter[] parameters = new Parameter[] { new TypedParameter(typeof (Uri), m_BaseUri), new NamedParameter("crawlStart", m_BaseUri), new TypedParameter(typeof (Crawler), this), }; m_CrawlerQueue = m_LifetimeScope.Resolve<ICrawlerQueue>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof (ICrawlerQueue), m_CrawlerQueue)).ToArray(); m_CrawlerHistory = m_LifetimeScope.Resolve<ICrawlerHistory>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof (ICrawlerHistory), m_CrawlerHistory)).ToArray(); m_TaskRunner = m_LifetimeScope.Resolve<ITaskRunner>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof (ITaskRunner), m_TaskRunner)).ToArray(); m_Logger = m_LifetimeScope.Resolve<ILog>(parameters); parameters = parameters.AddToEnd(new TypedParameter(typeof (ILog), m_Logger)).ToArray(); m_CrawlerRules = m_LifetimeScope.Resolve<ICrawlerRules>(parameters); m_Logger.Verbose("Crawl started @ {0}", m_BaseUri); m_WebDownloaderFactory = m_LifetimeScope.Resolve<Func<IWebDownloader>>(); using (m_CrawlCompleteEvent = new ManualResetEvent(false)) { m_Crawling = true; m_Runtime = Stopwatch.StartNew(); if (m_CrawlerQueue.Count > 0) { // Resume enabled ProcessQueue(); } else { AddStep(m_BaseUri, 0); } if (!m_CrawlStopped) { m_CrawlCompleteEvent.WaitOne(); } m_Runtime.Stop(); m_Crawling = false; } if (m_Cancelled) { OnCancelled(); } m_Logger.Verbose("Crawl ended @ {0} in {1}", m_BaseUri, m_Runtime.Elapsed); OnCrawlFinished(); }