/** -------------------------------------------------------------------- **/ public void AddUrlQueueItem(string Url) { string NewUrl = Url; if (MacroscopePreferencesManager.GetIgnoreQueries()) { NewUrl = MacroscopeUrlUtils.StripQueryString(Url: NewUrl); } if (MacroscopePreferencesManager.GetIgnoreHashFragments()) { NewUrl = MacroscopeUrlUtils.StripHashFragment(Url: NewUrl); } if (!this.JobHistory.SeenHistoryItem(Url: NewUrl)) { try { MacroscopeJobItem JobItem; JobItem = new MacroscopeJobItem(Url: NewUrl); this.NamedQueueJobItems.AddToNamedQueue( Name: MacroscopeConstants.NamedQueueUrlList, Item: JobItem ); } catch (MacroscopeNamedQueueException ex) { this.DebugMsg(string.Format("AddUrlQueueItem: {0}", ex.Message)); } } this.AddToProgress(Url: NewUrl); }
public void TestCreateNamedQueue() { MacroscopeNamedQueue <MacroscopeJobItem> NamedQueue = new MacroscopeNamedQueue <MacroscopeJobItem>(); NamedQueue.CreateNamedQueue(QUEUENAME); MacroscopeJobItem JobItem1 = new MacroscopeJobItem("http://www.company.com/"); NamedQueue.AddToNamedQueue(QUEUENAME, JobItem1); Assert.AreEqual(1, NamedQueue.CountNamedQueueItems(QUEUENAME)); }
public void TestFailAddingDuplicateToNamedQueue() { MacroscopeNamedQueue <MacroscopeJobItem> NamedQueue = new MacroscopeNamedQueue <MacroscopeJobItem>(); NamedQueue.CreateNamedQueue(QUEUENAME, MacroscopeNamedQueue <MacroscopeJobItem> .MODE.USE_HISTORY); MacroscopeJobItem JobItem1 = new MacroscopeJobItem("http://www.company.com/"); MacroscopeJobItem JobItem2 = new MacroscopeJobItem("http://www.company.com/"); NamedQueue.AddToNamedQueue(QUEUENAME, JobItem1); NamedQueue.AddToNamedQueue(QUEUENAME, JobItem2); Assert.AreEqual(1, NamedQueue.CountNamedQueueItems(QUEUENAME)); }
public void TestAddRemoveAddAgainToNamedQueue() { MacroscopeNamedQueue <MacroscopeJobItem> NamedQueue = new MacroscopeNamedQueue <MacroscopeJobItem>(); NamedQueue.CreateNamedQueue(QUEUENAME, MacroscopeNamedQueue <MacroscopeJobItem> .MODE.USE_HISTORY); MacroscopeJobItem JobItem1 = new MacroscopeJobItem("http://www.company.com/"); NamedQueue.AddToNamedQueue(QUEUENAME, JobItem1); Assert.AreEqual(1, NamedQueue.CountNamedQueueItems(QUEUENAME)); MacroscopeJobItem JobItem2 = NamedQueue.GetNamedQueueItem(QUEUENAME); Assert.AreEqual(0, NamedQueue.CountNamedQueueItems(QUEUENAME)); bool Forgotten = NamedQueue.ForgetNamedQueueItem(QUEUENAME, JobItem1); NamedQueue.AddToNamedQueue(QUEUENAME, JobItem1); Assert.AreEqual(1, NamedQueue.CountNamedQueueItems(QUEUENAME)); }
/** -------------------------------------------------------------------- **/ public void ForgetUrlQueueItem(string Url) { MacroscopeJobItem JobItem; string NewUrl = Url; if (MacroscopePreferencesManager.GetIgnoreQueries()) { NewUrl = MacroscopeUrlUtils.StripQueryString(Url: NewUrl); } if (MacroscopePreferencesManager.GetIgnoreHashFragments()) { NewUrl = MacroscopeUrlUtils.StripHashFragment(Url: NewUrl); } JobItem = new MacroscopeJobItem(Url: NewUrl); this.NamedQueueJobItems.ForgetNamedQueueItem( Name: MacroscopeConstants.NamedQueueUrlList, Item: JobItem ); }
/**************************************************************************/ public async void Execute() { int MaxFetches = MacroscopePreferencesManager.GetMaxFetchesPerWorker(); while (MaxFetches > 0) { if (this.JobMaster.GetThreadsStop()) { this.DebugMsg(string.Format("JobMaster.GetThreadsStop: {0}", this.JobMaster.GetThreadsStop())); break; } else { MacroscopeJobItem JobItem = this.JobMaster.GetUrlQueueItem(); string Url = null; string RedirectedFromUrl = null; if (JobItem != null) { Url = JobItem.GetItemUrl(); RedirectedFromUrl = JobItem.GetItemRedirectedFromUrl(); } if (!string.IsNullOrEmpty(Url)) { if (!this.CheckIncludeExcludeUrl(Url)) { Url = null; } } if (!string.IsNullOrEmpty(Url)) { if ( !MacroscopePreferencesManager.GetCrawlParentDirectories() && !MacroscopePreferencesManager.GetCrawlChildDirectories() && Url != this.JobMaster.GetStartUrl()) { Url = null; } else if ( !MacroscopePreferencesManager.GetCrawlParentDirectories() || !MacroscopePreferencesManager.GetCrawlChildDirectories()) { this.DebugMsg(string.Format("Running Parent/Child Check: {0}", Url)); if ( MacroscopePreferencesManager.GetCrawlParentDirectories() && (!string.IsNullOrEmpty(Url))) { if (!MacroscopeHttpUrlUtils.IsWithinParentDirectory(StartUrl: this.JobMaster.GetParentStartingDirectory(), Url: Url)) { Url = null; } } if ( MacroscopePreferencesManager.GetCrawlChildDirectories() && (!string.IsNullOrEmpty(Url))) { if (!MacroscopeHttpUrlUtils.IsWithinChildDirectory(StartUrl: this.JobMaster.GetChildStartingDirectory(), Url: Url)) { Url = null; } } } else { this.DebugMsg(string.Format("Skipping Parent/Child Check: {0}", Url)); } } if (!string.IsNullOrEmpty(Url)) { if (MacroscopePreferencesManager.GetDepth() >= 0) { if (MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url) > MacroscopePreferencesManager.GetDepth()) { this.DebugMsg(string.Format("URL Too Deep: {0}", Url)); Url = null; } } } if (!string.IsNullOrEmpty(Url)) { this.DebugMsg(string.Format("Execute: {0}", Url)); int Tries = MacroscopePreferencesManager.GetMaxRetries(); JobHistory.AddHistoryItem(Url: Url); do { this.DebugMsg(string.Format("Trying Fetch: {0} :: {1}", Tries, Url)); MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; try { if (!string.IsNullOrEmpty(RedirectedFromUrl)) { FetchStatus = await this.Fetch(Url, RedirectedFromUrl); } else { FetchStatus = await this.Fetch(Url); } } catch (Exception ex) { this.DebugMsg(string.Format("FetchStatus: {0}", ex.Message)); this.DebugMsg(string.Format("Url: {0}", Url)); this.DebugMsg(string.Format("FetchStatus: {0}", FetchStatus)); } switch (FetchStatus) { case MacroscopeConstants.FetchStatus.ERROR: this.DebugMsg(string.Format("Fetch Failed: {0} :: {1}", Tries, Url)); Thread.Sleep(25); break; case MacroscopeConstants.FetchStatus.NETWORK_ERROR: this.DebugMsg(string.Format("Fetch Failed: {0} :: {1}", Tries, Url)); Thread.Sleep(25); break; default: this.JobMaster.NotifyWorkersFetched(Url: Url); Tries = 0; break; } Tries--; } while(Tries > 0); if (this.CrawlDelay > 0) { this.DebugMsg(string.Format("CRAWL DELAY: Sleeping for {0} seconds...", this.CrawlDelay)); Thread.Sleep(CrawlDelay * 1000); } } } MaxFetches--; //Thread.Yield(); } this.JobMaster.NotifyWorkersDone(); }
/** -------------------------------------------------------------------- **/ public MacroscopeJobItem GetUrlQueueItem() { MacroscopeJobItem JobItem = this.NamedQueueJobItems.GetNamedQueueItem(MacroscopeConstants.NamedQueueUrlList); return(JobItem); }
/**************************************************************************/ public void Execute() { int MaxFetches = MacroscopePreferencesManager.GetMaxFetchesPerWorker(); while (MaxFetches > 0) { if (this.JobMaster.GetThreadsStop()) { DebugMsg(string.Format("JobMaster.GetThreadsStop: {0}", this.JobMaster.GetThreadsStop())); break; } else { MacroscopeJobItem JobItem = this.JobMaster.GetUrlQueueItem(); string Url = null; if (JobItem != null) { Url = JobItem.GetItemUrl(); } if (!string.IsNullOrEmpty(Url)) { if (!this.CheckIncludeExcludeUrl(Url)) { Url = null; } } if (!string.IsNullOrEmpty(Url)) { if ( !MacroscopePreferencesManager.GetCrawlParentDirectories() && !MacroscopePreferencesManager.GetCrawlChildDirectories() && Url != this.JobMaster.GetStartUrl()) { Url = null; } else if ( !MacroscopePreferencesManager.GetCrawlParentDirectories() || !MacroscopePreferencesManager.GetCrawlChildDirectories()) { DebugMsg(string.Format("Running Parent/Child Check: {0}", Url)); if ( MacroscopePreferencesManager.GetCrawlParentDirectories() && (!string.IsNullOrEmpty(Url))) { if (!this.JobMaster.IsWithinParentDirectory(Url)) { Url = null; } } if ( MacroscopePreferencesManager.GetCrawlChildDirectories() && (!string.IsNullOrEmpty(Url))) { if (!this.JobMaster.IsWithinChildDirectory(Url)) { Url = null; } } } else { DebugMsg(string.Format("Skipping Parent/Child Check: {0}", Url)); } } if (!string.IsNullOrEmpty(Url)) { DebugMsg(string.Format("Execute: {0}", Url)); int Tries = MacroscopePreferencesManager.GetMaxRetries(); do { DebugMsg(string.Format("Trying Fetch: {0} :: {1}", Tries, Url)); if ( (this.Fetch(Url) == MacroscopeConstants.FetchStatus.ERROR) || (this.Fetch(Url) == MacroscopeConstants.FetchStatus.ERROR)) { DebugMsg(string.Format("Fetch Failed: {0} :: {1}", Tries, Url)); Thread.Sleep(1000); } else { this.JobMaster.NotifyWorkersFetched(Url); break; } Tries--; } while(Tries > 0); if (this.CrawlDelay > 0) { DebugMsg(string.Format("CRAWL DELAY: Sleeping for {0} seconds...", this.CrawlDelay)); Thread.Sleep(CrawlDelay * 1000); } } } MaxFetches--; Thread.Yield(); } this.JobMaster.NotifyWorkersDone(); }