public void TestFindUrlDepth() { Dictionary <string, int> UrlList = new Dictionary <string, int>(); UrlList.Add("https://nazuke.github.io/", 0); UrlList.Add("https://nazuke.github.io/0.html", 0); UrlList.Add("https://nazuke.github.io/0/1.html", 1); UrlList.Add("https://nazuke.github.io/0/1/2.html", 2); UrlList.Add("https://nazuke.github.io/0/1/2/", 2); UrlList.Add("https://nazuke.github.io/0/1/2/3.html", 3); UrlList.Add("https://nazuke.github.io/0/1/2/3.html/", 3); UrlList.Add("https://nazuke.github.io/0/1/2/3/4.html?key=value", 4); foreach (KeyValuePair <string, int> UrlPair in UrlList) { this.DebugMsg(string.Format("{0}: {1}", UrlPair.Value, UrlPair.Key)); int Depth = MacroscopeHttpUrlUtils.FindUrlDepth(Url: UrlPair.Key); Assert.AreEqual(UrlPair.Value, Depth); } }
/**************************************************************************/ public async void Execute() { int MaxFetches = MacroscopePreferencesManager.GetMaxFetchesPerWorker(); while (MaxFetches > 0) { if (this.JobMaster.GetThreadsStop()) { this.DebugMsg(string.Format("JobMaster.GetThreadsStop: {0}", this.JobMaster.GetThreadsStop())); break; } else { MacroscopeJobItem JobItem = this.JobMaster.GetUrlQueueItem(); string Url = null; string RedirectedFromUrl = null; if (JobItem != null) { Url = JobItem.GetItemUrl(); RedirectedFromUrl = JobItem.GetItemRedirectedFromUrl(); } if (!string.IsNullOrEmpty(Url)) { if (!this.CheckIncludeExcludeUrl(Url)) { Url = null; } } if (!string.IsNullOrEmpty(Url)) { if ( !MacroscopePreferencesManager.GetCrawlParentDirectories() && !MacroscopePreferencesManager.GetCrawlChildDirectories() && Url != this.JobMaster.GetStartUrl()) { Url = null; } else if ( !MacroscopePreferencesManager.GetCrawlParentDirectories() || !MacroscopePreferencesManager.GetCrawlChildDirectories()) { this.DebugMsg(string.Format("Running Parent/Child Check: {0}", Url)); if ( MacroscopePreferencesManager.GetCrawlParentDirectories() && (!string.IsNullOrEmpty(Url))) { if (!MacroscopeHttpUrlUtils.IsWithinParentDirectory(StartUrl: this.JobMaster.GetParentStartingDirectory(), Url: Url)) { Url = null; } } if ( MacroscopePreferencesManager.GetCrawlChildDirectories() && (!string.IsNullOrEmpty(Url))) { if (!MacroscopeHttpUrlUtils.IsWithinChildDirectory(StartUrl: this.JobMaster.GetChildStartingDirectory(), Url: Url)) { Url = null; } } } else { this.DebugMsg(string.Format("Skipping Parent/Child Check: {0}", Url)); } } if (!string.IsNullOrEmpty(Url)) { if (MacroscopePreferencesManager.GetDepth() >= 0) { if (MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url) > MacroscopePreferencesManager.GetDepth()) { this.DebugMsg(string.Format("URL Too Deep: {0}", Url)); Url = null; } } } if (!string.IsNullOrEmpty(Url)) { this.DebugMsg(string.Format("Execute: {0}", Url)); int Tries = MacroscopePreferencesManager.GetMaxRetries(); JobHistory.AddHistoryItem(Url: Url); do { this.DebugMsg(string.Format("Trying Fetch: {0} :: {1}", Tries, Url)); MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; try { if (!string.IsNullOrEmpty(RedirectedFromUrl)) { FetchStatus = await this.Fetch(Url, RedirectedFromUrl); } else { FetchStatus = await this.Fetch(Url); } } catch (Exception ex) { this.DebugMsg(string.Format("FetchStatus: {0}", ex.Message)); this.DebugMsg(string.Format("Url: {0}", Url)); this.DebugMsg(string.Format("FetchStatus: {0}", FetchStatus)); } switch (FetchStatus) { case MacroscopeConstants.FetchStatus.ERROR: this.DebugMsg(string.Format("Fetch Failed: {0} :: {1}", Tries, Url)); Thread.Sleep(25); break; case MacroscopeConstants.FetchStatus.NETWORK_ERROR: this.DebugMsg(string.Format("Fetch Failed: {0} :: {1}", Tries, Url)); Thread.Sleep(25); break; default: this.JobMaster.NotifyWorkersFetched(Url: Url); Tries = 0; break; } Tries--; } while(Tries > 0); if (this.CrawlDelay > 0) { this.DebugMsg(string.Format("CRAWL DELAY: Sleeping for {0} seconds...", this.CrawlDelay)); Thread.Sleep(CrawlDelay * 1000); } } } MaxFetches--; //Thread.Yield(); } this.JobMaster.NotifyWorkersDone(); }
/**************************************************************************/ private async Task <MacroscopeConstants.FetchStatus> Fetch(string Url, string RedirectedFromUrl = null) { MacroscopeDocument msDoc = null; MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; bool BlockedByRobotsRule; if (MacroscopePreferencesManager.GetPageLimit() > -1) { int PagesFound = this.JobMaster.GetPagesFound(); int PageLimit = MacroscopePreferencesManager.GetPageLimit(); if (PagesFound >= PageLimit) { this.DebugMsg(string.Format("PAGE LIMIT REACHED: {0} :: {1}", PageLimit, PagesFound)); return(FetchStatus); } } if (this.DocCollection.ContainsDocument(Url: Url)) { msDoc = this.DocCollection.GetDocumentByUrl(Url: Url); if (msDoc.GetAuthenticationRealm() != null) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredential Credential; Credential = this.JobMaster.GetCredentialsHttp().GetCredential( msDoc.GetHostAndPort(), msDoc.GetAuthenticationRealm() ); if (Credential != null) { msDoc = this.DocCollection.CreateDocument( Credential: Credential, Url: Url ); } } } } else { msDoc = this.DocCollection.CreateDocument(Url: Url); } if (!string.IsNullOrEmpty(RedirectedFromUrl)) { msDoc.SetUrlRedirectFrom(Url: RedirectedFromUrl); } msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK); if (!MacroscopeDnsTools.CheckValidHostname(Url: Url)) { this.DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK")); msDoc.SetStatusCode(HttpStatusCode.BadGateway); FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR; msDoc.SetFetchStatus(FetchStatus); } if (await this.JobMaster.GetRobots().CheckRobotRule(Url: Url)) { msDoc.SetAllowedByRobots(true); } else { msDoc.SetAllowedByRobots(false); } BlockedByRobotsRule = await this.JobMaster.GetRobots().ApplyRobotRule(Url: Url); if (!BlockedByRobotsRule) { this.DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url)); this.JobMaster.AddToBlockedByRobots(Url); FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED; msDoc.SetFetchStatus(FetchStatus); JobHistory.VisitedHistoryItem(Url: msDoc.GetUrl()); } else { this.JobMaster.RemoveFromBlockedByRobots(Url); } if (this.AllowedHosts.IsExternalUrl(Url: Url)) { this.DebugMsg(string.Format("IsExternalUrl: {0}", Url)); msDoc.SetIsExternal(State: true); } if (this.DocCollection.ContainsDocument(Url: Url)) { if (!this.DocCollection.GetDocumentByUrl(Url: Url).GetIsDirty()) { FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN; return(FetchStatus); } } if (MacroscopePreferencesManager.GetDepth() >= 0) { int Depth = MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url); if (Depth > MacroscopePreferencesManager.GetDepth()) { this.DebugMsg(string.Format("URL Too Deep: {0}", Depth)); FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED; return(FetchStatus); } } /** ------------------------------------------------------------------ **/ if (!await msDoc.Execute()) { this.DebugMsg(string.Format("EXECUTE FAILED: {0}", Url)); FetchStatus = MacroscopeConstants.FetchStatus.ERROR; } /** ------------------------------------------------------------------ **/ /** ------------------------------------------------------------------ **/ { if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp(); CredentialsHttp.EnqueueCredentialRequest( Domain: msDoc.GetHostAndPort(), Realm: msDoc.GetAuthenticationRealm(), Url: msDoc.GetUrl() ); this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl()); } } if (msDoc.GetIsRedirect()) { this.DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl())); this.DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom())); if (MacroscopePreferencesManager.GetCheckRedirects()) { string Hostname = msDoc.GetHostAndPort(); string HostnameFrom = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom()); string UrlRedirectTo = msDoc.GetUrlRedirectTo(); string HostnameTo = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo); this.DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo)); this.DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo)); if (MacroscopePreferencesManager.GetFollowRedirects()) { if (MacroscopePreferencesManager.GetCheckExternalLinks()) { this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo); } else { if (this.AllowedHosts.IsInternalUrl(Url: UrlRedirectTo)) { this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo); } } } } this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo()); } else { this.ProcessHrefLangLanguages(msDoc); // Process Languages from HrefLang this.JobMaster.ProcessOutlinks(msDoc: msDoc); // Process Outlinks from document } FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS; } /** ------------------------------------------------------------------ **/ if (DocCollection.ContainsDocument(msDoc: msDoc)) { JobHistory.VisitedHistoryItem(Url: Url); } else { this.DebugMsg(string.Format("OOPS: {0}", Url)); } /** ------------------------------------------------------------------ **/ return(FetchStatus); }