/**************************************************************************/ private async Task <MacroscopeConstants.FetchStatus> Fetch(string Url, string RedirectedFromUrl = null) { MacroscopeDocument msDoc = null; MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; bool BlockedByRobotsRule; if (MacroscopePreferencesManager.GetPageLimit() > -1) { int PagesFound = this.JobMaster.GetPagesFound(); int PageLimit = MacroscopePreferencesManager.GetPageLimit(); if (PagesFound >= PageLimit) { this.DebugMsg(string.Format("PAGE LIMIT REACHED: {0} :: {1}", PageLimit, PagesFound)); return(FetchStatus); } } if (this.DocCollection.ContainsDocument(Url: Url)) { msDoc = this.DocCollection.GetDocumentByUrl(Url: Url); if (msDoc.GetAuthenticationRealm() != null) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredential Credential; Credential = this.JobMaster.GetCredentialsHttp().GetCredential( msDoc.GetHostAndPort(), msDoc.GetAuthenticationRealm() ); if (Credential != null) { msDoc = this.DocCollection.CreateDocument( Credential: Credential, Url: Url ); } } } } else { msDoc = this.DocCollection.CreateDocument(Url: Url); } if (!string.IsNullOrEmpty(RedirectedFromUrl)) { msDoc.SetUrlRedirectFrom(Url: RedirectedFromUrl); } msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK); if (!MacroscopeDnsTools.CheckValidHostname(Url: Url)) { this.DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK")); msDoc.SetStatusCode(HttpStatusCode.BadGateway); FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR; msDoc.SetFetchStatus(FetchStatus); } if (await this.JobMaster.GetRobots().CheckRobotRule(Url: Url)) { msDoc.SetAllowedByRobots(true); } else { msDoc.SetAllowedByRobots(false); } BlockedByRobotsRule = await this.JobMaster.GetRobots().ApplyRobotRule(Url: Url); if (!BlockedByRobotsRule) { this.DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url)); this.JobMaster.AddToBlockedByRobots(Url); FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED; msDoc.SetFetchStatus(FetchStatus); JobHistory.VisitedHistoryItem(Url: msDoc.GetUrl()); } else { this.JobMaster.RemoveFromBlockedByRobots(Url); } if (this.AllowedHosts.IsExternalUrl(Url: Url)) { this.DebugMsg(string.Format("IsExternalUrl: {0}", Url)); msDoc.SetIsExternal(State: true); } if (this.DocCollection.ContainsDocument(Url: Url)) { if (!this.DocCollection.GetDocumentByUrl(Url: Url).GetIsDirty()) { FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN; return(FetchStatus); } } if (MacroscopePreferencesManager.GetDepth() >= 0) { int Depth = MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url); if (Depth > MacroscopePreferencesManager.GetDepth()) { this.DebugMsg(string.Format("URL Too Deep: {0}", Depth)); FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED; return(FetchStatus); } } /** ------------------------------------------------------------------ **/ if (!await msDoc.Execute()) { this.DebugMsg(string.Format("EXECUTE FAILED: {0}", Url)); FetchStatus = MacroscopeConstants.FetchStatus.ERROR; } /** ------------------------------------------------------------------ **/ /** ------------------------------------------------------------------ **/ { if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp(); CredentialsHttp.EnqueueCredentialRequest( Domain: msDoc.GetHostAndPort(), Realm: msDoc.GetAuthenticationRealm(), Url: msDoc.GetUrl() ); this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl()); } } if (msDoc.GetIsRedirect()) { this.DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl())); this.DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom())); if (MacroscopePreferencesManager.GetCheckRedirects()) { string Hostname = msDoc.GetHostAndPort(); string HostnameFrom = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom()); string UrlRedirectTo = msDoc.GetUrlRedirectTo(); string HostnameTo = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo); this.DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo)); this.DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo)); if (MacroscopePreferencesManager.GetFollowRedirects()) { if (MacroscopePreferencesManager.GetCheckExternalLinks()) { this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo); } else { if (this.AllowedHosts.IsInternalUrl(Url: UrlRedirectTo)) { this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo); } } } } this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo()); } else { this.ProcessHrefLangLanguages(msDoc); // Process Languages from HrefLang this.JobMaster.ProcessOutlinks(msDoc: msDoc); // Process Outlinks from document } FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS; } /** ------------------------------------------------------------------ **/ if (DocCollection.ContainsDocument(msDoc: msDoc)) { JobHistory.VisitedHistoryItem(Url: Url); } else { this.DebugMsg(string.Format("OOPS: {0}", Url)); } /** ------------------------------------------------------------------ **/ return(FetchStatus); }
/**************************************************************************/ private MacroscopeConstants.FetchStatus Fetch(string Url) { MacroscopeDocument msDoc = this.DocCollection.GetDocument(Url); MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; if (msDoc != null) { if (msDoc.GetAuthenticationRealm() != null) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredential Credential; Credential = this.JobMaster.GetCredentialsHttp().GetCredential( msDoc.GetHostAndPort(), msDoc.GetAuthenticationRealm() ); if (Credential != null) { msDoc = this.DocCollection.CreateDocument( Credential: Credential, Url: Url ); } } } } else { msDoc = this.DocCollection.CreateDocument(Url); } msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK); if (!MacroscopeDnsTools.CheckValidHostname(Url: Url)) { DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK")); msDoc.SetStatusCode(HttpStatusCode.BadGateway); FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR; msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.NETWORK_ERROR); } if (!this.JobMaster.GetRobots().ApplyRobotRule(Url)) { DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url)); this.JobMaster.AddToBlockedByRobots(Url); FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED; msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED); this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl()); } else { this.JobMaster.RemoveFromBlockedByRobots(Url); } this.JobMaster.GetJobHistory().AddHistoryItem(Url); if (this.AllowedHosts.IsExternalUrl(Url: Url)) { DebugMsg(string.Format("IsExternalUrl: {0}", Url)); msDoc.SetIsExternal(State: true); } if (this.DocCollection.ContainsDocument(Url)) { if (!this.DocCollection.GetDocument(Url).GetIsDirty()) { FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN; return(FetchStatus); } } if (this.JobMaster.GetDepth() > 0) { int Depth = MacroscopeUrlUtils.FindUrlDepth(Url); if (Depth > this.JobMaster.GetDepth()) { DebugMsg(string.Format("TOO DEEP: {0}", Depth)); FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED; return(FetchStatus); } } if (msDoc.Execute()) { this.DocCollection.AddDocument(Url, msDoc); if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp(); CredentialsHttp.EnqueueCredentialRequest( Domain: msDoc.GetHostAndPort(), Realm: msDoc.GetAuthenticationRealm(), Url: msDoc.GetUrl() ); this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl()); } } this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl()); this.JobMaster.IncPageLimitCount(); if (msDoc.GetIsRedirect()) { DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl())); DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom())); if (MacroscopePreferencesManager.GetFollowRedirects()) { string Hostname = msDoc.GetHostAndPort(); string HostnameFrom = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom()); string UrlRedirectTo = msDoc.GetUrlRedirectTo(); string HostnameTo = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo); DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo)); DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo)); } this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo()); } else { this.ProcessHrefLangLanguages(msDoc); // Process Languages from HrefLang this.ProcessOutlinks(msDoc); // Process Outlinks from document } FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS; } else { DebugMsg(string.Format("EXECUTE FAILED: {0}", Url)); FetchStatus = MacroscopeConstants.FetchStatus.ERROR; } return(FetchStatus); }