public void TestCheckValidHostname() { SortedDictionary <string, bool> TestUrls = new SortedDictionary <string, bool> (); TestUrls.Add("https://nazuke.github.io/SEOMacroscope/", true); TestUrls.Add("https://bogus.bogus.com/some/path/index.html", false); TestUrls.Add("https://www.google.com/", true); foreach (string Url in TestUrls.Keys) { Assert.AreEqual( TestUrls[Url], MacroscopeDnsTools.CheckValidHostname(Url: Url), string.Format("FAIL: {0}", Url) ); } }
/** Fetch Robots Text *****************************************************/ private async Task <string> FetchRobotTextFile(Uri RobotsUri) { MacroscopeHttpTwoClientResponse Response = null; bool Proceed = false; string RobotText = ""; string RawData = ""; if (!MacroscopeDnsTools.CheckValidHostname(Url: RobotsUri.ToString())) { DebugMsg(string.Format("FetchRobotTextFile :: CheckValidHostname: {0}", "NOT OK")); return(RobotText); } try { Response = await this.Client.Get( RobotsUri, this.ConfigureHeadRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); if (Response != null) { Proceed = true; } } catch (MacroscopeDocumentException ex) { DebugMsg(string.Format("MacroscopeDocumentException: {0}", ex.Message)); DebugMsg(string.Format("MacroscopeDocumentException: {0}", RobotsUri.ToString())); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString())); } if ((Proceed) && (Response != null)) { try { RawData = Response.GetContentAsString(); } catch (Exception ex) { DebugMsg(string.Format("FetchRobotTextFile: Exception: {0}", ex.Message)); RawData = ""; } } else { lock (this.BadRobots) { if (!this.BadRobots.ContainsKey(RobotsUri)) { this.BadRobots.Add(RobotsUri, true); RobotText = ""; } } } if (!string.IsNullOrEmpty(RawData)) { RobotText = RawData; } return(RobotText); }
/** Fetch Robots Text *****************************************************/ private string FetchRobotTextFile(Uri RobotsUri) { Boolean Proceed = false; HttpWebRequest req = null; HttpWebResponse res = null; string RobotText = ""; string RawData = ""; if (!MacroscopeDnsTools.CheckValidHostname(Url: RobotsUri.ToString())) { DebugMsg(string.Format("FetchRobotTextFile :: CheckValidHostname: {0}", "NOT OK")); return(RobotText); } try { req = WebRequest.CreateHttp(RobotsUri); req.Method = "GET"; req.Timeout = MacroscopePreferencesManager.GetRequestTimeout() * 1000; req.KeepAlive = false; req.UserAgent = this.UserAgent(); req.Host = RobotsUri.Host; req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; MacroscopePreferencesManager.EnableHttpProxy(req); res = ( HttpWebResponse )req.GetResponse(); Proceed = true; } catch (UriFormatException ex) { DebugMsg(string.Format("UriFormatException: {0}", ex.Message)); DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString())); } catch (WebException ex) { DebugMsg(string.Format("WebException: {0}", ex.Message)); DebugMsg(string.Format("WebException: {0}", RobotsUri.ToString())); DebugMsg(string.Format("WebExceptionStatus: {0}", ex.Status)); } catch (NotSupportedException ex) { DebugMsg(string.Format("NotSupportedException: {0}", ex.Message)); DebugMsg(string.Format("NotSupportedException: {0}", RobotsUri.ToString())); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString())); } if ((Proceed) && (res != null)) { try { Stream ResponseStream = res.GetResponseStream(); StreamReader ReadStream = new StreamReader(ResponseStream); RawData = ReadStream.ReadToEnd(); } catch (WebException ex) { DebugMsg(string.Format("FetchRobotTextFile: WebException: {0}", ex.Message)); RawData = ""; } catch (Exception ex) { DebugMsg(string.Format("FetchRobotTextFile: Exception: {0}", ex.Message)); RawData = ""; } res.Close(); res.Dispose(); } else { lock (this.BadRobots) { if (!this.BadRobots.ContainsKey(RobotsUri)) { this.BadRobots.Add(RobotsUri, true); RobotText = ""; } } } if (!string.IsNullOrEmpty(RawData)) { RobotText = RawData; } return(RobotText); }
/**************************************************************************/ private async Task <MacroscopeConstants.FetchStatus> Fetch(string Url, string RedirectedFromUrl = null) { MacroscopeDocument msDoc = null; MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; bool BlockedByRobotsRule; if (MacroscopePreferencesManager.GetPageLimit() > -1) { int PagesFound = this.JobMaster.GetPagesFound(); int PageLimit = MacroscopePreferencesManager.GetPageLimit(); if (PagesFound >= PageLimit) { this.DebugMsg(string.Format("PAGE LIMIT REACHED: {0} :: {1}", PageLimit, PagesFound)); return(FetchStatus); } } if (this.DocCollection.ContainsDocument(Url: Url)) { msDoc = this.DocCollection.GetDocumentByUrl(Url: Url); if (msDoc.GetAuthenticationRealm() != null) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredential Credential; Credential = this.JobMaster.GetCredentialsHttp().GetCredential( msDoc.GetHostAndPort(), msDoc.GetAuthenticationRealm() ); if (Credential != null) { msDoc = this.DocCollection.CreateDocument( Credential: Credential, Url: Url ); } } } } else { msDoc = this.DocCollection.CreateDocument(Url: Url); } if (!string.IsNullOrEmpty(RedirectedFromUrl)) { msDoc.SetUrlRedirectFrom(Url: RedirectedFromUrl); } msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK); if (!MacroscopeDnsTools.CheckValidHostname(Url: Url)) { this.DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK")); msDoc.SetStatusCode(HttpStatusCode.BadGateway); FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR; msDoc.SetFetchStatus(FetchStatus); } if (await this.JobMaster.GetRobots().CheckRobotRule(Url: Url)) { msDoc.SetAllowedByRobots(true); } else { msDoc.SetAllowedByRobots(false); } BlockedByRobotsRule = await this.JobMaster.GetRobots().ApplyRobotRule(Url: Url); if (!BlockedByRobotsRule) { this.DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url)); this.JobMaster.AddToBlockedByRobots(Url); FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED; msDoc.SetFetchStatus(FetchStatus); JobHistory.VisitedHistoryItem(Url: msDoc.GetUrl()); } else { this.JobMaster.RemoveFromBlockedByRobots(Url); } if (this.AllowedHosts.IsExternalUrl(Url: Url)) { this.DebugMsg(string.Format("IsExternalUrl: {0}", Url)); msDoc.SetIsExternal(State: true); } if (this.DocCollection.ContainsDocument(Url: Url)) { if (!this.DocCollection.GetDocumentByUrl(Url: Url).GetIsDirty()) { FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN; return(FetchStatus); } } if (MacroscopePreferencesManager.GetDepth() >= 0) { int Depth = MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url); if (Depth > MacroscopePreferencesManager.GetDepth()) { this.DebugMsg(string.Format("URL Too Deep: {0}", Depth)); FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED; return(FetchStatus); } } /** ------------------------------------------------------------------ **/ if (!await msDoc.Execute()) { this.DebugMsg(string.Format("EXECUTE FAILED: {0}", Url)); FetchStatus = MacroscopeConstants.FetchStatus.ERROR; } /** ------------------------------------------------------------------ **/ /** ------------------------------------------------------------------ **/ { if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp(); CredentialsHttp.EnqueueCredentialRequest( Domain: msDoc.GetHostAndPort(), Realm: msDoc.GetAuthenticationRealm(), Url: msDoc.GetUrl() ); this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl()); } } if (msDoc.GetIsRedirect()) { this.DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl())); this.DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom())); if (MacroscopePreferencesManager.GetCheckRedirects()) { string Hostname = msDoc.GetHostAndPort(); string HostnameFrom = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom()); string UrlRedirectTo = msDoc.GetUrlRedirectTo(); string HostnameTo = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo); this.DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo)); this.DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo)); if (MacroscopePreferencesManager.GetFollowRedirects()) { if (MacroscopePreferencesManager.GetCheckExternalLinks()) { this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo); } else { if (this.AllowedHosts.IsInternalUrl(Url: UrlRedirectTo)) { this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo); } } } } this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo()); } else { this.ProcessHrefLangLanguages(msDoc); // Process Languages from HrefLang this.JobMaster.ProcessOutlinks(msDoc: msDoc); // Process Outlinks from document } FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS; } /** ------------------------------------------------------------------ **/ if (DocCollection.ContainsDocument(msDoc: msDoc)) { JobHistory.VisitedHistoryItem(Url: Url); } else { this.DebugMsg(string.Format("OOPS: {0}", Url)); } /** ------------------------------------------------------------------ **/ return(FetchStatus); }
/**************************************************************************/ private MacroscopeConstants.FetchStatus Fetch(string Url) { MacroscopeDocument msDoc = this.DocCollection.GetDocument(Url); MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; if (msDoc != null) { if (msDoc.GetAuthenticationRealm() != null) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredential Credential; Credential = this.JobMaster.GetCredentialsHttp().GetCredential( msDoc.GetHostAndPort(), msDoc.GetAuthenticationRealm() ); if (Credential != null) { msDoc = this.DocCollection.CreateDocument( Credential: Credential, Url: Url ); } } } } else { msDoc = this.DocCollection.CreateDocument(Url); } msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK); if (!MacroscopeDnsTools.CheckValidHostname(Url: Url)) { DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK")); msDoc.SetStatusCode(HttpStatusCode.BadGateway); FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR; msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.NETWORK_ERROR); } if (!this.JobMaster.GetRobots().ApplyRobotRule(Url)) { DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url)); this.JobMaster.AddToBlockedByRobots(Url); FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED; msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED); this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl()); } else { this.JobMaster.RemoveFromBlockedByRobots(Url); } this.JobMaster.GetJobHistory().AddHistoryItem(Url); if (this.AllowedHosts.IsExternalUrl(Url: Url)) { DebugMsg(string.Format("IsExternalUrl: {0}", Url)); msDoc.SetIsExternal(State: true); } if (this.DocCollection.ContainsDocument(Url)) { if (!this.DocCollection.GetDocument(Url).GetIsDirty()) { FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN; return(FetchStatus); } } if (this.JobMaster.GetDepth() > 0) { int Depth = MacroscopeUrlUtils.FindUrlDepth(Url); if (Depth > this.JobMaster.GetDepth()) { DebugMsg(string.Format("TOO DEEP: {0}", Depth)); FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED; return(FetchStatus); } } if (msDoc.Execute()) { this.DocCollection.AddDocument(Url, msDoc); if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp(); CredentialsHttp.EnqueueCredentialRequest( Domain: msDoc.GetHostAndPort(), Realm: msDoc.GetAuthenticationRealm(), Url: msDoc.GetUrl() ); this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl()); } } this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl()); this.JobMaster.IncPageLimitCount(); if (msDoc.GetIsRedirect()) { DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl())); DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom())); if (MacroscopePreferencesManager.GetFollowRedirects()) { string Hostname = msDoc.GetHostAndPort(); string HostnameFrom = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom()); string UrlRedirectTo = msDoc.GetUrlRedirectTo(); string HostnameTo = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo); DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo)); DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo)); } this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo()); } else { this.ProcessHrefLangLanguages(msDoc); // Process Languages from HrefLang this.ProcessOutlinks(msDoc); // Process Outlinks from document } FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS; } else { DebugMsg(string.Format("EXECUTE FAILED: {0}", Url)); FetchStatus = MacroscopeConstants.FetchStatus.ERROR; } return(FetchStatus); }