public void TestCheckValidHostname()
        {
            SortedDictionary <string, bool> TestUrls = new SortedDictionary <string, bool> ();

            TestUrls.Add("https://nazuke.github.io/SEOMacroscope/", true);
            TestUrls.Add("https://bogus.bogus.com/some/path/index.html", false);
            TestUrls.Add("https://www.google.com/", true);

            foreach (string Url in TestUrls.Keys)
            {
                Assert.AreEqual(
                    TestUrls[Url],
                    MacroscopeDnsTools.CheckValidHostname(Url: Url),
                    string.Format("FAIL: {0}", Url)
                    );
            }
        }
        /** Fetch Robots Text *****************************************************/

        private async Task <string> FetchRobotTextFile(Uri RobotsUri)
        {
            MacroscopeHttpTwoClientResponse Response = null;
            bool   Proceed   = false;
            string RobotText = "";
            string RawData   = "";

            if (!MacroscopeDnsTools.CheckValidHostname(Url: RobotsUri.ToString()))
            {
                DebugMsg(string.Format("FetchRobotTextFile :: CheckValidHostname: {0}", "NOT OK"));
                return(RobotText);
            }

            try
            {
                Response = await this.Client.Get(
                    RobotsUri,
                    this.ConfigureHeadRequestHeadersCallback,
                    this.PostProcessRequestHttpHeadersCallback
                    );

                if (Response != null)
                {
                    Proceed = true;
                }
            }
            catch (MacroscopeDocumentException ex)
            {
                DebugMsg(string.Format("MacroscopeDocumentException: {0}", ex.Message));
                DebugMsg(string.Format("MacroscopeDocumentException: {0}", RobotsUri.ToString()));
            }
            catch (Exception ex)
            {
                DebugMsg(string.Format("Exception: {0}", ex.Message));
                DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString()));
            }

            if ((Proceed) && (Response != null))
            {
                try
                {
                    RawData = Response.GetContentAsString();
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("FetchRobotTextFile: Exception: {0}", ex.Message));
                    RawData = "";
                }
            }
            else
            {
                lock (this.BadRobots)
                {
                    if (!this.BadRobots.ContainsKey(RobotsUri))
                    {
                        this.BadRobots.Add(RobotsUri, true);
                        RobotText = "";
                    }
                }
            }

            if (!string.IsNullOrEmpty(RawData))
            {
                RobotText = RawData;
            }

            return(RobotText);
        }
示例#3
0
        /** Fetch Robots Text *****************************************************/

        private string FetchRobotTextFile(Uri RobotsUri)
        {
            Boolean         Proceed   = false;
            HttpWebRequest  req       = null;
            HttpWebResponse res       = null;
            string          RobotText = "";
            string          RawData   = "";

            if (!MacroscopeDnsTools.CheckValidHostname(Url: RobotsUri.ToString()))
            {
                DebugMsg(string.Format("FetchRobotTextFile :: CheckValidHostname: {0}", "NOT OK"));
                return(RobotText);
            }

            try
            {
                req           = WebRequest.CreateHttp(RobotsUri);
                req.Method    = "GET";
                req.Timeout   = MacroscopePreferencesManager.GetRequestTimeout() * 1000;
                req.KeepAlive = false;
                req.UserAgent = this.UserAgent();
                req.Host      = RobotsUri.Host;
                req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                MacroscopePreferencesManager.EnableHttpProxy(req);

                res = ( HttpWebResponse )req.GetResponse();

                Proceed = true;
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("UriFormatException: {0}", ex.Message));
                DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString()));
            }
            catch (WebException ex)
            {
                DebugMsg(string.Format("WebException: {0}", ex.Message));
                DebugMsg(string.Format("WebException: {0}", RobotsUri.ToString()));
                DebugMsg(string.Format("WebExceptionStatus: {0}", ex.Status));
            }
            catch (NotSupportedException ex)
            {
                DebugMsg(string.Format("NotSupportedException: {0}", ex.Message));
                DebugMsg(string.Format("NotSupportedException: {0}", RobotsUri.ToString()));
            }
            catch (Exception ex)
            {
                DebugMsg(string.Format("Exception: {0}", ex.Message));
                DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString()));
            }

            if ((Proceed) && (res != null))
            {
                try
                {
                    Stream       ResponseStream = res.GetResponseStream();
                    StreamReader ReadStream     = new StreamReader(ResponseStream);
                    RawData = ReadStream.ReadToEnd();
                }
                catch (WebException ex)
                {
                    DebugMsg(string.Format("FetchRobotTextFile: WebException: {0}", ex.Message));
                    RawData = "";
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("FetchRobotTextFile: Exception: {0}", ex.Message));
                    RawData = "";
                }

                res.Close();

                res.Dispose();
            }
            else
            {
                lock (this.BadRobots)
                {
                    if (!this.BadRobots.ContainsKey(RobotsUri))
                    {
                        this.BadRobots.Add(RobotsUri, true);
                        RobotText = "";
                    }
                }
            }

            if (!string.IsNullOrEmpty(RawData))
            {
                RobotText = RawData;
            }

            return(RobotText);
        }
示例#4
0
        /**************************************************************************/

        private async Task <MacroscopeConstants.FetchStatus> Fetch(string Url, string RedirectedFromUrl = null)
        {
            MacroscopeDocument msDoc = null;

            MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID;
            bool BlockedByRobotsRule;

            if (MacroscopePreferencesManager.GetPageLimit() > -1)
            {
                int PagesFound = this.JobMaster.GetPagesFound();
                int PageLimit  = MacroscopePreferencesManager.GetPageLimit();
                if (PagesFound >= PageLimit)
                {
                    this.DebugMsg(string.Format("PAGE LIMIT REACHED: {0} :: {1}", PageLimit, PagesFound));
                    return(FetchStatus);
                }
            }

            if (this.DocCollection.ContainsDocument(Url: Url))
            {
                msDoc = this.DocCollection.GetDocumentByUrl(Url: Url);

                if (msDoc.GetAuthenticationRealm() != null)
                {
                    if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC)
                    {
                        MacroscopeCredential Credential;

                        Credential = this.JobMaster.GetCredentialsHttp().GetCredential(
                            msDoc.GetHostAndPort(),
                            msDoc.GetAuthenticationRealm()
                            );

                        if (Credential != null)
                        {
                            msDoc = this.DocCollection.CreateDocument(
                                Credential: Credential,
                                Url: Url
                                );
                        }
                    }
                }
            }
            else
            {
                msDoc = this.DocCollection.CreateDocument(Url: Url);
            }

            if (!string.IsNullOrEmpty(RedirectedFromUrl))
            {
                msDoc.SetUrlRedirectFrom(Url: RedirectedFromUrl);
            }

            msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK);

            if (!MacroscopeDnsTools.CheckValidHostname(Url: Url))
            {
                this.DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK"));
                msDoc.SetStatusCode(HttpStatusCode.BadGateway);
                FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR;
                msDoc.SetFetchStatus(FetchStatus);
            }

            if (await this.JobMaster.GetRobots().CheckRobotRule(Url: Url))
            {
                msDoc.SetAllowedByRobots(true);
            }
            else
            {
                msDoc.SetAllowedByRobots(false);
            }

            BlockedByRobotsRule = await this.JobMaster.GetRobots().ApplyRobotRule(Url: Url);

            if (!BlockedByRobotsRule)
            {
                this.DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url));

                this.JobMaster.AddToBlockedByRobots(Url);

                FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED;

                msDoc.SetFetchStatus(FetchStatus);

                JobHistory.VisitedHistoryItem(Url: msDoc.GetUrl());
            }
            else
            {
                this.JobMaster.RemoveFromBlockedByRobots(Url);
            }

            if (this.AllowedHosts.IsExternalUrl(Url: Url))
            {
                this.DebugMsg(string.Format("IsExternalUrl: {0}", Url));
                msDoc.SetIsExternal(State: true);
            }

            if (this.DocCollection.ContainsDocument(Url: Url))
            {
                if (!this.DocCollection.GetDocumentByUrl(Url: Url).GetIsDirty())
                {
                    FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN;
                    return(FetchStatus);
                }
            }

            if (MacroscopePreferencesManager.GetDepth() >= 0)
            {
                int Depth = MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url);
                if (Depth > MacroscopePreferencesManager.GetDepth())
                {
                    this.DebugMsg(string.Format("URL Too Deep: {0}", Depth));
                    FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED;
                    return(FetchStatus);
                }
            }

            /** ------------------------------------------------------------------ **/

            if (!await msDoc.Execute())
            {
                this.DebugMsg(string.Format("EXECUTE FAILED: {0}", Url));
                FetchStatus = MacroscopeConstants.FetchStatus.ERROR;
            }

            /** ------------------------------------------------------------------ **/



            /** ------------------------------------------------------------------ **/

            {
                if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized)
                {
                    if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC)
                    {
                        MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp();

                        CredentialsHttp.EnqueueCredentialRequest(
                            Domain: msDoc.GetHostAndPort(),
                            Realm: msDoc.GetAuthenticationRealm(),
                            Url: msDoc.GetUrl()
                            );

                        this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl());
                    }
                }

                if (msDoc.GetIsRedirect())
                {
                    this.DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl()));
                    this.DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom()));

                    if (MacroscopePreferencesManager.GetCheckRedirects())
                    {
                        string Hostname      = msDoc.GetHostAndPort();
                        string HostnameFrom  = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom());
                        string UrlRedirectTo = msDoc.GetUrlRedirectTo();
                        string HostnameTo    = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo);

                        this.DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo));
                        this.DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo));

                        if (MacroscopePreferencesManager.GetFollowRedirects())
                        {
                            if (MacroscopePreferencesManager.GetCheckExternalLinks())
                            {
                                this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo);
                            }
                            else
                            {
                                if (this.AllowedHosts.IsInternalUrl(Url: UrlRedirectTo))
                                {
                                    this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo);
                                }
                            }
                        }
                    }

                    this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo());
                }
                else
                {
                    this.ProcessHrefLangLanguages(msDoc);         // Process Languages from HrefLang

                    this.JobMaster.ProcessOutlinks(msDoc: msDoc); // Process Outlinks from document
                }

                FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS;
            }

            /** ------------------------------------------------------------------ **/

            if (DocCollection.ContainsDocument(msDoc: msDoc))
            {
                JobHistory.VisitedHistoryItem(Url: Url);
            }
            else
            {
                this.DebugMsg(string.Format("OOPS: {0}", Url));
            }

            /** ------------------------------------------------------------------ **/

            return(FetchStatus);
        }
        /**************************************************************************/

        private MacroscopeConstants.FetchStatus Fetch(string Url)
        {
            MacroscopeDocument msDoc = this.DocCollection.GetDocument(Url);

            MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID;

            if (msDoc != null)
            {
                if (msDoc.GetAuthenticationRealm() != null)
                {
                    if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC)
                    {
                        MacroscopeCredential Credential;

                        Credential = this.JobMaster.GetCredentialsHttp().GetCredential(
                            msDoc.GetHostAndPort(),
                            msDoc.GetAuthenticationRealm()
                            );

                        if (Credential != null)
                        {
                            msDoc = this.DocCollection.CreateDocument(
                                Credential: Credential,
                                Url: Url
                                );
                        }
                    }
                }
            }
            else
            {
                msDoc = this.DocCollection.CreateDocument(Url);
            }

            msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK);

            if (!MacroscopeDnsTools.CheckValidHostname(Url: Url))
            {
                DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK"));

                msDoc.SetStatusCode(HttpStatusCode.BadGateway);

                FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR;

                msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.NETWORK_ERROR);
            }

            if (!this.JobMaster.GetRobots().ApplyRobotRule(Url))
            {
                DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url));

                this.JobMaster.AddToBlockedByRobots(Url);

                FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED;

                msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED);

                this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl());
            }
            else
            {
                this.JobMaster.RemoveFromBlockedByRobots(Url);
            }

            this.JobMaster.GetJobHistory().AddHistoryItem(Url);

            if (this.AllowedHosts.IsExternalUrl(Url: Url))
            {
                DebugMsg(string.Format("IsExternalUrl: {0}", Url));
                msDoc.SetIsExternal(State: true);
            }

            if (this.DocCollection.ContainsDocument(Url))
            {
                if (!this.DocCollection.GetDocument(Url).GetIsDirty())
                {
                    FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN;
                    return(FetchStatus);
                }
            }

            if (this.JobMaster.GetDepth() > 0)
            {
                int Depth = MacroscopeUrlUtils.FindUrlDepth(Url);
                if (Depth > this.JobMaster.GetDepth())
                {
                    DebugMsg(string.Format("TOO DEEP: {0}", Depth));
                    FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED;
                    return(FetchStatus);
                }
            }

            if (msDoc.Execute())
            {
                this.DocCollection.AddDocument(Url, msDoc);

                if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized)
                {
                    if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC)
                    {
                        MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp();

                        CredentialsHttp.EnqueueCredentialRequest(
                            Domain: msDoc.GetHostAndPort(),
                            Realm: msDoc.GetAuthenticationRealm(),
                            Url: msDoc.GetUrl()
                            );

                        this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl());
                    }
                }

                this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl());

                this.JobMaster.IncPageLimitCount();

                if (msDoc.GetIsRedirect())
                {
                    DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl()));
                    DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom()));

                    if (MacroscopePreferencesManager.GetFollowRedirects())
                    {
                        string Hostname      = msDoc.GetHostAndPort();
                        string HostnameFrom  = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom());
                        string UrlRedirectTo = msDoc.GetUrlRedirectTo();
                        string HostnameTo    = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo);

                        DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo));
                        DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo));
                    }

                    this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo());
                }
                else
                {
                    this.ProcessHrefLangLanguages(msDoc); // Process Languages from HrefLang

                    this.ProcessOutlinks(msDoc);          // Process Outlinks from document
                }

                FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS;
            }
            else
            {
                DebugMsg(string.Format("EXECUTE FAILED: {0}", Url));
                FetchStatus = MacroscopeConstants.FetchStatus.ERROR;
            }

            return(FetchStatus);
        }