예제 #1
0
        /** Crawl Delay ***********************************************************/

        public async Task <int> GetCrawlDelay(string Url)
        {
            int    Delay = 0;
            Robots robot;

            if (!MacroscopePreferencesManager.GetFollowRobotsProtocol())
            {
                return(Delay);
            }

            robot = await this.FetchRobot(Url : Url);

            if (robot != null)
            {
                long CrawlDelayTime = robot.CrawlDelay(userAgent: this.UserAgentName());

                if (CrawlDelayTime == 0)
                {
                    CrawlDelayTime = robot.CrawlDelay("*");
                }

                if (CrawlDelayTime > 0)
                {
                    Delay = (int)(CrawlDelayTime / 1000);
                }

                DebugMsg(string.Format("ROBOTS CrawlDelayTime: {0}", CrawlDelayTime));
                DebugMsg(string.Format("ROBOTS Delay: {0}", Delay));
            }

            return(Delay);
        }
예제 #2
0
        /** Sitemaps **************************************************************/

        public async Task <List <string> > GetSitemapsAsList(string Url)
        {
            List <string> SitemapsList = new List <string>();

            if (MacroscopePreferencesManager.GetFollowRobotsProtocol())
            {
                Robots robot = await this.FetchRobot(Url : Url);

                try
                {
                    if ((robot != null) && (robot.Sitemaps != null))
                    {
                        foreach (Sitemap SitemapEntry in robot.Sitemaps)
                        {
                            string SitemapUrl    = SitemapEntry.Url.ToString();
                            string SitemapUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute(BaseUrl: Url, Url: SitemapUrl);

                            SitemapsList.Add(SitemapUrlAbs);

                            this.DebugMsg(string.Format("ROBOTS SitemapUrl: {0}", SitemapUrl));
                        }
                    }
                }
                catch (Exception ex)
                {
                    this.DebugMsg(ex.Message);
                }
            }

            return(SitemapsList);
        }
예제 #3
0
        /**************************************************************************/

        public MacroscopeJobWorker(MacroscopeJobMaster JobMaster)
        {
            this.SuppressDebugMsg = true;

            this.JobMaster = JobMaster;

            this.DocCollection = this.JobMaster.GetDocCollection();

            this.AllowedHosts = this.JobMaster.GetAllowedHosts();

            this.IncludeExcludeUrls = this.JobMaster.GetIncludeExcludeUrls();

            if (MacroscopePreferencesManager.GetCrawlDelay() > 0)
            {
                this.CrawlDelay = MacroscopePreferencesManager.GetCrawlDelay();
            }

            if (MacroscopePreferencesManager.GetFollowRobotsProtocol())
            {
                if (this.JobMaster.GetCrawlDelay() > 0)
                {
                    this.CrawlDelay = this.JobMaster.GetCrawlDelay();
                }
            }
        }
예제 #4
0
        /** -------------------------------------------------------------------- **/

        public async Task <bool> ApplyRobotRule(string Url)
        {
            bool Allowed = true;

            if (MacroscopePreferencesManager.GetFollowRobotsProtocol())
            {
                Allowed = await this.CheckRobotRule(Url : Url);
            }

            return(Allowed);
        }
예제 #5
0
        /** Generate Robot URL ****************************************************/

        public static string GenerateRobotUrl(string Url)
        {
            string RobotUrl = null;

            if (MacroscopePreferencesManager.GetFollowRobotsProtocol())
            {
                DebugMsgStatic(string.Format("ROBOTS Disabled: {0}", Url));

                Uri    BaseUri      = null;
                string BaseUriPort  = "";
                Uri    RobotsUri    = null;
                string RobotsTxtUrl = null;

                try
                {
                    BaseUri = new Uri(Url, UriKind.Absolute);

                    if (BaseUri.Port > 0)
                    {
                        BaseUriPort = string.Format(":{0}", BaseUri.Port);
                    }

                    RobotsUri = new Uri(
                        string.Format(
                            "{0}://{1}{2}{3}",
                            BaseUri.Scheme,
                            BaseUri.Host,
                            BaseUriPort,
                            "/robots.txt"
                            ),
                        UriKind.Absolute
                        );

                    RobotsTxtUrl = RobotsUri.ToString();
                }
                catch (InvalidOperationException ex)
                {
                    DebugMsgStatic(string.Format("GenerateRobotUrl: {0}", ex.Message));
                }
                catch (UriFormatException ex)
                {
                    DebugMsgStatic(string.Format("GenerateRobotUrl: {0}", ex.Message));
                }

                if (!string.IsNullOrEmpty(RobotsTxtUrl))
                {
                    RobotUrl = RobotsTxtUrl;
                }
            }

            return(RobotUrl);
        }
예제 #6
0
        /** ROBOT RULES ***********************************************************/

        public Boolean ApplyRobotRule(string Url)
        {
            Boolean Allowed = false;

            if (!MacroscopePreferencesManager.GetFollowRobotsProtocol())
            {
                DebugMsg(string.Format("ROBOTS Disabled: {0}", Url));
                return(true);
            }
            else
            {
                Robots robot   = this.FetchRobot(Url: Url);
                Uri    BaseUri = null;

                try
                {
                    BaseUri = new Uri(Url, UriKind.Absolute);
                }
                catch (UriFormatException ex)
                {
                    DebugMsg(string.Format("ApplyRobotRule: {0}", ex.Message));
                }
                catch (Exception ex)
                {
                    DebugMsg(string.Format("ApplyRobotRule: {0}", ex.Message));
                }

                if ((robot != null) && (BaseUri != null))
                {
                    if (robot.IsPathAllowed("*", BaseUri.AbsolutePath))
                    {
                        Allowed = true;
                    }
                    else
                    {
                        DebugMsg(string.Format("ROBOTS Disallowed: {0}", Url));
                        DebugMsg(string.Format("ROBOTS AbsolutePath: {0}", BaseUri.AbsolutePath));
                    }
                }
            }

            return(Allowed);
        }
        /**************************************************************************/

        public void SetPrefsFormControlFields()
        {
            { // Configure Display Options
                this.checkBoxPauseDisplayDuringScan.Checked = MacroscopePreferencesManager.GetPauseDisplayDuringScan();
                this.checkBoxShowProgressDialogues.Checked  = MacroscopePreferencesManager.GetShowProgressDialogues();
            }

            { //Configure Form Fields
              /** Spidering Control ---------------------------------------------- **/

                this.numericUpDownDepth.Minimum = -1;
                this.numericUpDownDepth.Maximum = 10000;

                this.numericUpDownPageLimit.Minimum = -1;
                this.numericUpDownPageLimit.Maximum = 10000;

                this.numericUpDownCrawlDelay.Minimum = 0;
                this.numericUpDownCrawlDelay.Maximum = 60;

                this.numericUpDownMaxRetries.Minimum = 0;
                this.numericUpDownMaxRetries.Maximum = 10;
            }

            {
                /** WebProxy Options ----------------------------------------------- **/

                this.comboBoxProxyType.SelectedIndex = (int)MacroscopePreferencesManager.GetProxyType();

                /** Server Certificate Options --------------------------------------- **/

                this.checkBoxServerCertificateValidation.Checked = MacroscopePreferencesManager.GetServerCertificateValidation();

                /** Spidering Control ---------------------------------------------- **/

                this.numericUpDownMaxThreads.Value     = MacroscopePreferencesManager.GetMaxThreads();
                this.numericUpDownDepth.Value          = MacroscopePreferencesManager.GetDepth();
                this.numericUpDownPageLimit.Value      = MacroscopePreferencesManager.GetPageLimit();
                this.numericUpDownCrawlDelay.Value     = MacroscopePreferencesManager.GetCrawlDelay();
                this.numericUpDownRequestTimeout.Value = (Decimal)MacroscopePreferencesManager.GetRequestTimeout();
                this.numericUpDownMaxRetries.Value     = (Decimal)MacroscopePreferencesManager.GetMaxRetries();

                this.checkBoxCrawlStrictUrlCheck.Checked = MacroscopePreferencesManager.GetCrawlStrictUrlCheck();

                this.checkBoxCheckExternalLinks.Checked = MacroscopePreferencesManager.GetCheckExternalLinks();
                this.checkBoxFetchExternalLinks.Checked = MacroscopePreferencesManager.GetFetchExternalLinks();

                this.checkBoxFollowRobotsProtocol.Checked = MacroscopePreferencesManager.GetFollowRobotsProtocol();
                this.checkBoxFollowSitemapLinks.Checked   = MacroscopePreferencesManager.GetFollowSitemapLinks();
                this.checkBoxProbeHumansText.Checked      = MacroscopePreferencesManager.GetProbeHumansText();

                this.checkBoxCheckRedirects.Checked  = MacroscopePreferencesManager.GetCheckRedirects();
                this.checkBoxFollowRedirects.Checked = MacroscopePreferencesManager.GetFollowRedirects();

                this.checkBoxFollowNoFollow.Checked       = MacroscopePreferencesManager.GetFollowNoFollow();
                this.checkBoxIgnoreQueries.Checked        = MacroscopePreferencesManager.GetIgnoreQueries();
                this.checkBoxIgnoreHashFragments.Checked  = MacroscopePreferencesManager.GetIgnoreHashFragments();
                this.checkBoxFollowCanonicalLinks.Checked = MacroscopePreferencesManager.GetFollowCanonicalLinks();
                this.checkBoxFollowAlternateLinks.Checked = MacroscopePreferencesManager.GetFollowAlternateLinks();
                this.checkBoxFollowHrefLangLinks.Checked  = MacroscopePreferencesManager.GetFollowHrefLangLinks();
                this.checkBoxDowncaseLinks.Checked        = MacroscopePreferencesManager.GetDowncaseLinks();

                this.checkBoxFetchStylesheets.Checked = MacroscopePreferencesManager.GetFetchStylesheets();
                this.checkBoxFetchJavascripts.Checked = MacroscopePreferencesManager.GetFetchJavascripts();
                this.checkBoxFetchImages.Checked      = MacroscopePreferencesManager.GetFetchImages();
                this.checkBoxFetchAudio.Checked       = MacroscopePreferencesManager.GetFetchAudio();
                this.checkBoxFetchVideo.Checked       = MacroscopePreferencesManager.GetFetchVideo();
                this.checkBoxFetchXml.Checked         = MacroscopePreferencesManager.GetFetchXml();
                this.checkBoxFetchBinaries.Checked    = MacroscopePreferencesManager.GetFetchBinaries();

                this.checkBoxScanSitesInList.Checked = MacroscopePreferencesManager.GetScanSitesInList();

                this.checkBoxProbeParentFolderUrls.Checked = MacroscopePreferencesManager.GetProbeParentFolderUrls();

                this.checkBoxProbeHead404sWithGet.Checked = MacroscopePreferencesManager.GetProbeHead404sWithGet();

                /** Analysis Options ----------------------------------------------- **/

                this.checkBoxResolveAddresses.Checked = MacroscopePreferencesManager.GetResolveAddresses();

                this.checkBoxCheckHreflangs.Checked = MacroscopePreferencesManager.GetCheckHreflangs();
                this.checkBoxDetectLanguage.Checked = MacroscopePreferencesManager.GetDetectLanguage();

                this.checkBoxProcessStylesheets.Checked = MacroscopePreferencesManager.GetProcessStylesheets();
                this.checkBoxProcessJavascripts.Checked = MacroscopePreferencesManager.GetProcessJavascripts();
                this.checkBoxProcessImages.Checked      = MacroscopePreferencesManager.GetProcessImages();
                this.checkBoxProcessPdfs.Checked        = MacroscopePreferencesManager.GetProcessPdfs();
                this.checkBoxProcessAudio.Checked       = MacroscopePreferencesManager.GetProcessAudio();
                this.checkBoxProcessVideo.Checked       = MacroscopePreferencesManager.GetProcessVideo();
                this.checkBoxProcessXml.Checked         = MacroscopePreferencesManager.GetProcessXml();
                this.checkBoxProcessBinaries.Checked    = MacroscopePreferencesManager.GetProcessBinaries();

                this.numericUpDownRedirectChainsMaxHops.Value = MacroscopePreferencesManager.GetRedirectChainsMaxHops();

                this.checkBoxWarnAboutInsecureLinks.Checked = MacroscopePreferencesManager.GetWarnAboutInsecureLinks();

                this.checkBoxEnableTextIndexing.Checked        = MacroscopePreferencesManager.GetEnableTextIndexing();
                this.checkBoxCaseSensitiveTextIndexing.Checked = MacroscopePreferencesManager.GetCaseSensitiveTextIndexing();

                this.checkBoxDisregardHtml5ElementNav.Checked    = MacroscopePreferencesManager.GetDisregardHtml5ElementNav();
                this.checkBoxDisregardHtml5ElementHeader.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementHeader();
                this.checkBoxDisregardHtml5ElementFooter.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementFooter();

                this.checkBoxDetectQrCodeInImage.Checked = MacroscopePreferencesManager.GetDetectQrCodeInImage();

                /** SEO Options ---------------------------------------------------- **/

                this.numericUpDownTitleMinLen.Value        = MacroscopePreferencesManager.GetTitleMinLen();
                this.numericUpDownTitleMaxLen.Value        = MacroscopePreferencesManager.GetTitleMaxLen();
                this.numericUpDownTitleMinWords.Value      = MacroscopePreferencesManager.GetTitleMinWords();
                this.numericUpDownTitleMaxWords.Value      = MacroscopePreferencesManager.GetTitleMaxWords();
                this.numericUpDownTitleMaxPixelWidth.Value = MacroscopePreferencesManager.GetTitleMaxPixelWidth();

                this.numericUpDownDescriptionMinLen.Value   = MacroscopePreferencesManager.GetDescriptionMinLen();
                this.numericUpDownDescriptionMaxLen.Value   = MacroscopePreferencesManager.GetDescriptionMaxLen();
                this.numericUpDownDescriptionMinWords.Value = MacroscopePreferencesManager.GetDescriptionMinWords();
                this.numericUpDownDescriptionMaxWords.Value = MacroscopePreferencesManager.GetDescriptionMaxWords();

                this.numericUpDownMaxHeadingDepth.Value = MacroscopePreferencesManager.GetMaxHeadingDepth();

                this.checkBoxAnalyzeKeywordsInText.Checked  = MacroscopePreferencesManager.GetAnalyzeKeywordsInText();
                this.checkBoxAnalyzeTextReadability.Checked = MacroscopePreferencesManager.GetAnalyzeTextReadability();
                this.comboBoxAnalyzeTextReadabilityEnglishAlgorithm.SelectedIndex = (int)MacroscopePreferencesManager.GetAnalyzeTextReadabilityEnglishAlgorithm();

                this.checkBoxEnableLevenshteinDeduplication.Checked  = MacroscopePreferencesManager.GetEnableLevenshteinDeduplication();
                this.comboBoxLevenshteinAnalysisLevel.SelectedIndex  = (int)MacroscopePreferencesManager.GetLevenshteinAnalysisLevel();
                this.numericUpDownMaxLevenshteinSizeDifference.Value = MacroscopePreferencesManager.GetMaxLevenshteinSizeDifference();
                this.numericUpDownMaxLevenshteinDistance.Value       = MacroscopePreferencesManager.GetMaxLevenshteinDistance();

                this.checkBoxAnalyzeClickPaths.Checked = MacroscopePreferencesManager.GetAnalyzeClickPaths();

                // TODO: Finish implementing click path analysis:
#if DEBUG
                this.groupBoxPageNavigationAnalysis.Visible = true;
#else
                this.groupBoxPageNavigationAnalysis.Visible = false;
#endif

                /** Custom Filter Options ------------------------------------------ **/

                this.checkBoxCustomFiltersEnable.Checked      = MacroscopePreferencesManager.GetCustomFiltersEnable();
                this.numericUpDownCustomFiltersMaxItems.Value = MacroscopePreferencesManager.GetCustomFiltersMaxItems();

                this.checkBoxCustomFiltersApplyToHtml.Checked        = MacroscopePreferencesManager.GetCustomFiltersApplyToHtml();
                this.checkBoxCustomFiltersApplyToCss.Checked         = MacroscopePreferencesManager.GetCustomFiltersApplyToCss();
                this.checkBoxCustomFiltersApplyToJavascripts.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts();
                this.checkBoxCustomFiltersApplyToText.Checked        = MacroscopePreferencesManager.GetCustomFiltersApplyToText();
                this.checkBoxCustomFiltersApplyToXml.Checked         = MacroscopePreferencesManager.GetCustomFiltersApplyToXml();

                /** Extractor Options ---------------------------------------------- **/

                this.checkBoxDataExtractorsEnable.Checked          = MacroscopePreferencesManager.GetDataExtractorsEnable();
                this.checkBoxDataExtractorsCleanWhiteSpace.Checked = MacroscopePreferencesManager.GetDataExtractorsCleanWhiteSpace();

                this.numericUpDownDataExtractorsMaxItemsCssSelectors.Value = MacroscopePreferencesManager.GetDataExtractorsMaxItemsCssSelectors();
                this.numericUpDownDataExtractorsMaxItemsRegexes.Value      = MacroscopePreferencesManager.GetDataExtractorsMaxItemsRegexes();
                this.numericUpDownDataExtractorsMaxItemsXpaths.Value       = MacroscopePreferencesManager.GetDataExtractorsMaxItemsXpaths();

                this.checkBoxDataExtractorsApplyToHtml.Checked        = MacroscopePreferencesManager.GetDataExtractorsApplyToHtml();
                this.checkBoxDataExtractorsApplyToCss.Checked         = MacroscopePreferencesManager.GetDataExtractorsApplyToCss();
                this.checkBoxDataExtractorsApplyToJavascripts.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts();
                this.checkBoxDataExtractorsApplyToText.Checked        = MacroscopePreferencesManager.GetDataExtractorsApplyToText();
                this.checkBoxDataExtractorsApplyToPdf.Checked         = MacroscopePreferencesManager.GetDataExtractorsApplyToPdf();
                this.checkBoxDataExtractorsApplyToXml.Checked         = MacroscopePreferencesManager.GetDataExtractorsApplyToXml();

                /** Export Options ------------------------------------------------- **/

                this.checkBoxSitemapIncludeLinkedPdfs.Checked = MacroscopePreferencesManager.GetSitemapIncludeLinkedPdfs();

                /** Ignore Errors Settings ----------------------------------------- **/

                this.checkBoxIgnoreErrors410.Checked = MacroscopePreferencesManager.GetIgnoreErrors410();
                this.checkBoxIgnoreErrors451.Checked = MacroscopePreferencesManager.GetIgnoreErrors451();
            }
        }
예제 #8
0
        /** Fetch Robot ***********************************************************/

        public Robots FetchRobot(string Url)
        {
            Robots robot = null;

            if (!MacroscopePreferencesManager.GetFollowRobotsProtocol())
            {
                DebugMsg(string.Format("ROBOTS Disabled: {0}", Url));
                return(robot);
            }

            Uri    BaseUri      = null;
            Uri    RobotsUri    = null;
            string RobotsTxtUrl = null;

            try
            {
                BaseUri = new Uri(Url, UriKind.Absolute);

                string BaseUriPort = "";

                if (BaseUri.Port > 0)
                {
                    BaseUriPort = string.Format(":{0}", BaseUri.Port);
                }

                RobotsUri = new Uri(
                    string.Format(
                        "{0}://{1}{2}{3}",
                        BaseUri.Scheme,
                        BaseUri.Host,
                        BaseUriPort,
                        "/robots.txt"
                        ),
                    UriKind.Absolute
                    );

                RobotsTxtUrl = RobotsUri.ToString();
            }
            catch (InvalidOperationException ex)
            {
                DebugMsg(string.Format("FetchRobot: {0}", ex.Message));
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("FetchRobot: {0}", ex.Message));
            }

            /*
             * lock( this.BadRobots )
             * {
             * if( !this.BadRobots.ContainsKey( RobotsUri ) )
             * {
             *  return( robot );
             * }
             * }
             */

            if (!string.IsNullOrEmpty(RobotsTxtUrl))
            {
                lock (this.RobotSquad)
                {
                    if (this.RobotSquad.ContainsKey(RobotsTxtUrl))
                    {
                        robot = this.RobotSquad[RobotsTxtUrl];
                    }
                    else
                    {
                        String RobotsText = this.FetchRobotTextFile(RobotsUri: RobotsUri);

                        if (RobotsText != null)
                        {
                            robot = new Robots(content: RobotsText);
                            this.RobotSquad.Add(RobotsTxtUrl, robot);
                        }
                    }
                }
            }

            return(robot);
        }