/** Crawl Delay ***********************************************************/ public async Task <int> GetCrawlDelay(string Url) { int Delay = 0; Robots robot; if (!MacroscopePreferencesManager.GetFollowRobotsProtocol()) { return(Delay); } robot = await this.FetchRobot(Url : Url); if (robot != null) { long CrawlDelayTime = robot.CrawlDelay(userAgent: this.UserAgentName()); if (CrawlDelayTime == 0) { CrawlDelayTime = robot.CrawlDelay("*"); } if (CrawlDelayTime > 0) { Delay = (int)(CrawlDelayTime / 1000); } DebugMsg(string.Format("ROBOTS CrawlDelayTime: {0}", CrawlDelayTime)); DebugMsg(string.Format("ROBOTS Delay: {0}", Delay)); } return(Delay); }
/** Sitemaps **************************************************************/ public async Task <List <string> > GetSitemapsAsList(string Url) { List <string> SitemapsList = new List <string>(); if (MacroscopePreferencesManager.GetFollowRobotsProtocol()) { Robots robot = await this.FetchRobot(Url : Url); try { if ((robot != null) && (robot.Sitemaps != null)) { foreach (Sitemap SitemapEntry in robot.Sitemaps) { string SitemapUrl = SitemapEntry.Url.ToString(); string SitemapUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute(BaseUrl: Url, Url: SitemapUrl); SitemapsList.Add(SitemapUrlAbs); this.DebugMsg(string.Format("ROBOTS SitemapUrl: {0}", SitemapUrl)); } } } catch (Exception ex) { this.DebugMsg(ex.Message); } } return(SitemapsList); }
/**************************************************************************/ public MacroscopeJobWorker(MacroscopeJobMaster JobMaster) { this.SuppressDebugMsg = true; this.JobMaster = JobMaster; this.DocCollection = this.JobMaster.GetDocCollection(); this.AllowedHosts = this.JobMaster.GetAllowedHosts(); this.IncludeExcludeUrls = this.JobMaster.GetIncludeExcludeUrls(); if (MacroscopePreferencesManager.GetCrawlDelay() > 0) { this.CrawlDelay = MacroscopePreferencesManager.GetCrawlDelay(); } if (MacroscopePreferencesManager.GetFollowRobotsProtocol()) { if (this.JobMaster.GetCrawlDelay() > 0) { this.CrawlDelay = this.JobMaster.GetCrawlDelay(); } } }
/** -------------------------------------------------------------------- **/ public async Task <bool> ApplyRobotRule(string Url) { bool Allowed = true; if (MacroscopePreferencesManager.GetFollowRobotsProtocol()) { Allowed = await this.CheckRobotRule(Url : Url); } return(Allowed); }
/** Generate Robot URL ****************************************************/ public static string GenerateRobotUrl(string Url) { string RobotUrl = null; if (MacroscopePreferencesManager.GetFollowRobotsProtocol()) { DebugMsgStatic(string.Format("ROBOTS Disabled: {0}", Url)); Uri BaseUri = null; string BaseUriPort = ""; Uri RobotsUri = null; string RobotsTxtUrl = null; try { BaseUri = new Uri(Url, UriKind.Absolute); if (BaseUri.Port > 0) { BaseUriPort = string.Format(":{0}", BaseUri.Port); } RobotsUri = new Uri( string.Format( "{0}://{1}{2}{3}", BaseUri.Scheme, BaseUri.Host, BaseUriPort, "/robots.txt" ), UriKind.Absolute ); RobotsTxtUrl = RobotsUri.ToString(); } catch (InvalidOperationException ex) { DebugMsgStatic(string.Format("GenerateRobotUrl: {0}", ex.Message)); } catch (UriFormatException ex) { DebugMsgStatic(string.Format("GenerateRobotUrl: {0}", ex.Message)); } if (!string.IsNullOrEmpty(RobotsTxtUrl)) { RobotUrl = RobotsTxtUrl; } } return(RobotUrl); }
/** ROBOT RULES ***********************************************************/ public Boolean ApplyRobotRule(string Url) { Boolean Allowed = false; if (!MacroscopePreferencesManager.GetFollowRobotsProtocol()) { DebugMsg(string.Format("ROBOTS Disabled: {0}", Url)); return(true); } else { Robots robot = this.FetchRobot(Url: Url); Uri BaseUri = null; try { BaseUri = new Uri(Url, UriKind.Absolute); } catch (UriFormatException ex) { DebugMsg(string.Format("ApplyRobotRule: {0}", ex.Message)); } catch (Exception ex) { DebugMsg(string.Format("ApplyRobotRule: {0}", ex.Message)); } if ((robot != null) && (BaseUri != null)) { if (robot.IsPathAllowed("*", BaseUri.AbsolutePath)) { Allowed = true; } else { DebugMsg(string.Format("ROBOTS Disallowed: {0}", Url)); DebugMsg(string.Format("ROBOTS AbsolutePath: {0}", BaseUri.AbsolutePath)); } } } return(Allowed); }
/**************************************************************************/ public void SetPrefsFormControlFields() { { // Configure Display Options this.checkBoxPauseDisplayDuringScan.Checked = MacroscopePreferencesManager.GetPauseDisplayDuringScan(); this.checkBoxShowProgressDialogues.Checked = MacroscopePreferencesManager.GetShowProgressDialogues(); } { //Configure Form Fields /** Spidering Control ---------------------------------------------- **/ this.numericUpDownDepth.Minimum = -1; this.numericUpDownDepth.Maximum = 10000; this.numericUpDownPageLimit.Minimum = -1; this.numericUpDownPageLimit.Maximum = 10000; this.numericUpDownCrawlDelay.Minimum = 0; this.numericUpDownCrawlDelay.Maximum = 60; this.numericUpDownMaxRetries.Minimum = 0; this.numericUpDownMaxRetries.Maximum = 10; } { /** WebProxy Options ----------------------------------------------- **/ this.comboBoxProxyType.SelectedIndex = (int)MacroscopePreferencesManager.GetProxyType(); /** Server Certificate Options --------------------------------------- **/ this.checkBoxServerCertificateValidation.Checked = MacroscopePreferencesManager.GetServerCertificateValidation(); /** Spidering Control ---------------------------------------------- **/ this.numericUpDownMaxThreads.Value = MacroscopePreferencesManager.GetMaxThreads(); this.numericUpDownDepth.Value = MacroscopePreferencesManager.GetDepth(); this.numericUpDownPageLimit.Value = MacroscopePreferencesManager.GetPageLimit(); this.numericUpDownCrawlDelay.Value = MacroscopePreferencesManager.GetCrawlDelay(); this.numericUpDownRequestTimeout.Value = (Decimal)MacroscopePreferencesManager.GetRequestTimeout(); this.numericUpDownMaxRetries.Value = (Decimal)MacroscopePreferencesManager.GetMaxRetries(); this.checkBoxCrawlStrictUrlCheck.Checked = MacroscopePreferencesManager.GetCrawlStrictUrlCheck(); this.checkBoxCheckExternalLinks.Checked = MacroscopePreferencesManager.GetCheckExternalLinks(); this.checkBoxFetchExternalLinks.Checked = MacroscopePreferencesManager.GetFetchExternalLinks(); this.checkBoxFollowRobotsProtocol.Checked = MacroscopePreferencesManager.GetFollowRobotsProtocol(); this.checkBoxFollowSitemapLinks.Checked = MacroscopePreferencesManager.GetFollowSitemapLinks(); this.checkBoxProbeHumansText.Checked = MacroscopePreferencesManager.GetProbeHumansText(); this.checkBoxCheckRedirects.Checked = MacroscopePreferencesManager.GetCheckRedirects(); this.checkBoxFollowRedirects.Checked = MacroscopePreferencesManager.GetFollowRedirects(); this.checkBoxFollowNoFollow.Checked = MacroscopePreferencesManager.GetFollowNoFollow(); this.checkBoxIgnoreQueries.Checked = MacroscopePreferencesManager.GetIgnoreQueries(); this.checkBoxIgnoreHashFragments.Checked = MacroscopePreferencesManager.GetIgnoreHashFragments(); this.checkBoxFollowCanonicalLinks.Checked = MacroscopePreferencesManager.GetFollowCanonicalLinks(); this.checkBoxFollowAlternateLinks.Checked = MacroscopePreferencesManager.GetFollowAlternateLinks(); this.checkBoxFollowHrefLangLinks.Checked = MacroscopePreferencesManager.GetFollowHrefLangLinks(); this.checkBoxDowncaseLinks.Checked = MacroscopePreferencesManager.GetDowncaseLinks(); this.checkBoxFetchStylesheets.Checked = MacroscopePreferencesManager.GetFetchStylesheets(); this.checkBoxFetchJavascripts.Checked = MacroscopePreferencesManager.GetFetchJavascripts(); this.checkBoxFetchImages.Checked = MacroscopePreferencesManager.GetFetchImages(); this.checkBoxFetchAudio.Checked = MacroscopePreferencesManager.GetFetchAudio(); this.checkBoxFetchVideo.Checked = MacroscopePreferencesManager.GetFetchVideo(); this.checkBoxFetchXml.Checked = MacroscopePreferencesManager.GetFetchXml(); this.checkBoxFetchBinaries.Checked = MacroscopePreferencesManager.GetFetchBinaries(); this.checkBoxScanSitesInList.Checked = MacroscopePreferencesManager.GetScanSitesInList(); this.checkBoxProbeParentFolderUrls.Checked = MacroscopePreferencesManager.GetProbeParentFolderUrls(); this.checkBoxProbeHead404sWithGet.Checked = MacroscopePreferencesManager.GetProbeHead404sWithGet(); /** Analysis Options ----------------------------------------------- **/ this.checkBoxResolveAddresses.Checked = MacroscopePreferencesManager.GetResolveAddresses(); this.checkBoxCheckHreflangs.Checked = MacroscopePreferencesManager.GetCheckHreflangs(); this.checkBoxDetectLanguage.Checked = MacroscopePreferencesManager.GetDetectLanguage(); this.checkBoxProcessStylesheets.Checked = MacroscopePreferencesManager.GetProcessStylesheets(); this.checkBoxProcessJavascripts.Checked = MacroscopePreferencesManager.GetProcessJavascripts(); this.checkBoxProcessImages.Checked = MacroscopePreferencesManager.GetProcessImages(); this.checkBoxProcessPdfs.Checked = MacroscopePreferencesManager.GetProcessPdfs(); this.checkBoxProcessAudio.Checked = MacroscopePreferencesManager.GetProcessAudio(); this.checkBoxProcessVideo.Checked = MacroscopePreferencesManager.GetProcessVideo(); this.checkBoxProcessXml.Checked = MacroscopePreferencesManager.GetProcessXml(); this.checkBoxProcessBinaries.Checked = MacroscopePreferencesManager.GetProcessBinaries(); this.numericUpDownRedirectChainsMaxHops.Value = MacroscopePreferencesManager.GetRedirectChainsMaxHops(); this.checkBoxWarnAboutInsecureLinks.Checked = MacroscopePreferencesManager.GetWarnAboutInsecureLinks(); this.checkBoxEnableTextIndexing.Checked = MacroscopePreferencesManager.GetEnableTextIndexing(); this.checkBoxCaseSensitiveTextIndexing.Checked = MacroscopePreferencesManager.GetCaseSensitiveTextIndexing(); this.checkBoxDisregardHtml5ElementNav.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementNav(); this.checkBoxDisregardHtml5ElementHeader.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementHeader(); this.checkBoxDisregardHtml5ElementFooter.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementFooter(); this.checkBoxDetectQrCodeInImage.Checked = MacroscopePreferencesManager.GetDetectQrCodeInImage(); /** SEO Options ---------------------------------------------------- **/ this.numericUpDownTitleMinLen.Value = MacroscopePreferencesManager.GetTitleMinLen(); this.numericUpDownTitleMaxLen.Value = MacroscopePreferencesManager.GetTitleMaxLen(); this.numericUpDownTitleMinWords.Value = MacroscopePreferencesManager.GetTitleMinWords(); this.numericUpDownTitleMaxWords.Value = MacroscopePreferencesManager.GetTitleMaxWords(); this.numericUpDownTitleMaxPixelWidth.Value = MacroscopePreferencesManager.GetTitleMaxPixelWidth(); this.numericUpDownDescriptionMinLen.Value = MacroscopePreferencesManager.GetDescriptionMinLen(); this.numericUpDownDescriptionMaxLen.Value = MacroscopePreferencesManager.GetDescriptionMaxLen(); this.numericUpDownDescriptionMinWords.Value = MacroscopePreferencesManager.GetDescriptionMinWords(); this.numericUpDownDescriptionMaxWords.Value = MacroscopePreferencesManager.GetDescriptionMaxWords(); this.numericUpDownMaxHeadingDepth.Value = MacroscopePreferencesManager.GetMaxHeadingDepth(); this.checkBoxAnalyzeKeywordsInText.Checked = MacroscopePreferencesManager.GetAnalyzeKeywordsInText(); this.checkBoxAnalyzeTextReadability.Checked = MacroscopePreferencesManager.GetAnalyzeTextReadability(); this.comboBoxAnalyzeTextReadabilityEnglishAlgorithm.SelectedIndex = (int)MacroscopePreferencesManager.GetAnalyzeTextReadabilityEnglishAlgorithm(); this.checkBoxEnableLevenshteinDeduplication.Checked = MacroscopePreferencesManager.GetEnableLevenshteinDeduplication(); this.comboBoxLevenshteinAnalysisLevel.SelectedIndex = (int)MacroscopePreferencesManager.GetLevenshteinAnalysisLevel(); this.numericUpDownMaxLevenshteinSizeDifference.Value = MacroscopePreferencesManager.GetMaxLevenshteinSizeDifference(); this.numericUpDownMaxLevenshteinDistance.Value = MacroscopePreferencesManager.GetMaxLevenshteinDistance(); this.checkBoxAnalyzeClickPaths.Checked = MacroscopePreferencesManager.GetAnalyzeClickPaths(); // TODO: Finish implementing click path analysis: #if DEBUG this.groupBoxPageNavigationAnalysis.Visible = true; #else this.groupBoxPageNavigationAnalysis.Visible = false; #endif /** Custom Filter Options ------------------------------------------ **/ this.checkBoxCustomFiltersEnable.Checked = MacroscopePreferencesManager.GetCustomFiltersEnable(); this.numericUpDownCustomFiltersMaxItems.Value = MacroscopePreferencesManager.GetCustomFiltersMaxItems(); this.checkBoxCustomFiltersApplyToHtml.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToHtml(); this.checkBoxCustomFiltersApplyToCss.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToCss(); this.checkBoxCustomFiltersApplyToJavascripts.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts(); this.checkBoxCustomFiltersApplyToText.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToText(); this.checkBoxCustomFiltersApplyToXml.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToXml(); /** Extractor Options ---------------------------------------------- **/ this.checkBoxDataExtractorsEnable.Checked = MacroscopePreferencesManager.GetDataExtractorsEnable(); this.checkBoxDataExtractorsCleanWhiteSpace.Checked = MacroscopePreferencesManager.GetDataExtractorsCleanWhiteSpace(); this.numericUpDownDataExtractorsMaxItemsCssSelectors.Value = MacroscopePreferencesManager.GetDataExtractorsMaxItemsCssSelectors(); this.numericUpDownDataExtractorsMaxItemsRegexes.Value = MacroscopePreferencesManager.GetDataExtractorsMaxItemsRegexes(); this.numericUpDownDataExtractorsMaxItemsXpaths.Value = MacroscopePreferencesManager.GetDataExtractorsMaxItemsXpaths(); this.checkBoxDataExtractorsApplyToHtml.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToHtml(); this.checkBoxDataExtractorsApplyToCss.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToCss(); this.checkBoxDataExtractorsApplyToJavascripts.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts(); this.checkBoxDataExtractorsApplyToText.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToText(); this.checkBoxDataExtractorsApplyToPdf.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToPdf(); this.checkBoxDataExtractorsApplyToXml.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToXml(); /** Export Options ------------------------------------------------- **/ this.checkBoxSitemapIncludeLinkedPdfs.Checked = MacroscopePreferencesManager.GetSitemapIncludeLinkedPdfs(); /** Ignore Errors Settings ----------------------------------------- **/ this.checkBoxIgnoreErrors410.Checked = MacroscopePreferencesManager.GetIgnoreErrors410(); this.checkBoxIgnoreErrors451.Checked = MacroscopePreferencesManager.GetIgnoreErrors451(); } }
/** Fetch Robot ***********************************************************/ public Robots FetchRobot(string Url) { Robots robot = null; if (!MacroscopePreferencesManager.GetFollowRobotsProtocol()) { DebugMsg(string.Format("ROBOTS Disabled: {0}", Url)); return(robot); } Uri BaseUri = null; Uri RobotsUri = null; string RobotsTxtUrl = null; try { BaseUri = new Uri(Url, UriKind.Absolute); string BaseUriPort = ""; if (BaseUri.Port > 0) { BaseUriPort = string.Format(":{0}", BaseUri.Port); } RobotsUri = new Uri( string.Format( "{0}://{1}{2}{3}", BaseUri.Scheme, BaseUri.Host, BaseUriPort, "/robots.txt" ), UriKind.Absolute ); RobotsTxtUrl = RobotsUri.ToString(); } catch (InvalidOperationException ex) { DebugMsg(string.Format("FetchRobot: {0}", ex.Message)); } catch (UriFormatException ex) { DebugMsg(string.Format("FetchRobot: {0}", ex.Message)); } /* * lock( this.BadRobots ) * { * if( !this.BadRobots.ContainsKey( RobotsUri ) ) * { * return( robot ); * } * } */ if (!string.IsNullOrEmpty(RobotsTxtUrl)) { lock (this.RobotSquad) { if (this.RobotSquad.ContainsKey(RobotsTxtUrl)) { robot = this.RobotSquad[RobotsTxtUrl]; } else { String RobotsText = this.FetchRobotTextFile(RobotsUri: RobotsUri); if (RobotsText != null) { robot = new Robots(content: RobotsText); this.RobotSquad.Add(RobotsTxtUrl, robot); } } } } return(robot); }