/**************************************************************************/

        public void SetPrefsFormControlFields()
        {
            { // Configure Display Options
                this.checkBoxPauseDisplayDuringScan.Checked = MacroscopePreferencesManager.GetPauseDisplayDuringScan();
                this.checkBoxShowProgressDialogues.Checked  = MacroscopePreferencesManager.GetShowProgressDialogues();
            }

            { //Configure Form Fields
              /** Spidering Control ---------------------------------------------- **/

                this.numericUpDownDepth.Minimum = -1;
                this.numericUpDownDepth.Maximum = 10000;

                this.numericUpDownPageLimit.Minimum = -1;
                this.numericUpDownPageLimit.Maximum = 10000;

                this.numericUpDownCrawlDelay.Minimum = 0;
                this.numericUpDownCrawlDelay.Maximum = 60;

                this.numericUpDownMaxRetries.Minimum = 0;
                this.numericUpDownMaxRetries.Maximum = 10;
            }

            {
                /** WebProxy Options ----------------------------------------------- **/

                this.comboBoxProxyType.SelectedIndex = (int)MacroscopePreferencesManager.GetProxyType();

                /** Server Certificate Options --------------------------------------- **/

                this.checkBoxServerCertificateValidation.Checked = MacroscopePreferencesManager.GetServerCertificateValidation();

                /** Spidering Control ---------------------------------------------- **/

                this.numericUpDownMaxThreads.Value     = MacroscopePreferencesManager.GetMaxThreads();
                this.numericUpDownDepth.Value          = MacroscopePreferencesManager.GetDepth();
                this.numericUpDownPageLimit.Value      = MacroscopePreferencesManager.GetPageLimit();
                this.numericUpDownCrawlDelay.Value     = MacroscopePreferencesManager.GetCrawlDelay();
                this.numericUpDownRequestTimeout.Value = (Decimal)MacroscopePreferencesManager.GetRequestTimeout();
                this.numericUpDownMaxRetries.Value     = (Decimal)MacroscopePreferencesManager.GetMaxRetries();

                this.checkBoxCrawlStrictUrlCheck.Checked = MacroscopePreferencesManager.GetCrawlStrictUrlCheck();

                this.checkBoxCheckExternalLinks.Checked = MacroscopePreferencesManager.GetCheckExternalLinks();
                this.checkBoxFetchExternalLinks.Checked = MacroscopePreferencesManager.GetFetchExternalLinks();

                this.checkBoxFollowRobotsProtocol.Checked = MacroscopePreferencesManager.GetFollowRobotsProtocol();
                this.checkBoxFollowSitemapLinks.Checked   = MacroscopePreferencesManager.GetFollowSitemapLinks();
                this.checkBoxProbeHumansText.Checked      = MacroscopePreferencesManager.GetProbeHumansText();

                this.checkBoxCheckRedirects.Checked  = MacroscopePreferencesManager.GetCheckRedirects();
                this.checkBoxFollowRedirects.Checked = MacroscopePreferencesManager.GetFollowRedirects();

                this.checkBoxFollowNoFollow.Checked       = MacroscopePreferencesManager.GetFollowNoFollow();
                this.checkBoxIgnoreQueries.Checked        = MacroscopePreferencesManager.GetIgnoreQueries();
                this.checkBoxIgnoreHashFragments.Checked  = MacroscopePreferencesManager.GetIgnoreHashFragments();
                this.checkBoxFollowCanonicalLinks.Checked = MacroscopePreferencesManager.GetFollowCanonicalLinks();
                this.checkBoxFollowAlternateLinks.Checked = MacroscopePreferencesManager.GetFollowAlternateLinks();
                this.checkBoxFollowHrefLangLinks.Checked  = MacroscopePreferencesManager.GetFollowHrefLangLinks();
                this.checkBoxDowncaseLinks.Checked        = MacroscopePreferencesManager.GetDowncaseLinks();

                this.checkBoxFetchStylesheets.Checked = MacroscopePreferencesManager.GetFetchStylesheets();
                this.checkBoxFetchJavascripts.Checked = MacroscopePreferencesManager.GetFetchJavascripts();
                this.checkBoxFetchImages.Checked      = MacroscopePreferencesManager.GetFetchImages();
                this.checkBoxFetchAudio.Checked       = MacroscopePreferencesManager.GetFetchAudio();
                this.checkBoxFetchVideo.Checked       = MacroscopePreferencesManager.GetFetchVideo();
                this.checkBoxFetchXml.Checked         = MacroscopePreferencesManager.GetFetchXml();
                this.checkBoxFetchBinaries.Checked    = MacroscopePreferencesManager.GetFetchBinaries();

                this.checkBoxScanSitesInList.Checked = MacroscopePreferencesManager.GetScanSitesInList();

                this.checkBoxProbeParentFolderUrls.Checked = MacroscopePreferencesManager.GetProbeParentFolderUrls();

                this.checkBoxProbeHead404sWithGet.Checked = MacroscopePreferencesManager.GetProbeHead404sWithGet();

                /** Analysis Options ----------------------------------------------- **/

                this.checkBoxResolveAddresses.Checked = MacroscopePreferencesManager.GetResolveAddresses();

                this.checkBoxCheckHreflangs.Checked = MacroscopePreferencesManager.GetCheckHreflangs();
                this.checkBoxDetectLanguage.Checked = MacroscopePreferencesManager.GetDetectLanguage();

                this.checkBoxProcessStylesheets.Checked = MacroscopePreferencesManager.GetProcessStylesheets();
                this.checkBoxProcessJavascripts.Checked = MacroscopePreferencesManager.GetProcessJavascripts();
                this.checkBoxProcessImages.Checked      = MacroscopePreferencesManager.GetProcessImages();
                this.checkBoxProcessPdfs.Checked        = MacroscopePreferencesManager.GetProcessPdfs();
                this.checkBoxProcessAudio.Checked       = MacroscopePreferencesManager.GetProcessAudio();
                this.checkBoxProcessVideo.Checked       = MacroscopePreferencesManager.GetProcessVideo();
                this.checkBoxProcessXml.Checked         = MacroscopePreferencesManager.GetProcessXml();
                this.checkBoxProcessBinaries.Checked    = MacroscopePreferencesManager.GetProcessBinaries();

                this.numericUpDownRedirectChainsMaxHops.Value = MacroscopePreferencesManager.GetRedirectChainsMaxHops();

                this.checkBoxWarnAboutInsecureLinks.Checked = MacroscopePreferencesManager.GetWarnAboutInsecureLinks();

                this.checkBoxEnableTextIndexing.Checked        = MacroscopePreferencesManager.GetEnableTextIndexing();
                this.checkBoxCaseSensitiveTextIndexing.Checked = MacroscopePreferencesManager.GetCaseSensitiveTextIndexing();

                this.checkBoxDisregardHtml5ElementNav.Checked    = MacroscopePreferencesManager.GetDisregardHtml5ElementNav();
                this.checkBoxDisregardHtml5ElementHeader.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementHeader();
                this.checkBoxDisregardHtml5ElementFooter.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementFooter();

                this.checkBoxDetectQrCodeInImage.Checked = MacroscopePreferencesManager.GetDetectQrCodeInImage();

                /** SEO Options ---------------------------------------------------- **/

                this.numericUpDownTitleMinLen.Value        = MacroscopePreferencesManager.GetTitleMinLen();
                this.numericUpDownTitleMaxLen.Value        = MacroscopePreferencesManager.GetTitleMaxLen();
                this.numericUpDownTitleMinWords.Value      = MacroscopePreferencesManager.GetTitleMinWords();
                this.numericUpDownTitleMaxWords.Value      = MacroscopePreferencesManager.GetTitleMaxWords();
                this.numericUpDownTitleMaxPixelWidth.Value = MacroscopePreferencesManager.GetTitleMaxPixelWidth();

                this.numericUpDownDescriptionMinLen.Value   = MacroscopePreferencesManager.GetDescriptionMinLen();
                this.numericUpDownDescriptionMaxLen.Value   = MacroscopePreferencesManager.GetDescriptionMaxLen();
                this.numericUpDownDescriptionMinWords.Value = MacroscopePreferencesManager.GetDescriptionMinWords();
                this.numericUpDownDescriptionMaxWords.Value = MacroscopePreferencesManager.GetDescriptionMaxWords();

                this.numericUpDownMaxHeadingDepth.Value = MacroscopePreferencesManager.GetMaxHeadingDepth();

                this.checkBoxAnalyzeKeywordsInText.Checked  = MacroscopePreferencesManager.GetAnalyzeKeywordsInText();
                this.checkBoxAnalyzeTextReadability.Checked = MacroscopePreferencesManager.GetAnalyzeTextReadability();
                this.comboBoxAnalyzeTextReadabilityEnglishAlgorithm.SelectedIndex = (int)MacroscopePreferencesManager.GetAnalyzeTextReadabilityEnglishAlgorithm();

                this.checkBoxEnableLevenshteinDeduplication.Checked  = MacroscopePreferencesManager.GetEnableLevenshteinDeduplication();
                this.comboBoxLevenshteinAnalysisLevel.SelectedIndex  = (int)MacroscopePreferencesManager.GetLevenshteinAnalysisLevel();
                this.numericUpDownMaxLevenshteinSizeDifference.Value = MacroscopePreferencesManager.GetMaxLevenshteinSizeDifference();
                this.numericUpDownMaxLevenshteinDistance.Value       = MacroscopePreferencesManager.GetMaxLevenshteinDistance();

                this.checkBoxAnalyzeClickPaths.Checked = MacroscopePreferencesManager.GetAnalyzeClickPaths();

                // TODO: Finish implementing click path analysis:
#if DEBUG
                this.groupBoxPageNavigationAnalysis.Visible = true;
#else
                this.groupBoxPageNavigationAnalysis.Visible = false;
#endif

                /** Custom Filter Options ------------------------------------------ **/

                this.checkBoxCustomFiltersEnable.Checked      = MacroscopePreferencesManager.GetCustomFiltersEnable();
                this.numericUpDownCustomFiltersMaxItems.Value = MacroscopePreferencesManager.GetCustomFiltersMaxItems();

                this.checkBoxCustomFiltersApplyToHtml.Checked        = MacroscopePreferencesManager.GetCustomFiltersApplyToHtml();
                this.checkBoxCustomFiltersApplyToCss.Checked         = MacroscopePreferencesManager.GetCustomFiltersApplyToCss();
                this.checkBoxCustomFiltersApplyToJavascripts.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts();
                this.checkBoxCustomFiltersApplyToText.Checked        = MacroscopePreferencesManager.GetCustomFiltersApplyToText();
                this.checkBoxCustomFiltersApplyToXml.Checked         = MacroscopePreferencesManager.GetCustomFiltersApplyToXml();

                /** Extractor Options ---------------------------------------------- **/

                this.checkBoxDataExtractorsEnable.Checked          = MacroscopePreferencesManager.GetDataExtractorsEnable();
                this.checkBoxDataExtractorsCleanWhiteSpace.Checked = MacroscopePreferencesManager.GetDataExtractorsCleanWhiteSpace();

                this.numericUpDownDataExtractorsMaxItemsCssSelectors.Value = MacroscopePreferencesManager.GetDataExtractorsMaxItemsCssSelectors();
                this.numericUpDownDataExtractorsMaxItemsRegexes.Value      = MacroscopePreferencesManager.GetDataExtractorsMaxItemsRegexes();
                this.numericUpDownDataExtractorsMaxItemsXpaths.Value       = MacroscopePreferencesManager.GetDataExtractorsMaxItemsXpaths();

                this.checkBoxDataExtractorsApplyToHtml.Checked        = MacroscopePreferencesManager.GetDataExtractorsApplyToHtml();
                this.checkBoxDataExtractorsApplyToCss.Checked         = MacroscopePreferencesManager.GetDataExtractorsApplyToCss();
                this.checkBoxDataExtractorsApplyToJavascripts.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts();
                this.checkBoxDataExtractorsApplyToText.Checked        = MacroscopePreferencesManager.GetDataExtractorsApplyToText();
                this.checkBoxDataExtractorsApplyToPdf.Checked         = MacroscopePreferencesManager.GetDataExtractorsApplyToPdf();
                this.checkBoxDataExtractorsApplyToXml.Checked         = MacroscopePreferencesManager.GetDataExtractorsApplyToXml();

                /** Export Options ------------------------------------------------- **/

                this.checkBoxSitemapIncludeLinkedPdfs.Checked = MacroscopePreferencesManager.GetSitemapIncludeLinkedPdfs();

                /** Ignore Errors Settings ----------------------------------------- **/

                this.checkBoxIgnoreErrors410.Checked = MacroscopePreferencesManager.GetIgnoreErrors410();
                this.checkBoxIgnoreErrors451.Checked = MacroscopePreferencesManager.GetIgnoreErrors451();
            }
        }
Пример #2
0
        /**************************************************************************/

        private async Task <MacroscopeConstants.FetchStatus> Fetch(string Url, string RedirectedFromUrl = null)
        {
            MacroscopeDocument msDoc = null;

            MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID;
            bool BlockedByRobotsRule;

            if (MacroscopePreferencesManager.GetPageLimit() > -1)
            {
                int PagesFound = this.JobMaster.GetPagesFound();
                int PageLimit  = MacroscopePreferencesManager.GetPageLimit();
                if (PagesFound >= PageLimit)
                {
                    this.DebugMsg(string.Format("PAGE LIMIT REACHED: {0} :: {1}", PageLimit, PagesFound));
                    return(FetchStatus);
                }
            }

            if (this.DocCollection.ContainsDocument(Url: Url))
            {
                msDoc = this.DocCollection.GetDocumentByUrl(Url: Url);

                if (msDoc.GetAuthenticationRealm() != null)
                {
                    if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC)
                    {
                        MacroscopeCredential Credential;

                        Credential = this.JobMaster.GetCredentialsHttp().GetCredential(
                            msDoc.GetHostAndPort(),
                            msDoc.GetAuthenticationRealm()
                            );

                        if (Credential != null)
                        {
                            msDoc = this.DocCollection.CreateDocument(
                                Credential: Credential,
                                Url: Url
                                );
                        }
                    }
                }
            }
            else
            {
                msDoc = this.DocCollection.CreateDocument(Url: Url);
            }

            if (!string.IsNullOrEmpty(RedirectedFromUrl))
            {
                msDoc.SetUrlRedirectFrom(Url: RedirectedFromUrl);
            }

            msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK);

            if (!MacroscopeDnsTools.CheckValidHostname(Url: Url))
            {
                this.DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK"));
                msDoc.SetStatusCode(HttpStatusCode.BadGateway);
                FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR;
                msDoc.SetFetchStatus(FetchStatus);
            }

            if (await this.JobMaster.GetRobots().CheckRobotRule(Url: Url))
            {
                msDoc.SetAllowedByRobots(true);
            }
            else
            {
                msDoc.SetAllowedByRobots(false);
            }

            BlockedByRobotsRule = await this.JobMaster.GetRobots().ApplyRobotRule(Url: Url);

            if (!BlockedByRobotsRule)
            {
                this.DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url));

                this.JobMaster.AddToBlockedByRobots(Url);

                FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED;

                msDoc.SetFetchStatus(FetchStatus);

                JobHistory.VisitedHistoryItem(Url: msDoc.GetUrl());
            }
            else
            {
                this.JobMaster.RemoveFromBlockedByRobots(Url);
            }

            if (this.AllowedHosts.IsExternalUrl(Url: Url))
            {
                this.DebugMsg(string.Format("IsExternalUrl: {0}", Url));
                msDoc.SetIsExternal(State: true);
            }

            if (this.DocCollection.ContainsDocument(Url: Url))
            {
                if (!this.DocCollection.GetDocumentByUrl(Url: Url).GetIsDirty())
                {
                    FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN;
                    return(FetchStatus);
                }
            }

            if (MacroscopePreferencesManager.GetDepth() >= 0)
            {
                int Depth = MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url);
                if (Depth > MacroscopePreferencesManager.GetDepth())
                {
                    this.DebugMsg(string.Format("URL Too Deep: {0}", Depth));
                    FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED;
                    return(FetchStatus);
                }
            }

            /** ------------------------------------------------------------------ **/

            if (!await msDoc.Execute())
            {
                this.DebugMsg(string.Format("EXECUTE FAILED: {0}", Url));
                FetchStatus = MacroscopeConstants.FetchStatus.ERROR;
            }

            /** ------------------------------------------------------------------ **/



            /** ------------------------------------------------------------------ **/

            {
                if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized)
                {
                    if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC)
                    {
                        MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp();

                        CredentialsHttp.EnqueueCredentialRequest(
                            Domain: msDoc.GetHostAndPort(),
                            Realm: msDoc.GetAuthenticationRealm(),
                            Url: msDoc.GetUrl()
                            );

                        this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl());
                    }
                }

                if (msDoc.GetIsRedirect())
                {
                    this.DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl()));
                    this.DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom()));

                    if (MacroscopePreferencesManager.GetCheckRedirects())
                    {
                        string Hostname      = msDoc.GetHostAndPort();
                        string HostnameFrom  = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom());
                        string UrlRedirectTo = msDoc.GetUrlRedirectTo();
                        string HostnameTo    = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo);

                        this.DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo));
                        this.DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo));

                        if (MacroscopePreferencesManager.GetFollowRedirects())
                        {
                            if (MacroscopePreferencesManager.GetCheckExternalLinks())
                            {
                                this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo);
                            }
                            else
                            {
                                if (this.AllowedHosts.IsInternalUrl(Url: UrlRedirectTo))
                                {
                                    this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo);
                                }
                            }
                        }
                    }

                    this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo());
                }
                else
                {
                    this.ProcessHrefLangLanguages(msDoc);         // Process Languages from HrefLang

                    this.JobMaster.ProcessOutlinks(msDoc: msDoc); // Process Outlinks from document
                }

                FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS;
            }

            /** ------------------------------------------------------------------ **/

            if (DocCollection.ContainsDocument(msDoc: msDoc))
            {
                JobHistory.VisitedHistoryItem(Url: Url);
            }
            else
            {
                this.DebugMsg(string.Format("OOPS: {0}", Url));
            }

            /** ------------------------------------------------------------------ **/

            return(FetchStatus);
        }
Пример #3
0
        /**************************************************************************/

        public async void Execute()
        {
            int MaxFetches = MacroscopePreferencesManager.GetMaxFetchesPerWorker();

            while (MaxFetches > 0)
            {
                if (this.JobMaster.GetThreadsStop())
                {
                    this.DebugMsg(string.Format("JobMaster.GetThreadsStop: {0}", this.JobMaster.GetThreadsStop()));
                    break;
                }
                else
                {
                    MacroscopeJobItem JobItem           = this.JobMaster.GetUrlQueueItem();
                    string            Url               = null;
                    string            RedirectedFromUrl = null;

                    if (JobItem != null)
                    {
                        Url = JobItem.GetItemUrl();
                        RedirectedFromUrl = JobItem.GetItemRedirectedFromUrl();
                    }

                    if (!string.IsNullOrEmpty(Url))
                    {
                        if (!this.CheckIncludeExcludeUrl(Url))
                        {
                            Url = null;
                        }
                    }

                    if (!string.IsNullOrEmpty(Url))
                    {
                        if (
                            !MacroscopePreferencesManager.GetCrawlParentDirectories() &&
                            !MacroscopePreferencesManager.GetCrawlChildDirectories() &&
                            Url != this.JobMaster.GetStartUrl())
                        {
                            Url = null;
                        }
                        else if (
                            !MacroscopePreferencesManager.GetCrawlParentDirectories() ||
                            !MacroscopePreferencesManager.GetCrawlChildDirectories())
                        {
                            this.DebugMsg(string.Format("Running Parent/Child Check: {0}", Url));

                            if (
                                MacroscopePreferencesManager.GetCrawlParentDirectories() &&
                                (!string.IsNullOrEmpty(Url)))
                            {
                                if (!MacroscopeHttpUrlUtils.IsWithinParentDirectory(StartUrl: this.JobMaster.GetParentStartingDirectory(), Url: Url))
                                {
                                    Url = null;
                                }
                            }

                            if (
                                MacroscopePreferencesManager.GetCrawlChildDirectories() &&
                                (!string.IsNullOrEmpty(Url)))
                            {
                                if (!MacroscopeHttpUrlUtils.IsWithinChildDirectory(StartUrl: this.JobMaster.GetChildStartingDirectory(), Url: Url))
                                {
                                    Url = null;
                                }
                            }
                        }
                        else
                        {
                            this.DebugMsg(string.Format("Skipping Parent/Child Check: {0}", Url));
                        }
                    }

                    if (!string.IsNullOrEmpty(Url))
                    {
                        if (MacroscopePreferencesManager.GetDepth() >= 0)
                        {
                            if (MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url) > MacroscopePreferencesManager.GetDepth())
                            {
                                this.DebugMsg(string.Format("URL Too Deep: {0}", Url));
                                Url = null;
                            }
                        }
                    }

                    if (!string.IsNullOrEmpty(Url))
                    {
                        this.DebugMsg(string.Format("Execute: {0}", Url));

                        int Tries = MacroscopePreferencesManager.GetMaxRetries();

                        JobHistory.AddHistoryItem(Url: Url);

                        do
                        {
                            this.DebugMsg(string.Format("Trying Fetch: {0} :: {1}", Tries, Url));

                            MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID;

                            try
                            {
                                if (!string.IsNullOrEmpty(RedirectedFromUrl))
                                {
                                    FetchStatus = await this.Fetch(Url, RedirectedFromUrl);
                                }
                                else
                                {
                                    FetchStatus = await this.Fetch(Url);
                                }
                            }
                            catch (Exception ex)
                            {
                                this.DebugMsg(string.Format("FetchStatus: {0}", ex.Message));
                                this.DebugMsg(string.Format("Url: {0}", Url));
                                this.DebugMsg(string.Format("FetchStatus: {0}", FetchStatus));
                            }

                            switch (FetchStatus)
                            {
                            case MacroscopeConstants.FetchStatus.ERROR:
                                this.DebugMsg(string.Format("Fetch Failed: {0} :: {1}", Tries, Url));
                                Thread.Sleep(25);
                                break;

                            case MacroscopeConstants.FetchStatus.NETWORK_ERROR:
                                this.DebugMsg(string.Format("Fetch Failed: {0} :: {1}", Tries, Url));
                                Thread.Sleep(25);
                                break;

                            default:
                                this.JobMaster.NotifyWorkersFetched(Url: Url);
                                Tries = 0;
                                break;
                            }

                            Tries--;
                        } while(Tries > 0);

                        if (this.CrawlDelay > 0)
                        {
                            this.DebugMsg(string.Format("CRAWL DELAY: Sleeping for {0} seconds...", this.CrawlDelay));
                            Thread.Sleep(CrawlDelay * 1000);
                        }
                    }
                }

                MaxFetches--;

                //Thread.Yield();
            }

            this.JobMaster.NotifyWorkersDone();
        }
Пример #4
0
        /**************************************************************************/

        private void InitializeJobMaster(MacroscopeConstants.RunTimeMode JobRunTimeMode)
        {
            GC.Collect();

            /*
             * {
             * this.JobMasterLog = new EventLog ();
             * this.JobMasterLog.Source = MacroscopeConstants.MainEventLogSourceName;
             * this.JobGuid = Guid.NewGuid();
             * this.LogEntry( string.Format( "Starting Job" ) );
             * }
             */

            this.RunTimeMode = JobRunTimeMode;

            if (this.TaskController != null)
            {
                this.CredentialsHttp = this.TaskController.IGetCredentialsHttp();
            }

            this.DocCollection = new MacroscopeDocumentCollection(JobMaster: this);
            this.AllowedHosts  = new MacroscopeAllowedHosts();

            /** BEGIN: Named Queues *************************************************/

            this.NamedQueueJobItems = new MacroscopeNamedQueue <MacroscopeJobItem> ();

            this.NamedQueueJobItems.CreateNamedQueue(
                Name: MacroscopeConstants.NamedQueueUrlList,
                QueueMode: MacroscopeNamedQueue <MacroscopeJobItem> .MODE.USE_HISTORY
                );

            this.NamedQueue = new MacroscopeNamedQueue <string> ();

            {
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayQueue);

                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayStructure);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayStructureLinkCounts);

                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHierarchy);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayCanonicalAnalysis);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHrefLang);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayErrors);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHostnames);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayRedirectsAudit);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayLinks);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHyperlinks);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayUriAnalysis);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageTitles);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageDescriptions);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageKeywords);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageHeadings);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageText);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayStylesheets);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayImages);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayJavascripts);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayAudios);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayVideos);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplaySitemaps);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayEmailAddresses);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayTelephoneNumbers);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayCustomFilters);

                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayDataExtractorsCssSelectors);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayDataExtractorsRegexes);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayDataExtractorsXpaths);

                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayRemarks);
            }

            /** END: Named Queues ***************************************************/

            this.CrawlDelay = 0;

            this.AdjustThreadsMax();
            this.ThreadsRunning = 0;
            this.ThreadsStop    = false;
            this.ThreadsDict    = new Dictionary <int, Boolean> ();

            this.SemaphoreWorkers = new Semaphore(0, this.ThreadsMax);
            this.SemaphoreWorkers.Release(this.ThreadsMax);

            this.Depth          = MacroscopePreferencesManager.GetDepth();
            this.PageLimit      = MacroscopePreferencesManager.GetPageLimit();
            this.PageLimitCount = 0;

            this.PagesFound = 0;

            {
                this.ParentStartingDirectory = "";
                this.ChildStartingDirectory  = "";
            }

            this.JobHistory = new MacroscopeJobHistory();

            this.InitProgress();

            this.Locales = new Dictionary <string, string> (32);

            this.Robots          = new MacroscopeRobots();
            this.BlockedByRobots = new Dictionary <string, Boolean> ();
        }