/**************************************************************************/

        public void SetPrefsFormControlFields()
        {
            { // Configure Display Options
                this.checkBoxPauseDisplayDuringScan.Checked = MacroscopePreferencesManager.GetPauseDisplayDuringScan();
                this.checkBoxShowProgressDialogues.Checked  = MacroscopePreferencesManager.GetShowProgressDialogues();
            }

            { //Configure Form Fields
              /** Spidering Control ---------------------------------------------- **/

                this.numericUpDownDepth.Minimum = -1;
                this.numericUpDownDepth.Maximum = 10000;

                this.numericUpDownPageLimit.Minimum = -1;
                this.numericUpDownPageLimit.Maximum = 10000;

                this.numericUpDownCrawlDelay.Minimum = 0;
                this.numericUpDownCrawlDelay.Maximum = 60;

                this.numericUpDownMaxRetries.Minimum = 0;
                this.numericUpDownMaxRetries.Maximum = 10;
            }

            {
                /** WebProxy Options ----------------------------------------------- **/

                this.comboBoxProxyType.SelectedIndex = (int)MacroscopePreferencesManager.GetProxyType();

                /** Server Certificate Options --------------------------------------- **/

                this.checkBoxServerCertificateValidation.Checked = MacroscopePreferencesManager.GetServerCertificateValidation();

                /** Spidering Control ---------------------------------------------- **/

                this.numericUpDownMaxThreads.Value     = MacroscopePreferencesManager.GetMaxThreads();
                this.numericUpDownDepth.Value          = MacroscopePreferencesManager.GetDepth();
                this.numericUpDownPageLimit.Value      = MacroscopePreferencesManager.GetPageLimit();
                this.numericUpDownCrawlDelay.Value     = MacroscopePreferencesManager.GetCrawlDelay();
                this.numericUpDownRequestTimeout.Value = (Decimal)MacroscopePreferencesManager.GetRequestTimeout();
                this.numericUpDownMaxRetries.Value     = (Decimal)MacroscopePreferencesManager.GetMaxRetries();

                this.checkBoxCrawlStrictUrlCheck.Checked = MacroscopePreferencesManager.GetCrawlStrictUrlCheck();

                this.checkBoxCheckExternalLinks.Checked = MacroscopePreferencesManager.GetCheckExternalLinks();
                this.checkBoxFetchExternalLinks.Checked = MacroscopePreferencesManager.GetFetchExternalLinks();

                this.checkBoxFollowRobotsProtocol.Checked = MacroscopePreferencesManager.GetFollowRobotsProtocol();
                this.checkBoxFollowSitemapLinks.Checked   = MacroscopePreferencesManager.GetFollowSitemapLinks();
                this.checkBoxProbeHumansText.Checked      = MacroscopePreferencesManager.GetProbeHumansText();

                this.checkBoxCheckRedirects.Checked  = MacroscopePreferencesManager.GetCheckRedirects();
                this.checkBoxFollowRedirects.Checked = MacroscopePreferencesManager.GetFollowRedirects();

                this.checkBoxFollowNoFollow.Checked       = MacroscopePreferencesManager.GetFollowNoFollow();
                this.checkBoxIgnoreQueries.Checked        = MacroscopePreferencesManager.GetIgnoreQueries();
                this.checkBoxIgnoreHashFragments.Checked  = MacroscopePreferencesManager.GetIgnoreHashFragments();
                this.checkBoxFollowCanonicalLinks.Checked = MacroscopePreferencesManager.GetFollowCanonicalLinks();
                this.checkBoxFollowAlternateLinks.Checked = MacroscopePreferencesManager.GetFollowAlternateLinks();
                this.checkBoxFollowHrefLangLinks.Checked  = MacroscopePreferencesManager.GetFollowHrefLangLinks();
                this.checkBoxDowncaseLinks.Checked        = MacroscopePreferencesManager.GetDowncaseLinks();

                this.checkBoxFetchStylesheets.Checked = MacroscopePreferencesManager.GetFetchStylesheets();
                this.checkBoxFetchJavascripts.Checked = MacroscopePreferencesManager.GetFetchJavascripts();
                this.checkBoxFetchImages.Checked      = MacroscopePreferencesManager.GetFetchImages();
                this.checkBoxFetchAudio.Checked       = MacroscopePreferencesManager.GetFetchAudio();
                this.checkBoxFetchVideo.Checked       = MacroscopePreferencesManager.GetFetchVideo();
                this.checkBoxFetchXml.Checked         = MacroscopePreferencesManager.GetFetchXml();
                this.checkBoxFetchBinaries.Checked    = MacroscopePreferencesManager.GetFetchBinaries();

                this.checkBoxScanSitesInList.Checked = MacroscopePreferencesManager.GetScanSitesInList();

                this.checkBoxProbeParentFolderUrls.Checked = MacroscopePreferencesManager.GetProbeParentFolderUrls();

                this.checkBoxProbeHead404sWithGet.Checked = MacroscopePreferencesManager.GetProbeHead404sWithGet();

                /** Analysis Options ----------------------------------------------- **/

                this.checkBoxResolveAddresses.Checked = MacroscopePreferencesManager.GetResolveAddresses();

                this.checkBoxCheckHreflangs.Checked = MacroscopePreferencesManager.GetCheckHreflangs();
                this.checkBoxDetectLanguage.Checked = MacroscopePreferencesManager.GetDetectLanguage();

                this.checkBoxProcessStylesheets.Checked = MacroscopePreferencesManager.GetProcessStylesheets();
                this.checkBoxProcessJavascripts.Checked = MacroscopePreferencesManager.GetProcessJavascripts();
                this.checkBoxProcessImages.Checked      = MacroscopePreferencesManager.GetProcessImages();
                this.checkBoxProcessPdfs.Checked        = MacroscopePreferencesManager.GetProcessPdfs();
                this.checkBoxProcessAudio.Checked       = MacroscopePreferencesManager.GetProcessAudio();
                this.checkBoxProcessVideo.Checked       = MacroscopePreferencesManager.GetProcessVideo();
                this.checkBoxProcessXml.Checked         = MacroscopePreferencesManager.GetProcessXml();
                this.checkBoxProcessBinaries.Checked    = MacroscopePreferencesManager.GetProcessBinaries();

                this.numericUpDownRedirectChainsMaxHops.Value = MacroscopePreferencesManager.GetRedirectChainsMaxHops();

                this.checkBoxWarnAboutInsecureLinks.Checked = MacroscopePreferencesManager.GetWarnAboutInsecureLinks();

                this.checkBoxEnableTextIndexing.Checked        = MacroscopePreferencesManager.GetEnableTextIndexing();
                this.checkBoxCaseSensitiveTextIndexing.Checked = MacroscopePreferencesManager.GetCaseSensitiveTextIndexing();

                this.checkBoxDisregardHtml5ElementNav.Checked    = MacroscopePreferencesManager.GetDisregardHtml5ElementNav();
                this.checkBoxDisregardHtml5ElementHeader.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementHeader();
                this.checkBoxDisregardHtml5ElementFooter.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementFooter();

                this.checkBoxDetectQrCodeInImage.Checked = MacroscopePreferencesManager.GetDetectQrCodeInImage();

                /** SEO Options ---------------------------------------------------- **/

                this.numericUpDownTitleMinLen.Value        = MacroscopePreferencesManager.GetTitleMinLen();
                this.numericUpDownTitleMaxLen.Value        = MacroscopePreferencesManager.GetTitleMaxLen();
                this.numericUpDownTitleMinWords.Value      = MacroscopePreferencesManager.GetTitleMinWords();
                this.numericUpDownTitleMaxWords.Value      = MacroscopePreferencesManager.GetTitleMaxWords();
                this.numericUpDownTitleMaxPixelWidth.Value = MacroscopePreferencesManager.GetTitleMaxPixelWidth();

                this.numericUpDownDescriptionMinLen.Value   = MacroscopePreferencesManager.GetDescriptionMinLen();
                this.numericUpDownDescriptionMaxLen.Value   = MacroscopePreferencesManager.GetDescriptionMaxLen();
                this.numericUpDownDescriptionMinWords.Value = MacroscopePreferencesManager.GetDescriptionMinWords();
                this.numericUpDownDescriptionMaxWords.Value = MacroscopePreferencesManager.GetDescriptionMaxWords();

                this.numericUpDownMaxHeadingDepth.Value = MacroscopePreferencesManager.GetMaxHeadingDepth();

                this.checkBoxAnalyzeKeywordsInText.Checked  = MacroscopePreferencesManager.GetAnalyzeKeywordsInText();
                this.checkBoxAnalyzeTextReadability.Checked = MacroscopePreferencesManager.GetAnalyzeTextReadability();
                this.comboBoxAnalyzeTextReadabilityEnglishAlgorithm.SelectedIndex = (int)MacroscopePreferencesManager.GetAnalyzeTextReadabilityEnglishAlgorithm();

                this.checkBoxEnableLevenshteinDeduplication.Checked  = MacroscopePreferencesManager.GetEnableLevenshteinDeduplication();
                this.comboBoxLevenshteinAnalysisLevel.SelectedIndex  = (int)MacroscopePreferencesManager.GetLevenshteinAnalysisLevel();
                this.numericUpDownMaxLevenshteinSizeDifference.Value = MacroscopePreferencesManager.GetMaxLevenshteinSizeDifference();
                this.numericUpDownMaxLevenshteinDistance.Value       = MacroscopePreferencesManager.GetMaxLevenshteinDistance();

                this.checkBoxAnalyzeClickPaths.Checked = MacroscopePreferencesManager.GetAnalyzeClickPaths();

                // TODO: Finish implementing click path analysis:
#if DEBUG
                this.groupBoxPageNavigationAnalysis.Visible = true;
#else
                this.groupBoxPageNavigationAnalysis.Visible = false;
#endif

                /** Custom Filter Options ------------------------------------------ **/

                this.checkBoxCustomFiltersEnable.Checked      = MacroscopePreferencesManager.GetCustomFiltersEnable();
                this.numericUpDownCustomFiltersMaxItems.Value = MacroscopePreferencesManager.GetCustomFiltersMaxItems();

                this.checkBoxCustomFiltersApplyToHtml.Checked        = MacroscopePreferencesManager.GetCustomFiltersApplyToHtml();
                this.checkBoxCustomFiltersApplyToCss.Checked         = MacroscopePreferencesManager.GetCustomFiltersApplyToCss();
                this.checkBoxCustomFiltersApplyToJavascripts.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts();
                this.checkBoxCustomFiltersApplyToText.Checked        = MacroscopePreferencesManager.GetCustomFiltersApplyToText();
                this.checkBoxCustomFiltersApplyToXml.Checked         = MacroscopePreferencesManager.GetCustomFiltersApplyToXml();

                /** Extractor Options ---------------------------------------------- **/

                this.checkBoxDataExtractorsEnable.Checked          = MacroscopePreferencesManager.GetDataExtractorsEnable();
                this.checkBoxDataExtractorsCleanWhiteSpace.Checked = MacroscopePreferencesManager.GetDataExtractorsCleanWhiteSpace();

                this.numericUpDownDataExtractorsMaxItemsCssSelectors.Value = MacroscopePreferencesManager.GetDataExtractorsMaxItemsCssSelectors();
                this.numericUpDownDataExtractorsMaxItemsRegexes.Value      = MacroscopePreferencesManager.GetDataExtractorsMaxItemsRegexes();
                this.numericUpDownDataExtractorsMaxItemsXpaths.Value       = MacroscopePreferencesManager.GetDataExtractorsMaxItemsXpaths();

                this.checkBoxDataExtractorsApplyToHtml.Checked        = MacroscopePreferencesManager.GetDataExtractorsApplyToHtml();
                this.checkBoxDataExtractorsApplyToCss.Checked         = MacroscopePreferencesManager.GetDataExtractorsApplyToCss();
                this.checkBoxDataExtractorsApplyToJavascripts.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts();
                this.checkBoxDataExtractorsApplyToText.Checked        = MacroscopePreferencesManager.GetDataExtractorsApplyToText();
                this.checkBoxDataExtractorsApplyToPdf.Checked         = MacroscopePreferencesManager.GetDataExtractorsApplyToPdf();
                this.checkBoxDataExtractorsApplyToXml.Checked         = MacroscopePreferencesManager.GetDataExtractorsApplyToXml();

                /** Export Options ------------------------------------------------- **/

                this.checkBoxSitemapIncludeLinkedPdfs.Checked = MacroscopePreferencesManager.GetSitemapIncludeLinkedPdfs();

                /** Ignore Errors Settings ----------------------------------------- **/

                this.checkBoxIgnoreErrors410.Checked = MacroscopePreferencesManager.GetIgnoreErrors410();
                this.checkBoxIgnoreErrors451.Checked = MacroscopePreferencesManager.GetIgnoreErrors451();
            }
        }
Ejemplo n.º 2
0
        /**************************************************************************/

        public Dictionary <MacroscopeDocument, int> AnalyzeDocCollection(MacroscopeDocumentCollection DocCollection)
        {
            Dictionary <MacroscopeDocument, int> DocList;
            decimal DocListCount;
            decimal Count;

            if (this.AnalyzerFingerprint.GetType() != typeof(Levenshtein))
            {
                throw new Exception("MacroscopeLevenshteinAnalysis not initialized");
            }

            DocList      = new Dictionary <MacroscopeDocument, int>(DocCollection.CountDocuments());
            DocListCount = (decimal)DocCollection.CountDocuments();
            Count        = 0;

            foreach (MacroscopeDocument msDocCompare in DocCollection.IterateDocuments())
            {
                string CompareFingerprint = msDocCompare.GetLevenshteinFingerprint();
                bool   DoCheck            = false;

                Count++;

                if ((this.PercentageDone != null) && (DocListCount > 0))
                {
                    this.PercentageDone.PercentageDone((((decimal)100 / DocListCount) * Count), msDocCompare.GetUrl());
                }

                if (this.CrossCheckDocuments(msDocCompare: msDocCompare))
                {
                    continue;
                }

                if (msDocCompare.GetIsExternal())
                {
                    continue;
                }

                if (msDocCompare.GetIsRedirect())
                {
                    continue;
                }

                if (!this.AllowedDocType(msDoc: msDocCompare))
                {
                    continue;
                }
                else
                if (msDocCompare.GetUrl() == this.msDocOriginal.GetUrl())
                {
                    continue;
                }
                else
                if (CompareFingerprint.Length == 0)
                {
                    continue;
                }

                if (msDocOriginal.GetChecksum() == msDocCompare.GetChecksum())
                {
                    DocList.Add(msDocCompare, 0);
                    continue;
                }

                //this.DebugMsg( string.Format( "msDocOriginal: {0}", this.msDocOriginal.GetUrl() ) );
                //this.DebugMsg( string.Format( "this.Fingerprint.Length: {0}", this.Fingerprint.Length ) );
                //this.DebugMsg( string.Format( "msDocCompare: {0}", msDocCompare.GetUrl() ) );
                //this.DebugMsg( string.Format( "CompareFingerprint.Length: {0}", CompareFingerprint.Length ) );

                //this.DebugMsg( string.Format( "this.ComparisonThreshold: {0}", this.ComparisonThreshold ) );

                if (CompareFingerprint.Length > this.Fingerprint.Length)
                {
                    int Len = CompareFingerprint.Length - this.Fingerprint.Length;
                    if (Len <= this.ComparisonSizeDifference)
                    {
                        DoCheck = true;
                    }
                }
                else
                {
                    int Len = this.Fingerprint.Length - CompareFingerprint.Length;
                    if (Len <= this.ComparisonSizeDifference)
                    {
                        DoCheck = true;
                    }
                }

                if (DoCheck)
                {
                    int DistanceFingerprint = this.AnalyzerFingerprint.DistanceFrom(value: CompareFingerprint);

                    if (DistanceFingerprint <= this.ComparisonThreshold)
                    {
                        switch (MacroscopePreferencesManager.GetLevenshteinAnalysisLevel())
                        {
                        case 1:
                            DocList.Add(msDocCompare, DistanceFingerprint);
                            break;

                        case 2:
                            string      DocumentText         = this.msDocOriginal.GetDocumentTextRaw().ToLower();
                            string      CompareDocumentText  = msDocCompare.GetDocumentTextRaw().ToLower();
                            Levenshtein AnalyzerText         = new Levenshtein(value: DocumentText);
                            int         DistanceDocumentText = AnalyzerText.DistanceFrom(value: CompareDocumentText);
                            if (DistanceDocumentText <= this.ComparisonThreshold)
                            {
                                DocList.Add(msDocCompare, DistanceDocumentText);
                            }
                            break;

                        default:
                            throw new Exception("Invalid Levenshtein Analysis Level");
                        }
                    }
                }

                Thread.Yield();
            }

            return(DocList);
        }