Ejemplo n.º 1
0
        /** Execute Job ***********************************************************/

        public Boolean Execute()
        {
            DebugMsg(string.Format("Start URL: {0}", this.StartUrl));

            //this.LogEntry( string.Format( "Executing with Start URL: {0}", this.StartUrl ) );

            this.StartUrl = MacroscopeUrlUtils.SanitizeUrl(Url: this.StartUrl);

            this.DocCollection.SetStartUrl(Url: this.StartUrl);

            this.DetermineStartingDirectory();

            this.SetThreadsStop(Stopped: false);

            this.AllowedHosts.AddFromUrl(Url: this.StartUrl);

            if (!this.PeekUrlQueue())
            {
                string RobotsUrl = MacroscopeRobots.GenerateRobotUrl(Url: this.StartUrl);

                if (!string.IsNullOrEmpty(RobotsUrl))
                {
                    this.AddUrlQueueItem(Url: RobotsUrl);
                }

                this.IncludeExcludeUrls.AddExplicitIncludeUrl(Url: this.StartUrl);

                this.AddUrlQueueItem(Url: this.StartUrl);
            }

            this.ProbeRobotsFile(Url: this.StartUrl);

            this.SetCrawlDelay(Url: this.StartUrl);

            this.SpawnWorkers();

            DebugMsg(string.Format("Pages Found: {0}", this.GetPagesFound()));

            if (this.TaskController != null)
            {
                this.TaskController.ICallbackScanComplete();
            }

            this.AddUpdateDisplayQueue(Url: this.StartUrl);

            return(true);
        }
Ejemplo n.º 2
0
        /**************************************************************************/

        private void InitializeJobMaster(MacroscopeConstants.RunTimeMode JobRunTimeMode)
        {
            GC.Collect();

            /*
             * {
             * this.JobMasterLog = new EventLog ();
             * this.JobMasterLog.Source = MacroscopeConstants.MainEventLogSourceName;
             * this.JobGuid = Guid.NewGuid();
             * this.LogEntry( string.Format( "Starting Job" ) );
             * }
             */

            this.RunTimeMode = JobRunTimeMode;

            if (this.TaskController != null)
            {
                this.CredentialsHttp = this.TaskController.IGetCredentialsHttp();
            }

            this.DocCollection = new MacroscopeDocumentCollection(JobMaster: this);
            this.AllowedHosts  = new MacroscopeAllowedHosts();

            /** BEGIN: Named Queues *************************************************/

            this.NamedQueueJobItems = new MacroscopeNamedQueue <MacroscopeJobItem> ();

            this.NamedQueueJobItems.CreateNamedQueue(
                Name: MacroscopeConstants.NamedQueueUrlList,
                QueueMode: MacroscopeNamedQueue <MacroscopeJobItem> .MODE.USE_HISTORY
                );

            this.NamedQueue = new MacroscopeNamedQueue <string> ();

            {
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayQueue);

                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayStructure);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayStructureLinkCounts);

                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHierarchy);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayCanonicalAnalysis);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHrefLang);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayErrors);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHostnames);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayRedirectsAudit);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayLinks);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHyperlinks);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayUriAnalysis);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageTitles);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageDescriptions);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageKeywords);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageHeadings);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageText);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayStylesheets);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayImages);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayJavascripts);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayAudios);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayVideos);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplaySitemaps);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayEmailAddresses);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayTelephoneNumbers);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayCustomFilters);

                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayDataExtractorsCssSelectors);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayDataExtractorsRegexes);
                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayDataExtractorsXpaths);

                this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayRemarks);
            }

            /** END: Named Queues ***************************************************/

            this.CrawlDelay = 0;

            this.AdjustThreadsMax();
            this.ThreadsRunning = 0;
            this.ThreadsStop    = false;
            this.ThreadsDict    = new Dictionary <int, Boolean> ();

            this.SemaphoreWorkers = new Semaphore(0, this.ThreadsMax);
            this.SemaphoreWorkers.Release(this.ThreadsMax);

            this.Depth          = MacroscopePreferencesManager.GetDepth();
            this.PageLimit      = MacroscopePreferencesManager.GetPageLimit();
            this.PageLimitCount = 0;

            this.PagesFound = 0;

            {
                this.ParentStartingDirectory = "";
                this.ChildStartingDirectory  = "";
            }

            this.JobHistory = new MacroscopeJobHistory();

            this.InitProgress();

            this.Locales = new Dictionary <string, string> (32);

            this.Robots          = new MacroscopeRobots();
            this.BlockedByRobots = new Dictionary <string, Boolean> ();
        }