示例#1
0
        public void ProcessFiles(FileInfo sourceFileInfo, FileInfo destinationFileInfo)
        {
            sourceFileInfo      = sourceFileInfo ?? throw new ArgumentNullException(nameof(sourceFileInfo));
            destinationFileInfo = destinationFileInfo ?? throw new ArgumentNullException(nameof(destinationFileInfo));

            ProcessFilesInternal(sourceFileInfo, destinationFileInfo);
            OnFileProcessed?.Invoke(this, EventArgs.Empty);
        }
        /// <summary>
        ///     Process a range of FileID after crawling.  Useful if crawled Files were not processed at crawl time according to desired ApplicationSettings configuration.
        ///     Calling this method DOES change the 'LastDiscovered' fields where applicable.
        ///     This method is not when crawling, rather during post-processing.
        /// </summary>
        /// <param name = "fileIDLowerBound"></param>
        /// <param name = "fileIDUpperBound"></param>
        public static void ProcessFiles(Crawler <TArachnodeDAO> crawler, long fileIDLowerBound, long fileIDUpperBound)
        {
            //do not assign the application settings.  doing so will override the ApplicationSetting you set before calling this method...
            TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false);

            ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            ActionManager <TArachnodeDAO>  actionManager  = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager);
            CookieManager cookieManager = new CookieManager();;
            MemoryManager <TArachnodeDAO>      memoryManager      = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            RuleManager <TArachnodeDAO>        ruleManager        = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager);
            CacheManager <TArachnodeDAO>       cacheManager       = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO);
            Cache <TArachnodeDAO> cache = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager);

            DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager);

            //load the CrawlActions, CrawlRules and EngineActions...
            ruleManager.ProcessCrawlRules(crawler);
            actionManager.ProcessCrawlActions(crawler);
            actionManager.ProcessEngineActions(crawler);

            //these three methods are called in the Engine.
            UserDefinedFunctions.RefreshAllowedExtensions(true);
            UserDefinedFunctions.RefreshAllowedSchemes(true);
            UserDefinedFunctions.RefreshDisallowed();

            //instantiate a WebClient to access the ResponseHeaders...
            WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager));

            webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null);

            FileManager <TArachnodeDAO> fileManager = new FileManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, arachnodeDAO);

            for (long i = fileIDLowerBound; i <= fileIDUpperBound; i++)
            {
                ArachnodeDataSet.FilesRow filesRow = null;

                try
                {
                    //get the File from the database.  we need the source data as we don't store this in the index.
                    //even though most of the fields are available in the Document, the File is the authoritative source, so we'll use that for all of the fields.
                    filesRow = arachnodeDAO.GetFile(i.ToString());

                    if (filesRow != null)
                    {
                        if (filesRow.Source == null || filesRow.Source.Length == 0)
                        {
                            if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType)))
                            {
                                filesRow.Source = File.ReadAllBytes(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType));
                            }
                            else
                            {
                                Console.WriteLine("FileID: " + i + " was NOT processed successfully.");
                                if (OnFileProcessed != null)
                                {
                                    OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null);
                                }
                            }
                        }

                        ProcessFile(crawler.ApplicationSettings, crawler.WebSettings, crawler, filesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, fileManager, memoryManager, ruleManager, arachnodeDAO);

                        Console.WriteLine("FileID: " + i + " was processed successfully.");
                        if (OnFileProcessed != null)
                        {
                            OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was processed successfully.", null, null);
                        }
                    }
                }
                catch (Exception exception)
                {
                    Console.WriteLine("FileID: " + i + " was NOT processed successfully.");
                    Console.WriteLine(exception.Message);

                    if (OnFileProcessed != null)
                    {
                        OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null);
                        OnFileProcessed.BeginInvoke(filesRow, exception.Message, null, null);
                    }

                    arachnodeDAO.InsertException(null, null, exception, false);
                }
            }

            //stop the CrawlActions, CrawlRules and EngineActions...
            ruleManager.Stop();
            actionManager.Stop();
        }