コード例 #1
0
        /// <summary>
        ///     Processes the crawl request.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name = "imageManager">The image manager.</param>
        /// <param name = "webPageManager">The web page manager.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, FileManager <TArachnodeDAO> fileManager, ImageManager <TArachnodeDAO> imageManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO)
        {
            switch (crawlRequest.DataType.DiscoveryType)
            {
            case DiscoveryType.File:
                ProcessFile(crawlRequest, fileManager, arachnodeDAO);
                break;

            case DiscoveryType.Image:
                ProcessImage(crawlRequest, imageManager, arachnodeDAO);
                break;

            case DiscoveryType.WebPage:
                if (crawlRequest.Discovery.ExpectFileOrImage || _discoveryManager.IsCrawlRestricted(crawlRequest, crawlRequest.Parent.Uri.AbsoluteUri))
                {
                    /*<img src="http://msn.com"></a> would return a WebPage.  If the Crawl was restricting crawling to fark.com, and a
                     * page on fark.com listed the aforementioned Image and we didn't make this check, then the Crawl would create a
                     * record in the WebPages table, which would not be correct.
                     * A CurrentDepth of '0' means a request was made for a File or an Image.
                     */
                    //ANODET: Improve parsing... check the regular expressions...
                    crawlRequest.Crawl.UnassignedDiscoveries.Remove(crawlRequest.Discovery.Uri.AbsoluteUri);

                    return;

#if !DEMO
                    //throw new Exception("A CrawlRequest was created for a File or an Image, but the HttpWebResponse returned a WebPage.  This is typically indicative of invalid HTML.");
#endif
                }
                ProcessWebPage(crawlRequest, webPageManager, arachnodeDAO);
                break;

#if !DEMO
            case DiscoveryType.None:

                if (ApplicationSettings.InsertDisallowedAbsoluteUris)
                {
                    arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, "Disallowed by unassigned DataType.  (" + crawlRequest.DataType.ContentType + ")", ApplicationSettings.ClassifyAbsoluteUris);
                }

                break;
#endif
            }

            //remove the reference from the crawl.
            crawlRequest.Crawl.UnassignedDiscoveries.Remove(crawlRequest.Discovery.Uri.AbsoluteUri);
        }
コード例 #2
0
        /// <summary>
        ///     Processes the web page.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "webPageManager">The web page manager.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        protected override void ProcessWebPage(CrawlRequest <TArachnodeDAO> crawlRequest, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO)
        {
            _consoleManager.OutputWebPageDiscovered(crawlRequest.Crawl.CrawlInfo.ThreadNumber, crawlRequest);
            Counters.GetInstance().WebPagesDiscovered(1);

            /**/

            webPageManager.ManageWebPage(crawlRequest);

            //the Crawler may(/will) be null if PostProcessing...
            if (ApplicationSettings.ProcessDiscoveriesAsynchronously && !crawlRequest.Crawl.IsProcessingDiscoveriesAsynchronously && crawlRequest.Crawl.Crawler != null)
            {
                crawlRequest.Crawl.Crawler.Engine.DiscoveryProcessors[crawlRequest.Crawl.CrawlInfo.ThreadNumber].AddCrawlRequestToBeProcessed(crawlRequest);
            }
            else
            {
                ProcessDiscoveries(crawlRequest, arachnodeDAO);
            }
        }

        public override void ProcessDiscoveries(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            /**/

            //Email Addresses

            ProcessEmailAddresses(crawlRequest, arachnodeDAO);

            /**/

            //HyperLinks

            ProcessHyperLinks(crawlRequest, arachnodeDAO);

            /**/

            //Files and Images

            ProcessFilesAndImages(crawlRequest, arachnodeDAO);
        }