Пример #1
0
        private void ConfigureStorageDirectories()
        {
            var storageDirectory = new DirectoryInfo(ExtractorParams.StorageDir);

            if (!storageDirectory.Exists)
            {
                storageDirectory.Create();
            }

            string websitePath = HtmlFileUtils.GetWebsitePathFromUri(ExtractorParams.Scope, ExtractorParams.RootUrl);

            ContentDirectory = new DirectoryInfo(Path.Combine(storageDirectory.FullName, websitePath));
            if (!ContentDirectory.Exists)
            {
                ContentDirectory.Create();
            }
        }
Пример #2
0
        public static string GetWebsitePathFromUri(ExtractionScope scope, Uri rootUri)
        {
            string websitePath = null;

            switch (scope)
            {
            case ExtractionScope.Domain:
                websitePath = HtmlFileUtils.GetPathValidChars(GetBaseDomain(rootUri));
                break;

            case ExtractionScope.SubDomain:
                websitePath = HtmlFileUtils.GetPathValidChars(GetSubDomain(rootUri));
                break;

            //case ExtractionScope.Path:
            default:
                websitePath = HtmlFileUtils.GetPathValidChars(GetSubDomain(rootUri) + GetRootPath(rootUri.AbsolutePath).Replace("/", "_"));
                break;
            }
            return(websitePath);
        }
Пример #3
0
        private void ConfigureWebCrawler()
        {
            CrawlConfiguration config = new CrawlConfiguration();

            config.MaxConcurrentThreads                       = Environment.ProcessorCount;
            config.MaxPagesToCrawl                            = 0;
            config.MaxPagesToCrawlPerDomain                   = 0;
            config.MaxPageSizeInBytes                         = 0;
            config.UserAgentString                            = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko";
            config.HttpProtocolVersion                        = HttpProtocolVersion.NotSpecified;
            config.CrawlTimeoutSeconds                        = 0;
            config.IsUriRecrawlingEnabled                     = false;
            config.IsExternalPageCrawlingEnabled              = false;
            config.IsExternalPageLinksCrawlingEnabled         = false;
            config.IsRespectUrlNamedAnchorOrHashbangEnabled   = false;
            config.DownloadableContentTypes                   = "text/html, text/plain";
            config.HttpServicePointConnectionLimit            = 200;
            config.HttpRequestTimeoutInSeconds                = 15;
            config.HttpRequestMaxAutoRedirects                = 7;
            config.IsHttpRequestAutoRedirectsEnabled          = true;
            config.IsHttpRequestAutomaticDecompressionEnabled = true;
            config.IsSendingCookiesEnabled                    = false;
            config.IsSslCertificateValidationEnabled          = false;
            config.MinAvailableMemoryRequiredInMb             = 0;
            config.MaxMemoryUsageInMb                         = 0;
            config.MaxMemoryUsageCacheTimeInSeconds           = 0;
            config.MaxCrawlDepth               = 1000;
            config.MaxLinksPerPage             = 1000;
            config.IsForcedLinkParsingEnabled  = false;
            config.MaxRetryCount               = 0;
            config.MinRetryDelayInMilliseconds = 0;

            config.IsRespectRobotsDotTextEnabled                = true;
            config.UrlPatternsToExclude                         = ExtractorParams.UrlPatternsToExclude;
            config.IsRespectMetaRobotsNoFollowEnabled           = true;
            config.IsRespectHttpXRobotsTagHeaderNoFollowEnabled = true;
            config.IsRespectAnchorRelNoFollowEnabled            = true;
            config.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = false;
            config.RobotsDotTextUserAgentString                 = "bingbot";
            config.MinCrawlDelayPerDomainMilliSeconds           = ExtractorParams.MinCrawlDelay;
            config.MaxRobotsDotTextCrawlDelayInSeconds          = 5;

            config.IsAlwaysLogin         = false;
            config.LoginUser             = "";
            config.LoginPassword         = "";
            config.UseDefaultCredentials = false;

            if (!DoContinue)
            {
                scheduler = new Scheduler(config.IsUriRecrawlingEnabled, null, null);
            }
            else
            {
                using (FileStream fs = new FileStream(Path.Combine(ContentDirectory.FullName, LogsDirName, CheckpointFileName), FileMode.Open))
                {
                    scheduler = Scheduler.Deserialize(fs);
                }
            }
            crawler = new PoliteWebCrawler(config, null, null, scheduler, null, null, null, null, null);
            crawler.IsInternalUri((candidateUri, rootUri) => HtmlFileUtils.ShouldCrawlUri(ExtractorParams.Scope, candidateUri, rootUri));
            crawler.ShouldCrawlPageLinks(WebCrawler_ShouldCrawlPageLinks);
            crawler.PageCrawlCompletedAsync += WebCrawler_PageCrawlCompletedAsync;

            // DEBUG: uncomment to debug Abot crawl progress
            // crawler.PageCrawlStartingAsync += WebCrawler_PageCrawlStartingAsync;

            // DEBUG: uncomment to debug Abot crawling decisions
            // crawler.PageCrawlDisallowedAsync += WebCrawler_PageCrawlDisallowedAsync;
            // crawler.PageLinksCrawlDisallowedAsync += WebCrawler_PageLinksCrawlDisallowedAsync;
        }
Пример #4
0
        // => called each time a page has been crawled by the web crawler
        private void WebCrawler_PageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
        {
            try
            {
                CrawledPage crawledPage = e.CrawledPage;

                // Exit if the page wasn't crawled successfully
                if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                {
                    LogRequest(crawledPage, 0);

                    if (crawledPage.WebException != null)
                    {
                        Perfs.AddCrawlError();
                    }
                    else if (crawledPage.HttpWebResponse != null)
                    {
                        int statusCode = (int)crawledPage.HttpWebResponse.StatusCode;
                        if (statusCode != 404 && statusCode >= 400)
                        {
                            Perfs.AddCrawlError();
                        }
                    }
                    return;
                }

                // Exit if the page had non content
                if (string.IsNullOrEmpty(crawledPage.Content.Text))
                {
                    LogRequest(crawledPage, 0);
                    return;
                }

                // Get the page and its Css dependencies parsed by Abot whith Anglesharp
                var htmlDocumentUri = crawledPage.HttpWebResponse.ResponseUri;
                var htmlDocument    = crawledPage.AngleSharpHtmlDocument;

                // Visit the Html page syntax tree and convert it to NLPTextDocument
                Stopwatch timer                  = Stopwatch.StartNew();
                var       htmlConverter          = new HtmlDocumentConverter(htmlDocumentUri.AbsoluteUri, htmlDocument);
                var       normalizedTextDocument = htmlConverter.ConvertToNLPTextDocument();
                timer.Stop();

                // Check the percentage of text blocks which are new & unique in this page
                var percentUnique = Perfs.SetPercentUniqueForLastDoc(normalizedTextDocument);

                // Log the request results
                LogRequest(crawledPage, percentUnique);

                // Write the NLPTextDocument as a text file on disk
                if (percentUnique > 0)
                {
                    var fileInfo = HtmlFileUtils.GetFilePathFromUri(ContentDirectory, htmlDocumentUri);
                    if (!fileInfo.Directory.Exists)
                    {
                        fileInfo.Directory.Create();
                    }
                    NLPTextDocumentWriter.WriteToFile(normalizedTextDocument, fileInfo.FullName);

                    Perfs.AddTextConversion(timer.ElapsedMilliseconds, fileInfo.Length);
                }

                // Test stopping conditions
                bool   stopCrawl   = false;
                string stopMessage = null;
                if (userCancelEventReceived)
                {
                    stopCrawl   = true;
                    stopMessage = "Extraction interrupted by the user";
                }
                else if (ExtractorParams.MaxDuration > 0 && TimeSpan.FromMilliseconds(Perfs.ElapsedTime).Minutes >= ExtractorParams.MaxDuration)
                {
                    stopCrawl   = true;
                    stopMessage = "Extraction stopped because the extraction duration exceeded " + ExtractorParams.MaxDuration + " minutes";
                }
                else if (ExtractorParams.MaxPageCount > 0 && Perfs.HtmlPagesCount >= ExtractorParams.MaxPageCount)
                {
                    stopCrawl   = true;
                    stopMessage = "Extraction stopped because the number of extracted pages exceeded " + ExtractorParams.MaxPageCount;
                }
                else if (ExtractorParams.MinUniqueText > 0 && Perfs.PercentUniqueForLastDocs < (ExtractorParams.MinUniqueText / 100.0))
                {
                    stopCrawl   = true;
                    stopMessage = "Extraction stopped because the % of new textblocks fell below " + ExtractorParams.MinUniqueText + "%";
                }
                else if (ExtractorParams.MaxSizeOnDisk > 0 && Perfs.TotalSizeOnDisk >= (ExtractorParams.MaxSizeOnDisk * 1024L * 1024L))
                {
                    stopCrawl   = true;
                    stopMessage = "Extraction stopped because the files size on disk exceeded " + ExtractorParams.MaxSizeOnDisk + " MB";
                }

                // Write current status to screen (and to file if stopping)
                if (!stopCrawl)
                {
                    Perfs.WriteStatus(Console.Out);
                }
                else
                {
                    DisplayMessages(Perfs.WriteStatus);
                }

                // Write one checkpoint every one minute
                // to enable the "continue" crawl feature
                lock (CheckpointFileName)
                {
                    if (stopCrawl || DateTime.Now.Subtract(lastCheckpointTime).Minutes >= 1)
                    {
                        lastCheckpointTime = DateTime.Now;
                        using (FileStream fs = new FileStream(Path.Combine(ContentDirectory.FullName, LogsDirName, CheckpointFileName), FileMode.Create))
                        {
                            scheduler.Serialize(fs);
                        }
                    }

                    if (stopCrawl)
                    {
                        DisplayMessages(WriteEndMessage, stopMessage);
                        Environment.Exit(0);
                    }
                }
            }
            catch (Exception ex)
            {
                // Safeguard to make sure that an error
                // during the processing of a single page
                // can't stop the whole crawl process
                WriteError("Error while processing the page : " + e.CrawledPage.HttpWebResponse.ResponseUri.AbsoluteUri, ex);
            }
        }