private void ConfigureStorageDirectories() { var storageDirectory = new DirectoryInfo(ExtractorParams.StorageDir); if (!storageDirectory.Exists) { storageDirectory.Create(); } string websitePath = HtmlFileUtils.GetWebsitePathFromUri(ExtractorParams.Scope, ExtractorParams.RootUrl); ContentDirectory = new DirectoryInfo(Path.Combine(storageDirectory.FullName, websitePath)); if (!ContentDirectory.Exists) { ContentDirectory.Create(); } }
public static string GetWebsitePathFromUri(ExtractionScope scope, Uri rootUri) { string websitePath = null; switch (scope) { case ExtractionScope.Domain: websitePath = HtmlFileUtils.GetPathValidChars(GetBaseDomain(rootUri)); break; case ExtractionScope.SubDomain: websitePath = HtmlFileUtils.GetPathValidChars(GetSubDomain(rootUri)); break; //case ExtractionScope.Path: default: websitePath = HtmlFileUtils.GetPathValidChars(GetSubDomain(rootUri) + GetRootPath(rootUri.AbsolutePath).Replace("/", "_")); break; } return(websitePath); }
private void ConfigureWebCrawler() { CrawlConfiguration config = new CrawlConfiguration(); config.MaxConcurrentThreads = Environment.ProcessorCount; config.MaxPagesToCrawl = 0; config.MaxPagesToCrawlPerDomain = 0; config.MaxPageSizeInBytes = 0; config.UserAgentString = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko"; config.HttpProtocolVersion = HttpProtocolVersion.NotSpecified; config.CrawlTimeoutSeconds = 0; config.IsUriRecrawlingEnabled = false; config.IsExternalPageCrawlingEnabled = false; config.IsExternalPageLinksCrawlingEnabled = false; config.IsRespectUrlNamedAnchorOrHashbangEnabled = false; config.DownloadableContentTypes = "text/html, text/plain"; config.HttpServicePointConnectionLimit = 200; config.HttpRequestTimeoutInSeconds = 15; config.HttpRequestMaxAutoRedirects = 7; config.IsHttpRequestAutoRedirectsEnabled = true; config.IsHttpRequestAutomaticDecompressionEnabled = true; config.IsSendingCookiesEnabled = false; config.IsSslCertificateValidationEnabled = false; config.MinAvailableMemoryRequiredInMb = 0; config.MaxMemoryUsageInMb = 0; config.MaxMemoryUsageCacheTimeInSeconds = 0; config.MaxCrawlDepth = 1000; config.MaxLinksPerPage = 1000; config.IsForcedLinkParsingEnabled = false; config.MaxRetryCount = 0; config.MinRetryDelayInMilliseconds = 0; config.IsRespectRobotsDotTextEnabled = true; config.UrlPatternsToExclude = ExtractorParams.UrlPatternsToExclude; config.IsRespectMetaRobotsNoFollowEnabled = true; config.IsRespectHttpXRobotsTagHeaderNoFollowEnabled = true; config.IsRespectAnchorRelNoFollowEnabled = true; config.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = false; config.RobotsDotTextUserAgentString = "bingbot"; config.MinCrawlDelayPerDomainMilliSeconds = ExtractorParams.MinCrawlDelay; config.MaxRobotsDotTextCrawlDelayInSeconds = 5; config.IsAlwaysLogin = false; config.LoginUser = ""; config.LoginPassword = ""; config.UseDefaultCredentials = false; if (!DoContinue) { scheduler = new Scheduler(config.IsUriRecrawlingEnabled, null, null); } else { using (FileStream fs = new FileStream(Path.Combine(ContentDirectory.FullName, LogsDirName, CheckpointFileName), FileMode.Open)) { scheduler = Scheduler.Deserialize(fs); } } crawler = new PoliteWebCrawler(config, null, null, scheduler, null, null, null, null, null); crawler.IsInternalUri((candidateUri, rootUri) => HtmlFileUtils.ShouldCrawlUri(ExtractorParams.Scope, candidateUri, rootUri)); crawler.ShouldCrawlPageLinks(WebCrawler_ShouldCrawlPageLinks); crawler.PageCrawlCompletedAsync += WebCrawler_PageCrawlCompletedAsync; // DEBUG: uncomment to debug Abot crawl progress // crawler.PageCrawlStartingAsync += WebCrawler_PageCrawlStartingAsync; // DEBUG: uncomment to debug Abot crawling decisions // crawler.PageCrawlDisallowedAsync += WebCrawler_PageCrawlDisallowedAsync; // crawler.PageLinksCrawlDisallowedAsync += WebCrawler_PageLinksCrawlDisallowedAsync; }
// => called each time a page has been crawled by the web crawler private void WebCrawler_PageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e) { try { CrawledPage crawledPage = e.CrawledPage; // Exit if the page wasn't crawled successfully if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { LogRequest(crawledPage, 0); if (crawledPage.WebException != null) { Perfs.AddCrawlError(); } else if (crawledPage.HttpWebResponse != null) { int statusCode = (int)crawledPage.HttpWebResponse.StatusCode; if (statusCode != 404 && statusCode >= 400) { Perfs.AddCrawlError(); } } return; } // Exit if the page had non content if (string.IsNullOrEmpty(crawledPage.Content.Text)) { LogRequest(crawledPage, 0); return; } // Get the page and its Css dependencies parsed by Abot whith Anglesharp var htmlDocumentUri = crawledPage.HttpWebResponse.ResponseUri; var htmlDocument = crawledPage.AngleSharpHtmlDocument; // Visit the Html page syntax tree and convert it to NLPTextDocument Stopwatch timer = Stopwatch.StartNew(); var htmlConverter = new HtmlDocumentConverter(htmlDocumentUri.AbsoluteUri, htmlDocument); var normalizedTextDocument = htmlConverter.ConvertToNLPTextDocument(); timer.Stop(); // Check the percentage of text blocks which are new & unique in this page var percentUnique = Perfs.SetPercentUniqueForLastDoc(normalizedTextDocument); // Log the request results LogRequest(crawledPage, percentUnique); // Write the NLPTextDocument as a text file on disk if (percentUnique > 0) { var fileInfo = HtmlFileUtils.GetFilePathFromUri(ContentDirectory, htmlDocumentUri); if (!fileInfo.Directory.Exists) { fileInfo.Directory.Create(); } NLPTextDocumentWriter.WriteToFile(normalizedTextDocument, fileInfo.FullName); Perfs.AddTextConversion(timer.ElapsedMilliseconds, fileInfo.Length); } // Test stopping conditions bool stopCrawl = false; string stopMessage = null; if (userCancelEventReceived) { stopCrawl = true; stopMessage = "Extraction interrupted by the user"; } else if (ExtractorParams.MaxDuration > 0 && TimeSpan.FromMilliseconds(Perfs.ElapsedTime).Minutes >= ExtractorParams.MaxDuration) { stopCrawl = true; stopMessage = "Extraction stopped because the extraction duration exceeded " + ExtractorParams.MaxDuration + " minutes"; } else if (ExtractorParams.MaxPageCount > 0 && Perfs.HtmlPagesCount >= ExtractorParams.MaxPageCount) { stopCrawl = true; stopMessage = "Extraction stopped because the number of extracted pages exceeded " + ExtractorParams.MaxPageCount; } else if (ExtractorParams.MinUniqueText > 0 && Perfs.PercentUniqueForLastDocs < (ExtractorParams.MinUniqueText / 100.0)) { stopCrawl = true; stopMessage = "Extraction stopped because the % of new textblocks fell below " + ExtractorParams.MinUniqueText + "%"; } else if (ExtractorParams.MaxSizeOnDisk > 0 && Perfs.TotalSizeOnDisk >= (ExtractorParams.MaxSizeOnDisk * 1024L * 1024L)) { stopCrawl = true; stopMessage = "Extraction stopped because the files size on disk exceeded " + ExtractorParams.MaxSizeOnDisk + " MB"; } // Write current status to screen (and to file if stopping) if (!stopCrawl) { Perfs.WriteStatus(Console.Out); } else { DisplayMessages(Perfs.WriteStatus); } // Write one checkpoint every one minute // to enable the "continue" crawl feature lock (CheckpointFileName) { if (stopCrawl || DateTime.Now.Subtract(lastCheckpointTime).Minutes >= 1) { lastCheckpointTime = DateTime.Now; using (FileStream fs = new FileStream(Path.Combine(ContentDirectory.FullName, LogsDirName, CheckpointFileName), FileMode.Create)) { scheduler.Serialize(fs); } } if (stopCrawl) { DisplayMessages(WriteEndMessage, stopMessage); Environment.Exit(0); } } } catch (Exception ex) { // Safeguard to make sure that an error // during the processing of a single page // can't stop the whole crawl process WriteError("Error while processing the page : " + e.CrawledPage.HttpWebResponse.ResponseUri.AbsoluteUri, ex); } }