public static CrawlerConfiguration HtmlProcessor(this CrawlerConfiguration crawlerConfiguration, int? maxDegreeOfParallelism = null) { crawlerConfiguration.AddPipelineStep( new HtmlDocumentProcessorPipelineStep(maxDegreeOfParallelism.GetValueOrDefault(Environment.ProcessorCount))); return crawlerConfiguration; }
public static CrawlerConfiguration DetectLanguage(this CrawlerConfiguration crawlerConfiguration, int? maxDegreeOfParallelism = null) { crawlerConfiguration.AddPipelineStep( new GoogleLanguageDetection(maxDegreeOfParallelism.GetValueOrDefault(Environment.ProcessorCount))); return crawlerConfiguration; }
public static CrawlerConfiguration PdfTextExtractProcessor(this CrawlerConfiguration crawlerConfiguration, int? maxDegreeOfParallelism = null) { SitemapProcessor sitemapProcessor = new SitemapProcessor(maxDegreeOfParallelism.GetValueOrDefault(Environment.ProcessorCount)); crawlerConfiguration.AddPipelineStep(sitemapProcessor); return crawlerConfiguration; }
public static CrawlerConfiguration TextExtractProcessor(this CrawlerConfiguration crawlerConfiguration, int? maxDegreeOfParallelism = null) { ToxyTextExtractorProcessorPipelineStep filterTextExtractorProcessor = new ToxyTextExtractorProcessorPipelineStep(maxDegreeOfParallelism.GetValueOrDefault(Environment.ProcessorCount)); crawlerConfiguration.AddPipelineStep(filterTextExtractorProcessor); return crawlerConfiguration; }
public static CrawlerConfiguration Robots(this CrawlerConfiguration crawlerConfiguration, string searchPath = null) { crawlerConfiguration.AddPipelineStep(new RobotsPipelineStep(searchPath, crawlerConfiguration.Logger)); return crawlerConfiguration; }