public static void Run() { NCrawlerModule.Setup(); Console.Out.WriteLine("\nSimple indexer demo"); // Setup crawler to crawl/index http://ncrawler.codeplex.com // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Custom step, that is supposed to send content to an Index or Database using (var c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor( // Process html, filter links and content // Setup filter that removed all the text between <body and </body> // This can be custom tags like <!--BeginTextFiler--> and <!--EndTextFiler--> // or whatever you prefer. This way you can control what text is extracted on every page // Most cases you want just to filter the header information or menu text new Dictionary <string, string> { { "<body", "</body>" } }, // Setup filter that tells the crawler not to follow links between tags // that start with <head and ends with </head>. This can be custom tags like // <!--BeginNoFollow--> and <!--EndNoFollow--> or whatever you prefer. // This was you can control what links the crawler should not follow new Dictionary <string, string> { { "<head", "</head>" } }), new IndexerDemo()) { MaximumThreadCount = 2 }) // Custom Step to send filtered content to index { // Begin crawl c.Crawl(); } }
public static void Run() { NCrawlerModule.Setup(); // Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below NCrawlerModule.Register(builder => builder.Register((c, p) => { NCrawlerModule.Setup(); // Return to standard setup return(new CustomCrawlerRules(p.TypedAs <Crawler>(), c.Resolve <IRobot>(p), p.TypedAs <Uri>(), p.TypedAs <ICrawlerHistory>())); }). As <ICrawlerRules>(). InstancePerDependency()); Console.Out.WriteLine("Advanced crawl demo"); using (var c = new Crawler( new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new DumperStep()) { MaximumThreadCount = 2, MaximumCrawlDepth = 2, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
public static void Run() { NCrawlerModule.Setup(); Console.Out.WriteLine("Simple crawl demo"); // Setup crawler to crawl http://ncrawler.codeplex.com // with 1 thread adhering to robot rules, and maximum depth // of 2 with 4 pipeline steps: // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Processes PDF files, extracting text // * Step 3 - Try to determine language based on page, based on text extraction, using google language detection // * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction new GoogleLanguageDetection(), // Add language detection new Mp3FileProcessor(), // Add language detection new DumperStep()) { // Custom step to visualize crawl MaximumThreadCount = 10, MaximumCrawlDepth = 10, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
public static void SetupCustomCrawlerModule() { NCrawlerModule.Setup(new Module[1] { new CustomNCrawlerModule() }); }
public static void Run() { NCrawlerModule.Setup(); Console.Out.WriteLine("Advanced crawl demo"); using (Crawler c = new CustomCrawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new DumperStep()) { MaximumThreadCount = 10, MaximumCrawlDepth = 2, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
public static void Run() { NCrawlerModule.Setup(); Console.Out.WriteLine("http://www.cefa.com/FundSelector/"); /* * int count = 0; * foreach (string url in new StringPatternGenerator("http://ncrawler[a,b,c,d,e,f].codeplex.com/view[0-10].aspx?param1=[a-c]¶m2=[D-F]")) * { * Assert.IsTrue(crawlerHistory.Register(url)); * Assert.IsFalse(crawlerHistory.Register(url)); * count++; * Assert.AreEqual(count, crawlerHistory.RegisteredCount); * } * * if (crawlerHistory is IDisposable) * { * ((IDisposable)crawlerHistory).Dispose(); * } */ // Setup crawler to crawl http://www.cefa.com/FundSelector/ // with 1 thread adhering to robot rules, and maximum depth // of 2 with 4 pipeline steps: // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Processes PDF files, extracting text // * Step 3 - Try to determine language based on page, based on text extraction, using google language detection // * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class using (Crawler c = new Crawler(new Uri("http://www.cefa.com/"), new HtmlDocumentProcessor(), // Process html //new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), //new GoogleLanguageDetection(), new DumperStep()) { // Custom step to visualize crawl MaximumThreadCount = 4, MaximumCrawlDepth = 10000, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
public static void Run() { NCrawlerModule.Setup(); // Demo 2 - Find broken links Console.Out.WriteLine("\nFind broken links demo"); // Setup crawler to crawl http://ncrawler.codeplex.com // with 2 thread adhering to robot rules, and maximum depth // of 2 with 2 pipeline steps NCrawlerModule.Setup(); using (var c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new DumpBrokenLinksStep()) // Custom pipeline Step { MaximumThreadCount = 5, MaximumCrawlDepth = 2, }) { // Begin crawl c.Crawl(); } }
public static void Run() { NCrawlerModule.Setup(); // Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below NCrawlerModule.Register(builder => builder.Register((c, p) => { NCrawlerModule.Setup(); // Return to standard setup return(new CustomCrawlerRules(p.TypedAs <Crawler>(), c.Resolve <IRobot>(p), p.TypedAs <Uri>(), p.TypedAs <ICrawlerHistory>())); }). As <ICrawlerRules>(). InstancePerDependency()); Console.Out.WriteLine("Advanced crawl demo"); using (Crawler c = new Crawler( new Uri("http://www.cefa.com/"), new HtmlDocumentProcessor(), // Process html new DumperStep()) { MaximumThreadCount = 5, MaximumCrawlDepth = 3, MaximumCrawlCount = 10000, ExcludeFilter = Program.ExtensionsToSkip //, //IncludeFilter = new[] // { // (RegexFilter)new Regex(@"((^http://www.cefa.com/[a-zA-Z0-9\-\.]*)?()$)", // RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase) // } }) { // Begin crawl c.Crawl(); } }
public static void Run(MainForm form, Uri uri, CookieContainer cc) { MainForm = form; var modules = new Module[] { new CustomDownloaderModule(cc) //,new FileStorageModule(".", true) }; NCrawlerModule.Setup(modules); using (Crawler c = new Crawler( uri, new HtmlDocumentProcessor(), new ReviewStep(), new DumpStep())) { c.AfterDownload += c_AfterDownload; c.MaximumCrawlDepth = 1; c.MaximumThreadCount = 1; c.Crawl(); } }
public void pesquisa(string termo, List <Uri> seeds, bool flagTodosTermos) { NCrawlerModule.Setup(); // Setup crawler to crawl http://ncrawler.codeplex.com // with 1 thread adhering to robot rules, and maximum depth // of 2 with 4 pipeline steps: // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Processes PDF files, extracting text // * Step 3 - Try to determine language based on page, based on text extraction, using google language detection // * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class using (Crawler c = new Crawler( //new Uri("http://forcaavense.com/"), seeds[0], new AddUrls(seeds), new HtmlDocumentProcessor(), // Process html new GoogleLanguageDetection(), // Add language detection //new Mp3FileProcessor(), //new DumperStep(), new Web2TextStep(termo, paginas)) { AdhereToRobotRules = true, // Custom step to visualize crawl MaximumThreadCount = 2, MaximumCrawlDepth = profundidade, MaximumCrawlCount = 20, ExcludeFilter = ExtensionsToSkip, //ExcludeFilter = new IFilter[] //{ // new LambdaFilter((uri, crawlStep) => !uri.ToString().Contains("")); //} }) { // Begin crawl c.Crawl(); } }
/// <summary> /// Setup module as main module. /// </summary> /// <param name="resume">True if module resume his work; false otherwise.</param> public static void Setup(bool resume) { NCrawlerModule.Setup(new EfServicesModule(resume)); }
public static void SetupEfServicesStorage() { NCrawlerModule.Setup(new IsolatedStorageModule(false), new TestModule()); }
public static void SetupInMemoryStorage() { NCrawlerModule.Setup(new NCrawlerModule(), new TestModule()); }
public static void SetupFileStorage() { string storagePath = new FileInfo(Assembly.GetExecutingAssembly().Location).DirectoryName; NCrawlerModule.Setup(new FileStorageModule(storagePath, false), new TestModule()); }
public static void SetupFileServicesStorage() { NCrawlerModule.Setup(new FileStorageModule(Directory.GetCurrentDirectory(), false), new TestFileStorageModule()); }
public static void SetupEfServicesStorage() { NCrawlerModule.Setup(new EsentServicesModule(false), new TestEsentModule()); }
public static void SetupDbServicesStorage() { NCrawlerModule.Setup(new DbServicesModule(false), new TestModule()); }