示例#1
0
        public static void Run()
        {
            NCrawlerModule.Setup();
            Console.Out.WriteLine("\nSimple indexer demo");

            // Setup crawler to crawl/index http://ncrawler.codeplex.com
            //  * Step 1 - The Html Processor, parses and extracts links, text and more from html
            //  * Step 2 - Custom step, that is supposed to send content to an Index or Database
            using (var c = new Crawler(new Uri("http://ncrawler.codeplex.com"),
                                       new HtmlDocumentProcessor( // Process html, filter links and content
                                           // Setup filter that removed all the text between <body and </body>
                                           // This can be custom tags like <!--BeginTextFiler--> and <!--EndTextFiler-->
                                           // or whatever you prefer. This way you can control what text is extracted on every page
                                           // Most cases you want just to filter the header information or menu text
                                           new Dictionary <string, string>
            {
                { "<body", "</body>" }
            },
                                           // Setup filter that tells the crawler not to follow links between tags
                                           // that start with <head and ends with </head>. This can be custom tags like
                                           // <!--BeginNoFollow--> and <!--EndNoFollow--> or whatever you prefer.
                                           // This was you can control what links the crawler should not follow
                                           new Dictionary <string, string>
            {
                { "<head", "</head>" }
            }),
                                       new IndexerDemo())
            {
                MaximumThreadCount = 2
            })                     // Custom Step to send filtered content to index
            {
                // Begin crawl
                c.Crawl();
            }
        }
示例#2
0
        public static void Run()
        {
            NCrawlerModule.Setup();

            // Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below
            NCrawlerModule.Register(builder =>
                                    builder.Register((c, p) =>
            {
                NCrawlerModule.Setup();                                 // Return to standard setup
                return(new CustomCrawlerRules(p.TypedAs <Crawler>(), c.Resolve <IRobot>(p), p.TypedAs <Uri>(),
                                              p.TypedAs <ICrawlerHistory>()));
            }).
                                    As <ICrawlerRules>().
                                    InstancePerDependency());

            Console.Out.WriteLine("Advanced crawl demo");

            using (var c = new Crawler(
                       new Uri("http://ncrawler.codeplex.com"),
                       new HtmlDocumentProcessor(),          // Process html
                       new DumperStep())
            {
                MaximumThreadCount = 2,
                MaximumCrawlDepth = 2,
                ExcludeFilter = Program.ExtensionsToSkip,
            })
            {
                // Begin crawl
                c.Crawl();
            }
        }
示例#3
0
		public static void Run()
		{
			NCrawlerModule.Setup();
			Console.Out.WriteLine("Simple crawl demo");

			// Setup crawler to crawl http://ncrawler.codeplex.com
			// with 1 thread adhering to robot rules, and maximum depth
			// of 2 with 4 pipeline steps:
			//	* Step 1 - The Html Processor, parses and extracts links, text and more from html
			//  * Step 2 - Processes PDF files, extracting text
			//  * Step 3 - Try to determine language based on page, based on text extraction, using google language detection
			//  * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class
			using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"),
				new HtmlDocumentProcessor(), // Process html
				new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction
				new GoogleLanguageDetection(), // Add language detection
				new Mp3FileProcessor(), // Add language detection
				new DumperStep())
				{
					// Custom step to visualize crawl
					MaximumThreadCount = 10,
					MaximumCrawlDepth = 10,
					ExcludeFilter = Program.ExtensionsToSkip,
				})
			{
				// Begin crawl
				c.Crawl();
			}
		}
示例#4
0
 public static void SetupCustomCrawlerModule()
 {
     NCrawlerModule.Setup(new Module[1]
     {
         new CustomNCrawlerModule()
     });
 }
示例#5
0
        public static void Run()
        {
            NCrawlerModule.Setup();
            Console.Out.WriteLine("Advanced crawl demo");

            using (Crawler c = new CustomCrawler(new Uri("http://ncrawler.codeplex.com"),
                                                 new HtmlDocumentProcessor(), // Process html
                                                 new DumperStep())
            {
                MaximumThreadCount = 10,
                MaximumCrawlDepth = 2,
                ExcludeFilter = Program.ExtensionsToSkip,
            })
            {
                // Begin crawl
                c.Crawl();
            }
        }
示例#6
0
        public static void Run()
        {
            NCrawlerModule.Setup();
            Console.Out.WriteLine("http://www.cefa.com/FundSelector/");

            /*
             * int count = 0;
             * foreach (string url in new StringPatternGenerator("http://ncrawler[a,b,c,d,e,f].codeplex.com/view[0-10].aspx?param1=[a-c]&param2=[D-F]"))
             * {
             *  Assert.IsTrue(crawlerHistory.Register(url));
             *  Assert.IsFalse(crawlerHistory.Register(url));
             *  count++;
             *  Assert.AreEqual(count, crawlerHistory.RegisteredCount);
             * }
             *
             * if (crawlerHistory is IDisposable)
             * {
             *  ((IDisposable)crawlerHistory).Dispose();
             * }
             */

            // Setup crawler to crawl http://www.cefa.com/FundSelector/
            // with 1 thread adhering to robot rules, and maximum depth
            // of 2 with 4 pipeline steps:
            //	* Step 1 - The Html Processor, parses and extracts links, text and more from html
            //  * Step 2 - Processes PDF files, extracting text
            //  * Step 3 - Try to determine language based on page, based on text extraction, using google language detection
            //  * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class
            using (Crawler c = new Crawler(new Uri("http://www.cefa.com/"),
                                           new HtmlDocumentProcessor(), // Process html
                                                                        //new iTextSharpPdfProcessor.iTextSharpPdfProcessor(),
                                                                        //new GoogleLanguageDetection(),
                                           new DumperStep())
            {
                // Custom step to visualize crawl
                MaximumThreadCount = 4,
                MaximumCrawlDepth = 10000,
                ExcludeFilter = Program.ExtensionsToSkip,
            })
            {
                // Begin crawl
                c.Crawl();
            }
        }
示例#7
0
        public static void Run()
        {
            NCrawlerModule.Setup();
            // Demo 2 - Find broken links
            Console.Out.WriteLine("\nFind broken links demo");

            // Setup crawler to crawl http://ncrawler.codeplex.com
            // with 2 thread adhering to robot rules, and maximum depth
            // of 2 with 2 pipeline steps
            NCrawlerModule.Setup();
            using (var c = new Crawler(new Uri("http://ncrawler.codeplex.com"),
                                       new HtmlDocumentProcessor(), // Process html
                                       new DumpBrokenLinksStep())   // Custom pipeline Step
            {
                MaximumThreadCount = 5,
                MaximumCrawlDepth = 2,
            })
            {
                // Begin crawl
                c.Crawl();
            }
        }
示例#8
0
        public static void Run()
        {
            NCrawlerModule.Setup();

            // Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below
            NCrawlerModule.Register(builder =>
                                    builder.Register((c, p) =>
            {
                NCrawlerModule.Setup();                                 // Return to standard setup
                return(new CustomCrawlerRules(p.TypedAs <Crawler>(), c.Resolve <IRobot>(p), p.TypedAs <Uri>(),
                                              p.TypedAs <ICrawlerHistory>()));
            }).
                                    As <ICrawlerRules>().
                                    InstancePerDependency());

            Console.Out.WriteLine("Advanced crawl demo");

            using (Crawler c = new Crawler(
                       new Uri("http://www.cefa.com/"),
                       new HtmlDocumentProcessor(),          // Process html
                       new DumperStep())
            {
                MaximumThreadCount = 5,
                MaximumCrawlDepth = 3,
                MaximumCrawlCount = 10000,
                ExcludeFilter = Program.ExtensionsToSkip
                                //,
                                //IncludeFilter = new[]
                                //        {

                                //            (RegexFilter)new Regex(@"((^http://www.cefa.com/[a-zA-Z0-9\-\.]*)?()$)",
                                //                RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase)
                                //        }
            })
            {
                // Begin crawl
                c.Crawl();
            }
        }
示例#9
0
        public static void Run(MainForm form, Uri uri, CookieContainer cc)
        {
            MainForm = form;
            var modules = new Module[] {
                new CustomDownloaderModule(cc)
                //,new FileStorageModule(".", true)
            };

            NCrawlerModule.Setup(modules);
            using (Crawler c = new Crawler(
                       uri,
                       new HtmlDocumentProcessor(),
                       new ReviewStep(),
                       new DumpStep()))
            {
                c.AfterDownload += c_AfterDownload;

                c.MaximumCrawlDepth  = 1;
                c.MaximumThreadCount = 1;
                c.Crawl();
            }
        }
示例#10
0
    public void pesquisa(string termo, List <Uri> seeds, bool flagTodosTermos)
    {
        NCrawlerModule.Setup();
        // Setup crawler to crawl http://ncrawler.codeplex.com
        // with 1 thread adhering to robot rules, and maximum depth
        // of 2 with 4 pipeline steps:
        //	* Step 1 - The Html Processor, parses and extracts links, text and more from html
        //  * Step 2 - Processes PDF files, extracting text
        //  * Step 3 - Try to determine language based on page, based on text extraction, using google language detection
        //  * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class
        using (Crawler c = new Crawler(
                   //new Uri("http://forcaavense.com/"),
                   seeds[0],
                   new AddUrls(seeds),
                   new HtmlDocumentProcessor(),              // Process html
                   new GoogleLanguageDetection(),            // Add language detection
                   //new Mp3FileProcessor(),
                   //new DumperStep(),
                   new Web2TextStep(termo, paginas))
        {
            AdhereToRobotRules = true,

            // Custom step to visualize crawl
            MaximumThreadCount = 2,
            MaximumCrawlDepth = profundidade,
            MaximumCrawlCount = 20,
            ExcludeFilter = ExtensionsToSkip,

            //ExcludeFilter = new IFilter[]
            //{
            //    new LambdaFilter((uri, crawlStep) => !uri.ToString().Contains(""));
            //}
        })
        {
            // Begin crawl
            c.Crawl();
        }
    }
示例#11
0
 /// <summary>
 /// Setup module as main module.
 /// </summary>
 /// <param name="resume">True if module resume his work; false otherwise.</param>
 public static void Setup(bool resume)
 {
     NCrawlerModule.Setup(new EfServicesModule(resume));
 }
示例#12
0
 public static void SetupEfServicesStorage()
 {
     NCrawlerModule.Setup(new IsolatedStorageModule(false), new TestModule());
 }
示例#13
0
 public static void SetupInMemoryStorage()
 {
     NCrawlerModule.Setup(new NCrawlerModule(), new TestModule());
 }
示例#14
0
        public static void SetupFileStorage()
        {
            string storagePath = new FileInfo(Assembly.GetExecutingAssembly().Location).DirectoryName;

            NCrawlerModule.Setup(new FileStorageModule(storagePath, false), new TestModule());
        }
示例#15
0
 public static void SetupFileServicesStorage()
 {
     NCrawlerModule.Setup(new FileStorageModule(Directory.GetCurrentDirectory(), false), new TestFileStorageModule());
 }
示例#16
0
 public static void SetupEfServicesStorage()
 {
     NCrawlerModule.Setup(new EsentServicesModule(false), new TestEsentModule());
 }
示例#17
0
 public static void SetupDbServicesStorage()
 {
     NCrawlerModule.Setup(new DbServicesModule(false), new TestModule());
 }