public static void Run() { NCrawlerModule.Setup(); Console.Out.WriteLine("\nSimple indexer demo"); // Setup crawler to crawl/index http://ncrawler.codeplex.com // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Custom step, that is supposed to send content to an Index or Database using (Crawler c = new Crawler(new Uri("http://www.cefa.com"), new HtmlDocumentProcessor( // Process html, filter links and content // Setup filter that removed all the text between <body and </body> // This can be custom tags like <!--BeginTextFiler--> and <!--EndTextFiler--> // or whatever you prefer. This way you can control what text is extracted on every page // Most cases you want just to filter the header information or menu text new Dictionary <string, string> { { "<head", "</head>" } }, // Setup filter that tells the crawler not to follow links between tags // that start with <head and ends with </head>. This can be custom tags like // <!--BeginNoFollow--> and <!--EndNoFollow--> or whatever you prefer. // This was you can control what links the crawler should not follow new Dictionary <string, string> { { "<table class='Data_Table", "</table>" } }), new ContentIndexer()) { MaximumThreadCount = 2 }) // Custom Step to send filtered content to index { // Begin crawl c.Crawl(); } }
public static void Run() { NCrawlerModule.Setup(); // Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below NCrawlerModule.Register(builder => builder.Register((c, p) => { NCrawlerModule.Setup(); // Return to standard setup return(new CustomCrawlerRules(p.TypedAs <Crawler>(), c.Resolve <IRobot>(p), p.TypedAs <Uri>(), p.TypedAs <ICrawlerHistory>())); }). As <ICrawlerRules>(). InstancePerDependency()); Console.Out.WriteLine("Advanced crawl demo"); using (Crawler c = new Crawler( new Uri("http://www.cefa.com"), new HtmlDocumentProcessor(), // Process html new DumperStep()) { MaximumThreadCount = 2, MaximumCrawlDepth = 2, MaximumCrawlCount = 1000, ExcludeFilter = LinkExtractor.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
public static void Run() { List <String> urlList = new List <string>(); // Query the database for the list of mandatory urls to be crawled //using (SqlConnection connection = new SqlConnection("Server=nabccrmdev.cloudapp.net;Database=CEF_db;User Id=cef_admin; Password = BW2016!; ")) using (SqlConnection connection = new SqlConnection(BiggWhaleDataCollector.Properties.Settings.Default.NCrawlerConn)) { using (SqlCommand command = connection.CreateCommand()) { command.CommandText = "SELECT [Crawl Url] from [Crawl Urls] "; try { connection.Open(); using (SqlDataReader reader = command.ExecuteReader()) { while (reader.Read()) { urlList.Add(reader["Crawl Url"].ToString()); } } } catch (Exception ex) { Console.WriteLine(ex.ToString()); } } } foreach (String url in urlList) { NCrawlerModule.Setup(new FileStorageModule(BiggWhaleDataCollector.Properties.Settings.Default.DataFolder, false)); //FileStorageModule.Setup(BiggWhaleDataCollector.Properties.Settings.Default.DataFolder, false); using (Crawler c = new Crawler(new Uri(url), new HtmlDocumentProcessor(), new DumperStep()) { // Custom step to visualize crawl MaximumThreadCount = 10, MaximumCrawlDepth = 1, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } } }