Пример #1
0
        public static void Run()
        {
            NCrawlerModule.Setup();
            Console.Out.WriteLine("\nSimple indexer demo");

            // Setup crawler to crawl/index http://ncrawler.codeplex.com
            //  * Step 1 - The Html Processor, parses and extracts links, text and more from html
            //  * Step 2 - Custom step, that is supposed to send content to an Index or Database
            using (Crawler c = new Crawler(new Uri("http://www.cefa.com"),
                                           new HtmlDocumentProcessor( // Process html, filter links and content
                                               // Setup filter that removed all the text between <body and </body>
                                               // This can be custom tags like <!--BeginTextFiler--> and <!--EndTextFiler-->
                                               // or whatever you prefer. This way you can control what text is extracted on every page
                                               // Most cases you want just to filter the header information or menu text
                                               new Dictionary <string, string>
            {
                { "<head", "</head>" }
            },
                                               // Setup filter that tells the crawler not to follow links between tags
                                               // that start with <head and ends with </head>. This can be custom tags like
                                               // <!--BeginNoFollow--> and <!--EndNoFollow--> or whatever you prefer.
                                               // This was you can control what links the crawler should not follow
                                               new Dictionary <string, string>
            {
                { "<table class='Data_Table", "</table>" }
            }),
                                           new ContentIndexer())
            {
                MaximumThreadCount = 2
            })                     // Custom Step to send filtered content to index
            {
                // Begin crawl
                c.Crawl();
            }
        }
Пример #2
0
        public static void Run()
        {
            NCrawlerModule.Setup();

            // Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below
            NCrawlerModule.Register(builder =>
                                    builder.Register((c, p) =>
            {
                NCrawlerModule.Setup();         // Return to standard setup
                return(new CustomCrawlerRules(p.TypedAs <Crawler>(), c.Resolve <IRobot>(p), p.TypedAs <Uri>(),
                                              p.TypedAs <ICrawlerHistory>()));
            }).
                                    As <ICrawlerRules>().
                                    InstancePerDependency());

            Console.Out.WriteLine("Advanced crawl demo");

            using (Crawler c = new Crawler(
                       new Uri("http://www.cefa.com"),
                       new HtmlDocumentProcessor(), // Process html
                       new DumperStep())
            {
                MaximumThreadCount = 2,
                MaximumCrawlDepth = 2,
                MaximumCrawlCount = 1000,
                ExcludeFilter = LinkExtractor.ExtensionsToSkip,
            })
            {
                // Begin crawl
                c.Crawl();
            }
        }
Пример #3
0
        public static void Run()
        {
            List <String> urlList = new List <string>();

            // Query the database for the list of mandatory urls to be crawled
            //using (SqlConnection connection = new SqlConnection("Server=nabccrmdev.cloudapp.net;Database=CEF_db;User Id=cef_admin; Password = BW2016!; "))
            using (SqlConnection connection = new SqlConnection(BiggWhaleDataCollector.Properties.Settings.Default.NCrawlerConn))
            {
                using (SqlCommand command = connection.CreateCommand())
                {
                    command.CommandText = "SELECT [Crawl Url] from [Crawl Urls] ";
                    try
                    {
                        connection.Open();
                        using (SqlDataReader reader = command.ExecuteReader())
                        {
                            while (reader.Read())
                            {
                                urlList.Add(reader["Crawl Url"].ToString());
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine(ex.ToString());
                    }
                }
            }
            foreach (String url in urlList)
            {
                NCrawlerModule.Setup(new FileStorageModule(BiggWhaleDataCollector.Properties.Settings.Default.DataFolder, false));
                //FileStorageModule.Setup(BiggWhaleDataCollector.Properties.Settings.Default.DataFolder, false);
                using (Crawler c = new Crawler(new Uri(url),
                                               new HtmlDocumentProcessor(),
                                               new DumperStep())
                {
                    // Custom step to visualize crawl
                    MaximumThreadCount = 10,
                    MaximumCrawlDepth = 1,
                    ExcludeFilter = Program.ExtensionsToSkip,
                })
                {
                    // Begin crawl
                    c.Crawl();
                }
            }
        }