public static void Run() { NCrawlerModule.Setup(); Console.Out.WriteLine("Simple crawl demo"); // Setup crawler to crawl http://ncrawler.codeplex.com // with 1 thread adhering to robot rules, and maximum depth // of 2 with 4 pipeline steps: // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Processes PDF files, extracting text // * Step 3 - Try to determine language based on page, based on text extraction, using google language detection // * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction new GoogleLanguageDetection(), // Add language detection new Mp3FileProcessor(), // Add language detection new DumperStep()) { // Custom step to visualize crawl MaximumThreadCount = 10, MaximumCrawlDepth = 10, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
private void button1_Click(object sender, EventArgs e) { Crawler = new Crawler(textBox1.Text); bindingSource1.DataSource = Crawler; Crawler.Crawl(textBox1.Text); bindingSource1.ResetBindings(false); }
private void btnCrawl_Click(object sender, EventArgs e) { folderBrowserDialog1.ShowDialog(); string startingPath = folderBrowserDialog1.SelectedPath; try { if (startingPath != String.Empty) { crawler.Crawl(startingPath); } textBoxMessages.Text = $"Found {crawler.GetFiles().Count} music files."; if (crawler.GetFiles() != null) { textBoxMessages.Text = "Tagging all the files. Hang on."; var resultFiles = new Dictionary <string, File>(); tagger = new Tagger(resultFiles); tagger.RunTagJob(crawler.GetFiles(), (list) => { SetGridDataDelegate d = new SetGridDataDelegate(SetGridData); this.Invoke(d, new object[] { list }); }); buttonSaveAsPlaylist.Visible = true; } } catch (Exception ex) { ErrorLogger.LogError(ex); } }
public static void Run(frmMain parentForm, Book book) { form = parentForm; IsolatedStorageModule.Setup(false); currentBook = book; existingReviewIds = CrawlUtil.getNewContext().Reviews.Where(r => r.bookId == currentBook.id).Select(r => r.id).ToList(); baseUri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&format=html&isbn=" + book.isbn + "&links=660&min_rating=&review_back=fff&stars=000&text=000"; c = new Crawler(new Uri(baseUri), new HtmlDocumentProcessor(), // Process html new ReviewIFrameDumperStep()); // Custom step to visualize crawl c.MaximumThreadCount = 1; //** 2012-09-03 changed this from 2 to 1 in hopes that it'll fix the unknown (seemingly) random crashes. c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsToSkip; c.AdhereToRobotRules = false; // Begin crawl c.Crawl(); }
public static void Run(frmMain parentForm, User user) { form = parentForm; count = 0; maxPage = 1; //use in-memory storage baseUri = string.Format("http://www.goodreads.com/user/{0}/favorite_authors", user.id); Crawler c = new Crawler(new Uri(baseUri), new HtmlDocumentProcessor(), // Process html new CrawlFavouriteAuthors_DumperStep(user)); // Custom step to visualize crawl c.MaximumThreadCount = 1; c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsAndPagesToSkip; c.BeforeDownload += new EventHandler <NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload); c.AdhereToRobotRules = false; // Begin crawl c.Crawl(); }
public static void Run(SavePageForm parentForm, string url) { Form = parentForm; Url = url; c = new Crawler(new Uri(url), new HtmlDocumentProcessor(), // Process html new SaveFileStep()); c.MaximumThreadCount = 1; c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsToSkip; c.AdhereToRobotRules = false; c.CrawlFinished += new EventHandler <NCrawler.Events.CrawlFinishedEventArgs>(c_CrawlFinished); string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); //if there are no unblocked user agents left then reset the tracker and retry if (ua == null) { UserAgentTracker = CrawlUtil.InitUserAgentTracker(); ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); } c.UserAgent = ua; // Begin crawl c.Crawl(); }
public static void Run() { NCrawlerModule.Setup(); // Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below NCrawlerModule.Register(builder => builder.Register((c, p) => { NCrawlerModule.Setup(); // Return to standard setup return new CustomCrawlerRules(p.TypedAs<Crawler>(), c.Resolve<IRobot>(p), p.TypedAs<Uri>(), p.TypedAs<ICrawlerHistory>()); }). As<ICrawlerRules>(). InstancePerDependency()); Console.Out.WriteLine("Advanced crawl demo"); using (Crawler c = new Crawler( new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new DumperStep()) { MaximumThreadCount = 2, MaximumCrawlDepth = 2, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
void SetSnapshot(DataRenderer dataRenderer, PackedMemorySnapshot snapshot) { if (snapshot == null) { m_RawSnapshot = null; m_RawSchema = null; SchemaToDisplay = null; UpdateTableSelectionNames(); return; } m_RawSnapshot = snapshot; ProgressBarDisplay.ShowBar(string.Format("Opening snapshot: {0}", System.IO.Path.GetFileNameWithoutExtension(snapshot.filePath))); var cachedSnapshot = new CachedSnapshot(snapshot); using (Profiling.GetMarker(Profiling.MarkerId.CrawlManagedData).Auto()) { var crawling = Crawler.Crawl(cachedSnapshot); crawling.MoveNext(); //start execution var status = crawling.Current as EnumerationStatus; float progressPerStep = 1.0f / status.StepCount; while (crawling.MoveNext()) { ProgressBarDisplay.UpdateProgress(status.CurrentStep * progressPerStep, status.StepStatus); } } ProgressBarDisplay.ClearBar(); m_RawSchema = new RawSchema(); m_RawSchema.SetupSchema(cachedSnapshot, dataRenderer); SchemaToDisplay = m_RawSchema; if (k_DefaultViewFilePath.Length > 0) { using (ScopeDebugContext.Func(() => { return("File '" + k_DefaultViewFilePath + "'"); })) { Database.View.ViewSchema.Builder builder = null; using (Profiling.GetMarker(Profiling.MarkerId.LoadViewDefinitionFile).Auto()) { builder = Database.View.ViewSchema.Builder.LoadFromXMLFile(k_DefaultViewFilePath); } if (builder != null) { using (Profiling.GetMarker(Profiling.MarkerId.BuildViewDefinitionFile).Auto()) { ViewSchema = builder.Build(m_RawSchema); } if (ViewSchema != null) { SchemaToDisplay = ViewSchema; } } } } UpdateTableSelectionNames(); }
public override bool IsExternalUrl(Uri uri) { // Is External Url if (base.IsExternalUrl(uri)) { // Yes, check if we have crawled it before if (!m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity))) { // Create child crawler to traverse external site with max 2 levels using (Crawler externalCrawler = new Crawler(uri, new HtmlDocumentProcessor(), // Process html new DumperStep()) { MaximumThreadCount = 1, MaximumCrawlDepth = 2, MaximumCrawlCount = 10, ExcludeFilter = Program.ExtensionsToSkip, }) { // Crawl external site externalCrawler.Crawl(); } } // Do not follow link on this crawler return(true); } return(false); }
public static void Run(frmMain parentForm, User user) { form = parentForm; //use in-memory storage baseUri = "http://www.goodreads.com/user/show/" + user.userIdString; //http://www.goodreads.com/user/show/104320-erin-beck //http://www.goodreads.com/author/show/3360351.Ryan_Dilbert Crawler c = new Crawler(new Uri(baseUri), new HtmlDocumentProcessor(), // Process html new UserProfileDumperStep(user)); // Custom step to visualize crawl c.MaximumThreadCount = 1; c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsAndPagesToSkip; c.BeforeDownload += new EventHandler <NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload); c.AdhereToRobotRules = false; // Begin crawl c.Crawl(); }
static void Main(string[] args) { var crawler = new Crawler("./Sample"); var dir = crawler.Crawl(null, "./", "./"); Console.WriteLine(JsonConvert.SerializeObject(dir, Formatting.Indented)); }
public ActionResult Crawl(int mappingCode) { var mapping = _db.Mappings.Include(m => m.Urls).Include(m => m.Properties).SingleOrDefault(m => m.Id == mappingCode); List<RecordValueViewModel> result = Crawler.Crawl(Factory.Convert(mapping)); return Json(result, JsonRequestBehavior.AllowGet); //return PartialView(result); }
public override bool IsExternalUrl(Uri uri) { // Is External Url if (!base.IsExternalUrl(uri)) { return false; } // Yes, check if we have crawled it before if (!m_CrawlerHistory.Register(uri.GetUrlKeyString(m_Crawler.UriSensitivity))) { return true; } // Create child crawler to traverse external site with max 2 levels using (Crawler externalCrawler = new Crawler(uri, new HtmlDocumentProcessor(), // Process html new DumperStep()) { MaximumThreadCount = 1, MaximumCrawlDepth = 2, MaximumCrawlCount = 10, ExcludeFilter = Program.ExtensionsToSkip, }) { // Crawl external site externalCrawler.Crawl(); } // Do not follow link on this crawler return true; }
public static void Run(frmMain parentForm, User user) { form = parentForm; count = 0; //use in-memory storage baseUri = "http://www.goodreads.com/list/user_votes/" + user.userIdString; Crawler c = new Crawler(new Uri(baseUri), new HtmlDocumentProcessor(), // Process html new CrawlListAndVotes_DumperStep(user)); // Custom step to visualize crawl c.MaximumThreadCount = 1; c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsAndPagesToSkip; c.BeforeDownload += new EventHandler <NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload); c.AdhereToRobotRules = false; // Begin crawl c.Crawl(); }
static void Main(string[] args) { string startUrl = "https://www.cnblogs.com/Xy--1/"; Crawler myCrawler = new Crawler(startUrl); myCrawler.Crawl(); }
static void Main(string[] args) { var helpArgs = new string[] { "help", "-help", "/help" }; if (args.Any(x => helpArgs.Contains(x.ToLower()))) { ShowUsage(); return; } ICrawler crawler = new Crawler(); Console.WriteLine( $"AssemblyName\tType\tPath" + $"\tIsPackage\tPackageId\tRepositoryUrl" + $"\tDescription\tTargetFrameworks"); foreach (ProjInfo pi in crawler.Crawl(Environment.CurrentDirectory)) { string relativePath = PathHelper.GetRelativePath(Environment.CurrentDirectory, pi.CsProjPath); Console.WriteLine( $"{pi.AssName}\t{pi.AssType}\t{relativePath}" + $"\t{pi.GeneratePackage}\t{pi.PackageId}\t{pi.RepositoryUrl}" + $"\t{pi.Description}\t{pi.TargetFrameworks}"); } }
public static void Run() { IsolatedStorageModule.Setup(false); Console.Out.WriteLine("Simple crawl demo using IsolatedStorage"); // Setup crawler to crawl http://ncrawler.codeplex.com // with 1 thread adhering to robot rules, and maximum depth // of 2 with 4 pipeline steps: // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Processes PDF files, extracting text // * Step 3 - Try to determine language based on page, based on text extraction, using google language detection // * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class using (var c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction new GoogleLanguageDetection(), // Add language detection new DumperStep()) { // Custom step to visualize crawl MaximumThreadCount = 2, MaximumCrawlDepth = 10, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
public static void Run() { NCrawlerModule.Setup(); Console.Out.WriteLine("\nSimple indexer demo"); // Setup crawler to crawl/index http://ncrawler.codeplex.com // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Custom step, that is supposed to send content to an Index or Database using (var c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor( // Process html, filter links and content // Setup filter that removed all the text between <body and </body> // This can be custom tags like <!--BeginTextFiler--> and <!--EndTextFiler--> // or whatever you prefer. This way you can control what text is extracted on every page // Most cases you want just to filter the header information or menu text new Dictionary <string, string> { { "<body", "</body>" } }, // Setup filter that tells the crawler not to follow links between tags // that start with <head and ends with </head>. This can be custom tags like // <!--BeginNoFollow--> and <!--EndNoFollow--> or whatever you prefer. // This was you can control what links the crawler should not follow new Dictionary <string, string> { { "<head", "</head>" } }), new IndexerDemo()) { MaximumThreadCount = 2 }) // Custom Step to send filtered content to index { // Begin crawl c.Crawl(); } }
public void MaximumCrawlTime() { TestModule.SetupInMemoryStorage(); // Setup Stopwatch timer; using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor()) { // Custom step to visualize crawl MaximumThreadCount = 10, MaximumCrawlDepth = 10, MaximumCrawlTime = TimeSpan.FromSeconds(2) }) { timer = Stopwatch.StartNew(); // Run c.Crawl(); timer.Stop(); } // Allow time for gracefull finish Assert.Less(timer.ElapsedMilliseconds, 10000); }
public static void Main(string[] args) { Crawler crawler = new Crawler(new Uri("http://allrecipes.com/")); crawler.LoadRobotsTxt().Wait(); crawler.Crawl().Wait(); }
public static void CrawlWith( [Required] string address, [DefaultValue(true)] bool verbose, [DefaultValue(true)] bool includeImages, [DefaultValue(true)] bool includeLinks, [DefaultValue(true)] bool includeScripts, [DefaultValue(true)] bool includeStyles, [DefaultValue(true)] bool includeFailureCheck, [DefaultValue(true)] bool includeRobots, [DefaultValue(100)] int maxDepth, [DefaultValue(0)] int delay, [DefaultValue("")] string searchExpression, [DefaultValue("")] string partnerSites) { var config = GetComplexConfig(address, verbose, includeImages, includeLinks, includeScripts, includeStyles, includeFailureCheck, includeRobots, maxDepth, delay, searchExpression, partnerSites); Console.WriteLine(JsonConvert.SerializeObject(config)); Crawler.Crawl(config); if (config.Listener.GetCrawlResult().ErrorCount > 0) { Environment.Exit((int)ExitCode.CrawlError); } }
public void TestMe() { throw new NotImplementedException(); var crawler = new Crawler(); crawler.Crawl("http://google.com"); Thread.Sleep(3000); }
public static void Run() { NCrawlerModule.Setup(); Console.Out.WriteLine("\nSimple indexer demo"); // Setup crawler to crawl/index http://ncrawler.codeplex.com // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Custom step, that is supposed to send content to an Index or Database using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor( // Process html, filter links and content // Setup filter that removed all the text between <body and </body> // This can be custom tags like <!--BeginTextFiler--> and <!--EndTextFiler--> // or whatever you prefer. This way you can control what text is extracted on every page // Most cases you want just to filter the header information or menu text new Dictionary<string, string> { {"<body", "</body>"} }, // Setup filter that tells the crawler not to follow links between tags // that start with <head and ends with </head>. This can be custom tags like // <!--BeginNoFollow--> and <!--EndNoFollow--> or whatever you prefer. // This was you can control what links the crawler should not follow new Dictionary<string, string> { {"<head", "</head>"} }), new IndexerDemo()) { MaximumThreadCount = 2 }) // Custom Step to send filtered content to index { // Begin crawl c.Crawl(); } }
public static void Run() { NCrawlerModule.Setup(); // Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below NCrawlerModule.Register(builder => builder.Register((c, p) => { NCrawlerModule.Setup(); // Return to standard setup return(new CustomCrawlerRules(p.TypedAs <Crawler>(), c.Resolve <IRobot>(p), p.TypedAs <Uri>(), p.TypedAs <ICrawlerHistory>())); }). As <ICrawlerRules>(). InstancePerDependency()); Console.Out.WriteLine("Advanced crawl demo"); using (var c = new Crawler( new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new DumperStep()) { MaximumThreadCount = 2, MaximumCrawlDepth = 2, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
public static void Main(string[] args) { try { string sequenceType = args[0]; string start = args[1]; long max = long.Parse(args[2]); int pause = int.Parse(args[3]); // Pause interval if (args.Length < 4) { throw new ArgumentException("Invalid number of arguments!"); } CrawlContext context = GetCrawlContext(sequenceType, start, max, pause); Crawler crawler = CreateCrawler(context); crawler.Crawl(max, context.QueryType); } catch (Exception e) { Console.WriteLine(e); Trace.TraceError(e.ToString()); } }
static void Main(string[] args) { Crawler crawler = new Crawler(); IObservable<Uri> observable1 = crawler.Crawl(new Uri("http://www.codinghorror.com/")); observable1.Subscribe(onNext: Console.WriteLine, onCompleted: () => Console.WriteLine("Crawling completed")); Console.ReadLine(); }
static void Main(string[] args) { Crawler crawler = new Crawler(); IObservable <Uri> observable1 = crawler.Crawl(new Uri("http://www.codinghorror.com/")); observable1.Subscribe(onNext: Console.WriteLine, onCompleted: () => Console.WriteLine("Crawling completed")); Console.ReadLine(); }
private async void button1_Click(object sender, EventArgs e) { LogHelper log = new LogHelper(typeof(Form1)); using (Crawler spider = new Crawler()) { string html = await spider.Crawl(ConstVar.AreaUrl, Encoding.UTF8); } }
public void Run() { Console.WriteLine(typeof(DemoOne).Name); // 使用爬取框架,异常可通过异常事件来捕捉 string url = "https://www.baidu.com/"; Crawler c = new Crawler(url); // 此爬取没有展示,下面开始开始配置 Console.WriteLine("未设置管线"); c.Crawl(); Console.WriteLine("第一次请求结束\r"); // 第二次请求添加管线处理程序 c.AddPipeline(new DemoOneDeal()); c.Crawl(); Console.WriteLine("第二次请求结束\r"); Console.WriteLine(); }
static void Main(string[] args) { Crawler crawler = new Crawler(); IObservable <Uri> observable = crawler.Crawl(new Uri("https://dotnet.microsoft.com")); observable.Subscribe(onNext: Console.WriteLine, onCompleted: () => Console.WriteLine("Crawling completed")); Console.ReadLine(); }
public void StartCrawlers(Crawler mycrawler) { Thread thread_1 = new Thread(() => mycrawler.Crawl()); thread_1.Name = "爬虫线程1"; Thread thread_2 = new Thread(() => mycrawler.Crawl()); thread_2.Name = "爬虫线程2"; Thread thread_3 = new Thread(() => mycrawler.Crawl()); thread_3.Name = "爬虫线程3"; Task[] tasks = { Task.Run(() => thread_1.Start()), Task.Run(() => thread_2.Start()), Task.Run(() => thread_3.Start()), }; Task.WaitAll(tasks); }
static void Main(string[] args) { Uri uri = new Uri("http://www.csdn.net/"); Crawler c = new Crawler(uri, new HtmlDocumentProcessor(), new DumperStep()); c.MaximumThreadCount = 30; //线程数量 c.MaximumCrawlDepth = 2; //爬行深度 c.Crawl(); //开始爬行 }
public async Task <ActionResult> Deploy() { var c = new Crawler(); await c.Crawl(new Uri(Request.Url.GetLeftPart(UriPartial.Authority)), CloudConfigurationManager.GetSetting("BasicPassword"), ""); return(new ContentResult { Content = string.Join("<br>", c.CrawledUrls) + string.Join("<br>", c.log.entires) }); }
private Crawler TestCrawlerMethod(string path, int expectedCount, int recursionLimit) { List <string> uriList = new List <String>(); uriList.Add(path); Crawler crawler = new Crawler(uriList, new Browser(), recursionLimit); crawler.Crawl(); foreach (HttpRequestResult result in crawler.HttpRequestResults) { try { if (result.Error != null) { Console.WriteLine("The error property indicated a {1}, at {0} with the message, \"{2}\"", result.Error.AbsoluteUri.ToString() ?? "null", result.Error.HttpCode.ToString() ?? "null", result.Error.Message.ToString() ?? "null"); } else if (result.ContentType != null && result.IsHtml && result.Content != null) { Console.WriteLine("Content for requestUrl, {0}, is as follows:\n{1}", result.RequestUrl, result.Content); } else if (result.ContentType == null) { Console.WriteLine("ContentType for requestUrl, {0}, is null.", result.RequestUrl); } else if (!result.IsHtml) { Console.WriteLine("ContentType for requestUrl, {0}, is not html.", result.RequestUrl); } else if (result.Content == null) { Console.WriteLine("Content for requestUrl, {0}, is null.", result.RequestUrl); } else { Console.WriteLine("Problem writing result information to console."); } } catch (Exception ex) { Console.WriteLine("The following exception occurred while attempting to write information about the reuslt."); Console.WriteLine(ex); } } Assert.AreEqual(expectedCount, crawler.HttpRequestResults.Count); AssertLinksFromRemoteSiteNotRetrieved(crawler); AssertLinksNullStateForCssAndHtmlTypes(crawler); AssertBadLinksHaveNullAbsoluteUriAndPopulatedEx(crawler); return(crawler); }
public void TheRootAddressShouldBeCrawled() { var config = new CrawlerConfig { RootAddress = new Uri("http://localhost:51746/"), Listener = this, MaxDepth = 1, CrawlerFlags = CrawlerFlags.IncludeLinks | CrawlerFlags.IncludeFailureCheck }; Crawler.Crawl(config); }
//This is a basic example of how to use the crawler //In case of a cache miss, it prints out the page's title and //absolute URI, and saves the page data to the filesystem. public static void Main(String[] args) { if ((args.Length == 2 || args.Length == 3) && Uri.IsWellFormedUriString(args[0], UriKind.Absolute)) { Uri startingUri = new Uri(args[0]); String targetDirectoryPath = args[1]; bool followExternal = args.Length == 3 && args[2] == "--follow-external"; Console.WriteLine("Loading from cache..."); Cache cache = new Cache(startingUri, targetDirectoryPath); Console.WriteLine( "Cache loaded - {0} pages stored in cache", cache.Count()); Crawler crawler = new Crawler(cache, followExternal); Persister persister = new Persister(targetDirectoryPath, startingUri); //This event is fired when the crawler's process is over crawler.WorkComplete += () => { Environment.Exit(0); }; //This event is fired every time a valid page is downloaded crawler.NewPageFetched += (page) => { Console.WriteLine(page.Title + " - " + page.Uri.AbsoluteUri); persister.Save(page); }; //starts the crawler, on a different thread crawler.Crawl(startingUri); Console.WriteLine("Crawler started, press CTRL+C to interrupt"); while (true) { } } else { Console.WriteLine("Crawler"); Console.WriteLine("Usage:"); Console.WriteLine( "Tenteikura.Example.exe <starting_uri> <target_directory> [--options]"); Console.WriteLine( "<starting_uri> : a valid absolute URL which will be the starting point for the crawler"); Console.WriteLine( "<target_directory> : the directory where the page files will be saved"); Console.WriteLine(""); Console.WriteLine("OPTIONS:"); Console.WriteLine( "The only option available is --follow-external, which will make the crawler fetch non-local urls as well"); Console.WriteLine("EXAMPLE: "); Console.WriteLine( @"Tenteikura.Example.exe http://telenor.com C:\mytargetdirectory --follow-external"); } }
static void Main(string[] args) { ThreadPool.SetMinThreads(200, 200); var crawler = new Crawler(); var fileStore = new FileStore(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Cache")); var cachingHandler = new CachingHandler(fileStore) { InnerHandler = new HttpClientHandler() }; crawler.Requester = new HttpClient(cachingHandler); if(ConfigurationManager.ConnectionStrings["UrlStore"]!=null) crawler.Store = new UrlStore(); crawler.Crawl("http://YAHOO.COM"); Console.Read(); }
public static void Run() { NCrawlerModule.Setup(); // Demo 2 - Find broken links Console.Out.WriteLine("\nFind broken links demo"); // Setup crawler to crawl http://ncrawler.codeplex.com // with 2 thread adhering to robot rules, and maximum depth // of 2 with 2 pipeline steps NCrawlerModule.Setup(); using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new DumpBrokenLinksStep()) // Custom pipeline Step { MaximumThreadCount = 5, MaximumCrawlDepth = 2, }) { // Begin crawl c.Crawl(); } }
private static CollectorStep CollectionCrawl() { CollectorStep collectorStep = new CollectorStep(); HtmlDocumentProcessor htmlDocumentProcessor = new HtmlDocumentProcessor(); using (Crawler crawler = new Crawler(new Uri("http://ncrawler.codeplex.com"), collectorStep, htmlDocumentProcessor)) { Console.Out.WriteLine(crawler.GetType()); crawler.MaximumThreadCount = 5; crawler.UriSensitivity = UriComponents.HttpRequestUrl; crawler.ExcludeFilter = new[] { new RegexFilter( new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png)", RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase)) }; crawler.Crawl(); return collectorStep; } }